quantum-espresso/LAXlib/ptoolkit.f90

!
! Copyright (C) 2001-2006 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! ---------------------------------------------------------------------------------

MODULE laxlib_ptoolkit
  IMPLICIT NONE
  SAVE
CONTAINS

SUBROUTINE laxlib_dsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
   !
   !! Double precision SQuare Matrix REDistribution
   !!
   !! Copy a global "na * na" matrix locally stored in "a",
   !! and distributed as described by "desca", into a larger
   !! global "nb * nb" matrix stored in "b" and distributed
   !! as described in "descb".
   !!
   !
   ! If you want to read, get prepared for an headache!
   ! Written struggling by Carlo Cavazzoni.
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: na
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   REAL(DP)            :: a(lda,lda)
   !!  matrix to be redistributed into b
   TYPE(la_descriptor), INTENT(IN) :: desca
   !! laxlib descriptor of matrix a
   INTEGER, INTENT(IN) :: nb
   !! global dimension of matrix b
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of matrix b
   REAL(DP)            :: b(ldb,ldb)
   !! redistributed matrix
   TYPE(la_descriptor), INTENT(IN) :: descb
   !! laxlib descriptor of matrix b

   INTEGER :: ipc, ipr, npc, npr
   INTEGER :: ipr_old, ir_old, nr_old, irx_old
   INTEGER :: ipc_old, ic_old, nc_old, icx_old
   INTEGER :: myrow, mycol, ierr, rank
   INTEGER :: col_comm, row_comm, comm, sreq
   INTEGER :: nr_new, ir_new, irx_new, ir, nr, nrtot, irb, ire
   INTEGER :: nc_new, ic_new, icx_new, ic, nc, nctot, icb, ice
   INTEGER :: ib, i, j, myid
   INTEGER :: nrsnd( desca%npr )
   INTEGER :: ncsnd( desca%npr )
   INTEGER :: displ( desca%npr )
   INTEGER :: irb_new( desca%npr )
   INTEGER :: ire_new( desca%npr )
   INTEGER :: icb_new( desca%npr )
   INTEGER :: ice_new( desca%npr )
   REAL(DP), ALLOCATABLE :: buf(:)
   REAL(DP), ALLOCATABLE :: ab(:,:)
   REAL(DP), ALLOCATABLE :: tst1(:,:)
   REAL(DP), ALLOCATABLE :: tst2(:,:)
#if defined __MPI
    INTEGER :: istatus( MPI_STATUS_SIZE )
#endif

   IF( desca%active_node <= 0 ) THEN
      RETURN
   END IF

   ! preliminary consistency checks

   IF( nb < na ) &
      CALL lax_error__( " dsqmred ", " nb < na, this sub. work only with nb >= na ", nb )
   IF( nb /= descb%n ) &
      CALL lax_error__( " dsqmred ", " wrong global dim nb ", nb )
   IF( na /= desca%n ) &
      CALL lax_error__( " dsqmred ", " wrong global dim na ", na )
   IF( ldb /= descb%nrcx ) &
      CALL lax_error__( " dsqmred ", " wrong leading dim ldb ", ldb )
   IF( lda /= desca%nrcx ) &
      CALL lax_error__( " dsqmred ", " wrong leading dim lda ", lda )

   npr   = desca%npr
   myrow = desca%myr
   npc   = desca%npc
   mycol = desca%myc
   comm  = desca%comm

#if defined __MPI

   ! split communicator into row and col communicators

   CALL MPI_Comm_rank( comm, myid, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in MPI_Comm_rank 1 ", ABS( ierr ) )

   CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in MPI_Comm_split 1 ", ABS( ierr ) )

   CALL MPI_Comm_split( comm, myrow, mycol, row_comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in MPI_Comm_split 2 ", ABS( ierr ) )

   CALL MPI_Comm_rank( col_comm, rank, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in MPI_Comm_rank 2 ", ABS( ierr ) )
   IF( rank /= myrow ) &
      CALL lax_error__( " dsqmred ", " building col_comm ", rank )

   CALL MPI_Comm_rank( row_comm, rank, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in MPI_Comm_rank 3 ", ABS( ierr ) )
   IF( rank /= mycol ) &
      CALL lax_error__( " dsqmred ", " building row_comm ", rank )

   ALLOCATE( buf( descb%nrcx * descb%nrcx ) )
   ALLOCATE( ab( descb%nrcx, desca%nrcx ) )

   ! write( 3000 + myid, * ) 'na, nb = ', na, nb

   DO j = 1, descb%nc
      DO i = 1, descb%nr
         b( i, j ) = 0.0d0
      END DO
   END DO

   ab = 0.0d0

   ! first redistribute rows, column groups work in parallel

   DO ipr = 1, npr
      !
      CALL descla_local_dims( ir_new, nr_new, nb, descb%nx, npr, ipr-1 )
      !
      irx_new = ir_new + nr_new - 1

      ! write( 3000 + myid, * ) 'ir_new, nr_new, irx_new = ', ir_new, nr_new, irx_new
      !
      DO ipr_old = 1, npr
         !
         CALL descla_local_dims( ir_old, nr_old, na, desca%nx, npr, ipr_old-1 )
         !
         irx_old = ir_old + nr_old - 1
         !
         ! write( 3000 + myid, * ) 'ir_old, nr_old, irx_old = ', ir_old, nr_old, irx_old
         !
         IF( ir_old >= ir_new .AND. ir_old <= irx_new ) THEN
            !
            nrsnd( ipr_old ) = MIN( nr_old, irx_new - ir_old + 1 )
            irb = 1
            ire = nrsnd( ipr_old )
            irb_new( ipr_old ) = ir_old - ir_new + 1
            ire_new( ipr_old ) = irb_new( ipr_old ) + nrsnd( ipr_old ) - 1
            !
         ELSE IF( ir_new >= ir_old .AND. ir_new <= irx_old ) THEN
            !
            nrsnd( ipr_old ) = irx_old - ir_new + 1
            irb = ir_new - ir_old + 1
            ire = nr_old
            irb_new( ipr_old ) = 1
            ire_new( ipr_old ) = nrsnd( ipr_old )
            !
         ELSE
            nrsnd( ipr_old ) = 0
            irb = 0
            ire = 0
            irb_new( ipr_old ) = 0
            ire_new( ipr_old ) = 0
         END IF
         !
         ! write( 3000 + myid, * ) 'ipr_old, nrsnd            = ', ipr_old, nrsnd( ipr_old )
         ! write( 3000 + myid, * ) 'ipr_old, irb, ire         = ', ipr_old, irb, ire
         ! write( 3000 + myid, * ) 'ipr_old, irb_new, ire_new = ', ipr_old, irb_new( ipr_old ), ire_new( ipr_old )
         !
         IF( ( myrow == ipr_old - 1 ) .AND. ( nrsnd( ipr_old ) > 0 ) ) THEN
            IF(  myrow /= ipr - 1 ) THEN
               ib = 0
               DO j = 1, desca%nc
                  DO i = irb, ire
                     ib = ib + 1
                     buf( ib ) = a( i, j )
                  END DO
               END DO
               CALL mpi_isend( buf, ib, MPI_DOUBLE_PRECISION, ipr-1, ipr, col_comm, sreq, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " dsqmred ", " in mpi_isend ", ABS( ierr ) )
            ELSE
               DO j = 1, desca%nc
                  ib = irb
                  DO i = irb_new( ipr_old ), ire_new( ipr_old )
                     ab( i, j ) = a( ib, j )
                     ib = ib + 1
                  END DO
               END DO
            END IF
         END IF
         !
         IF( nrsnd( ipr_old ) /= ire - irb + 1 ) &
            CALL lax_error__( " dsqmred ", " something wrong with row 1 ", nrsnd( ipr_old ) )
         IF( nrsnd( ipr_old ) /= ire_new( ipr_old ) - irb_new( ipr_old ) + 1 ) &
            CALL lax_error__( " dsqmred ", " something wrong with row 2 ", nrsnd( ipr_old ) )
         !
         nrsnd( ipr_old ) = nrsnd( ipr_old ) * desca%nc
         !
      END DO
      !
      IF( myrow == ipr - 1 ) THEN
         DO ipr_old = 1, npr
            IF( nrsnd( ipr_old ) > 0 ) THEN
               IF(  myrow /= ipr_old - 1 ) THEN
                  CALL mpi_recv( buf, nrsnd(ipr_old), MPI_DOUBLE_PRECISION, ipr_old-1, ipr, col_comm, istatus, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " dsqmred ", " in mpi_recv ", ABS( ierr ) )
                  CALL mpi_get_count( istatus, MPI_DOUBLE_PRECISION, ib, ierr)
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " dsqmred ", " in mpi_get_count ", ABS( ierr ) )
                  IF( ib /= nrsnd(ipr_old) ) &
                     CALL lax_error__( " dsqmred ", " something wrong with row 3 ", ib )
                  ib = 0
                  DO j = 1, desca%nc
                     DO i = irb_new( ipr_old ), ire_new( ipr_old )
                        ib = ib + 1
                        ab( i, j ) = buf( ib )
                     END DO
                  END DO
               END IF
            END IF
         END DO
      ELSE
         DO ipr_old = 1, npr
            IF( myrow == ipr_old - 1 .AND. nrsnd( ipr_old ) > 0 ) THEN
               CALL MPI_Wait( sreq, istatus, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " dsqmred ", " in MPI_Wait ", ABS( ierr ) )
            END IF
         END DO
      END IF
      !
   END DO

   ! then redistribute cols, row groups work in parallel

   DO ipc = 1, npc
      !
      CALL descla_local_dims( ic_new, nc_new, nb, descb%nx, npc, ipc-1 )
      !
      icx_new = ic_new + nc_new - 1
      !
      ! write( 3000 + myid, * ) 'ic_new, nc_new, icx_new = ', ic_new, nc_new, icx_new
      !
      DO ipc_old = 1, npc
         !
         CALL descla_local_dims( ic_old, nc_old, na, desca%nx, npc, ipc_old-1 )
         !
         icx_old = ic_old + nc_old - 1
         !
         ! write( 3000 + myid, * ) 'ic_old, nc_old, icx_old = ', ic_old, nc_old, icx_old
         !
         IF( ic_old >= ic_new .AND. ic_old <= icx_new ) THEN
            !
            ncsnd( ipc_old ) = MIN( nc_old, icx_new - ic_old + 1 )
            icb = 1
            ice = ncsnd( ipc_old )
            icb_new( ipc_old ) = ic_old - ic_new + 1
            ice_new( ipc_old ) = icb_new( ipc_old ) + ncsnd( ipc_old ) - 1
            !
         ELSE IF( ic_new >= ic_old .AND. ic_new <= icx_old ) THEN
            !
            ncsnd( ipc_old ) = icx_old - ic_new + 1
            icb = ic_new - ic_old + 1
            ice = nc_old
            icb_new( ipc_old ) = 1
            ice_new( ipc_old ) = ncsnd( ipc_old )
            !
         ELSE
            ncsnd( ipc_old ) = 0
            icb = 0
            ice = 0
            icb_new( ipc_old ) = 0
            ice_new( ipc_old ) = 0
         END IF
         !
         ! write( 3000 + myid, * ) 'ipc_old, ncsnd            = ', ipc_old, ncsnd( ipc_old )
         ! write( 3000 + myid, * ) 'ipc_old, icb, ice         = ', ipc_old, icb, ice
         ! write( 3000 + myid, * ) 'ipc_old, icb_new, ice_new = ', ipc_old, icb_new( ipc_old ), ice_new( ipc_old )

         IF( ( mycol == ipc_old - 1 ) .AND. ( ncsnd( ipc_old ) > 0 ) ) THEN
            IF(  mycol /= ipc - 1 ) THEN
               ib = 0
               DO j = icb, ice
                  DO i = 1, descb%nrcx
                     ib = ib + 1
                     buf( ib ) = ab( i, j )
                  END DO
               END DO
               CALL mpi_isend( buf, ib, MPI_DOUBLE_PRECISION, ipc-1, ipc, row_comm, sreq, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " dsqmred ", " in mpi_isend 2 ", ABS( ierr ) )
            ELSE
               ib = icb
               DO j = icb_new( ipc_old ), ice_new( ipc_old )
                  DO i = 1, descb%nrcx
                        b( i, j ) = ab( i, ib )
                  END DO
                  ib = ib + 1
               END DO
            END IF
         END IF

         IF( ncsnd( ipc_old ) /= ice-icb+1 ) &
            CALL lax_error__( " dsqmred ", " something wrong with col 1 ", ncsnd( ipc_old ) )
         IF( ncsnd( ipc_old ) /= ice_new( ipc_old ) - icb_new( ipc_old ) + 1 ) &
            CALL lax_error__( " dsqmred ", " something wrong with col 2 ", ncsnd( ipc_old ) )
         !
         ncsnd( ipc_old ) = ncsnd( ipc_old ) * descb%nrcx
         !
      END DO
      !
      IF( mycol == ipc - 1 ) THEN
         DO ipc_old = 1, npc
            IF( ncsnd( ipc_old ) > 0 ) THEN
               IF(  mycol /= ipc_old - 1 ) THEN
                  ib = icb_new( ipc_old )
                  CALL mpi_recv( b( 1, ib ), ncsnd(ipc_old), MPI_DOUBLE_PRECISION, ipc_old-1, ipc, row_comm, istatus, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " dsqmred ", " in mpi_recv 2 ", ABS( ierr ) )
                  CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_PRECISION, ib, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " dsqmred ", " in MPI_GET_COUNT 2 ", ABS( ierr ) )
                  IF( ib /= ncsnd(ipc_old) ) &
                     CALL lax_error__( " dsqmred ", " something wrong with col 3 ", ib )
               END IF
            END IF
         END DO
      ELSE
         DO ipc_old = 1, npc
            IF( mycol == ipc_old - 1 .AND. ncsnd( ipc_old ) > 0 ) THEN
               CALL MPI_Wait( sreq, istatus, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " dsqmred ", " in MPI_Wait 2 ", ABS( ierr ) )
            END IF
         END DO
      END IF
      !
   END DO

   DEALLOCATE( ab )
   DEALLOCATE( buf )

   CALL mpi_comm_free( col_comm, ierr )

   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in mpi_comm_free 1 ", ABS( ierr ) )

   CALL mpi_comm_free( row_comm, ierr )

   IF( ierr /= 0 ) &
      CALL lax_error__( " dsqmred ", " in mpi_comm_free 2 ", ABS( ierr ) )

#if defined __PIPPO

   !  this is for debugging, tests through global matrix, if
   !  the two matrix (pre and before the redistribution) coincide.

   ALLOCATE( tst1( nb, nb ) )
   ALLOCATE( tst2( nb, nb ) )
   ALLOCATE( ab( nb, nb ) )

   ab = 0.0d0

   do j = 1, desca%nc
   do i = 1, desca%nr
      ab( i + desca%ir - 1, j + desca%ic - 1 ) = a( i , j )
   end do
   end do

   CALL MPI_REDUCE( ab, tst1, nb*nb, MPI_DOUBLE_PRECISION, MPI_SUM, 0, comm, ierr )

   ab = 0.0d0

   do j = 1, descb%nc
   do i = 1, descb%nr
      ab( i + descb%ir - 1, j + descb%ic - 1 ) = b( i , j )
   end do
   end do

   CALL MPI_REDUCE( ab, tst2, nb*nb, MPI_DOUBLE_PRECISION, MPI_SUM, 0, comm, ierr )

   IF( myid == 0 ) THEN
      write( 1000, * ) na, nb, SUM( ABS( tst2 - tst1 ) )
   END IF

   DEALLOCATE( ab )
   DEALLOCATE( tst2 )
   DEALLOCATE( tst1 )

#endif

#endif

   RETURN
END SUBROUTINE laxlib_dsqmred_x_x

SUBROUTINE laxlib_zsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
   !
   !! double complex (Z) SQuare Matrix REDistribution
   !!
   !! Copy a global "na * na" matrix locally stored in "a",
   !! and distributed as described by "desca", into a larger
   !! global "nb * nb" matrix stored in "b" and distributed
   !! as described in "descb".
   !!
   !
   ! If you want to read, get prepared for an headache!
   ! Written struggling by Carlo Cavazzoni.
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: na
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   COMPLEX(DP)            :: a(lda,lda)
   !!  matrix to be redistributed into b
   TYPE(la_descriptor), INTENT(IN) :: desca
   !! laxlib descriptor of matrix a
   INTEGER, INTENT(IN) :: nb
   !! global dimension of matrix b
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of matrix b
   COMPLEX(DP)            :: b(ldb,ldb)
   !! redistributed matrix
   TYPE(la_descriptor), INTENT(IN) :: descb
   !! laxlib descriptor matrix b

   INTEGER :: ipc, ipr, npc, npr
   INTEGER :: ipr_old, ir_old, nr_old, irx_old
   INTEGER :: ipc_old, ic_old, nc_old, icx_old
   INTEGER :: myrow, mycol, ierr, rank
   INTEGER :: col_comm, row_comm, comm, sreq
   INTEGER :: nr_new, ir_new, irx_new, ir, nr, nrtot, irb, ire
   INTEGER :: nc_new, ic_new, icx_new, ic, nc, nctot, icb, ice
   INTEGER :: ib, i, j, myid
   INTEGER :: nrsnd( desca%npr )
   INTEGER :: ncsnd( desca%npr )
   INTEGER :: displ( desca%npr )
   INTEGER :: irb_new( desca%npr )
   INTEGER :: ire_new( desca%npr )
   INTEGER :: icb_new( desca%npr )
   INTEGER :: ice_new( desca%npr )
   COMPLEX(DP), ALLOCATABLE :: buf(:)
   COMPLEX(DP), ALLOCATABLE :: ab(:,:)
   COMPLEX(DP), ALLOCATABLE :: tst1(:,:)
   COMPLEX(DP), ALLOCATABLE :: tst2(:,:)
#if defined __MPI
    INTEGER :: istatus( MPI_STATUS_SIZE )
#endif

   IF( desca%active_node <= 0 ) THEN
      RETURN
   END IF

   ! preliminary consistency checks

   IF( nb < na ) &
      CALL lax_error__( " zsqmred ", " nb < na, this sub. work only with nb >= na ", nb )
   IF( nb /= descb%n ) &
      CALL lax_error__( " zsqmred ", " wrong global dim nb ", nb )
   IF( na /= desca%n ) &
      CALL lax_error__( " zsqmred ", " wrong global dim na ", na )
   IF( ldb /= descb%nrcx ) &
      CALL lax_error__( " zsqmred ", " wrong leading dim ldb ", ldb )
   IF( lda /= desca%nrcx ) &
      CALL lax_error__( " zsqmred ", " wrong leading dim lda ", lda )

   npr   = desca%npr
   myrow = desca%myr
   npc   = desca%npc
   mycol = desca%myc
   comm  = desca%comm

#if defined __MPI

   ! split communicator into row and col communicators

   CALL MPI_Comm_rank( comm, myid, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in MPI_Comm_rank 1 ", ABS( ierr ) )

   CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in MPI_Comm_split 1 ", ABS( ierr ) )

   CALL MPI_Comm_split( comm, myrow, mycol, row_comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in MPI_Comm_split 2 ", ABS( ierr ) )

   CALL MPI_Comm_rank( col_comm, rank, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in MPI_Comm_rank 2 ", ABS( ierr ) )
   IF( rank /= myrow ) &
      CALL lax_error__( " zsqmred ", " building col_comm ", rank )

   CALL MPI_Comm_rank( row_comm, rank, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in MPI_Comm_rank 3 ", ABS( ierr ) )
   IF( rank /= mycol ) &
      CALL lax_error__( " zsqmred ", " building row_comm ", rank )

   ALLOCATE( buf( descb%nrcx * descb%nrcx ) )
   ALLOCATE( ab( descb%nrcx, desca%nrcx ) )

   DO j = 1, descb%nc
      DO i = 1, descb%nr
         b( i, j ) = ( 0_DP , 0_DP )
      END DO
   END DO

   ab = ( 0_DP , 0_DP )

   ! first redistribute rows, column groups work in parallel

   DO ipr = 1, npr
      !
      CALL descla_local_dims( ir_new, nr_new, nb, descb%nx, npr, ipr-1 )
      !
      irx_new = ir_new + nr_new - 1
      !
      DO ipr_old = 1, npr
         !
         CALL descla_local_dims( ir_old, nr_old, na, desca%nx, npr, ipr_old-1 )
         !
         irx_old = ir_old + nr_old - 1
         !
         IF( ir_old >= ir_new .AND. ir_old <= irx_new ) THEN
            !
            nrsnd( ipr_old ) = MIN( nr_old, irx_new - ir_old + 1 )
            irb = 1
            ire = nrsnd( ipr_old )
            irb_new( ipr_old ) = ir_old - ir_new + 1
            ire_new( ipr_old ) = irb_new( ipr_old ) + nrsnd( ipr_old ) - 1
            !
         ELSE IF( ir_new >= ir_old .AND. ir_new <= irx_old ) THEN
            !
            nrsnd( ipr_old ) = irx_old - ir_new + 1
            irb = ir_new - ir_old + 1
            ire = nr_old
            irb_new( ipr_old ) = 1
            ire_new( ipr_old ) = nrsnd( ipr_old )
            !
         ELSE
            nrsnd( ipr_old ) = 0
            irb = 0
            ire = 0
            irb_new( ipr_old ) = 0
            ire_new( ipr_old ) = 0
         END IF
         !
         IF( ( myrow == ipr_old - 1 ) .AND. ( nrsnd( ipr_old ) > 0 ) ) THEN
            IF(  myrow /= ipr - 1 ) THEN
               ib = 0
               DO j = 1, desca%nc
                  DO i = irb, ire
                     ib = ib + 1
                     buf( ib ) = a( i, j )
                  END DO
               END DO
               CALL mpi_isend( buf, ib, MPI_DOUBLE_COMPLEX, ipr-1, ipr, col_comm, sreq, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " zsqmred ", " in mpi_isend 1 ", ABS( ierr ) )
            ELSE
               DO j = 1, desca%nc
                  ib = irb
                  DO i = irb_new( ipr_old ), ire_new( ipr_old )
                     ab( i, j ) = a( ib, j )
                     ib = ib + 1
                  END DO
               END DO
            END IF
         END IF
         !
         IF( nrsnd( ipr_old ) /= ire - irb + 1 ) &
            CALL lax_error__( " zsqmred ", " something wrong with row 1 ", nrsnd( ipr_old ) )
         IF( nrsnd( ipr_old ) /= ire_new( ipr_old ) - irb_new( ipr_old ) + 1 ) &
            CALL lax_error__( " zsqmred ", " something wrong with row 2 ", nrsnd( ipr_old ) )
         !
         nrsnd( ipr_old ) = nrsnd( ipr_old ) * desca%nc
         !
      END DO
      !
      IF( myrow == ipr - 1 ) THEN
         DO ipr_old = 1, npr
            IF( nrsnd( ipr_old ) > 0 ) THEN
               IF(  myrow /= ipr_old - 1 ) THEN
                  CALL mpi_recv( buf, nrsnd(ipr_old), MPI_DOUBLE_COMPLEX, ipr_old-1, ipr, col_comm, istatus, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " zsqmred ", " in mpi_recv 1 ", ABS( ierr ) )
                  CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_COMPLEX, ib, ierr)
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " zsqmred ", " in MPI_GET_COUNT 1 ", ABS( ierr ) )
                  IF( ib /= nrsnd(ipr_old) ) &
                     CALL lax_error__( " zsqmred ", " something wrong with row 3 ", ib )
                  ib = 0
                  DO j = 1, desca%nc
                     DO i = irb_new( ipr_old ), ire_new( ipr_old )
                        ib = ib + 1
                        ab( i, j ) = buf( ib )
                     END DO
                  END DO
               END IF
            END IF
         END DO
      ELSE
         DO ipr_old = 1, npr
            IF( myrow == ipr_old - 1 .AND. nrsnd( ipr_old ) > 0 ) THEN
               CALL MPI_Wait( sreq, istatus, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " zsqmred ", " in MPI_Wait 1 ", ABS( ierr ) )
            END IF
         END DO
      END IF
      !
   END DO

   ! then redistribute cols, row groups work in parallel

   DO ipc = 1, npc
      !
      CALL descla_local_dims( ic_new, nc_new, nb, descb%nx, npc, ipc-1 )
      !
      icx_new = ic_new + nc_new - 1
      !
      DO ipc_old = 1, npc
         !
         CALL descla_local_dims( ic_old, nc_old, na, desca%nx, npc, ipc_old-1 )
         !
         icx_old = ic_old + nc_old - 1
         !
         IF( ic_old >= ic_new .AND. ic_old <= icx_new ) THEN
            !
            ncsnd( ipc_old ) = MIN( nc_old, icx_new - ic_old + 1 )
            icb = 1
            ice = ncsnd( ipc_old )
            icb_new( ipc_old ) = ic_old - ic_new + 1
            ice_new( ipc_old ) = icb_new( ipc_old ) + ncsnd( ipc_old ) - 1
            !
         ELSE IF( ic_new >= ic_old .AND. ic_new <= icx_old ) THEN
            !
            ncsnd( ipc_old ) = icx_old - ic_new + 1
            icb = ic_new - ic_old + 1
            ice = nc_old
            icb_new( ipc_old ) = 1
            ice_new( ipc_old ) = ncsnd( ipc_old )
            !
         ELSE
            ncsnd( ipc_old ) = 0
            icb = 0
            ice = 0
            icb_new( ipc_old ) = 0
            ice_new( ipc_old ) = 0
         END IF
         !
         IF( ( mycol == ipc_old - 1 ) .AND. ( ncsnd( ipc_old ) > 0 ) ) THEN
            IF(  mycol /= ipc - 1 ) THEN
               ib = 0
               DO j = icb, ice
                  DO i = 1, descb%nrcx
                     ib = ib + 1
                     buf( ib ) = ab( i, j )
                  END DO
               END DO
               CALL mpi_isend( buf, ib, MPI_DOUBLE_COMPLEX, ipc-1, ipc, row_comm, sreq, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " zsqmred ", " in mpi_isend 2 ", ABS( ierr ) )
            ELSE
               ib = icb
               DO j = icb_new( ipc_old ), ice_new( ipc_old )
                  DO i = 1, descb%nrcx
                        b( i, j ) = ab( i, ib )
                  END DO
                  ib = ib + 1
               END DO
            END IF
         END IF

         IF( ncsnd( ipc_old ) /= ice-icb+1 ) &
            CALL lax_error__( " zsqmred ", " something wrong with col 1 ", ncsnd( ipc_old ) )
         IF( ncsnd( ipc_old ) /= ice_new( ipc_old ) - icb_new( ipc_old ) + 1 ) &
            CALL lax_error__( " zsqmred ", " something wrong with col 2 ", ncsnd( ipc_old ) )
         !
         ncsnd( ipc_old ) = ncsnd( ipc_old ) * descb%nrcx
         !
      END DO
      !
      IF( mycol == ipc - 1 ) THEN
         DO ipc_old = 1, npc
            IF( ncsnd( ipc_old ) > 0 ) THEN
               IF(  mycol /= ipc_old - 1 ) THEN
                  ib = icb_new( ipc_old )
                  CALL mpi_recv( b( 1, ib ), ncsnd(ipc_old), MPI_DOUBLE_COMPLEX, ipc_old-1, ipc, row_comm, istatus, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " zsqmred ", " in mpi_recv 2 ", ABS( ierr ) )
                  CALL MPI_GET_COUNT( istatus, MPI_DOUBLE_COMPLEX, ib, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " zsqmred ", " in MPI_GET_COUNT 2 ", ABS( ierr ) )
                  IF( ib /= ncsnd(ipc_old) ) &
                     CALL lax_error__( " zsqmred ", " something wrong with col 3 ", ib )
               END IF
            END IF
         END DO
      ELSE
         DO ipc_old = 1, npc
            IF( mycol == ipc_old - 1 .AND. ncsnd( ipc_old ) > 0 ) THEN
               CALL MPI_Wait( sreq, istatus, ierr )
               IF( ierr /= 0 ) &
                  CALL lax_error__( " zsqmred ", " in MPI_Wait 2 ", ABS( ierr ) )
            END IF
         END DO
      END IF
      !
   END DO

   DEALLOCATE( ab )
   DEALLOCATE( buf )

   CALL mpi_comm_free( col_comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in mpi_comm_free 1 ", ABS( ierr ) )

   CALL mpi_comm_free( row_comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " zsqmred ", " in mpi_comm_free 2 ", ABS( ierr ) )

#if defined __PIPPO

   !  this is for debugging, tests through global matrix, if
   !  the two matrix (pre and before the redistribution) coincide.

   ALLOCATE( tst1( nb, nb ) )
   ALLOCATE( tst2( nb, nb ) )
   ALLOCATE( ab( nb, nb ) )

   ab = 0.0d0

   do j = 1, desca%nc
   do i = 1, desca%nr
      ab( i + desca%ir - 1, j + desca%ic - 1 ) = a( i , j )
   end do
   end do

   CALL MPI_REDUCE( ab, tst1, nb*nb, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm, ierr )

   ab = 0.0d0

   do j = 1, descb%nc
   do i = 1, descb%nr
      ab( i + descb%ir - 1, j + descb%ic - 1 ) = b( i , j )
   end do
   end do

   CALL MPI_REDUCE( ab, tst2, nb*nb, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm, ierr )

   IF( myid == 0 ) THEN
      write( 4000, * ) na, nb, SUM( ABS( tst2 - tst1 ) )
   END IF

   DEALLOCATE( ab )
   DEALLOCATE( tst2 )
   DEALLOCATE( tst1 )

#endif

#endif

   RETURN
END SUBROUTINE laxlib_zsqmred_x_x

END MODULE laxlib_ptoolkit


! ---------------------------------------------------------------------------------


SUBROUTINE laxlib_dsqmdst_x( n, ar, ldar, a, lda, desc )
  !
  !!  Double precision SQuare Matrix DiSTribution
  !!  This subroutine take a replicated square matrix "ar" and distribute it
  !!  across processors as described by descriptor "desc"
  !
  USE laxlib_descriptor
  !
  implicit none
  include 'laxlib_kinds.fh'
  !
  INTEGER, INTENT(IN) :: n
  !! global dimension
  INTEGER, INTENT(IN) :: ldar
  !! leading dimension of matrix ar
  REAL(DP)            :: ar(ldar,*)
  !! matrix to be splitted, replicated on all proc
  INTEGER, INTENT(IN) :: lda
  !! leading dimension of matrix a
  REAL(DP)            :: a(lda,*)
  !! distributed matrix a
  TYPE(la_descriptor), INTENT(IN) :: desc
  !! laxlib descriptor for matrix a
  !!
  !
  REAL(DP), PARAMETER :: zero = 0_DP
  !
  INTEGER :: i, j, nr, nc, ic, ir, nx
  !
  IF( desc%active_node <= 0 ) THEN
     RETURN
  END IF

  nx  = desc%nrcx
  ir  = desc%ir
  ic  = desc%ic
  nr  = desc%nr
  nc  = desc%nc

  IF( lda < nx ) &
     CALL lax_error__( " dsqmdst ", " inconsistent dimension lda ", lda )
  IF( n /= desc%n ) &
     CALL lax_error__( " dsqmdst ", " inconsistent dimension n ", n )

  DO j = 1, nc
     DO i = 1, nr
        a( i, j ) = ar( i + ir - 1, j + ic - 1 )
     END DO
     DO i = nr+1, nx
        a( i, j ) = zero
     END DO
  END DO
  DO j = nc + 1, nx
     DO i = 1, nx
        a( i, j ) = zero
     END DO
  END DO

  RETURN

END SUBROUTINE laxlib_dsqmdst_x


SUBROUTINE laxlib_zsqmdst_x( n, ar, ldar, a, lda, desc )
  !
  !! double complex (Z) SQuare Matrix DiSTribution
  !! This subroutine take a replicated square matrix "ar" and distribute it
  !! across processors as described by descriptor "desc"
  !
  USE laxlib_descriptor
  !
  implicit none
  include 'laxlib_kinds.fh'
  !
  INTEGER, INTENT(IN) :: n
  !! global dimension
  INTEGER, INTENT(IN) :: ldar
  !! leading dimension of matrix ar
  COMPLEX(DP)            :: ar(ldar,*)
  !! matrix to be splitted, replicated on all proc
  INTEGER, INTENT(IN) :: lda
  !! leading dimension of matrix a
  COMPLEX(DP)            :: a(lda,*)
  !! distributed matrix a
  TYPE(la_descriptor), INTENT(IN) :: desc
  !! laxlib descriptor for matrix a
  !!
  !
  COMPLEX(DP), PARAMETER :: zero = ( 0_DP , 0_DP )
  !
  INTEGER :: i, j, nr, nc, ic, ir, nx
  !
  IF( desc%active_node <= 0 ) THEN
     RETURN
  END IF

  nx  = desc%nrcx
  ir  = desc%ir
  ic  = desc%ic
  nr  = desc%nr
  nc  = desc%nc

  IF( lda < nx ) &
     CALL lax_error__( " zsqmdst ", " inconsistent dimension lda ", lda )
  IF( n /= desc%n ) &
     CALL lax_error__( " zsqmdst ", " inconsistent dimension n ", n )

  DO j = 1, nc
     DO i = 1, nr
        a( i, j ) = ar( i + ir - 1, j + ic - 1 )
     END DO
     DO i = nr+1, nx
        a( i, j ) = zero
     END DO
  END DO
  DO j = nc + 1, nx
     DO i = 1, nx
        a( i, j ) = zero
     END DO
  END DO

  RETURN

END SUBROUTINE laxlib_zsqmdst_x

! ---------------------------------------------------------------------------------

SUBROUTINE laxlib_dsqmcll_x( n, a, lda, ar, ldar, desc, comm )
  !
  !! Double precision SQuare Matrix CoLLect
  !! This sub. take a distributed square matrix "a" and collect
  !! the block assigned to processors into a replicated matrix "ar",
  !! matrix is distributed as described by descriptor desc
  !!
  !
  USE laxlib_descriptor
  USE laxlib_parallel_include
  !
  implicit none
  include 'laxlib_kinds.fh'
  !
  INTEGER, INTENT(IN) :: n
  !! global dimension
  INTEGER, INTENT(IN) :: ldar
  !! leading dimension of matrix ar
  REAL(DP)            :: ar(ldar,*)
  !! matrix to be merged, replicated on all proc
  INTEGER, INTENT(IN) :: lda
  !! leading dimension of matrix a
  REAL(DP)            :: a(lda,*)
  !! distributed matrix a
  TYPE(la_descriptor), INTENT(IN) :: desc
  !! laxlib descriptor for matrix a
  INTEGER, INTENT(IN) :: comm
  !! mpi communicator
  !
  INTEGER :: i, j

#if defined __MPI
  !
  INTEGER :: np, nx, ipc, ipr, npr, npc, noff
  INTEGER :: ierr, ir, ic, nr, nc

  REAL(DP), ALLOCATABLE :: buf(:,:)
  !
  IF( desc%active_node > 0 ) THEN
     !
     np = desc%npr * desc%npc
     nx = desc%nrcx
     npr = desc%npr
     npc = desc%npc
     !
     IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
        ALLOCATE( buf( nx, nx * np ) )
     ELSE
        ALLOCATE( buf( 1, 1 ) )
     END IF
     !
     IF( lda /= nx ) &
        CALL lax_error__( " dsqmcll ", " inconsistent dimension lda ", lda )
     !
     IF( desc%n /= n ) &
        CALL lax_error__( " dsqmcll ", " inconsistent dimension n ", n )
     !
     CALL mpi_gather( a, nx*nx, mpi_double_precision, &
                      buf, nx*nx, mpi_double_precision, 0, desc%comm , ierr )
     !
     IF( ierr /= 0 ) &
        CALL lax_error__( " dsqmcll ", " in gather ", ABS( ierr ) )
     !
     IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
        DO ipc = 1, npc
           CALL descla_local_dims( ic, nc, n, desc%nx, npc, ipc-1 )
           DO ipr = 1, npr
              CALL descla_local_dims( ir, nr, n, desc%nx, npr, ipr-1 )
              noff = ( ipc - 1 + npc * ( ipr - 1 ) ) * nx
              DO j = 1, nc
                 DO i = 1, nr
                    ar( i + ir - 1, j + ic - 1 ) = buf( i, j + noff )
                 END DO
              END DO
           END DO
        END DO
     END IF
     !
     DEALLOCATE( buf )
     !
  END IF
  !
  CALL mpi_bcast( ar,  ldar * n, mpi_double_precision, 0, comm, ierr )
  !
  IF( ierr /= 0 ) &
     CALL lax_error__( " dsqmcll ", " in bcast ", ABS( ierr ) )

#else

  DO j = 1, n
     DO i = 1, n
        ar( i, j ) = a( i, j )
     END DO
  END DO

#endif

  RETURN
END SUBROUTINE laxlib_dsqmcll_x


SUBROUTINE laxlib_zsqmcll_x( n, a, lda, ar, ldar, desc, comm )
  !
  !  double complex (Z) SQuare Matrix CoLLect
  !  This sub. take a distributed square matrix "a" and collect
  !  the block assigned to processors into a replicated matrix "ar",
  !  matrix is distributed as described by descriptor desc
  !
  USE laxlib_descriptor
  USE laxlib_parallel_include
  !
  implicit none
  include 'laxlib_kinds.fh'
  !
  INTEGER, INTENT(IN) :: n
  !! global dimension
  INTEGER, INTENT(IN) :: ldar
  !! leading dimension of matrix ar
  COMPLEX(DP)            :: ar(ldar,*)
  !! matrix to be merged, replicated on all proc
  INTEGER, INTENT(IN) :: lda
  !! leading dimension of matrix a
  COMPLEX(DP)            :: a(lda,*)
  !! distributed matrix a
  TYPE(la_descriptor), INTENT(IN) :: desc
  !! laxlib descriptor for matrix a
  INTEGER, INTENT(IN) :: comm
  !! mpi communicator
  !
  INTEGER :: i, j

#if defined __MPI
  !
  INTEGER :: np, nx, ipc, ipr, npr, npc, noff
  INTEGER :: ierr, ir, ic, nr, nc

  COMPLEX(DP), ALLOCATABLE :: buf(:,:)
  !
  IF( desc%active_node > 0 ) THEN
     !
     np = desc%npr * desc%npc
     nx = desc%nrcx
     npr = desc%npr
     npc = desc%npc
     !
     IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
        ALLOCATE( buf( nx, nx * np ) )
     ELSE
        ALLOCATE( buf( 1, 1 ) )
     END IF
     !
     IF( lda /= nx ) &
        CALL lax_error__( " zsqmcll ", " inconsistent dimension lda ", lda )
     !
     IF( desc%n /= n ) &
        CALL lax_error__( " zsqmcll ", " inconsistent dimension n ", n )
     !
     CALL mpi_gather( a, nx*nx, mpi_double_complex, &
                      buf, nx*nx, mpi_double_complex, 0, desc%comm , ierr )
     !
     IF( ierr /= 0 ) &
        CALL lax_error__( " zsqmcll ", " in gather ", ABS( ierr ) )
     !
     IF( desc%myr == 0 .AND. desc%myc == 0 ) THEN
        DO ipc = 1, npc
           CALL descla_local_dims( ic, nc, n, desc%nx, npc, ipc-1 )
           DO ipr = 1, npr
              CALL descla_local_dims( ir, nr, n, desc%nx, npr, ipr-1 )
              noff = ( ipc - 1 + npc * ( ipr - 1 ) ) * nx
              DO j = 1, nc
                 DO i = 1, nr
                    ar( i + ir - 1, j + ic - 1 ) = buf( i, j + noff )
                 END DO
              END DO
           END DO
        END DO
     END IF
     !
     DEALLOCATE( buf )
     !
  END IF
  !
  CALL mpi_bcast( ar,  ldar * n, mpi_double_complex, 0, comm, ierr )
  !
  IF( ierr /= 0 ) &
     CALL lax_error__( " zsqmcll ", " in bcast ", ABS( ierr ) )

#else

  DO j = 1, n
     DO i = 1, n
        ar( i, j ) = a( i, j )
     END DO
  END DO

#endif

  RETURN
END SUBROUTINE laxlib_zsqmcll_x


! ---------------------------------------------------------------------------------

SUBROUTINE laxlib_dsqmwpb_x( n, a, lda, desc )
   !
   !! Double precision SQuare Matrix WiPe Border subroutine
   !! initialize to zero the distributed matrix border
   !
   USE laxlib_descriptor
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   REAL(DP)            :: a(lda,*)
   !! distributed matrix a
   TYPE(la_descriptor), INTENT(IN) :: desc
   !! laxlib descriptor
   !
   INTEGER :: i, j
   !
   DO j = 1, desc%nc
      DO i = desc%nr + 1, desc%nrcx
         a( i, j ) = 0_DP
      END DO
   END DO
   DO j = desc%nc + 1, desc%nrcx
      DO i = 1, desc%nrcx
         a( i, j ) = 0_DP
      END DO
   END DO
   !
   RETURN
END SUBROUTINE laxlib_dsqmwpb_x

! ---------------------------------------------------------------------------------

SUBROUTINE laxlib_dsqmsym_x( n, a, lda, idesc )
   !
   !! Double precision SQuare Matrix SYMmetrization
   !!
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   REAL(DP)            :: a(lda,*)
   !! distributed matrix a
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
#if defined __MPI
   INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
   INTEGER :: i, j
   INTEGER :: comm
   INTEGER :: nr, nc, dest, sreq, ierr, sour
   REAL(DP) :: atmp

#if defined __MPI

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node <= 0 ) THEN
      RETURN
   END IF

   IF( n /= desc%n ) &
      CALL lax_error__( " dsqmsym ", " wrong global dim n ", n )
   IF( lda /= desc%nrcx ) &
      CALL lax_error__( " dsqmsym ", " wrong leading dim lda ", lda )

   comm = desc%comm

   nr = desc%nr
   nc = desc%nc
   IF( desc%myc == desc%myr ) THEN
      !
      !  diagonal block, procs work locally
      !
      DO j = 1, nc
         DO i = j + 1, nr
            a(i,j) = a(j,i)
         END DO
      END DO
      !
   ELSE IF( desc%myc > desc%myr ) THEN
      !
      !  super diagonal block, procs send the block to sub diag.
      !
      CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
                             desc%myc, desc%myr, dest )
      CALL mpi_isend( a, lda*lda, MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " dsqmsym ", " in isend ", ABS( ierr ) )
      !
   ELSE IF( desc%myc < desc%myr ) THEN
      !
      !  sub diagonal block, procs receive the block from super diag,
      !  then transpose locally
      !
      CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
                             desc%myc, desc%myr, sour )
      CALL mpi_recv( a, lda*lda, MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " dsqmsym ", " in recv ", ABS( ierr ) )
      !
      DO j = 1, lda
         DO i = j + 1, lda
            atmp = a(i,j)
            a(i,j) = a(j,i)
            a(j,i) = atmp
         END DO
      END DO
      !
   END IF

   IF( desc%myc > desc%myr ) THEN
      !
      CALL MPI_Wait( sreq, istatus, ierr )
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " dsqmsym ", " in wait ", ABS( ierr ) )
      !
   END IF

#else

   DO j = 1, n
      !
      DO i = j + 1, n
         !
         a(i,j) = a(j,i)
         !
      END DO
      !
   END DO

#endif

   RETURN
END SUBROUTINE laxlib_dsqmsym_x

! ---------------------------------------------------------------------------------
#if defined (__CUDA)
SUBROUTINE laxlib_dsqmsym_gpu_x( n, a, lda, idesc )
   !
   !! Double precision SQuare Matrix SYMmetrization
   !! GPU version
   !!
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   USE cudafor
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   REAL(DP), INTENT(INOUT), DEVICE :: a(:,:)
   ATTRIBUTES(DEVICE) :: a
   !! distributed matrix a
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
#if defined __MPI
   INTEGER :: istatus( MPI_STATUS_SIZE )
#if ! defined(__GPU_MPI)
   REAL(DP), ALLOCATABLE :: a_h(:,:)
#endif
#endif
   INTEGER :: i, j
   INTEGER :: comm
   INTEGER :: nr, nc, dest, sreq, ierr, sour
   REAL(DP) :: atmp

#if defined __MPI

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node <= 0 ) THEN
      RETURN
   END IF

   IF( n /= desc%n ) &
      CALL lax_error__( " dsqmsym ", " wrong global dim n ", n )
   IF( lda /= desc%nrcx ) &
      CALL lax_error__( " dsqmsym ", " wrong leading dim lda ", lda )

   comm = desc%comm

   nr = desc%nr
   nc = desc%nc
   IF( desc%myc == desc%myr ) THEN
      !
      !  diagonal block, procs work locally
      !
!$cuf kernel do(1) <<<*,*>>>
      DO j = 1, nc
         DO i = j + 1, nr
            a(i,j) = a(j,i)
         END DO
      END DO
      !
   ELSE IF( desc%myc > desc%myr ) THEN
      !
      !  super diagonal block, procs send the block to sub diag.
      !
      CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
                             desc%myc, desc%myr, dest )
#if defined(__GPU_MPI)
      ierr = cudaDeviceSynchronize()
      CALL mpi_isend( a, SIZE(a), MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
#else
      ALLOCATE(a_h, SOURCE = a )
      CALL mpi_isend( a_h, SIZE(a_h), MPI_DOUBLE_PRECISION, dest, 1, comm, sreq, ierr )
#endif
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " dsqmsym ", " in isend ", ABS( ierr ) )
      !
   ELSE IF( desc%myc < desc%myr ) THEN
      !
      !  sub diagonal block, procs receive the block from super diag,
      !  then transpose locally
      !
      CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
                             desc%myc, desc%myr, sour )

#if defined(__GPU_MPI)
      ierr = cudaDeviceSynchronize()
      CALL mpi_recv( a, SIZE(a), MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
!$cuf kernel do(1) <<<*,*>>>
      DO j = 1, lda
         DO i = j + 1, lda
            atmp = a(i,j)
            a(i,j) = a(j,i)
            a(j,i) = atmp
         END DO
      END DO
#else
      ALLOCATE(a_h(SIZE(a,1),SIZE(a,2)))
      CALL mpi_recv( a_h, SIZE(a_h), MPI_DOUBLE_PRECISION, sour, 1, comm, istatus, ierr )
      DO j = 1, lda
         DO i = j + 1, lda
            atmp = a_h(i,j)
            a_h(i,j) = a_h(j,i)
            a_h(j,i) = atmp
         END DO
      END DO
      a = a_h
      DEALLOCATE(a_h)
#endif
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " dsqmsym ", " in recv ", ABS( ierr ) )
      !
   END IF

   IF( desc%myc > desc%myr ) THEN
      !
      CALL MPI_Wait( sreq, istatus, ierr )
      !
#if defined(__GPU_MPI)
#else
      DEALLOCATE(a_h)
#endif
      IF( ierr /= 0 ) &
         CALL lax_error__( " dsqmsym ", " in wait ", ABS( ierr ) )
      !
   END IF

#else

!$cuf kernel do(1) <<<*,*>>>
   DO j = 1, n
      DO i = j + 1, n
         a(i,j) = a(j,i)
      END DO
   END DO

#endif

   RETURN
END SUBROUTINE laxlib_dsqmsym_gpu_x
#endif
! ---------------------------------------------------------------------------------

SUBROUTINE laxlib_zsqmher_x( n, a, lda, idesc )
   !
   !! double complex (Z) SQuare Matrix HERmitianize
   !!
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   COMPLEX(DP)            :: a(lda,lda)
   !! distributed matrix a
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
#if defined __MPI
   INTEGER :: istatus( MPI_STATUS_SIZE )
#endif
   INTEGER :: i, j
   INTEGER :: comm, myid
   INTEGER :: nr, nc, dest, sreq, ierr, sour
   COMPLEX(DP) :: atmp
   COMPLEX(DP), ALLOCATABLE :: tst1(:,:)
   COMPLEX(DP), ALLOCATABLE :: tst2(:,:)

#if defined __MPI

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node <= 0 ) THEN
      RETURN
   END IF

   IF( n /= desc%n ) &
      CALL lax_error__( " zsqmsym ", " wrong global dim n ", n )
   IF( lda /= desc%nrcx ) &
      CALL lax_error__( " zsqmsym ", " wrong leading dim lda ", lda )

   comm = desc%comm

   nr = desc%nr
   nc = desc%nc
   IF( desc%myc == desc%myr ) THEN
      !
      !  diagonal block, procs work locally
      !
      DO j = 1, nc
         a(j,j) = CMPLX( DBLE( a(j,j) ), 0_DP, KIND=DP )
         DO i = j + 1, nr
            a(i,j) = CONJG( a(j,i) )
         END DO
      END DO
      !
   ELSE IF( desc%myc > desc%myr ) THEN
      !
      !  super diagonal block, procs send the block to sub diag.
      !
      CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
                             desc%myc, desc%myr, dest )
      CALL mpi_isend( a, lda*lda, MPI_DOUBLE_COMPLEX, dest, 1, comm, sreq, ierr )
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " zsqmher ", " in mpi_isend ", ABS( ierr ) )
      !
   ELSE IF( desc%myc < desc%myr ) THEN
      !
      !  sub diagonal block, procs receive the block from super diag,
      !  then transpose locally
      !
      CALL GRID2D_RANK( 'R', desc%npr, desc%npc, &
                             desc%myc, desc%myr, sour )
      CALL mpi_recv( a, lda*lda, MPI_DOUBLE_COMPLEX, sour, 1, comm, istatus, ierr )
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " zsqmher ", " in mpi_recv ", ABS( ierr ) )
      !
      DO j = 1, lda
         DO i = j + 1, lda
            atmp   = a(i,j)
            a(i,j) = a(j,i)
            a(j,i) = atmp
         END DO
      END DO
      DO j = 1, nc
         DO i = 1, nr
            a(i,j)  = CONJG( a(i,j) )
         END DO
      END DO
      !
   END IF

   IF( desc%myc > desc%myr ) THEN
      !
      CALL MPI_Wait( sreq, istatus, ierr )
      !
      IF( ierr /= 0 ) &
         CALL lax_error__( " zsqmher ", " in MPI_Wait ", ABS( ierr ) )
      !
   END IF

#if defined __PIPPO
   CALL MPI_Comm_rank( comm, myid, ierr )
   ALLOCATE( tst1( n, n ) )
   ALLOCATE( tst2( n, n ) )
   tst1 = 0.0d0
   tst2 = 0.0d0
   do j = 1, desc%nc
   do i = 1, desc%nr
      tst1( i + desc%ir - 1, j + desc%ic - 1 ) = a( i , j )
   end do
   end do
   CALL MPI_REDUCE( tst1, tst2, n*n, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, comm, ierr )
   IF( myid == 0 ) THEN
   DO j = 1, n
      !
      IF( tst2(j,j) /=  CMPLX( DBLE( tst2(j,j) ), 0_DP, KIND=DP ) ) &
        WRITE( 4000, * ) j, tst2(j,j)
      !
      DO i = j + 1, n
         !
         IF( tst2(i,j) /= CONJG( tst2(j,i) ) )  WRITE( 4000, * ) i,j, tst2(i,j)
         !
      END DO
      !
   END DO
   END IF

   DEALLOCATE( tst1 )
   DEALLOCATE( tst2 )
#endif

#else

   DO j = 1, n
      !
      a(j,j) = CMPLX( DBLE( a(j,j) ), 0_DP, KIND=DP )
      !
      DO i = j + 1, n
         !
         a(i,j) = CONJG( a(j,i) )
         !
      END DO
      !
   END DO

#endif

   RETURN
END SUBROUTINE laxlib_zsqmher_x


! ---------------------------------------------------------------------------------

SUBROUTINE laxlib_dsqmred_x( na, a, lda, idesca, nb, b, ldb, idescb )
   !
   !! Double precision SQuare Matrix REDistribution
   !!
   !! Copy a global "na * na" matrix locally stored in "a",
   !! and distributed as described by integer "idesca", into a larger
   !! global "nb * nb" matrix stored in "b" and distributed
   !! as described in integer "idescb".
   !
   !
   USE laxlib_descriptor
   USE laxlib_ptoolkit
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: na
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   REAL(DP)            :: a(lda,lda)
   !! matrix to be redistributed into b
   INTEGER, INTENT(IN) :: idesca(LAX_DESC_SIZE)
   !! integer laxlib descriptor matrix a
   INTEGER, INTENT(IN) :: nb
   !! global dimension of matrix b
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of matrix b
   REAL(DP)            :: b(ldb,ldb)
   !! redistributed matrix
   INTEGER, INTENT(IN) :: idescb(LAX_DESC_SIZE)
   !! integer laxlib descriptor matrix b
   !
   TYPE(la_descriptor) :: desca
   TYPE(la_descriptor) :: descb
   !
   CALL laxlib_intarray_to_desc(desca,idesca)
   CALL laxlib_intarray_to_desc(descb,idescb)
   CALL laxlib_dsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
END SUBROUTINE

SUBROUTINE laxlib_zsqmred_x( na, a, lda, idesca, nb, b, ldb, idescb )
   !
   !! double complex (Z) SQuare Matrix REDistribution
   !!
   !! Copy a global "na * na" matrix locally stored in "a",
   !! and distributed as described by integer "idesca", into a larger
   !! global "nb * nb" matrix stored in "b" and distributed
   !! as described in integer "idescb".
   !!
   USE laxlib_descriptor
   USE laxlib_ptoolkit
   !
   IMPLICIT NONE
   !
   include 'laxlib_param.fh'
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: na
   !! global dimension of matrix a
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of matrix a
   COMPLEX(DP)            :: a(lda,lda)
   !! matrix to be redistributed into b
   INTEGER, INTENT(IN) :: idesca(LAX_DESC_SIZE)
   !! integer laxlib descriptor matrix a
   INTEGER, INTENT(IN) :: nb
   !! global dimension of matrix b
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of matrix b
   COMPLEX(DP)            :: b(ldb,ldb)
   !! redistributed matrix
   INTEGER, INTENT(IN) :: idescb(LAX_DESC_SIZE)
   !! integer laxlib descriptor matrix b
   !
   TYPE(la_descriptor) :: desca
   TYPE(la_descriptor) :: descb
   !
   CALL laxlib_intarray_to_desc(desca,idesca)
   CALL laxlib_intarray_to_desc(descb,idescb)
   CALL laxlib_zsqmred_x_x( na, a, lda, desca, nb, b, ldb, descb )
END SUBROUTINE


! ---------------------------------------------------------------------------------


SUBROUTINE rep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
  !
  !!  Parallel matrix multiplication with replicated matrix
  !!
  !!  DGEMM  PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
  !!
  !!     C := ALPHA*OP( A )*OP( B ) + BETA*C,
  !!
  !!  WHERE  OP( X ) IS ONE OF
  !!
  !!     OP( X ) = X   OR   OP( X ) = X',
  !!
  !!  ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
  !!  AN M BY K MATRIX,  OP( B )  A  K BY N MATRIX AND  C AN M BY N MATRIX.
  !
  !
  !  written by Carlo Cavazzoni
  !
  USE laxlib_parallel_include
  implicit none
  include 'laxlib_kinds.fh'
  !
  CHARACTER(LEN=1), INTENT(IN) :: transa
  !! specifies the form of op( A ) to be used in the matrix multiplication as follows:
  !! 'N' or 'n',  op( A ) = A.
  !! 'T' or 't',  op( A ) = A**T.
  !! 'C' or 'c',  op( A ) = A**T.
  CHARACTER(LEN=1), INTENT(IN) :: transb
  !! specifies the form of op( B ) to be used in the matrix multiplication as
  !follows:
  !! 'N' or 'n',  op( B ) = B.
  !! 'T' or 't',  op( B ) = B**T.
  !! 'C' or 'c',  op( B ) = B**T.
  INTEGER, INTENT(IN) :: m
  !! number of rows of the matrix A and C
  INTEGER, INTENT(IN) :: n
  !! number of columns of the matrix B and C
  INTEGER, INTENT(IN) :: k
  !! number of columns of A and rows of B
  REAL(DP), INTENT(IN) :: alpha
  !! scalar alpha
  REAL(DP), INTENT(IN) :: beta
  !! scalar beta
  INTEGER, INTENT(IN) :: lda
  !! leading dimension of A
  INTEGER, INTENT(IN) :: ldb
  !! leading dimension of B
  INTEGER, INTENT(IN) :: ldc
  !! leading dimension of C
  REAL(DP) :: a(lda,*)
  !! matrix A
  REAL(DP) :: b(ldb,*)
  !! matrix B
  REAL(DP) :: c(ldc,*)
  !! matrix C
  INTEGER, INTENT(IN) :: comm
  !! mpi communicator
  !
#if defined __MPI
  !
  INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
  INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
  INTEGER :: nproc, mpime, q, r

  REAL(DP), ALLOCATABLE :: auxa( : )
  REAL(DP), ALLOCATABLE :: auxc( : )
  !
  ! ... BODY
  !
  CALL MPI_COMM_SIZE(comm, NPROC, IERR)
  CALL MPI_COMM_RANK(comm, MPIME, IERR)

  IF ( NPROC == 1 ) THEN

     !  if there is only one proc no need of using parallel alg.

     CALL dgemm(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)

     RETURN

  END IF

  ME = MPIME + 1
  Q = INT( m / NPROC )
  R = MOD( m , NPROC )

  ! ... Find out the number of elements in the local block
  !     along "M" first dimension os matrix A

  NB = Q
  IF( ME <= R ) NB = NB + 1

  ! ... Find out the global index of the local first row

  IF( ME <= R ) THEN
     ib_s = (Q+1)*(ME-1) + 1
  ELSE
     ib_s = Q*(ME-1) + R + 1
  END IF

  ldx = m / nproc + 1

  ALLOCATE( auxa( MAX( n, k ) * ldx ) )
  ALLOCATE( auxc( MAX( n, m ) * ldx ) )

  IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
     ibuf = 0
     ioff = ib_s - 1
     DO J = 1, k
        DO I = 1, NB
           auxa( ibuf + I ) = A( I + ioff, J )
        END DO
        ibuf = ibuf + ldx
     END DO
  ELSE
     ibuf = 0
     ioff = ib_s - 1
     DO J = 1, k
        DO I = 1, NB
           auxa( ibuf + I ) = A( J, I + ioff )
        END DO
        ibuf = ibuf + ldx
     END DO
     !ioff = ib_s - 1
     !call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
  END IF

  IF( beta /= 0.0_DP ) THEN
     ibuf = 0
     ioff = ib_s - 1
     DO J = 1, n
        DO I = 1, NB
           auxc( ibuf + I ) = C( I + ioff, J )
        END DO
        ibuf = ibuf + ldx
     END DO
  END IF

  CALL dgemm( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )

  ! ... Here processors exchange blocks

  DO IP = 0, NPROC-1

     ! ...    Find out the number of elements in the block of processor SOUR

     NB_SOUR = q
     IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1

     ! ...    Find out the global index of the first row owned by SOUR

     IF( (IP+1) .LE. r ) THEN
        ib_sour = (Q+1)*IP + 1
     ELSE
        ib_sour = Q*IP + R + 1
     END IF

     IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)

     CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_precision, ip, comm, IERR)

     IF( ierr /= 0 ) &
        CALL lax_error__( " rep_matmul_drv ", " in MPI_BCAST ", ABS( ierr ) )

     IBUF = 0
     ioff = IB_SOUR - 1
     DO J = 1, N
        DO I = 1, NB_SOUR
           C( I + ioff, J ) = AUXA( IBUF + I )
        END DO
        IBUF = IBUF + ldx
     END DO

  END DO

  DEALLOCATE( auxa, auxc )

#else

     !  if we are not compiling with __MPI this is equivalent to a blas call

     CALL dgemm(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)

#endif

  RETURN

END SUBROUTINE rep_matmul_drv


SUBROUTINE zrep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
  !
  !!  Parallel matrix multiplication with replicated matrix
  !!
  !!  DGEMM  PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
  !!
  !!     C := ALPHA*OP( A )*OP( B ) + BETA*C,
  !!
  !!  WHERE  OP( X ) IS ONE OF
  !!
  !!     OP( X ) = X   OR   OP( X ) = X',
  !!
  !!  ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
  !!  AN M BY K MATRIX,  OP( B )  A  K BY N MATRIX AND  C AN M BY N MATRIX.
  !!
  !  written by Carlo Cavazzoni
  !
  USE laxlib_parallel_include
  implicit none
  include 'laxlib_kinds.fh'
  !
  CHARACTER(LEN=1), INTENT(IN) :: transa
  !! specifies the form of op( A ) to be used in the matrix multiplication as
  !follows:
  !! 'N' or 'n',  op( A ) = A.
  !! 'T' or 't',  op( A ) = A**T.
  !! 'C' or 'c',  op( A ) = A**T.
  CHARACTER(LEN=1), INTENT(IN) :: transb
  !! specifies the form of op( B ) to be used in the matrix multiplication as
  !follows:
  !! 'N' or 'n',  op( B ) = B.
  !! 'T' or 't',  op( B ) = B**T.
  !! 'C' or 'c',  op( B ) = B**T.
  INTEGER, INTENT(IN) :: m
  !! number of rows of the matrix A and C
  INTEGER, INTENT(IN) :: n
  !! number of columns of the matrix B and C
  INTEGER, INTENT(IN) :: k
  !! number of columns of A and rows of B
  REAL(DP), INTENT(IN) :: alpha
  !! scalar alpha
  REAL(DP), INTENT(IN) :: beta
  !! scalar beta
  INTEGER, INTENT(IN) :: lda
  !! leading dimension of A
  INTEGER, INTENT(IN) :: ldb
  !! leading dimension of B
  INTEGER, INTENT(IN) :: ldc
  !! leading dimension of C
  COMPLEX(DP) :: a(lda,*)
  !! matrix A
  COMPLEX(DP) :: b(ldb,*)
  !! matrix B
  COMPLEX(DP) :: c(ldc,*)
  !! matrix C
  INTEGER, INTENT(IN) :: comm
  !! mpi communicator
  !
#if defined __MPI
  !
  INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
  INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
  INTEGER :: nproc, mpime, q, r

  COMPLEX(DP), ALLOCATABLE :: auxa( : )
  COMPLEX(DP), ALLOCATABLE :: auxc( : )
  !
  ! ... BODY
  !
  CALL MPI_COMM_SIZE(comm, NPROC, IERR)
  CALL MPI_COMM_RANK(comm, MPIME, IERR)

  IF ( NPROC == 1 ) THEN

     !  if there is only one proc no need of using parallel alg.

     CALL zgemm(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)

     RETURN

  END IF

  ME = MPIME + 1
  Q = INT( m / NPROC )
  R = MOD( m , NPROC )

  ! ... Find out the number of elements in the local block
  !     along "M" first dimension os matrix A

  NB = Q
  IF( ME <= R ) NB = NB + 1

  ! ... Find out the global index of the local first row

  IF( ME <= R ) THEN
     ib_s = (Q+1)*(ME-1) + 1
  ELSE
     ib_s = Q*(ME-1) + R + 1
  END IF

  ldx = m / nproc + 1

  ALLOCATE( auxa( MAX( n, k ) * ldx ) )
  ALLOCATE( auxc( MAX( n, m ) * ldx ) )

  IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
     ibuf = 0
     ioff = ib_s - 1
     DO J = 1, k
        DO I = 1, NB
           auxa( ibuf + I ) = A( I + ioff, J )
        END DO
        ibuf = ibuf + ldx
     END DO
  ELSE
     ibuf = 0
     ioff = ib_s - 1
     DO J = 1, k
        DO I = 1, NB
           auxa( ibuf + I ) = CONJG( A( J, I + ioff ) )
        END DO
        ibuf = ibuf + ldx
     END DO
     !ioff = ib_s - 1
     !call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
  END IF

  IF( beta /= 0.0_DP ) THEN
     ibuf = 0
     ioff = ib_s - 1
     DO J = 1, n
        DO I = 1, NB
           auxc( ibuf + I ) = C( I + ioff, J )
        END DO
        ibuf = ibuf + ldx
     END DO
  END IF

  CALL zgemm( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )

  ! ... Here processors exchange blocks

  DO IP = 0, NPROC-1

     ! ...    Find out the number of elements in the block of processor SOUR

     NB_SOUR = q
     IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1

     ! ...    Find out the global index of the first row owned by SOUR

     IF( (IP+1) .LE. r ) THEN
        ib_sour = (Q+1)*IP + 1
     ELSE
        ib_sour = Q*IP + R + 1
     END IF

     IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)

     CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_complex, ip, comm, IERR)

     IF( ierr /= 0 ) &
        CALL lax_error__( " zrep_matmul_drv ", " in MPI_BCAST ", ABS( ierr ) )

     IBUF = 0
     ioff = IB_SOUR - 1
     DO J = 1, N
        DO I = 1, NB_SOUR
           C( I + ioff, J ) = AUXA( IBUF + I )
        END DO
        IBUF = IBUF + ldx
     END DO

  END DO

  DEALLOCATE( auxa, auxc )

#else

     !  if we are not compiling with __MPI this is equivalent to a blas call

     CALL zgemm(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)

#endif

  RETURN

END SUBROUTINE zrep_matmul_drv

!
!
!=----------------------------------------------------------------------------=!
!
!
!  Cannon's algorithms for parallel matrix multiplication
!  written by Carlo Cavazzoni
!
!
!

SUBROUTINE sqr_dmm_cannon_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
   !
   !!
   !!  Double precision parallel square matrix multiplication with Cannon's algorithm
   !!  performs one of the matrix-matrix operations
   !!
   !!     C := ALPHA*OP( A )*OP( B ) + BETA*C,
   !!
   !!  where  op( x ) is one of
   !!
   !!     OP( X ) = X   OR   OP( X ) = X',
   !!
   !!  alpha and beta are scalars, and a, b and c are square matrices
   !!
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   CHARACTER(LEN=1), INTENT(IN) :: transa
   !! specifies the form of op( A ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( A ) = A.
   !! 'T' or 't',  op( A ) = A**T.
   !! 'C' or 'c',  op( A ) = A**T.
   CHARACTER(LEN=1), INTENT(IN) :: transb
   !! specifies the form of op( B ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( B ) = B.
   !! 'T' or 't',  op( B ) = B**T.
   !! 'C' or 'c',  op( B ) = B**T.
   INTEGER, INTENT(IN) :: n
   !! global dimension
   REAL(DP), INTENT(IN) :: alpha
   !! scalar alpha
   REAL(DP), INTENT(IN) :: beta
   !! scalar beta
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   INTEGER, INTENT(IN) :: ldc
   !! leading dimension of C
   REAL(DP) :: a(lda,*)
   !! matrix A
   REAL(DP) :: b(ldb,*)
   !! matrix B
   REAL(DP) :: c(ldc,*)
   !! matrix C
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   !
   integer :: ierr
   integer :: np
   integer :: i, j, nr, nc, nb, iter, rowid, colid
   logical :: ta, tb
   INTEGER :: comm
   !
   !
   real(DP), allocatable :: bblk(:,:), ablk(:,:)
   !
#if defined (__MPI)
   !
   integer :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   CALL laxlib_intarray_to_desc(desc,idesc)
   !
   IF( desc%active_node < 0 ) THEN
      !
      !  processors not interested in this computation return quickly
      !
      RETURN
      !
   END IF

   IF( n < 1 ) THEN
      RETURN
   END IF

   IF( desc%npr == 1 ) THEN
      !
      !  quick return if only one processor is used
      !
      CALL dgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
      !
      RETURN
      !
   END IF

   IF( desc%npr /= desc%npc ) &
      CALL lax_error__( ' sqr_mm_cannon ', ' works only with square processor mesh ', 1 )
   !
   !  Retrieve communicator and mesh geometry
   !
   np    = desc%npr
   comm  = desc%comm
   rowid = desc%myr
   colid = desc%myc
   !
   !  Retrieve the size of the local block
   !
   nr    = desc%nr
   nc    = desc%nc
   nb    = desc%nrcx
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_mm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
   allocate( ablk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         ablk( i, j ) = a( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   !
   !
   allocate( bblk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         bblk( i, j ) = b( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
   DO j = nc+1, nb
      DO i = 1, nb
         bblk( i, j ) = 0.0_DP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         bblk( i, j ) = 0.0_DP
      END DO
   END DO
   !
   !
   ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
   tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
   !
   !  Shift A rowid+1 places to the west
   !
   IF( ta ) THEN
      CALL shift_exch_block( ablk, 'W', 1 )
   ELSE
      CALL shift_block( ablk, 'W', rowid+1, 1 )
   END IF
   !
   !  Shift B colid+1 places to the north
   !
   IF( tb ) THEN
      CALL shift_exch_block( bblk, 'N', np+1 )
   ELSE
      CALL shift_block( bblk, 'N', colid+1, np+1 )
   END IF
   !
   !  Accumulate on C
   !
   CALL dgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
   !
   DO iter = 2, np
      !
      !  Shift A 1 places to the east
      !
      CALL shift_block( ablk, 'E', 1, iter )
      !
      !  Shift B 1 places to the south
      !
      CALL shift_block( bblk, 'S', 1, np+iter )
      !
      !  Accumulate on C
      !
      CALL dgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_DP, c, ldc)
      !
   END DO

   deallocate( ablk, bblk )

   RETURN

CONTAINS

   SUBROUTINE shift_block( blk, dir, ln, tag )
      !
      !   Block shift
      !
      IMPLICIT NONE
      REAL(DP) :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir      ! shift direction
      INTEGER,          INTENT(IN) :: ln       ! shift length
      INTEGER,          INTENT(IN) :: tag      ! communication tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      !
      IF( dir == 'W' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid - ln + np, np )
         icsrc = MOD( colid + ln + np, np )
         !
      ELSE IF( dir == 'E' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid + ln + np, np )
         icsrc = MOD( colid - ln + np, np )
         !
      ELSE IF( dir == 'N' ) THEN

         irdst = MOD( rowid - ln + np, np )
         irsrc = MOD( rowid + ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE IF( dir == 'S' ) THEN

         irdst = MOD( rowid + ln + np, np )
         irsrc = MOD( rowid - ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE

         CALL lax_error__( ' sqr_mm_cannon ', ' unknown shift direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_mm_cannon ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
      !
#endif
      RETURN
   END SUBROUTINE shift_block

   SUBROUTINE shift_exch_block( blk, dir, tag )
      !
      !   Combined block shift and exchange
      !   only used for the first step
      !
      IMPLICIT NONE
      REAL(DP) :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir
      INTEGER,          INTENT(IN) :: tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      INTEGER :: icol, irow
      !
      IF( dir == 'W' ) THEN
         !
         icol = rowid
         irow = colid
         !
         irdst = irow
         icdst = MOD( icol - irow-1 + np, np )
         !
         irow = rowid
         icol = MOD( colid + rowid+1 + np, np )
         !
         irsrc = icol
         icsrc = irow
         !
      ELSE IF( dir == 'N' ) THEN
         !
         icol = rowid
         irow = colid
         !
         icdst = icol
         irdst = MOD( irow - icol-1 + np, np )
         !
         irow = MOD( rowid + colid+1 + np, np )
         icol = colid
         !
         irsrc = icol
         icsrc = irow

      ELSE

         CALL lax_error__( ' sqr_mm_cannon ', ' unknown shift_exch direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_mm_cannon ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
      !
#endif
      RETURN
   END SUBROUTINE shift_exch_block

END SUBROUTINE sqr_dmm_cannon_x

!=----------------------------------------------------------------------------=!
#if defined (__CUDA)
SUBROUTINE sqr_dmm_cannon_gpu_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
   !
   !!
   !!  Double precision parallel square matrix multiplication with Cannon's algorithm
   !!  performs one of the matrix-matrix operations
   !!
   !!  GPU version
   !!
   !!     C := ALPHA*OP( A )*OP( B ) + BETA*C,
   !!
   !!  where  op( x ) is one of
   !!
   !!     OP( X ) = X   OR   OP( X ) = X',
   !!
   !!  alpha and beta are scalars, and a, b and c are square matrices
   !!
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   USE cudafor
   USE cublas
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   CHARACTER(LEN=1), INTENT(IN) :: transa
   !! specifies the form of op( A ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( A ) = A.
   !! 'T' or 't',  op( A ) = A**T.
   !! 'C' or 'c',  op( A ) = A**T.
   CHARACTER(LEN=1), INTENT(IN) :: transb
   !! specifies the form of op( B ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( B ) = B.
   !! 'T' or 't',  op( B ) = B**T.
   !! 'C' or 'c',  op( B ) = B**T.
   INTEGER, INTENT(IN) :: n
   !! global dimension
   REAL(DP), INTENT(IN) :: alpha
   !! scalar alpha
   REAL(DP), INTENT(IN) :: beta
   !! scalar beta
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   INTEGER, INTENT(IN) :: ldc
   !! leading dimension of C
   REAL(DP), DEVICE :: a(lda,*)
   !! matrix A
   REAL(DP), DEVICE :: b(ldb,*)
   !! matrix B
   REAL(DP), DEVICE :: c(ldc,*)
   !! matrix C
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   !
   integer :: ierr
   integer :: np
   integer :: i, j, nr, nc, nb, iter, rowid, colid
   logical :: ta, tb
   INTEGER :: comm
   !
   !
   real(DP), DEVICE, allocatable :: bblk(:,:), ablk(:,:)
   !
#if defined (__MPI)
   !
   integer :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   ierr = cudaDeviceSynchronize()
   !
   CALL laxlib_intarray_to_desc(desc,idesc)
   !
   IF( desc%active_node < 0 ) THEN
      !
      !  processors not interested in this computation return quickly
      !
      RETURN
      !
   END IF

   IF( n < 1 ) THEN
      RETURN
   END IF

   IF( desc%npr == 1 ) THEN
      !
      !  quick return if only one processor is used
      !
      CALL cublasdgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
      !
      RETURN
      !
   END IF

   IF( desc%npr /= desc%npc ) &
      CALL lax_error__( ' sqr_mm_cannon ', ' works only with square processor mesh ', 1 )
   !
   !  Retrieve communicator and mesh geometry
   !
   np    = desc%npr
   comm  = desc%comm
   rowid = desc%myr
   colid = desc%myc
   !
   !  Retrieve the size of the local block
   !
   nr    = desc%nr
   nc    = desc%nc
   nb    = desc%nrcx
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_mm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
   allocate( ablk( nb, nb ) )

!$cuf kernel do(2) <<<*,*>>>
   DO j = 1, nc
      DO i = 1, nr
         ablk( i, j ) = a( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
!$cuf kernel do(2) <<<*,*>>>
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
!$cuf kernel do(2) <<<*,*>>>
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   !
   !
   allocate( bblk( nb, nb ) )
!$cuf kernel do(2) <<<*,*>>>
   DO j = 1, nc
      DO i = 1, nr
         bblk( i, j ) = b( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
!$cuf kernel do(2) <<<*,*>>>
   DO j = nc+1, nb
      DO i = 1, nb
         bblk( i, j ) = 0.0_DP
      END DO
   END DO
!$cuf kernel do(2) <<<*,*>>>
   DO j = 1, nb
      DO i = nr+1, nb
         bblk( i, j ) = 0.0_DP
      END DO
   END DO
   !
   ierr = cudaDeviceSynchronize()
   !
   ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
   tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
   !
   !  Shift A rowid+1 places to the west
   !
   IF( ta ) THEN
      CALL shift_exch_block( ablk, 'W', 1 )
   ELSE
      CALL shift_block( ablk, 'W', rowid+1, 1 )
   END IF
   !
   !  Shift B colid+1 places to the north
   !
   IF( tb ) THEN
      CALL shift_exch_block( bblk, 'N', np+1 )
   ELSE
      CALL shift_block( bblk, 'N', colid+1, np+1 )
   END IF
   !
   !  Accumulate on C
   !
   CALL cublasdgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
   !
   DO iter = 2, np
      !
      ierr = cudaDeviceSynchronize()
      !
      !  Shift A 1 places to the east
      !
      CALL shift_block( ablk, 'E', 1, iter )
      !
      !  Shift B 1 places to the south
      !
      CALL shift_block( bblk, 'S', 1, np+iter )
      !
      !  Accumulate on C
      !
      CALL cublasdgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_DP, c, ldc)
      !
   END DO

   ierr = cudaDeviceSynchronize()
   deallocate( ablk, bblk )

   RETURN

CONTAINS

   SUBROUTINE shift_block( blk, dir, ln, tag )
      !
      !   Block shift
      !
      IMPLICIT NONE
      REAL(DP), DEVICE :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir      ! shift direction
      INTEGER,          INTENT(IN) :: ln       ! shift length
      INTEGER,          INTENT(IN) :: tag      ! communication tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
#if ! defined(__GPU_MPI)
      REAL(DP), ALLOCATABLE :: blk_h( :, : )
      ALLOCATE( blk_h, SOURCE =  blk )
      ierr = cudaDeviceSynchronize()
#endif
      !
      IF( dir == 'W' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid - ln + np, np )
         icsrc = MOD( colid + ln + np, np )
         !
      ELSE IF( dir == 'E' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid + ln + np, np )
         icsrc = MOD( colid - ln + np, np )
         !
      ELSE IF( dir == 'N' ) THEN

         irdst = MOD( rowid - ln + np, np )
         irsrc = MOD( rowid + ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE IF( dir == 'S' ) THEN

         irdst = MOD( rowid + ln + np, np )
         irsrc = MOD( rowid - ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE

         CALL lax_error__( ' sqr_mm_cannon ', ' unknown shift direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
#if defined(__GPU_MPI)
      ierr = cudaDeviceSynchronize()
      CALL MPI_SENDRECV_REPLACE(blk, SIZE(blk), MPI_DOUBLE_PRECISION, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
#else
      CALL MPI_SENDRECV_REPLACE(blk_h, SIZE(blk_h), MPI_DOUBLE_PRECISION, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
      blk = blk_h
      ierr = cudaDeviceSynchronize()
      DEALLOCATE( blk_h )
#endif
      !
#endif
      RETURN
   END SUBROUTINE shift_block

   SUBROUTINE shift_exch_block( blk, dir, tag )
      !
      !   Combined block shift and exchange
      !   only used for the first step
      !
      IMPLICIT NONE
      REAL(DP), DEVICE :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir
      INTEGER,          INTENT(IN) :: tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      INTEGER :: icol, irow
#if ! defined(__GPU_MPI)
      REAL(DP), ALLOCATABLE :: blk_h( :, : )
      ALLOCATE( blk_h, SOURCE =  blk )
      ierr = cudaDeviceSynchronize()
#endif
      !
      IF( dir == 'W' ) THEN
         !
         icol = rowid
         irow = colid
         !
         irdst = irow
         icdst = MOD( icol - irow-1 + np, np )
         !
         irow = rowid
         icol = MOD( colid + rowid+1 + np, np )
         !
         irsrc = icol
         icsrc = irow
         !
      ELSE IF( dir == 'N' ) THEN
         !
         icol = rowid
         irow = colid
         !
         icdst = icol
         irdst = MOD( irow - icol-1 + np, np )
         !
         irow = MOD( rowid + colid+1 + np, np )
         icol = colid
         !
         irsrc = icol
         icsrc = irow

      ELSE

         CALL lax_error__( ' sqr_mm_cannon_gpu ', ' unknown shift_exch direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
#if defined(__GPU_MPI)
      ierr = cudaDeviceSynchronize()
      CALL MPI_SENDRECV_REPLACE(blk, SIZE(blk), MPI_DOUBLE_PRECISION, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
#else
      CALL MPI_SENDRECV_REPLACE(blk_h, SIZE(blk_h), MPI_DOUBLE_PRECISION, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_mm_cannon_gpu ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
      blk = blk_h
      ierr = cudaDeviceSynchronize()
      DEALLOCATE( blk_h )
#endif
      !
#endif
      RETURN
   END SUBROUTINE shift_exch_block

END SUBROUTINE sqr_dmm_cannon_gpu_x
#endif
!=----------------------------------------------------------------------------=!

SUBROUTINE sqr_smm_cannon_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
   !
   !!  Single precision parallel square matrix multiplication with Cannon's algorithm
   !!  performs one of the matrix-matrix operations
   !!
   !!     C := ALPHA*OP( A )*OP( B ) + BETA*C,
   !!
   !!  where  op( x ) is one of
   !!
   !!     OP( X ) = X   OR   OP( X ) = X',
   !!
   !!  alpha and beta are scalars, and a, b and c are square matrices
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   CHARACTER(LEN=1), INTENT(IN) :: transa
   !! specifies the form of op( A ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( A ) = A.
   !! 'T' or 't',  op( A ) = A**T.
   !! 'C' or 'c',  op( A ) = A**T.
   CHARACTER(LEN=1), INTENT(IN) :: transb
   !! specifies the form of op( B ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( B ) = B.
   !! 'T' or 't',  op( B ) = B**T.
   !! 'C' or 'c',  op( B ) = B**T.
   INTEGER, INTENT(IN) :: n
   !! global dimension
   REAL(SP), INTENT(IN) :: alpha
   !! scalar alpha
   REAL(SP), INTENT(IN) :: beta
   !! scalar beta
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   INTEGER, INTENT(IN) :: ldc
   !! leading dimension of C
   REAL(SP) :: a(lda,*)
   !! matrix A
   REAL(SP) :: b(ldb,*)
   !! matrix B
   REAL(SP) :: c(ldc,*)
   !! matrix C
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   !
   integer :: ierr
   integer :: np
   integer :: i, j, nr, nc, nb, iter, rowid, colid
   logical :: ta, tb
   INTEGER :: comm
   !
   !
   real(SP), allocatable :: bblk(:,:), ablk(:,:)
   !
#if defined (__MPI)
   !
   integer :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   CALL laxlib_intarray_to_desc(desc,idesc)
   !
   IF( desc%active_node < 0 ) THEN
      !
      !  processors not interested in this computation return quickly
      !
      RETURN
      !
   END IF

   IF( n < 1 ) THEN
      RETURN
   END IF

   IF( desc%npr == 1 ) THEN
      !
      !  quick return if only one processor is used
      !
      CALL sgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
      !
      RETURN
      !
   END IF

   IF( desc%npr /= desc%npc ) &
      CALL lax_error__( ' sqr_smm_cannon ', ' works only with square processor mesh ', 1 )
   !
   !  Retrieve communicator and mesh geometry
   !
   np    = desc%npr
   comm  = desc%comm
   rowid = desc%myr
   colid = desc%myc
   !
   !  Retrieve the size of the local block
   !
   nr    = desc%nr
   nc    = desc%nc
   nb    = desc%nrcx
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_smm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
   allocate( ablk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         ablk( i, j ) = a( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = 0.0_SP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = 0.0_SP
      END DO
   END DO
   !
   !
   allocate( bblk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         bblk( i, j ) = b( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
   DO j = nc+1, nb
      DO i = 1, nb
         bblk( i, j ) = 0.0_SP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         bblk( i, j ) = 0.0_SP
      END DO
   END DO
   !
   !
   ta = ( TRANSA == 'T' .OR. TRANSA == 't' )
   tb = ( TRANSB == 'T' .OR. TRANSB == 't' )
   !
   !  Shift A rowid+1 places to the west
   !
   IF( ta ) THEN
      CALL shift_exch_block( ablk, 'W', 1 )
   ELSE
      CALL shift_block( ablk, 'W', rowid+1, 1 )
   END IF
   !
   !  Shift B colid+1 places to the north
   !
   IF( tb ) THEN
      CALL shift_exch_block( bblk, 'N', np+1 )
   ELSE
      CALL shift_block( bblk, 'N', colid+1, np+1 )
   END IF
   !
   !  Accumulate on C
   !
   CALL sgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
   !
   DO iter = 2, np
      !
      !  Shift A 1 places to the east
      !
      CALL shift_block( ablk, 'E', 1, iter )
      !
      !  Shift B 1 places to the south
      !
      CALL shift_block( bblk, 'S', 1, np+iter )
      !
      !  Accumulate on C
      !
      CALL sgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, 1.0_SP, c, ldc)
      !
   END DO

   deallocate( ablk, bblk )

   RETURN

CONTAINS

   SUBROUTINE shift_block( blk, dir, ln, tag )
      !
      !   Block shift
      !
      IMPLICIT NONE
      REAL(SP) :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir      ! shift direction
      INTEGER,          INTENT(IN) :: ln       ! shift length
      INTEGER,          INTENT(IN) :: tag      ! communication tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      !
      IF( dir == 'W' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid - ln + np, np )
         icsrc = MOD( colid + ln + np, np )
         !
      ELSE IF( dir == 'E' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid + ln + np, np )
         icsrc = MOD( colid - ln + np, np )
         !
      ELSE IF( dir == 'N' ) THEN

         irdst = MOD( rowid - ln + np, np )
         irsrc = MOD( rowid + ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE IF( dir == 'S' ) THEN

         irdst = MOD( rowid + ln + np, np )
         irsrc = MOD( rowid - ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE

         CALL lax_error__( ' sqr_smm_cannon ', ' unknown shift direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_REAL, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_smm_cannon ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
      !
#endif
      RETURN
   END SUBROUTINE shift_block

   SUBROUTINE shift_exch_block( blk, dir, tag )
      !
      !   Combined block shift and exchange
      !   only used for the first step
      !
      IMPLICIT NONE
      REAL(SP) :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir
      INTEGER,          INTENT(IN) :: tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      INTEGER :: icol, irow
      !
      IF( dir == 'W' ) THEN
         !
         icol = rowid
         irow = colid
         !
         irdst = irow
         icdst = MOD( icol - irow-1 + np, np )
         !
         irow = rowid
         icol = MOD( colid + rowid+1 + np, np )
         !
         irsrc = icol
         icsrc = irow
         !
      ELSE IF( dir == 'N' ) THEN
         !
         icol = rowid
         irow = colid
         !
         icdst = icol
         irdst = MOD( irow - icol-1 + np, np )
         !
         irow = MOD( rowid + colid+1 + np, np )
         icol = colid
         !
         irsrc = icol
         icsrc = irow

      ELSE

         CALL lax_error__( ' sqr_smm_cannon ', ' unknown shift_exch direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_REAL, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_smm_cannon ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
      !
#endif
      RETURN
   END SUBROUTINE shift_exch_block

END SUBROUTINE sqr_smm_cannon_x


!=----------------------------------------------------------------------------=!

SUBROUTINE sqr_zmm_cannon_x( transa, transb, n, alpha, a, lda, b, ldb, beta, c, ldc, idesc )
   !
   !!  Double precision complex (Z) parallel square matrix multiplication with Cannon's algorithm
   !!  performs one of the matrix-matrix operations
   !!
   !!     C := ALPHA*OP( A )*OP( B ) + BETA*C,
   !!
   !!  where  op( x ) is one of
   !!
   !!     OP( X ) = X   OR   OP( X ) = X',
   !!
   !!  alpha and beta are scalars, and a, b and c are square matrices
   !
   USE laxlib_descriptor
   !
   USE laxlib_parallel_include
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   CHARACTER(LEN=1), INTENT(IN) :: transa
   !! specifies the form of op( A ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( A ) = A.
   !! 'T' or 't',  op( A ) = A**T.
   !! 'C' or 'c',  op( A ) = A**T.
   CHARACTER(LEN=1), INTENT(IN) :: transb
   !! specifies the form of op( B ) to be used in the matrix multiplication as
   !follows:
   !! 'N' or 'n',  op( B ) = B.
   !! 'T' or 't',  op( B ) = B**T.
   !! 'C' or 'c',  op( B ) = B**T.
   INTEGER, INTENT(IN) :: n
   !! global dimension
   COMPLEX(DP), INTENT(IN) :: alpha
   !! scalar alpha
   COMPLEX(DP), INTENT(IN) :: beta
   !! scalar beta
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   INTEGER, INTENT(IN) :: ldc
   !! leading dimension of C
   COMPLEX(DP) :: a(lda,*)
   !! matrix A
   COMPLEX(DP) :: b(ldb,*)
   !! matrix B
   COMPLEX(DP) :: c(ldc,*)
   !! matrix C
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   !
   INTEGER :: ierr
   INTEGER :: np
   INTEGER :: i, j, nr, nc, nb, iter, rowid, colid
   LOGICAL :: ta, tb
   INTEGER :: comm
   !
   !
   COMPLEX(DP), ALLOCATABLE :: bblk(:,:), ablk(:,:)
   COMPLEX(DP) :: zone = ( 1.0_DP, 0.0_DP )
   COMPLEX(DP) :: zzero = ( 0.0_DP, 0.0_DP )
   !
#if defined (__MPI)
   !
   integer :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   CALL laxlib_intarray_to_desc(desc,idesc)
   !
   IF( desc%active_node < 0 ) THEN
      !
      !  processors not interested in this computation return quickly
      !
      RETURN
      !
   END IF

   IF( n < 1 ) THEN
      RETURN
   END IF

   IF( desc%npr == 1 ) THEN
      !
      !  quick return if only one processor is used
      !
      CALL zgemm( TRANSA, TRANSB, n, n, n, alpha, a, lda, b, ldb, beta, c, ldc)
      !
      RETURN
      !
   END IF

   IF( desc%npr /= desc%npc ) &
      CALL lax_error__( ' sqr_zmm_cannon ', ' works only with square processor mesh ', 1 )
   !
   !  Retrieve communicator and mesh geometry
   !
   np    = desc%npr
   comm  = desc%comm
   rowid = desc%myr
   colid = desc%myc
   !
   !  Retrieve the size of the local block
   !
   nr    = desc%nr
   nc    = desc%nc
   nb    = desc%nrcx
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_zmm_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
   allocate( ablk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         ablk( i, j ) = a( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = zzero
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = zzero
      END DO
   END DO
   !
   !
   allocate( bblk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         bblk( i, j ) = b( i, j )
      END DO
   END DO
   !
   !  Clear memory outside the matrix block
   !
   DO j = nc+1, nb
      DO i = 1, nb
         bblk( i, j ) = zzero
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         bblk( i, j ) = zzero
      END DO
   END DO
   !
   !
   ta = ( TRANSA == 'C' .OR. TRANSA == 'c' )
   tb = ( TRANSB == 'C' .OR. TRANSB == 'c' )
   !
   !  Shift A rowid+1 places to the west
   !
   IF( ta ) THEN
      CALL shift_exch_block( ablk, 'W', 1 )
   ELSE
      CALL shift_block( ablk, 'W', rowid+1, 1 )
   END IF
   !
   !  Shift B colid+1 places to the north
   !
   IF( tb ) THEN
      CALL shift_exch_block( bblk, 'N', np+1 )
   ELSE
      CALL shift_block( bblk, 'N', colid+1, np+1 )
   END IF
   !
   !  Accumulate on C
   !
   CALL zgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, beta, c, ldc)
   !
   DO iter = 2, np
      !
      !  Shift A 1 places to the east
      !
      CALL shift_block( ablk, 'E', 1, iter )
      !
      !  Shift B 1 places to the south
      !
      CALL shift_block( bblk, 'S', 1, np+iter )
      !
      !  Accumulate on C
      !
      CALL zgemm( TRANSA, TRANSB, nr, nc, nb, alpha, ablk, nb, bblk, nb, zone, c, ldc)
      !
   END DO

   deallocate( ablk, bblk )

   RETURN

CONTAINS

   SUBROUTINE shift_block( blk, dir, ln, tag )
      !
      !   Block shift
      !
      IMPLICIT NONE
      COMPLEX(DP) :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir      ! shift direction
      INTEGER,          INTENT(IN) :: ln       ! shift length
      INTEGER,          INTENT(IN) :: tag      ! communication tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      !
      IF( dir == 'W' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid - ln + np, np )
         icsrc = MOD( colid + ln + np, np )
         !
      ELSE IF( dir == 'E' ) THEN
         !
         irdst = rowid
         irsrc = rowid
         icdst = MOD( colid + ln + np, np )
         icsrc = MOD( colid - ln + np, np )
         !
      ELSE IF( dir == 'N' ) THEN

         irdst = MOD( rowid - ln + np, np )
         irsrc = MOD( rowid + ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE IF( dir == 'S' ) THEN

         irdst = MOD( rowid + ln + np, np )
         irsrc = MOD( rowid - ln + np, np )
         icdst = colid
         icsrc = colid

      ELSE

         CALL lax_error__( ' sqr_zmm_cannon ', ' unknown shift direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_COMPLEX, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_zmm_cannon ", " in MPI_SENDRECV_REPLACE 1 ", ABS( ierr ) )
      !
#endif
      RETURN
   END SUBROUTINE shift_block
   !
   SUBROUTINE shift_exch_block( blk, dir, tag )
      !
      !   Combined block shift and exchange
      !   only used for the first step
      !
      IMPLICIT NONE
      COMPLEX(DP) :: blk( :, : )
      CHARACTER(LEN=1), INTENT(IN) :: dir
      INTEGER,          INTENT(IN) :: tag
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      INTEGER :: icol, irow
      !
      IF( dir == 'W' ) THEN
         !
         icol = rowid
         irow = colid
         !
         irdst = irow
         icdst = MOD( icol - irow-1 + np, np )
         !
         irow = rowid
         icol = MOD( colid + rowid+1 + np, np )
         !
         irsrc = icol
         icsrc = irow
         !
      ELSE IF( dir == 'N' ) THEN
         !
         icol = rowid
         irow = colid
         !
         icdst = icol
         irdst = MOD( irow - icol-1 + np, np )
         !
         irow = MOD( rowid + colid+1 + np, np )
         icol = colid
         !
         irsrc = icol
         icsrc = irow

      ELSE

         CALL lax_error__( ' sqr_zmm_cannon ', ' unknown shift_exch direction ', 1 )

      END IF
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_COMPLEX, &
           idest, tag, isour, tag, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_zmm_cannon ", " in MPI_SENDRECV_REPLACE 2 ", ABS( ierr ) )
      !
#endif
      RETURN
   END SUBROUTINE shift_exch_block

END SUBROUTINE sqr_zmm_cannon_x

!
!
!
!

SUBROUTINE sqr_tr_cannon_x( n, a, lda, b, ldb, idesc )
   !
   !!  Parallel square matrix transposition with Cannon's algorithm
   !!
   !
   USE laxlib_parallel_include
   IMPLICIT NONE
   !
   include 'laxlib_param.fh'
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   REAL(DP)            :: a(lda,*)
   !! matrix A
   REAL(DP)            :: b(ldb,*)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   INTEGER :: ierr
   INTEGER :: np, rowid, colid
   INTEGER :: i, j, nr, nc, nb
   INTEGER :: comm
   !
   REAL(DP), ALLOCATABLE :: ablk(:,:)
   !
#if defined (__MPI)
   !
   INTEGER :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      RETURN
   END IF

   IF( n < 1 ) THEN
     RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) == 1 ) THEN
      CALL mytranspose( a, lda, b, ldb, n, n )
      RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' works only with square processor mesh ', 1 )
   IF( n /= idesc(LAX_DESC_N) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size n  ', 1 )
   IF( lda /= idesc(LAX_DESC_NRCX) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size lda  ', 1 )
   IF( ldb /= idesc(LAX_DESC_NRCX) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size ldb  ', 1 )

   comm = idesc(LAX_DESC_COMM)

   rowid = idesc(LAX_DESC_MYR)
   colid = idesc(LAX_DESC_MYC)
   np    = idesc(LAX_DESC_NPR)
   !
   !  Compute the size of the local block
   !
   nr = idesc(LAX_DESC_NR)
   nc = idesc(LAX_DESC_NC)
   nb = idesc(LAX_DESC_NRCX)
   !
   allocate( ablk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         ablk( i, j ) = a( i, j )
      END DO
   END DO
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   !
   CALL exchange_block( ablk )
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_tr_cannon ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
   DO j = 1, nr
      DO i = 1, nc
         b( j, i ) = ablk( i, j )
      END DO
   END DO
   !
   deallocate( ablk )

   RETURN

CONTAINS

   SUBROUTINE exchange_block( blk )
      !
      !   Block exchange ( transpose )
      !
      IMPLICIT NONE
      REAL(DP) :: blk( :, : )
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      !
      irdst = colid
      icdst = rowid
      irsrc = colid
      icsrc = rowid
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_DOUBLE_PRECISION, &
           idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_tr_cannon ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
      !
#endif

      RETURN
   END SUBROUTINE


END SUBROUTINE

!
#if defined (__CUDA)
SUBROUTINE sqr_tr_cannon_gpu_x( n, a, lda, b, ldb, idesc )
   !
   !!  Parallel square matrix transposition with Cannon's algorithm
   !!  GPU version
   !!
   !
   USE laxlib_parallel_include
   USE cudafor
   USE cublas
   !
   IMPLICIT NONE
   !
   include 'laxlib_param.fh'
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   REAL(DP), INTENT(IN),  DEVICE :: a(:,:)
   !! matrix A
   REAL(DP), INTENT(OUT), DEVICE :: b(:,:)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   INTEGER :: ierr
   INTEGER :: np, rowid, colid
   INTEGER :: i, j, nr, nc, nb, ldx
   INTEGER :: comm
   !
   REAL(DP) :: tmp
#if defined (__GPU_MPI)
   REAL(DP), ALLOCATABLE, DEVICE :: ablk(:,:)
#else
   REAL(DP), ALLOCATABLE :: ablk(:,:)
#endif
   !
#if defined (__MPI)
   !
   INTEGER :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   IF( n < 2 ) THEN
     GOTO 1000
   END IF
   !
   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      GOTO 1000
   END IF
   !
   IF( idesc(LAX_DESC_NPR) == 1 .AND. idesc(LAX_DESC_NPC) == 1 ) THEN
!$cuf kernel do(2) <<<*,*>>>
      DO j = 1, n
         DO i = 1, n
            b( j, i ) = a( i, j )
         END DO
      END DO
      GOTO 1000
   END IF


   IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
      CALL lax_error__( ' sqr_tr_cannon_gpu ', ' works only with square processor mesh ', 1 )
   IF( n /= idesc(LAX_DESC_N) ) &
      CALL lax_error__( ' sqr_tr_cannon_gpu ', ' inconsistent size n  ', 1 )
   IF( lda /= idesc(LAX_DESC_NRCX) .OR. lda /= SIZE(a,1) ) &
      CALL lax_error__( ' sqr_tr_cannon_gpu ', ' inconsistent size lda  ', 1 )
   IF( ldb /= idesc(LAX_DESC_NRCX) .OR. ldb /= SIZE(b,1) ) &
      CALL lax_error__( ' sqr_tr_cannon_gpu ', ' inconsistent size ldb  ', 1 )

   comm = idesc(LAX_DESC_COMM)

   rowid = idesc(LAX_DESC_MYR)
   colid = idesc(LAX_DESC_MYC)
   np    = idesc(LAX_DESC_NPR)
   !
   !  Compute the size of the local block
   !
   nr = idesc(LAX_DESC_NR)
   nc = idesc(LAX_DESC_NC)
   nb = idesc(LAX_DESC_NRCX)
   ldx = MAX(lda,ldb)
   !
   ALLOCATE( ablk(nb,nb) )
   ablk(1:nr,1:nc) = a(1:nr,1:nc)
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = 0.0_DP
      END DO
   END DO
   !
   CALL exchange_block( ablk )
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_tr_cannon_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
#if defined (__GPU_MPI)
!$cuf kernel do(2) <<<*,*>>>
   DO j = 1, nr
      DO i = 1, nc
         b( j, i ) = ablk( i, j )
      END DO
   END DO
   ierr = cudaDeviceSynchronize()
#else
   DO j = 1, ldx
      DO i = j + 1, ldx
            tmp = ablk(i,j)
            ablk(i,j) = ablk(j,i)
            ablk(j,i) = tmp
      END DO
   END DO
   !ierr = cudaMemcpy2D(b, SIZE(b,1), ablk, ldx, nc, nr, cudaMemcpyHostToDevice )
   b(1:nr,1:nc) = ablk(1:nr,1:nc)
   !DO j = 1, nr
   !   DO i = 1, nc
   !      b( j, i ) = ablk( i, j )
   !   END DO
   !END DO
#endif
   !
   !
   deallocate( ablk )

1000 CONTINUE

   ierr = cudaDeviceSynchronize()

   RETURN

CONTAINS

   SUBROUTINE exchange_block( blk )
      !
      !   Block exchange ( transpose )
      !
      IMPLICIT NONE
#if defined (__GPU_MPI)
      REAL(DP), DEVICE :: blk( :, : )
#else
      REAL(DP) :: blk( :, : )
#endif
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      !
      irdst = colid
      icdst = rowid
      irsrc = colid
      icsrc = rowid
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
#if defined (__GPU_MPI)
      ierr = cudaDeviceSynchronize()
#endif
      CALL MPI_SENDRECV_REPLACE(blk, SIZE(blk), MPI_DOUBLE_PRECISION, &
           idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_tr_cannon_gpu ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
      !
#endif

      RETURN
   END SUBROUTINE

END SUBROUTINE sqr_tr_cannon_gpu_x
#endif

!

SUBROUTINE sqr_tr_cannon_sp_x( n, a, lda, b, ldb, idesc )
   !
   !! Parallel square matrix transposition with Cannon's algorithm
   !! single precision version
   !
   USE laxlib_parallel_include
   IMPLICIT NONE
   !
   include 'laxlib_param.fh'
   include 'laxlib_kinds.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of A
   INTEGER, INTENT(IN) :: ldb
   !! leading dimension of B
   REAL(SP)            :: a(lda,*)
   !! matrix A
   REAL(SP)            :: b(ldb,*)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   INTEGER :: ierr
   INTEGER :: np, rowid, colid
   INTEGER :: i, j, nr, nc, nb
   INTEGER :: comm
   !
   REAL(SP), ALLOCATABLE :: ablk(:,:)
   !
#if defined (__MPI)
   !
   INTEGER :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      RETURN
   END IF

   IF( n < 1 ) THEN
     RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) == 1 ) THEN
      CALL mytranspose_sp( a, lda, b, ldb, n, n )
      RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' works only with square processor mesh ', 1 )
   IF( n /= idesc(LAX_DESC_N) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size n  ', 1 )
   IF( lda /= idesc(LAX_DESC_NRCX) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size lda  ', 1 )
   IF( ldb /= idesc(LAX_DESC_NRCX) ) &
      CALL lax_error__( ' sqr_tr_cannon ', ' inconsistent size ldb  ', 1 )

   comm = idesc(LAX_DESC_COMM)

   rowid = idesc(LAX_DESC_MYR)
   colid = idesc(LAX_DESC_MYC)
   np    = idesc(LAX_DESC_NPR)
   !
   !  Compute the size of the local block
   !
   nr = idesc(LAX_DESC_NR)
   nc = idesc(LAX_DESC_NC)
   nb = idesc(LAX_DESC_NRCX)
   !
   allocate( ablk( nb, nb ) )
   DO j = 1, nc
      DO i = 1, nr
         ablk( i, j ) = a( i, j )
      END DO
   END DO
   DO j = nc+1, nb
      DO i = 1, nb
         ablk( i, j ) = 0.0_SP
      END DO
   END DO
   DO j = 1, nb
      DO i = nr+1, nb
         ablk( i, j ) = 0.0_SP
      END DO
   END DO
   !
   CALL exchange_block( ablk )
   !
#if defined (__MPI)
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " sqr_tr_cannon_sp ", " in MPI_BARRIER ", ABS( ierr ) )
#endif
   !
   DO j = 1, nr
      DO i = 1, nc
         b( j, i ) = ablk( i, j )
      END DO
   END DO
   !
   deallocate( ablk )

   RETURN

CONTAINS

   SUBROUTINE exchange_block( blk )
      !
      !   Block exchange ( transpose )
      !
      IMPLICIT NONE
      REAL(SP) :: blk( :, : )
      !
      INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
      !
      irdst = colid
      icdst = rowid
      irsrc = colid
      icsrc = rowid
      !
      CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
      CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
      !
#if defined (__MPI)
      !
      CALL MPI_SENDRECV_REPLACE(blk, nb*nb, MPI_REAL, &
           idest, np+np+1, isour, np+np+1, comm, istatus, ierr)
      IF( ierr /= 0 ) &
         CALL lax_error__( " sqr_tr_cannon_sp ", " in MPI_SENDRECV_REPLACE ", ABS( ierr ) )
      !
#endif

      RETURN
   END SUBROUTINE


END SUBROUTINE


SUBROUTINE redist_row2col_x( n, a, b, ldx, nx, idesc )
   !
   !!  redistribute a, array whose second dimension is distributed over processor row,
   !!  to obtain b, with the second dim. distributed over processor column
   !
   !
   USE laxlib_parallel_include
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: ldx
   !! local rows
   INTEGER, INTENT(IN) :: nx
   !! local columns
   REAL(DP)            :: a(ldx,nx)
   !! matrix A
   REAL(DP)            :: b(ldx,nx)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   INTEGER :: ierr
   INTEGER :: np, rowid, colid
   INTEGER :: comm
   INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
   !
#if defined (__MPI)
   !
   INTEGER :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      RETURN
   END IF

   IF( n < 1 ) THEN
     RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) == 1 ) THEN
      b = a
      RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
      CALL lax_error__( ' redist_row2col ', ' works only with square processor mesh ', 1 )
   IF( n /= idesc(LAX_DESC_N) ) &
      CALL lax_error__( ' redist_row2col ', ' inconsistent size n  ', 1 )
   IF( nx /= idesc(LAX_DESC_NRCX) ) &
      CALL lax_error__( ' redist_row2col ', ' inconsistent size lda  ', 1 )

   comm = idesc(LAX_DESC_COMM)

   rowid = idesc(LAX_DESC_MYR)
   colid = idesc(LAX_DESC_MYC)
   np    = idesc(LAX_DESC_NPR)
   !
   irdst = colid
   icdst = rowid
   irsrc = colid
   icsrc = rowid
   !
   CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
   CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
   !
#if defined (__MPI)
   !
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col ", " in MPI_BARRIER ", ABS( ierr ) )
   !
   CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
                     b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col ", " in MPI_SENDRECV ", ABS( ierr ) )
   !
#else
   b = a
#endif
   !
   RETURN

END SUBROUTINE redist_row2col_x

#if defined (__CUDA)
SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
   !
   !!  redistribute a, array whose second dimension is distributed over processor row,
   !!  to obtain b, with the second dim. distributed over processor column
   !!  GPU version
   !
   USE cudafor
   USE laxlib_parallel_include
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: ldx
   !! local rows
   INTEGER, INTENT(IN) :: nx
   !! local columns
   REAL(DP), DEVICE    :: a(:,:)
   !! matrix A
   REAL(DP), DEVICE    :: b(:,:)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   REAL(DP), ALLOCATABLE :: a_h(:,:), b_h(:,:)
   INTEGER :: ierr
   INTEGER :: np, rowid, colid
   INTEGER :: comm
   INTEGER :: icdst, irdst, icsrc, irsrc, idest, isour
   !
#if defined (__MPI)
   !
   INTEGER :: istatus( MPI_STATUS_SIZE )
   !
#endif
   !
   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      RETURN
   END IF

   IF( n < 1 ) THEN
      RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) == 1 ) THEN
      b = a
      RETURN
   END IF

   IF( idesc(LAX_DESC_NPR) /= idesc(LAX_DESC_NPC) ) &
      CALL lax_error__( ' redist_row2col_gpu ', ' works only with square processor mesh ', 1 )
   IF( n /= idesc(LAX_DESC_N) ) &
      CALL lax_error__( ' redist_row2col_gpu ', ' inconsistent size n  ', 1 )
   IF( nx /= idesc(LAX_DESC_NRCX) ) &
      CALL lax_error__( ' redist_row2col_gpu ', ' inconsistent size lda  ', 1 )

   comm = idesc(LAX_DESC_COMM)

   rowid = idesc(LAX_DESC_MYR)
   colid = idesc(LAX_DESC_MYC)
   np    = idesc(LAX_DESC_NPR)
   !
   irdst = colid
   icdst = rowid
   irsrc = colid
   icsrc = rowid
   !
   CALL GRID2D_RANK( 'R', np, np, irdst, icdst, idest )
   CALL GRID2D_RANK( 'R', np, np, irsrc, icsrc, isour )
   !
#if defined (__MPI)
   !
   CALL MPI_BARRIER( comm, ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
   !
#if defined(__GPU_MPI)
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
   !
   ierr = cudaDeviceSynchronize()
   CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
                     b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
#else
   ALLOCATE( a_h, SOURCE=a, STAT=ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " allocating a_h ", ABS( ierr ) )
   ALLOCATE( b_h, MOLD=b, STAT=ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " allocating b_h ", ABS( ierr ) )
   !
   CALL MPI_SENDRECV(a_h, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
                     b_h, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
   !
   b = b_h
   !
   DEALLOCATE( a_h, b_h )
#endif
   !
#else
   b = a
#endif
   !
   RETURN

END SUBROUTINE redist_row2col_gpu_x

#endif

!
!
!

SUBROUTINE cyc2blk_redist_x( n, a, lda, nca, b, ldb, ncb, idesc )
   !
   !!  Parallel square matrix redistribution. Double precision
   !!  A (input) is cyclically distributed by rows across processors
   !!  B (output) is distributed by block across 2D processors grid
   !!
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! local rows of A
   INTEGER, INTENT(IN) :: nca
   !! local columns of A
   INTEGER, INTENT(IN) :: ldb
   !! local rows of B
   INTEGER, INTENT(IN) :: ncb
   !! local columns of B
   REAL(DP)            :: a(lda,nca)
   !! matrix A
   REAL(DP)            :: b(ldb,ncb)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   integer :: ierr, itag
   integer :: np, ip, me, nproc, comm_a
   integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
   integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
   INTEGER :: me_ortho(2), np_ortho(2)
   !
   real(DP), allocatable :: rcvbuf(:,:,:)
   real(DP), allocatable :: sndbuf(:,:)
   TYPE(la_descriptor) :: ip_desc
   !
   character(len=256) :: msg
   !
#if defined (__MPI)

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node < 0 ) THEN
      RETURN
   END IF

   np = desc%npr  !  dimension of the processor mesh
   nb = desc%nrcx    !  leading dimension of the local matrix block
   me = desc%mype   !  my processor id (starting from 0)
   comm_a = desc%comm
   nproc  = desc%npr * desc%npc

   IF( np /= desc%npc ) &
      CALL lax_error__( ' cyc2blk_redist ', ' works only with square processor mesh ', 1 )
   IF( n < 1 ) &
      CALL lax_error__( ' cyc2blk_redist ', ' incorrect first argument (n <= 0)', 1 )
   IF( desc%n < nproc ) &
      CALL lax_error__( ' cyc2blk_redist ', ' number of bands < number of proc ', 1 )

   nbuf = (nb/nproc+2) * nb
   !
   ALLOCATE( sndbuf( nb/nproc+2, nb ) )
   ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
   !
   ! loop over all processors
   !
   DO ip = 0, nproc - 1
      !
      ! 2D proc ortho grid sizes
      !
      np_ortho(1) = desc%npr
      np_ortho(2) = desc%npc
      !
      ! compute other processor coordinates me_ortho
      !
      CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
      !
      ! initialize other processor descriptor
      !
      CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )

      IF( ip_desc%nrcx /= nb ) &
         CALL lax_error__( ' cyc2blk_redist ', ' inconsistent block dim nb ', 1 )
      !
      IF( ip_desc%active_node > 0 ) THEN

         ip_nr = ip_desc%nr
         ip_nc = ip_desc%nc
         ip_ir = ip_desc%ir
         ip_ic = ip_desc%ic
         !
         DO j = 1, ip_nc
            jj = j + ip_ic - 1
            il = 1
            DO i = 1, ip_nr
               ii = i + ip_ir - 1
               IF( MOD( ii - 1, nproc ) == me ) THEN
                  CALL check_sndbuf_index()
                  sndbuf( il, j ) = a( ( ii - 1 )/nproc + 1, jj )
                  il = il + 1
               END IF
            END DO
         END DO

      END IF

      CALL mpi_barrier( comm_a, ierr )

      CALL mpi_gather( sndbuf, nbuf, mpi_double_precision, &
                       rcvbuf, nbuf, mpi_double_precision, ip, comm_a, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " cyc2blk_redist ", " in mpi_gather ", ABS( ierr ) )

   END DO

   !
   nr  = desc%nr
   nc  = desc%nc
   ir  = desc%ir
   ic  = desc%ic
   !
   DO ip = 0, nproc - 1
      DO j = 1, nc
         il = 1
         DO i = 1, nr
            ii = i + ir - 1
            IF( MOD( ii - 1, nproc ) == ip ) THEN
               CALL check_rcvbuf_index()
               b( i, j ) = rcvbuf( il, j, ip+1 )
               il = il + 1
            END IF
         END DO
      END DO
   END DO
   !
   DEALLOCATE( rcvbuf )
   DEALLOCATE( sndbuf )

#else

   b( 1:n, 1:n ) = a( 1:n, 1:n )

#endif

   RETURN

CONTAINS

   SUBROUTINE check_sndbuf_index()
      CHARACTER(LEN=38), SAVE :: msg = ' check_sndbuf_index in cyc2blk_redist '
      IF( j  > SIZE(sndbuf,2) ) CALL lax_error__( msg, ' j > SIZE(sndbuf,2) ', ip+1 )
      IF( il > SIZE(sndbuf,1) ) CALL lax_error__( msg, ' il > SIZE(sndbuf,1) ', ip+1 )
      IF( ( ii - 1 )/nproc + 1 < 1 ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 < 1 ', ip+1 )
      IF( ( ii - 1 )/nproc + 1 > lda ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 > SIZE(a,1) ', ip+1 )
      IF( jj < 1 ) CALL lax_error__( msg, ' jj < 1 ', ip+1 )
      IF( jj > n ) CALL lax_error__( msg, ' jj > n ', ip+1 )
      RETURN
   END SUBROUTINE check_sndbuf_index

   SUBROUTINE check_rcvbuf_index()
      CHARACTER(LEN=38), SAVE :: msg = ' check_rcvbuf_index in cyc2blk_redist '
      IF( i > ldb ) CALL lax_error__( msg, ' i > ldb ', ip+1 )
      IF( j > ldb ) CALL lax_error__( msg, ' j > ldb ', ip+1 )
      IF( j > nb  ) CALL lax_error__( msg, ' j > nb  ', ip+1 )
      IF( il > SIZE( rcvbuf, 1 ) ) CALL lax_error__( msg, ' il too large ', ip+1 )
      RETURN
   END SUBROUTINE check_rcvbuf_index

END SUBROUTINE cyc2blk_redist_x


SUBROUTINE cyc2blk_zredist_x( n, a, lda, nca, b, ldb, ncb, idesc )
   !
   !! Parallel square matrix redistribution. Double precision complex (Z)
   !! A (input) is cyclically distributed by rows across processors
   !! B (output) is distributed by block across 2D processors grid
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! local rows of A
   INTEGER, INTENT(IN) :: nca
   !! local columns of A
   INTEGER, INTENT(IN) :: ldb
   !! local rows of B
   INTEGER, INTENT(IN) :: ncb
   !! local columns of B
   COMPLEX(DP)            :: a(lda,nca)
   !! matrix A
   COMPLEX(DP)            :: b(ldb,ncb)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   !
   integer :: ierr, itag
   integer :: np, ip, me, nproc, comm_a
   integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
   integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
   INTEGER :: me_ortho(2), np_ortho(2)
   !
   COMPLEX(DP), allocatable :: rcvbuf(:,:,:)
   COMPLEX(DP), allocatable :: sndbuf(:,:)
   TYPE(la_descriptor) :: ip_desc
   !
   character(len=256) :: msg
   !
#if defined (__MPI)

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node < 0 ) THEN
      RETURN
   END IF

   np = desc%npr  !  dimension of the processor mesh
   nb = desc%nrcx    !  leading dimension of the local matrix block
   me = desc%mype   !  my processor id (starting from 0)
   comm_a = desc%comm
   nproc  = desc%npr * desc%npc

   IF( np /= desc%npc ) &
      CALL lax_error__( ' cyc2blk_zredist ', ' works only with square processor mesh ', 1 )
   IF( n < 1 ) &
      CALL lax_error__( ' cyc2blk_zredist ', ' n less or equal zero ', 1 )
   IF( desc%n < nproc ) &
      CALL lax_error__( ' cyc2blk_zredist ', ' nb less than the number of proc ', 1 )
   !
   nbuf = (nb/nproc+2) * nb
   !
   ALLOCATE( sndbuf( nb/nproc+2, nb ) )
   ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )

   DO ip = 0, nproc - 1
      !
      ! 2D proc ortho grid sizes
      !
      np_ortho(1) = desc%npr
      np_ortho(2) = desc%npc
      !
      ! compute other processor coordinates me_ortho
      !
      CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
      !
      ! initialize other processor descriptor
      !
      CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )

      ip_nr = ip_desc%nr
      ip_nc = ip_desc%nc
      ip_ir = ip_desc%ir
      ip_ic = ip_desc%ic
      !
      DO j = 1, ip_nc
         jj = j + ip_ic - 1
         il = 1
         DO i = 1, ip_nr
            ii = i + ip_ir - 1
            IF( MOD( ii - 1, nproc ) == me ) THEN
               CALL check_sndbuf_index()
               sndbuf( il, j ) = a( ( ii - 1 )/nproc + 1, jj )
               il = il + 1
            END IF
         END DO
      END DO

      CALL mpi_barrier( comm_a, ierr )

      CALL mpi_gather( sndbuf, nbuf, mpi_double_complex, &
                       rcvbuf, nbuf, mpi_double_complex, ip, comm_a, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " cyc2blk_zredist ", " in mpi_gather ", ABS( ierr ) )

   END DO

   !
   nr  = desc%nr
   nc  = desc%nc
   ir  = desc%ir
   ic  = desc%ic
   !
   DO ip = 0, nproc - 1
      DO j = 1, nc
         il = 1
         DO i = 1, nr
            ii = i + ir - 1
            IF( MOD( ii - 1, nproc ) == ip ) THEN
               CALL check_rcvbuf_index()
               b( i, j ) = rcvbuf( il, j, ip+1 )
               il = il + 1
            END IF
         END DO
      END DO
   END DO
   !
   !
   DEALLOCATE( rcvbuf )
   DEALLOCATE( sndbuf )

#else

   b( 1:n, 1:n ) = a( 1:n, 1:n )

#endif

   RETURN

CONTAINS

   SUBROUTINE check_sndbuf_index()
      CHARACTER(LEN=40), SAVE :: msg = ' check_sndbuf_index in cyc2blk_zredist  '
      IF( j  > SIZE(sndbuf,2) ) CALL lax_error__( msg, ' j > SIZE(sndbuf,2) ', ip+1 )
      IF( il > SIZE(sndbuf,1) ) CALL lax_error__( msg, ' il > SIZE(sndbuf,1) ', ip+1 )
      IF( ( ii - 1 )/nproc + 1 < 1 ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 < 1 ', ip+1 )
      IF( ( ii - 1 )/nproc + 1 > SIZE(a,1) ) CALL lax_error__( msg, ' ( ii - 1 )/nproc + 1 > SIZE(a,1) ', ip+1 )
      IF( jj < 1 ) CALL lax_error__( msg, ' jj < 1 ', ip+1 )
      IF( jj > n ) CALL lax_error__( msg, ' jj > n ', ip+1 )
      RETURN
   END SUBROUTINE check_sndbuf_index

   SUBROUTINE check_rcvbuf_index()
      CHARACTER(LEN=40), SAVE :: msg = ' check_rcvbuf_index in cyc2blk_zredist  '
      IF( i > ldb ) CALL lax_error__( msg, ' i > ldb ', ip+1 )
      IF( j > ldb ) CALL lax_error__( msg, ' j > ldb ', ip+1 )
      IF( j > nb  ) CALL lax_error__( msg, ' j > nb  ', ip+1 )
      IF( il > SIZE( rcvbuf, 1 ) ) CALL lax_error__( msg, ' il too large ', ip+1 )
      RETURN
   END SUBROUTINE check_rcvbuf_index

END SUBROUTINE cyc2blk_zredist_x


SUBROUTINE blk2cyc_redist_x( n, a, lda, nca, b, ldb, ncb, idesc )
   !
   !!  Parallel square matrix redistribution. Double precision.
   !!  A (output) is cyclically distributed by rows across processors
   !!  B (input) is distributed by block across 2D processors grid
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! local rows of A
   INTEGER, INTENT(IN) :: nca
   !! local columns of A
   INTEGER, INTENT(IN) :: ldb
   !! local rows of B
   INTEGER, INTENT(IN) :: ncb
   !! local columns of B
   REAL(DP)            :: a(lda,nca)
   !! matrix A
   REAL(DP)            :: b(ldb,ncb)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   integer :: ierr, itag
   integer :: np, ip, me, comm_a, nproc
   integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
   integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
   INTEGER :: me_ortho(2), np_ortho(2)
   !
   REAL(DP), allocatable :: rcvbuf(:,:,:)
   REAL(DP), allocatable :: sndbuf(:,:)
   TYPE(la_descriptor) :: ip_desc
   !
   character(len=256) :: msg
   !
#if defined (__MPI)

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node < 0 ) THEN
      RETURN
   END IF

   np = desc%npr  !  dimension of the processor mesh
   nb = desc%nrcx    !  leading dimension of the local matrix block
   me = desc%mype   !  my processor id (starting from 0)
   comm_a = desc%comm
   nproc  = desc%npr * desc%npc

   IF( np /= desc%npc ) &
      CALL lax_error__( ' blk2cyc_redist ', ' works only with square processor mesh ', 1 )
   IF( n < 1 ) &
      CALL lax_error__( ' blk2cyc_redist ', ' n less or equal zero ', 1 )
   IF( desc%n < nproc ) &
      CALL lax_error__( ' blk2cyc_redist ', ' nb less than the number of proc ', 1 )
   !
   nbuf = (nb/nproc+2) * nb
   !
   ALLOCATE( sndbuf( nb/nproc+2, nb ) )
   ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
   !
   nr  = desc%nr
   nc  = desc%nc
   ir  = desc%ir
   ic  = desc%ic
   !
   DO ip = 0, nproc - 1
      DO j = 1, nc
         il = 1
         DO i = 1, nr
            ii = i + ir - 1
            IF( MOD( ii - 1, nproc ) == ip ) THEN
               sndbuf( il, j ) = b( i, j )
               il = il + 1
            END IF
         END DO
      END DO
      CALL mpi_barrier( comm_a, ierr )
      CALL mpi_gather( sndbuf, nbuf, mpi_double_precision, &
                       rcvbuf, nbuf, mpi_double_precision, ip, comm_a, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " blk2cyc_redist ", " in mpi_gather ", ABS( ierr ) )
   END DO
   !

   DO ip = 0, nproc - 1
      !
      ! 2D proc ortho grid sizes
      !
      np_ortho(1) = desc%npr
      np_ortho(2) = desc%npc
      !
      ! compute other processor coordinates me_ortho
      !
      CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
      !
      ! initialize other processor descriptor
      !
      CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )
      !
      ip_nr = ip_desc%nr
      ip_nc = ip_desc%nc
      ip_ir = ip_desc%ir
      ip_ic = ip_desc%ic
      !
      DO j = 1, ip_nc
         jj = j + ip_ic - 1
         il = 1
         DO i = 1, ip_nr
            ii = i + ip_ir - 1
            IF( MOD( ii - 1, nproc ) == me ) THEN
               a( ( ii - 1 )/nproc + 1, jj ) = rcvbuf( il, j, ip+1 )
               il = il + 1
            END IF
         END DO
      END DO

   END DO
   !
   DEALLOCATE( rcvbuf )
   DEALLOCATE( sndbuf )

#else

   a( 1:n, 1:n ) = b( 1:n, 1:n )

#endif

   RETURN

END SUBROUTINE blk2cyc_redist_x


SUBROUTINE blk2cyc_zredist_x( n, a, lda, nca, b, ldb, ncb, idesc )
   !
   !!  Parallel square matrix redistribution. Double precision complex (Z)
   !!  A (output) is cyclically distributed by rows across processors
   !!  B (input) is distributed by block across 2D processors grid
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: lda
   !! local rows of A
   INTEGER, INTENT(IN) :: nca
   !! local columns of A
   INTEGER, INTENT(IN) :: ldb
   !! local rows of B
   INTEGER, INTENT(IN) :: ncb
   !! local columns of B
   COMPLEX(DP)            :: a(lda,nca)
   !! matrix A
   COMPLEX(DP)            :: b(ldb,ncb)
   !! matrix B
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   !
   TYPE(la_descriptor) :: desc
   !
   integer :: ierr, itag
   integer :: np, ip, me, comm_a, nproc
   integer :: ip_ir, ip_ic, ip_nr, ip_nc, il, nbuf, ip_irl
   integer :: i, ii, j, jj, nr, nc, nb, nrl, irl, ir, ic
   INTEGER :: me_ortho(2), np_ortho(2)
   !
   COMPLEX(DP), allocatable :: rcvbuf(:,:,:)
   COMPLEX(DP), allocatable :: sndbuf(:,:)
   TYPE(la_descriptor) :: ip_desc
   !
   character(len=256) :: msg
   !
#if defined (__MPI)

   CALL laxlib_intarray_to_desc(desc,idesc)

   IF( desc%active_node < 0 ) THEN
      RETURN
   END IF

   np = desc%npr  !  dimension of the processor mesh
   nb = desc%nrcx    !  leading dimension of the local matrix block
   me = desc%mype   !  my processor id (starting from 0)
   comm_a = desc%comm
   nproc  = desc%npr * desc%npc

   IF( np /= desc%npc ) &
      CALL lax_error__( ' blk2cyc_zredist ', ' works only with square processor mesh ', 1 )
   IF( n < 1 ) &
      CALL lax_error__( ' blk2cyc_zredist ', ' n less or equal zero ', 1 )
   IF( desc%n < nproc ) &
      CALL lax_error__( ' blk2cyc_zredist ', ' nb less than the number of proc ', 1 )
   !
   nbuf = (nb/nproc+2) * nb
   !
   ALLOCATE( sndbuf( nb/nproc+2, nb ) )
   ALLOCATE( rcvbuf( nb/nproc+2, nb, nproc ) )
   !
   nr  = desc%nr
   nc  = desc%nc
   ir  = desc%ir
   ic  = desc%ic
   !
   DO ip = 0, nproc - 1
      DO j = 1, nc
         il = 1
         DO i = 1, nr
            ii = i + ir - 1
            IF( MOD( ii - 1, nproc ) == ip ) THEN
               sndbuf( il, j ) = b( i, j )
               il = il + 1
            END IF
         END DO
      END DO
      CALL mpi_barrier( comm_a, ierr )
      CALL mpi_gather( sndbuf, nbuf, mpi_double_complex, &
                       rcvbuf, nbuf, mpi_double_complex, ip, comm_a, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " blk2cyc_zredist ", " in mpi_gather ", ABS( ierr ) )
   END DO
   !

   DO ip = 0, nproc - 1
      !
      ! 2D proc ortho grid sizes
      !
      np_ortho(1) = desc%npr
      np_ortho(2) = desc%npc
      !
      ! compute other processor coordinates me_ortho
      !
      CALL GRID2D_COORDS( 'R', ip, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
      !
      ! initialize other processor descriptor
      !
      CALL descla_init( ip_desc, desc%n, desc%nx, np_ortho, me_ortho, desc%comm, desc%cntx, 1 )
      !
      ip_nr = ip_desc%nr
      ip_nc = ip_desc%nc
      ip_ir = ip_desc%ir
      ip_ic = ip_desc%ic
      !
      DO j = 1, ip_nc
         jj = j + ip_ic - 1
         il = 1
         DO i = 1, ip_nr
            ii = i + ip_ir - 1
            IF( MOD( ii - 1, nproc ) == me ) THEN
               a( ( ii - 1 )/nproc + 1, jj ) = rcvbuf( il, j, ip+1 )
               il = il + 1
            END IF
         END DO
      END DO

   END DO
   !
   DEALLOCATE( rcvbuf )
   DEALLOCATE( sndbuf )

#else

   a( 1:n, 1:n ) = b( 1:n, 1:n )

#endif

   RETURN

END SUBROUTINE blk2cyc_zredist_x
!
!
!
!  Double Complex and Double Precision Cholesky Factorization of
!  an Hermitan/Symmetric block distributed matrix
!  written by Carlo Cavazzoni
!
!

SUBROUTINE laxlib_pzpotrf_x( sll, ldx, n, idesc )
   !
   !! Double precision Complex (Z) Cholesky Factorization of
   !! an Hermitan/Symmetric block distributed matrix
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   implicit none
   !
   include 'laxlib_param.fh'
   include 'laxlib_kinds.fh'
   !
   integer :: n
   !! global dimension
   integer :: ldx
   !! leading dimension of sll
   integer, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   complex(DP) :: sll( ldx, ldx )
   !! matrix sll
   !
   real(DP)  :: one, zero
   complex(DP) :: cone, czero
   integer :: myrow, mycol, ierr
   integer :: jb, info, ib, kb
   integer :: jnr, jir, jic, jnc
   integer :: inr, iir, iic, inc
   integer :: knr, kir, kic, knc
   integer :: nr, nc
   integer :: rcomm, ccomm, color, key, myid, np
   complex(DP), allocatable :: ssnd( :, : ), srcv( :, : )
   TYPE(la_descriptor) :: desc

   one   = 1.0_DP
   cone  = 1.0_DP
   zero  = 0.0_DP
   czero = 0.0_DP

#if defined __MPI

   CALL laxlib_intarray_to_desc(desc,idesc)

   myrow = desc%myr
   mycol = desc%myc
   myid  = desc%mype
   np    = desc%npr

   IF( desc%npr /= desc%npc ) THEN
      CALL lax_error__( ' pzpotrf ', ' only square grid are allowed ', 1 )
   END IF

   IF( ldx /= desc%nrcx ) THEN
      CALL lax_error__( ' pzpotrf ', ' wrong leading dimension ldx ', ldx )
   END IF

   nr = desc%nr
   nc = desc%nc

   ALLOCATE( ssnd( ldx, ldx ) )
   ALLOCATE( srcv( ldx, ldx ) )

   DO jb = 1, np
      !
      !    Update and factorize the current diagonal block and test
      !    for non-positive-definiteness.
      !
      CALL descla_local_dims( jir, jnr, n, desc%nx, np, jb-1 )
      !
      !    since we loop on diagonal blocks/procs we have jnc == jnr
      !
      jnc = jnr
      !
      !    prepare row and colum communicators
      IF( ( myrow >= ( jb-1 ) ) .AND. ( mycol <= ( jb-1 ) ) ) THEN
          color = mycol
          key   = myrow
      ELSE
          color = np
          key   = myid
      END IF
      !
      CALL mpi_comm_split( desc%comm , color, key, ccomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pzpotrf ", " in mpi_comm_split 1 ", ABS( ierr ) )
      !
      IF( myrow >= jb-1 .and. mycol <= jb-1 ) THEN
          color = myrow
          key   = mycol
      ELSE
          color = np
          key   = myid
      END IF
      !
      CALL mpi_comm_split( desc%comm, color, key, rcomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pzpotrf ", " in mpi_comm_split 2 ", ABS( ierr ) )
      !
      !    here every process can work independently, then we need a reduce.
      !
      IF( jb > 1 ) THEN
         !
         DO ib = 1, jb - 1
            IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( ib - 1 ) ) ) THEN
               !
               !  this is because only the lover triangle of ssnd will be set to 0 by ZHERK
               !
               ssnd = 0.0_DP
               !
               !  remember: matrix ssnd is nr*nr, and procs on the diagonale have nr == nc
               !
               CALL ZHERK( 'L', 'N', nr, nc, -ONE, sll, ldx, zero, ssnd, ldx )
               !
            END IF
         END DO
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
            ssnd = sll
         END IF
         !
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
            !
            !  accumulate on the diagonal block/proc
            !
            CALL mpi_barrier( rcomm, ierr )

            CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_COMPLEX, MPI_SUM, jb-1, rcomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pzpotrf ", " in MPI_REDUCE 1 ", ABS( ierr ) )
            !
         END IF
         !
      END IF
      !
      ! Only proj ( jb-1, jb-1 ) operates this
      !
      info = 0
      !
      IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
         CALL ZPOTF2( 'L', jnr, sll, ldx, INFO )
         IF( info /= 0 ) &
            CALL lax_error__( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
      END IF
      !
      IF( ( jb > 1 ) .AND. ( jb < np ) ) THEN
         !
         !           Compute the current block column.
         !
         ! processors ( 1 : jb - 1, jb ) should bcast their blocs
         ! along column to processor ( 1 : jb - 1, jb + 1 : nb )
         !
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( sll,  ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pzpotrf ", " in mpi_bcast 1 ", ABS( ierr ) )
         ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pzpotrf ", " in mpi_bcast 2 ", ABS( ierr ) )
         END IF
         !
         DO ib = jb + 1, np
            CALL descla_local_dims( iir, inr, n, desc%nx, np, ib-1 )
            DO kb = 1, jb - 1
               CALL descla_local_dims( kic, knc, n, desc%nx, np, kb-1 )
               IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( kb - 1 ) ) ) THEN
                  CALL zgemm( 'N', 'C', inr, jnr, knc, -CONE, sll, ldx, srcv, ldx, czero, ssnd, ldx )
               END IF
            END DO
            IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
               ssnd = sll
            END IF
         END DO
         !
         ! processors ( jb, jb + 1 : nb ) should collect block along row,
         ! from processors  ( 1 : jb - 1, jb + 1 : nb )
         !
         DO kb = jb + 1, np
            IF( ( myrow == ( kb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
               IF( ( jb == 1 ) ) THEN
                  IF( mycol == ( jb - 1 ) ) THEN
                     sll = ssnd
                  END IF
               ELSE
                  CALL mpi_barrier( rcomm, ierr )
                  CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_COMPLEX, MPI_SUM, jb-1, rcomm, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " pzpotrf ", " in mpi_reduce 2 ", ABS( ierr ) )
               END IF
            END IF
         END DO
         !
      END IF
      !
      IF( jb < np ) THEN
         !
         ! processor "jb,jb" should broadcast his block to procs ( jb+1 : nb, jb )
         !
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( sll,  ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pzpotrf ", " in mpi_bcast 3 ", ABS( ierr ) )
         ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( srcv,  ldx*ldx, MPI_DOUBLE_COMPLEX, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pzpotrf ", " in mpi_bcast 4 ", ABS( ierr ) )
         END IF
         !
         DO ib = jb + 1, np
            IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
               CALL ZTRSM( 'R', 'L', 'C', 'N', nr, nc, CONE, srcv, ldx, sll, ldx )
            END IF
         END DO
         !
      END IF
      !
      CALL mpi_comm_free( rcomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pzpotrf ", " in mpi_comm_free 1 ", ABS( ierr ) )
      !
      CALL mpi_comm_free( ccomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pzpotrf ", " in mpi_comm_free 2 ", ABS( ierr ) )
      !
   END DO

   DEALLOCATE( srcv, ssnd )

#else

   CALL ZPOTRF( 'L', n, sll, ldx, info )

   IF( info /= 0 ) &
      CALL lax_error__( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )

#endif

   return
END SUBROUTINE laxlib_pzpotrf_x

!  now the Double Precision subroutine

SUBROUTINE laxlib_pdpotrf_x( sll, ldx, n, idesc )
   !
   !! Double precision Cholesky Factorization of
   !! an Hermitan/Symmetric block distributed matrix
   !
   USE laxlib_descriptor
   USE laxlib_parallel_include
   !
   implicit none
   !
   include 'laxlib_param.fh'
   include 'laxlib_kinds.fh'
   !
   integer  :: n
   !! global dimension
   integer  :: ldx
   !! leading dimension of sll
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   REAL(DP) :: sll( ldx, ldx )
   !! matrix sll
   REAL(DP) :: one, zero
   integer  :: myrow, mycol, ierr
   integer  :: jb, info, ib, kb
   integer  :: jnr, jir, jic, jnc
   integer  :: inr, iir, iic, inc
   integer  :: knr, kir, kic, knc
   integer  :: nr, nc
   integer  :: rcomm, ccomm, color, key, myid, np
   REAL(DP), ALLOCATABLE :: ssnd( :, : ), srcv( :, : )
   TYPE(la_descriptor) :: desc

   one   = 1.0_DP
   zero  = 0.0_DP

#if defined __MPI

   CALL laxlib_intarray_to_desc(desc,idesc)

   myrow = desc%myr
   mycol = desc%myc
   myid  = desc%mype
   np    = desc%npr

   IF( desc%npr /= desc%npc ) THEN
      CALL lax_error__( ' pdpotrf ', ' only square grid are allowed ', 1 )
   END IF

   IF( ldx /= desc%nrcx ) THEN
      CALL lax_error__( ' pdpotrf ', ' wrong leading dimension ldx ', ldx )
   END IF

   nr = desc%nr
   nc = desc%nc

   ALLOCATE( ssnd( ldx, ldx ) )
   ALLOCATE( srcv( ldx, ldx ) )

   DO jb = 1, np
      !
      !    Update and factorize the current diagonal block and test
      !    for non-positive-definiteness.
      !
      CALL descla_local_dims( jir, jnr, n, desc%nx, np, jb-1 )
      !
      !    since we loop on diagonal blocks/procs we have jnc == jnr
      !
      jnc = jnr
      !
      !    prepare row and colum communicators
      IF( ( myrow >= ( jb-1 ) ) .AND. ( mycol <= ( jb-1 ) ) ) THEN
          color = mycol
          key   = myrow
      ELSE
          color = np
          key   = myid
      END IF
      !
      CALL mpi_comm_split( desc%comm , color, key, ccomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pdpotrf ", " in mpi_comm_split 1 ", ABS( ierr ) )
      !
      IF( myrow >= jb-1 .and. mycol <= jb-1 ) THEN
          color = myrow
          key   = mycol
      ELSE
          color = np
          key   = myid
      END IF
      !
      CALL mpi_comm_split( desc%comm, color, key, rcomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pdpotrf ", " in mpi_comm_split 2 ", ABS( ierr ) )
      !
      !    here every process can work independently, then we need a reduce.
      !
      IF( jb > 1 ) THEN
         !
         DO ib = 1, jb - 1
            IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( ib - 1 ) ) ) THEN
               !
               !  this is because only the lover triangle of ssnd will be set to 0 by ZHERK
               !
               ssnd = 0_DP
               !
               !  remember: matrix ssnd is nr*nr, and procs on the diagonale have nr == nc
               !
               CALL DSYRK( 'L', 'N', nr, nc, -ONE, sll, ldx, zero, ssnd, ldx )
               !
            END IF
         END DO
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
            ssnd = sll
         END IF
         !
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
            !
            !  accumulate on the diagonal block/proc
            !
            CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_PRECISION, MPI_SUM, jb-1, rcomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pdpotrf ", " in MPI_REDUCE 1 ", ABS( ierr ) )
            !
         END IF
         !
      END IF
      !
      ! Only proj ( jb-1, jb-1 ) operates this
      !
      info = 0
      !
      IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
         CALL DPOTRF( 'L', jnr, sll, ldx, INFO )
         IF( info /= 0 ) &
            CALL lax_error__( " pdpotrf ", " problems computing cholesky decomposition ", ABS( info ) )
      END IF
      !
      IF( ( jb > 1 ) .AND. ( jb < np ) ) THEN
         !
         !           Compute the current block column.
         !
         ! processors ( 1 : jb - 1, jb ) should bcast their blocs
         ! along column to processor ( 1 : jb - 1, jb + 1 : nb )
         !
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( sll,  ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pdpotrf ", " in mpi_bcast 1 ", ABS( ierr ) )
         ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol < ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( srcv, ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pdpotrf ", " in mpi_bcast 2 ", ABS( ierr ) )
         END IF
         !
         DO ib = jb + 1, np
            CALL descla_local_dims( iir, inr, n, desc%nx, np, ib-1 )
            DO kb = 1, jb - 1
               CALL descla_local_dims( kic, knc, n, desc%nx, np, kb-1 )
               IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( kb - 1 ) ) ) THEN
                  CALL dgemm( 'N', 'T', inr, jnr, knc, -ONE, sll, ldx, srcv, ldx, zero, ssnd, ldx )
               END IF
            END DO
            IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
               ssnd = sll
            END IF
         END DO
         !
         ! processors ( jb, jb + 1 : nb ) should collect block along row,
         ! from processors  ( 1 : jb - 1, jb + 1 : nb )
         !
         DO kb = jb + 1, np
            IF( ( myrow == ( kb - 1 ) ) .AND. ( mycol <= ( jb - 1 ) ) ) THEN
               IF( ( jb == 1 ) ) THEN
                  IF( mycol == ( jb - 1 ) ) THEN
                     sll = ssnd
                  END IF
               ELSE
                  CALL MPI_REDUCE( ssnd, sll, ldx*ldx, MPI_DOUBLE_PRECISION, MPI_SUM, jb-1, rcomm, ierr )
                  IF( ierr /= 0 ) &
                     CALL lax_error__( " pdpotrf ", " in mpi_reduce 2 ", ABS( ierr ) )
               END IF
            END IF
         END DO
         !
      END IF
      !
      IF( jb < np ) THEN
         !
         ! processor "jb,jb" should broadcast his block to procs ( jb+1 : nb, jb )
         !
         IF( ( myrow == ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( sll,  ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pdpotrf ", " in mpi_bcast 3 ", ABS( ierr ) )
         ELSE IF( ( myrow > ( jb - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
            CALL mpi_barrier( ccomm, ierr )
            CALL mpi_bcast( srcv,  ldx*ldx, MPI_DOUBLE_PRECISION, 0, ccomm, ierr )
            IF( ierr /= 0 ) &
               CALL lax_error__( " pdpotrf ", " in mpi_bcast 4 ", ABS( ierr ) )
         END IF
         !
         DO ib = jb + 1, np
            IF( ( myrow == ( ib - 1 ) ) .AND. ( mycol == ( jb - 1 ) ) ) THEN
               CALL DTRSM( 'R', 'L', 'T', 'N', nr, nc, ONE, srcv, ldx, sll, ldx )
            END IF
         END DO
         !
      END IF
      !
      CALL mpi_comm_free( rcomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pdpotrf ", " in mpi_comm_free 1 ", ABS( ierr ) )

      CALL mpi_comm_free( ccomm, ierr )
      IF( ierr /= 0 ) &
         CALL lax_error__( " pdpotrf ", " in mpi_comm_free 2 ", ABS( ierr ) )
      !
   END DO

   DEALLOCATE( srcv, ssnd )

#else

   CALL DPOTRF( 'L', n, sll, ldx, info )

   IF( info /= 0 ) &
      CALL lax_error__( " pzpotrf ", " problems computing cholesky decomposition ", ABS( info ) )

#endif

   return
END SUBROUTINE laxlib_pdpotrf_x

!
!
!
!

SUBROUTINE laxlib_pztrtri_x ( sll, ldx, n, idesc )
    !
    !! pztrtri computes the parallel inversion of a lower triangular matrix
    !! distribuited among the processes using a 2-D block partitioning.
    !! The algorithm is based on the schema below and executes the model
    !! recursively to each column C2 under the diagonal.
    !!
    !!     |-------|-------|      |--------------------|--------------------|
    !!     |   A1  |   0   |      |   C1 = trtri(A1)   |          0         |
    !! A = |-------|-------|  C = |--------------------|--------------------|
    !!     |   A2  |   A3  |      | C2 = -C3 * A2 * C1 |   C3 = trtri(A3)   |
    !!     |-------|-------|      |--------------------|--------------------|
    !!
    !! The recursive steps of multiplication (C2 = -C3 * A2 * C1) is based on the Cannon's algorithms
    !! for parallel matrix multiplication and is done with BLACS(dgemm)
    !!
    !!
    !
    ! Arguments
    ! ============
    !
    ! sll   = local block of data
    ! ldx   = leading dimension of one block
    ! n     = size of the global array diributed among the blocks
    ! desc  = descriptor of the matrix distribution
    !
    !
    !  written by Ivan Girotto
    !

    USE laxlib_descriptor
    USE laxlib_parallel_include

    IMPLICIT NONE
    include 'laxlib_kinds.fh'
    include 'laxlib_param.fh'

    INTEGER, INTENT(IN)  :: n
    !! global dimension
    INTEGER, INTENT(IN)  :: ldx
    !! leading dimension of sll
    INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
    !! integer laxlib descriptor
    COMPLEX(DP), INTENT( INOUT ) :: sll( ldx, ldx )
    !! matrix sll

    COMPLEX(DP), PARAMETER :: ONE = (1.0_DP, 0.0_DP)
    COMPLEX(DP), PARAMETER :: ZERO = (0.0_DP, 0.0_DP)

#if defined __MPI
    INTEGER :: status(MPI_STATUS_SIZE)
#endif
    INTEGER :: req(2), ierr, col_comm
    INTEGER :: send, recv, group_rank, group_size
    INTEGER :: myrow, mycol, np, myid, comm

    ! counters
    INTEGER :: k, i, j, count, step_count, shiftcount, cicle
    INTEGER :: C3dim   ! Dimension of submatrix B
    INTEGER :: nc, nr ! Local dimension of block
    INTEGER :: info, sup_recv
    INTEGER :: idrowref, idcolref, idref, idrecv

    ! B and BUF_RECV are used to overload the computation of matrix multiplication and the shift of the blocks
    COMPLEX(DP), ALLOCATABLE, DIMENSION( :, : ) :: B, C, BUF_RECV
    COMPLEX(DP) :: first
    TYPE(la_descriptor) :: desc

    CALL laxlib_intarray_to_desc(desc,idesc)
    myrow = desc%myr
    mycol = desc%myc
    myid  = desc%mype
    np    = desc%npr
    comm  = desc%comm

    IF( desc%npr /= desc%npc ) THEN
       CALL lax_error__( ' pztrtri ', ' only square grid are allowed ', 1 )
    END IF
    IF( ldx /= desc%nrcx ) THEN
       CALL lax_error__( ' pztrtri ', ' wrong leading dimension ldx ', ldx )
    END IF

    nr = desc%nr
    nc = desc%nc

    !  clear elements outside local meaningful block nr*nc

    DO j = nc+1, ldx
       DO i = 1, ldx
          sll( i, j ) = zero
       END DO
    END DO
    DO j = 1, ldx
       DO i = nr+1, ldx
          sll( i, j ) = zero
       END DO
    END DO

#if defined __MPI

    ALLOCATE( B( ldx, ldx ) )
    ALLOCATE( C( ldx, ldx ) )
    ALLOCATE( BUF_RECV ( ldx, ldx ) )

    IF( np == 2 ) THEN
       !
       !  special case with 4 proc, 2x2 grid
       !
       IF( myrow == mycol ) THEN
          CALL compute_ztrtri()
       END IF
       !
       CALL GRID2D_RANK( 'R', np, np, 1, 0, idref )
       !
       IF( myrow == 0 .AND. mycol == 0 ) THEN
          CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idref, 0, comm, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pztrtri ", " in mpi_send 1 ", ABS( ierr ) )
       END IF
       !
       IF( myrow == 1 .AND. mycol == 1 ) THEN
          CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idref, 1, comm, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pztrtri ", " in mpi_send 2 ", ABS( ierr ) )
       END IF
       !
       IF( myrow == 1 .AND. mycol == 0 ) THEN
          !
          CALL GRID2D_RANK( 'R', np, np, 0, 0, i )
          CALL GRID2D_RANK( 'R', np, np, 1, 1, j )
          !
          CALL MPI_Irecv( B, ldx*ldx, MPI_DOUBLE_COMPLEX, i, 0, comm, req(1), ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pztrtri ", " in mpi_irecv 3 ", ABS( ierr ) )
          !
          CALL MPI_Irecv( C, ldx*ldx, MPI_DOUBLE_COMPLEX, j, 1, comm, req(2), ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pztrtri ", " in mpi_irecv 4 ", ABS( ierr ) )
          !
          CALL MPI_Wait(req(1), status, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pztrtri ", " in MPI_Wait 5 ", ABS( ierr ) )
          !
          CALL zgemm('N', 'N', ldx, ldx, ldx, ONE, sll, ldx, b, ldx, ZERO, buf_recv, ldx)
          !
          CALL MPI_Wait(req(2), status, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pztrtri ", " in MPI_Wait 6 ", ABS( ierr ) )
          !
          CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, c, ldx, buf_recv, ldx, ZERO, sll, ldx)
          !
       END IF
       !
       IF( myrow == 0 .AND. mycol == 1 ) THEN
          !
          sll = zero
          !
       END IF
       !
       DEALLOCATE( b, c, buf_recv )
       !
       RETURN
       !
    END IF


    IF( myrow >= mycol ) THEN
       !
       !  only procs on lower triangle partecipates
       !
       CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in MPI_Comm_split 9 ", ABS( ierr ) )

       CALL MPI_Comm_size( col_comm, group_size, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in MPI_Comm_size 10 ", ABS( ierr ) )
       !
       CALL MPI_Comm_rank( col_comm, group_rank, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in MPI_Comm_rank 11 ", ABS( ierr ) )
       !
    ELSE
       !
       !  other procs stay at the window!
       !
       CALL MPI_Comm_split( comm, MPI_UNDEFINED, MPI_UNDEFINED, col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in MPI_Comm_split 12 ", ABS( ierr ) )
       !
       sll = zero
       !
    END IF
    !

    ! Compute the inverse of a lower triangular
    ! along the diagonal of the global array with BLAS(ztrtri)
    !
    IF( mycol == myrow ) THEN
       !
       CALL compute_ztrtri()
       !
    ELSE IF( myrow > mycol ) THEN
       !
       buf_recv = sll
       !
    END IF

    IF( myrow >= mycol ) THEN
       !
       ! Broadcast the diagonal blocks to the processors under the diagonal
       !
       CALL MPI_Bcast( sll, ldx*ldx, MPI_DOUBLE_COMPLEX, 0, col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in MPI_Bcast 13 ", ABS( ierr ) )
       !
    END IF

    ! Compute A2 * C1 and start the Cannon's algorithm shifting the blocks of column one place to the North
    !
    IF( myrow > mycol ) THEN
       !
       CALL zgemm( 'N', 'N', ldx, ldx, ldx, ONE, buf_recv, ldx, sll, ldx, ZERO, c, ldx )
       !
       send = shift( 1, group_rank, 1, ( group_size - 1 ), 'N' )
       recv = shift( 1, group_rank, 1, ( group_size - 1 ), 'S' )
       !
       CALL MPI_Sendrecv( c, ldx*ldx, MPI_DOUBLE_COMPLEX, send, 0, buf_recv, &
            ldx*ldx, MPI_DOUBLE_COMPLEX, recv, 0, col_comm, status, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in MPI_Sendrecv 14 ", ABS( ierr ) )
       !
    END IF

    ! Execute the Cannon's algorithm to compute ricorsively the multiplication of C2 = -C3 * A2 * C1
    !
    DO count = ( np - 2 ), 0, -1
       C3dim = (np-1) - count ! Dimension of the submatrix C3
       first = ZERO
       cicle = 0
       IF( ( myrow > count ) .AND. ( mycol >= count ) ) THEN
          idcolref = count + 1
          idrowref = myrow
          CALL GRID2D_RANK( 'R', np, np, idrowref, idcolref, idref )
          idrecv = idref - 1
          ! Compute C2 = -C3 * A2 * C1
          DO shiftcount = count, np-2
             IF(mycol>count)THEN
                ! Execute the virtual shift of the matrix C3 along the row in order to know which processor
                ! have to send the block to C2
                IF( cicle == 0)THEN
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   send = shift(idref, myid, myrow-count, C3dim, 'W')
                ELSE
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   send = shift(idref, send, 1, C3dim, 'E')
                END IF
                IF(send==idref)THEN
                   CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_COMPLEX, idrecv, myid, comm, ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pztrtri ", " in MPI_Send 15 ", ABS( ierr ) )
                END IF
             ELSE
                IF( cicle == 0)THEN
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   sup_recv = shift(idref, myid+1, myrow-count, C3dim, 'E')
                ELSE
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   sup_recv = shift(idref, sup_recv, 1, C3dim, 'W')
                END IF
                CALL MPI_Recv(C, ldx*ldx, MPI_DOUBLE_COMPLEX, sup_recv, sup_recv, comm, status, ierr)
                IF( ierr /= 0 ) &
                   CALL lax_error__( " pztrtri ", " in MPI_Recv 16 ", ABS( ierr ) )
                send = shift(1, group_rank, 1, (group_size-1), 'S')
                recv = shift(1, group_rank, 1, (group_size-1), 'N')
                ! with the no-blocking communication the computation and the shift of the column block are overapped
                !
                IF( MOD( cicle, 2 ) == 0 ) THEN
                   CALL MPI_Isend(BUF_RECV, ldx*ldx, MPI_DOUBLE_COMPLEX, send, group_rank+cicle, col_comm, req(1), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pztrtri ", " in MPI_Isend 17 ", ABS( ierr ) )
                   CALL MPI_Irecv(B, ldx*ldx, MPI_DOUBLE_COMPLEX, recv, recv+cicle, col_comm, req(2), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pztrtri ", " in MPI_Irecv 18 ", ABS( ierr ) )
                   CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, BUF_RECV, ldx, first, sll, ldx)
                ELSE
                   CALL MPI_Isend(B, ldx*ldx, MPI_DOUBLE_COMPLEX, send, group_rank+cicle, col_comm, req(1), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pztrtri ", " in MPI_Isend 19 ", ABS( ierr ) )
                   CALL MPI_Irecv(BUF_RECV, ldx*ldx, MPI_DOUBLE_COMPLEX, recv, recv+cicle, col_comm, req(2), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pztrtri ", " in MPI_Irecv 20 ", ABS( ierr ) )
                   CALL zgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, B, ldx, ONE, sll, ldx)
                END IF
                !
                CALL MPI_Wait(req(1), status, ierr)
                IF( ierr /= 0 ) &
                   CALL lax_error__( " pztrtri ", " in MPI_Wait 21 ", ABS( ierr ) )
                !
                CALL MPI_Wait(req(2), status, ierr)
                IF( ierr /= 0 ) &
                   CALL lax_error__( " pztrtri ", " in MPI_Wait 22 ", ABS( ierr ) )
                !
             END IF
             cicle = cicle + 1
             first = ONE
          END DO
       END IF
    END DO

    ! split communicator is present and must be freed on all processors
    IF( col_comm /= MPI_COMM_NULL ) THEN
       CALL mpi_comm_free( col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pztrtri ", " in mpi_comm_free 25 ", ABS( ierr ) )
    ENDIF

    DEALLOCATE(B)
    DEALLOCATE(C)
    DEALLOCATE(BUF_RECV)

#else

    CALL compute_ztrtri()

#endif

  CONTAINS

     SUBROUTINE compute_ztrtri()
       !
       !  clear the upper triangle (excluding diagonal terms) and
       !
       DO j = 1, ldx
          DO i = 1, j-1
             sll ( i, j ) = zero
          END DO
       END DO
       !
       CALL ztrtri( 'L', 'N', nr, sll, ldx, info )
       !
       IF( info /= 0 ) THEN
          CALL lax_error__( ' pztrtri ', ' problem in the local inversion ', info )
       END IF
       !
     END SUBROUTINE compute_ztrtri


     INTEGER FUNCTION shift ( idref, id, pos, size, dir )

       IMPLICIT NONE

       INTEGER :: idref, id, pos, size
       CHARACTER ( LEN = 1 ) :: dir

       IF( ( dir == 'E' ) .OR. ( dir == 'S' ) ) THEN
          shift = idref + MOD ( ( id - idref ) + pos, size )
       ELSE IF( ( dir == 'W' ) .OR. ( dir == 'N' ) ) THEN
          shift = idref + MOD ( ( id - idref ) - pos + size, size )
       ELSE
          shift = -1
       END IF

       RETURN

     END FUNCTION shift

END SUBROUTINE laxlib_pztrtri_x

!  now the Double Precision subroutine

SUBROUTINE laxlib_pdtrtri_x ( sll, ldx, n, idesc )

    !
    !!
    !! pdtrtri computes the parallel inversion of a lower triangular matrix
    !! distribuited among the processes using a 2-D block partitioning.
    !! The algorithm is based on the schema below and executes the model
    !! recursively to each column C2 under the diagonal.
    !!
    !!     |-------|-------|      |--------------------|--------------------|
    !!     |   A1  |   0   |      |   C1 = trtri(A1)   |          0         |
    !! A = |-------|-------|  C = |--------------------|--------------------|
    !!     |   A2  |   A3  |      | C2 = -C3 * A2 * C1 |   C3 = trtri(A3)   |
    !!     |-------|-------|      |--------------------|--------------------|
    !!
    !! The recursive steps of multiplication (C2 = -C3 * A2 * C1) is based on the Cannon's algorithms
    !! for parallel matrix multiplication and is done with BLACS(dgemm)
    !!
    !
    ! Arguments
    ! ============
    !
    ! sll   = local block of data
    ! ldx   = leading dimension of one block
    ! n     = size of the global array diributed among the blocks
    ! idesc  = descriptor of the matrix distribution
    !
    !
    !  written by Ivan Girotto
    !

    USE laxlib_descriptor
    USE laxlib_parallel_include

    IMPLICIT NONE
    include 'laxlib_kinds.fh'
    include 'laxlib_param.fh'

    INTEGER, INTENT(IN)  :: n
    !! global dimension
    INTEGER, INTENT(IN)  :: ldx
    !! leading dimension of sll
    INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
    !! integer laxlib descriptor
    REAL(DP), INTENT( INOUT ) :: sll( ldx, ldx )
    !! matrix sll

    REAL(DP), PARAMETER :: ONE = 1.0_DP
    REAL(DP), PARAMETER :: ZERO = 0.0_DP

#if defined __MPI
    INTEGER :: status(MPI_STATUS_SIZE)
#endif
    INTEGER :: req(2), ierr, col_comm
    INTEGER :: send, recv, group_rank, group_size
    INTEGER :: myrow, mycol, np, myid, comm

    ! counters
    INTEGER :: k, i, j, count, step_count, shiftcount, cicle
    INTEGER :: C3dim   ! Dimension of submatrix B
    INTEGER :: nc, nr ! Local dimension of block
    INTEGER :: info, sup_recv
    INTEGER :: idrowref, idcolref, idref, idrecv

    ! B and BUF_RECV are used to overload the computation of matrix multiplication and the shift of the blocks
    REAL(DP), ALLOCATABLE, DIMENSION( :, : ) :: B, C, BUF_RECV
    REAL(DP) :: first
    TYPE(la_descriptor) :: desc

    CALL laxlib_intarray_to_desc(desc,idesc)

    myrow = desc%myr
    mycol = desc%myc
    myid  = desc%mype
    np    = desc%npr
    comm  = desc%comm

    IF( desc%npr /= desc%npc ) THEN
       CALL lax_error__( ' pdtrtri ', ' only square grid are allowed ', 1 )
    END IF
    IF( ldx /= desc%nrcx ) THEN
       CALL lax_error__( ' pdtrtri ', ' wrong leading dimension ldx ', ldx )
    END IF

    nr = desc%nr
    nc = desc%nc

    !  clear elements outside local meaningful block nr*nc

    DO j = nc+1, ldx
       DO i = 1, ldx
          sll( i, j ) = zero
       END DO
    END DO
    DO j = 1, ldx
       DO i = nr+1, ldx
          sll( i, j ) = zero
       END DO
    END DO

#if defined __MPI

    ALLOCATE( B( ldx, ldx ) )
    ALLOCATE( C( ldx, ldx ) )
    ALLOCATE( BUF_RECV ( ldx, ldx ) )

    IF( np == 2 ) THEN
       !
       !  special case with 4 proc, 2x2 grid
       !
       IF( myrow == mycol ) THEN
          CALL compute_dtrtri()
       END IF
       !
       CALL GRID2D_RANK( 'R', np, np, 1, 0, idref )
       !
       IF( myrow == 0 .AND. mycol == 0 ) THEN
          CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idref, 0, comm, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pdtrtri ", " in MPI_Send 1 ", ABS( ierr ) )
       END IF
       !
       IF( myrow == 1 .AND. mycol == 1 ) THEN
          CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idref, 1, comm, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pdtrtri ", " in MPI_Send 2 ", ABS( ierr ) )
       END IF
       !
       IF( myrow == 1 .AND. mycol == 0 ) THEN
          !
          CALL GRID2D_RANK( 'R', np, np, 0, 0, i )
          CALL GRID2D_RANK( 'R', np, np, 1, 1, j )
          !
          CALL MPI_Irecv( B, ldx*ldx, MPI_DOUBLE_PRECISION, i, 0, comm, req(1), ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pdtrtri ", " in MPI_Irecv 3 ", ABS( ierr ) )
          !
          CALL MPI_Irecv( C, ldx*ldx, MPI_DOUBLE_PRECISION, j, 1, comm, req(2), ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pdtrtri ", " in MPI_Irecv 4 ", ABS( ierr ) )
          !
          CALL MPI_Wait(req(1), status, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pdtrtri ", " in MPI_Wait 5 ", ABS( ierr ) )
          !
          CALL dgemm('N', 'N', ldx, ldx, ldx, ONE, sll, ldx, b, ldx, ZERO, buf_recv, ldx)
          !
          CALL MPI_Wait(req(2), status, ierr)
          IF( ierr /= 0 ) &
             CALL lax_error__( " pdtrtri ", " in MPI_Wait 6 ", ABS( ierr ) )
          !
          CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, c, ldx, buf_recv, ldx, ZERO, sll, ldx)
          !
       END IF
       !
       IF( myrow == 0 .AND. mycol == 1 ) THEN
          !
          sll = zero
          !
       END IF
       !
       DEALLOCATE( b, c, buf_recv )
       !
       RETURN
       !
    END IF


    IF( myrow >= mycol ) THEN
       !
       !  only procs on lower triangle partecipates
       !
       CALL MPI_Comm_split( comm, mycol, myrow, col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in MPI_Comm_split 9 ", ABS( ierr ) )

       CALL MPI_Comm_size( col_comm, group_size, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in MPI_Comm_size 10 ", ABS( ierr ) )

       CALL MPI_Comm_rank( col_comm, group_rank, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in MPI_Comm_rank 11 ", ABS( ierr ) )
       !
    ELSE
       !
       !  other procs stay at the window!
       !
       CALL MPI_Comm_split( comm, MPI_UNDEFINED, MPI_UNDEFINED, col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in MPI_Comm_split 12 ", ABS( ierr ) )
       !
       sll = zero
       !
    END IF
    !

    ! Compute the inverse of a lower triangular
    ! along the diagonal of the global array with BLAS(ztrtri)
    !
    IF( mycol == myrow ) THEN
       !
       CALL compute_dtrtri()
       !
    ELSE IF( myrow > mycol ) THEN
       !
       buf_recv = sll
       !
    END IF

    IF( myrow >= mycol ) THEN
       !
       ! Broadcast the diagonal blocks to the processors under the diagonal
       !
       CALL MPI_Bcast( sll, ldx*ldx, MPI_DOUBLE_PRECISION, 0, col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in MPI_Bcast 13 ", ABS( ierr ) )
       !
    END IF

    ! Compute A2 * C1 and start the Cannon's algorithm shifting the blocks of column one place to the North
    !
    IF( myrow > mycol ) THEN
       !
       CALL dgemm( 'N', 'N', ldx, ldx, ldx, ONE, buf_recv, ldx, sll, ldx, ZERO, c, ldx )
       !
       send = shift( 1, group_rank, 1, ( group_size - 1 ), 'N' )
       recv = shift( 1, group_rank, 1, ( group_size - 1 ), 'S' )
       !
       CALL MPI_Sendrecv( c, ldx*ldx, MPI_DOUBLE_PRECISION, send, 0, buf_recv, &
            ldx*ldx, MPI_DOUBLE_PRECISION, recv, 0, col_comm, status, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in MPI_Sendrecv 14 ", ABS( ierr ) )
       !
    END IF

    ! Execute the Cannon's algorithm to compute ricorsively the multiplication of C2 = -C3 * A2 * C1
    !
    DO count = ( np - 2 ), 0, -1
       C3dim = (np-1) - count ! Dimension of the submatrix C3
       first = ZERO
       cicle = 0
       IF( ( myrow > count ) .AND. ( mycol >= count ) ) THEN
          idcolref = count + 1
          idrowref = myrow
          CALL GRID2D_RANK( 'R', np, np, idrowref, idcolref, idref )
          idrecv = idref - 1
          ! Compute C2 = -C3 * A2 * C1
          DO shiftcount = count, np-2
             IF(mycol>count)THEN
                ! Execute the virtual shift of the matrix C3 along the row in order to know which processor
                ! have to send the block to C2
                IF( cicle == 0)THEN
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   send = shift(idref, myid, myrow-count, C3dim, 'W')
                ELSE
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   send = shift(idref, send, 1, C3dim, 'E')
                END IF
                IF(send==idref)THEN
                   CALL MPI_Send(sll, ldx*ldx, MPI_DOUBLE_PRECISION, idrecv, myid, comm, ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pdtrtri ", " in MPI_Send 15 ", ABS( ierr ) )
                END IF
             ELSE
                IF( cicle == 0)THEN
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   sup_recv = shift(idref, myid+1, myrow-count, C3dim, 'E')
                ELSE
                   ! virtual shift of the block i,j of the submatrix C3 i place to West
                   sup_recv = shift(idref, sup_recv, 1, C3dim, 'W')
                END IF
                CALL MPI_Recv(C, ldx*ldx, MPI_DOUBLE_PRECISION, sup_recv, sup_recv, comm, status, ierr)
                IF( ierr /= 0 ) &
                   CALL lax_error__( " pdtrtri ", " in MPI_Recv 16 ", ABS( ierr ) )
                send = shift(1, group_rank, 1, (group_size-1), 'S')
                recv = shift(1, group_rank, 1, (group_size-1), 'N')
                ! with the no-blocking communication the computation and the shift of the column block are overapped
                IF( MOD( cicle, 2 ) == 0 ) THEN
                   !
                   CALL MPI_Isend(BUF_RECV, ldx*ldx, MPI_DOUBLE_PRECISION, send, group_rank+cicle, col_comm, req(1), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pdtrtri ", " in MPI_Isend 17 ", ABS( ierr ) )
                   CALL MPI_Irecv(B, ldx*ldx, MPI_DOUBLE_PRECISION, recv, recv+cicle, col_comm, req(2), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pdtrtri ", " in MPI_Irecv 18 ", ABS( ierr ) )
                   !
                   CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, BUF_RECV, ldx, first, sll, ldx)
                   !
                ELSE
                   !
                   CALL MPI_Isend(B, ldx*ldx, MPI_DOUBLE_PRECISION, send, group_rank+cicle, col_comm, req(1), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pdtrtri ", " in MPI_Isend 19 ", ABS( ierr ) )
                   CALL MPI_Irecv(BUF_RECV, ldx*ldx, MPI_DOUBLE_PRECISION, recv, recv+cicle, col_comm, req(2), ierr)
                   IF( ierr /= 0 ) &
                      CALL lax_error__( " pdtrtri ", " in MPI_Irecv 20 ", ABS( ierr ) )
                   !
                   CALL dgemm('N', 'N', ldx, ldx, ldx, -ONE, C, ldx, B, ldx, ONE, sll, ldx)
                   !
                END IF
                !
                CALL MPI_Wait(req(1), status, ierr)
                IF( ierr /= 0 ) &
                   CALL lax_error__( " pdtrtri ", " in MPI_Wait 21 ", ABS( ierr ) )
                CALL MPI_Wait(req(2), status, ierr)
                IF( ierr /= 0 ) &
                   CALL lax_error__( " pdtrtri ", " in MPI_Wait 22 ", ABS( ierr ) )
                !
             END IF
             cicle = cicle + 1
             first = ONE
          END DO
       END IF
    END DO

    ! split communicator is present and must be freed on all processors
    IF( col_comm /= MPI_COMM_NULL ) THEN
       CALL mpi_comm_free( col_comm, ierr )
       IF( ierr /= 0 ) &
          CALL lax_error__( " pdtrtri ", " in mpi_comm_free 25 ", ABS( ierr ) )
    ENDIF

    DEALLOCATE(B)
    DEALLOCATE(C)
    DEALLOCATE(BUF_RECV)

#else

    CALL compute_dtrtri()

#endif

  CONTAINS

     SUBROUTINE compute_dtrtri()
       !
       !  clear the upper triangle (excluding diagonal terms) and
       !
       DO j = 1, ldx
          DO i = 1, j-1
             sll ( i, j ) = zero
          END DO
       END DO
       !
       CALL dtrtri( 'L', 'N', nr, sll, ldx, info )
       !
       IF( info /= 0 ) THEN
          CALL lax_error__( ' pdtrtri ', ' problem in the local inversion ', info )
       END IF
       !
     END SUBROUTINE compute_dtrtri


     INTEGER FUNCTION shift ( idref, id, pos, size, dir )

       IMPLICIT NONE

       INTEGER :: idref, id, pos, size
       CHARACTER ( LEN = 1 ) :: dir

       IF( ( dir == 'E' ) .OR. ( dir == 'S' ) ) THEN
          shift = idref + MOD ( ( id - idref ) + pos, size )
       ELSE IF( ( dir == 'W' ) .OR. ( dir == 'N' ) ) THEN
          shift = idref + MOD ( ( id - idref ) - pos + size, size )
       ELSE
          shift = -1
       END IF

       RETURN

     END FUNCTION shift

END SUBROUTINE laxlib_pdtrtri_x


SUBROUTINE laxlib_pdsyevd_x( tv, n, idesc, hh, ldh, e )
   !
   !! Parallel version of the HOUSEHOLDER tridiagonalization Algorithm for simmetric matrix.
   !! double precision version
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   include 'laxlib_low.fh'
   LOGICAL, INTENT(IN) :: tv
   !! if true compute eigenvalues and eigenvectors (not used)
   INTEGER, INTENT(IN) :: n
   !! global dimension
   INTEGER, INTENT(IN) :: ldh
   !! leading dimension of hh
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   REAL(DP) :: hh( ldh, ldh )
   !! matrix to be diagonalized and output eigenvectors
   REAL(DP) :: e( n )
   !! eigenvalues

   INTEGER :: nrlx, nrl, nproc
   REAL(DP), ALLOCATABLE :: diag(:,:), vv(:,:)
   CHARACTER :: jobv

   nrl   = idesc(LAX_DESC_NRL)
   nrlx  = idesc(LAX_DESC_NRLX)
   nproc = idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR)

   ALLOCATE( diag( nrlx, n ) )
   ALLOCATE( vv( nrlx, n ) )

   jobv = 'N'
   IF( tv ) jobv = 'V'
   !
   !  Redistribute matrix "hh" into "diag",
   !  matrix "hh" is block distributed, matrix diag is cyclic distributed
   CALL blk2cyc_redist( n, diag, nrlx, n, hh, ldh, ldh, idesc )
   !
   CALL dspev_drv( jobv, diag, nrlx, e, vv, nrlx, nrl, n, &
        nproc, idesc(LAX_DESC_MYPE), idesc(LAX_DESC_COMM) )
   !
   IF( tv ) CALL cyc2blk_redist( n, vv, nrlx, n, hh, ldh, ldh, idesc )
   !
   DEALLOCATE( vv )
   DEALLOCATE( diag )

   RETURN
END SUBROUTINE


SUBROUTINE laxlib_pzheevd_x( tv, n, idesc, hh, ldh, e )
   !
   !! Parallel version of the HOUSEHOLDER tridiagonalization Algorithm for simmetric matrix.
   !! double precision complex(Z) version
   !
   IMPLICIT NONE
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   include 'laxlib_low.fh'
   LOGICAL, INTENT(IN) :: tv
   !! if true compute eigenvalues and eigenvectors (not used)
   INTEGER, INTENT(IN) :: n
   !! global dimensio of matrix
   INTEGER, INTENT(IN) :: ldh
   !! leading dimension of hh
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor
   COMPLEX(DP) :: hh( ldh, ldh )
   !! matrix to be diagonalized and output eigenvectors
   REAL(DP) :: e( n )
   !! eigenvalues

   INTEGER :: nrlx, nrl
   COMPLEX(DP), ALLOCATABLE :: diag(:,:), vv(:,:)
   CHARACTER :: jobv

   nrl  = idesc(LAX_DESC_NRL)
   nrlx = idesc(LAX_DESC_NRLX)
   !
   ALLOCATE( diag( nrlx, n ) )
   ALLOCATE( vv( nrlx, n ) )
   !
   jobv = 'N'
   IF( tv ) jobv = 'V'

   CALL blk2cyc_redist( n, diag, nrlx, n, hh, ldh, ldh, idesc )
   !
   CALL zhpev_drv( jobv, diag, nrlx, e, vv, nrlx, nrl, n, &
        idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR), idesc(LAX_DESC_MYPE), idesc(LAX_DESC_COMM) )
   !
   if( tv ) CALL cyc2blk_redist( n, vv, nrlx, n, hh, ldh, ldh, idesc )
   !
   DEALLOCATE( vv )
   DEALLOCATE( diag )

   RETURN
END SUBROUTINE


SUBROUTINE sqr_dsetmat_x( what, n, alpha, a, lda, idesc )
   !
   !!
   !!  Set the double precision values of a square distributed matrix
   !!
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   CHARACTER(LEN=1), INTENT(IN) :: what
   !! 'A' set all the values of "a" equal to alpha
   !! 'U' set the values in the upper triangle of "a" equal to alpha
   !! 'L' set the values in the lower triangle of "a" equal to alpha
   !! 'D' set the values in the diagonal of "a" equal to alpha
   INTEGER, INTENT(IN) :: n
   !! global dimension of the matrix
   REAL(DP), INTENT(IN) :: alpha
   !! value to be assigned to elements of "a"
   INTEGER, INTENT(IN) :: lda
   !!! leading dimension of a
   REAL(DP) :: a(lda,*)
   !! matrix whose values have to be set
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor of matrix a

   INTEGER :: i, j

   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      !
      !  processors not interested in this computation return quickly
      !
      RETURN
      !
   END IF

   SELECT CASE( what )
     CASE( 'U', 'u' )
        IF( idesc(LAX_DESC_MYC) > idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = 1, idesc(LAX_DESC_NR)
                 a( i, j ) = alpha
              END DO
           END DO
        ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = 1, j - 1
                 a( i, j ) = alpha
              END DO
           END DO
        END IF
     CASE( 'L', 'l' )
        IF( idesc(LAX_DESC_MYC) < idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = 1, idesc(LAX_DESC_NR)
                 a( i, j ) = alpha
              END DO
           END DO
        ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = j + 1, idesc(LAX_DESC_NR)
                 a( i, j ) = alpha
              END DO
           END DO
        END IF
     CASE( 'D', 'd' )
        IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO i = 1, idesc(LAX_DESC_NR)
              a( i, i ) = alpha
           END DO
        END IF
     CASE DEFAULT
        DO j = 1, idesc(LAX_DESC_NC)
           DO i = 1, idesc(LAX_DESC_NR)
              a( i, j ) = alpha
           END DO
        END DO
   END SELECT
   !
   RETURN
END SUBROUTINE sqr_dsetmat_x


SUBROUTINE sqr_zsetmat_x( what, n, alpha, a, lda, idesc )
   !
   !!  Set the double precision complex(Z) values of a square distributed matrix
   !
   IMPLICIT NONE
   !
   include 'laxlib_kinds.fh'
   include 'laxlib_param.fh'
   !
   CHARACTER(LEN=1), INTENT(IN) :: what
   !! 'A' set all the values of "a" equal to alpha
   !! 'U' set the values in the upper triangle of "a" equal to alpha
   !! 'L' set the values in the lower triangle of "a" equal to alpha
   !! 'D' set the values in the diagonal of "a" equal to alpha
   !! 'H' clear the imaginary part of the diagonal of "a"
   INTEGER, INTENT(IN) :: n
   !! global dimension of the matrix
   COMPLEX(DP), INTENT(IN) :: alpha
   !! value to be assigned to elements of "a"
   INTEGER, INTENT(IN) :: lda
   !! leading dimension of a
   COMPLEX(DP) :: a(lda,*)
   !! matrix whose values have to be set
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !! integer laxlib descriptor of matrix a

   INTEGER :: i, j

   IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
      !
      !  processors not interested in this computation return quickly
      !
      RETURN
      !
   END IF

   SELECT CASE( what )
     CASE( 'U', 'u' )
        IF( idesc(LAX_DESC_MYC) > idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = 1, idesc(LAX_DESC_NR)
                 a( i, j ) = alpha
              END DO
           END DO
        ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = 1, j - 1
                 a( i, j ) = alpha
              END DO
           END DO
        END IF
     CASE( 'L', 'l' )
        IF( idesc(LAX_DESC_MYC) < idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = 1, idesc(LAX_DESC_NR)
                 a( i, j ) = alpha
              END DO
           END DO
        ELSE IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO j = 1, idesc(LAX_DESC_NC)
              DO i = j + 1, idesc(LAX_DESC_NR)
                 a( i, j ) = alpha
              END DO
           END DO
        END IF
     CASE( 'D', 'd' )
        IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO i = 1, idesc(LAX_DESC_NR)
              a( i, i ) = alpha
           END DO
        END IF
     CASE( 'H', 'h' )
        IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
           DO i = 1, idesc(LAX_DESC_NR)
              a( i, i ) = CMPLX( DBLE( a(i,i) ), 0_DP, KIND=DP )
           END DO
        END IF
     CASE DEFAULT
        DO j = 1, idesc(LAX_DESC_NC)
           DO i = 1, idesc(LAX_DESC_NR)
              a( i, j ) = alpha
           END DO
        END DO
   END SELECT
   !
   RETURN
END SUBROUTINE sqr_zsetmat_x

!------------------------------------------------------------------------
    SUBROUTINE distribute_lambda_x( lambda_repl, lambda_dist, idesc )
!------------------------------------------------------------------------
       IMPLICIT NONE
       include 'laxlib_kinds.fh'
       include 'laxlib_param.fh'
       REAL(DP), INTENT(IN)  :: lambda_repl(:,:)
       REAL(DP), INTENT(OUT) :: lambda_dist(:,:)
       INTEGER, INTENT(IN)  :: idesc(LAX_DESC_SIZE)
       INTEGER :: i, j, ic, ir
       IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
          ir = idesc(LAX_DESC_IR)
          ic = idesc(LAX_DESC_IC)
          DO j = 1, idesc(LAX_DESC_NC)
             DO i = 1, idesc(LAX_DESC_NR)
                lambda_dist( i, j ) = lambda_repl( i + ir - 1, j + ic - 1 )
             END DO
          END DO
       END IF
       RETURN
    END SUBROUTINE distribute_lambda_x
    !
!------------------------------------------------------------------------
    SUBROUTINE collect_lambda_x( lambda_repl, lambda_dist, idesc )
!------------------------------------------------------------------------
       USE laxlib_processors_grid,   ONLY: ortho_parent_comm
       USE laxlib_parallel_include
       IMPLICIT NONE
       include 'laxlib_kinds.fh'
       include 'laxlib_param.fh'
       REAL(DP), INTENT(OUT) :: lambda_repl(:,:)
       REAL(DP), INTENT(IN)  :: lambda_dist(:,:)
       INTEGER, INTENT(IN)  :: idesc(LAX_DESC_SIZE)
       INTEGER :: i, j, ic, ir, ierr
       lambda_repl = 0.0d0
       IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
          ir = idesc(LAX_DESC_IR)
          ic = idesc(LAX_DESC_IC)
          DO j = 1, idesc(LAX_DESC_NC)
             DO i = 1, idesc(LAX_DESC_NR)
                lambda_repl( i + ir - 1, j + ic - 1 ) = lambda_dist( i, j )
             END DO
          END DO
       END IF
#if defined __MPI
       CALL MPI_ALLREDUCE( MPI_IN_PLACE, lambda_repl, SIZE(lambda_repl), MPI_DOUBLE_PRECISION, &
                           MPI_SUM, ortho_parent_comm, ierr )
#endif
       RETURN
    END SUBROUTINE collect_lambda_x

!------------------------------------------------------------------------
    SUBROUTINE collect_zmat_x( zmat_repl, zmat_dist, idesc )
!------------------------------------------------------------------------
       USE laxlib_processors_grid,   ONLY: ortho_parent_comm
       USE laxlib_parallel_include
       IMPLICIT NONE
       include 'laxlib_kinds.fh'
       include 'laxlib_param.fh'
       REAL(DP), INTENT(OUT) :: zmat_repl(:,:)
       REAL(DP), INTENT(IN)  :: zmat_dist(:,:)
       INTEGER, INTENT(IN)  :: idesc(LAX_DESC_SIZE)
       INTEGER :: i, ii, j, me, np, nrl, ierr
       zmat_repl = 0.0d0
       me = idesc(LAX_DESC_MYPE)
       np = idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR)
       nrl = idesc(LAX_DESC_NRL)
       IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
          DO j = 1, idesc(LAX_DESC_N)
             ii = me + 1
             DO i = 1, nrl
                zmat_repl( ii, j ) = zmat_dist( i, j )
                ii = ii + np
             END DO
          END DO
       END IF
#if defined __MPI
       CALL MPI_ALLREDUCE( MPI_IN_PLACE, zmat_repl, SIZE(zmat_repl), MPI_DOUBLE_PRECISION, &
                           MPI_SUM, ortho_parent_comm, ierr )
#endif
       RETURN
    END SUBROUTINE collect_zmat_x

!------------------------------------------------------------------------
    SUBROUTINE setval_lambda_x( lambda_dist, i, j, val, idesc )
!------------------------------------------------------------------------
       IMPLICIT NONE
       include 'laxlib_kinds.fh'
       include 'laxlib_param.fh'
       REAL(DP), INTENT(OUT) :: lambda_dist(:,:)
       INTEGER,  INTENT(IN)  :: i, j
       REAL(DP), INTENT(IN)  :: val
       INTEGER, INTENT(IN)  :: idesc(LAX_DESC_SIZE)
       INTEGER :: ir, ic
       IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
          ir = idesc(LAX_DESC_IR)
          ic = idesc(LAX_DESC_IC)
          IF( ( i >= ir ) .AND. ( i - ir + 1 <= idesc(LAX_DESC_NR) ) ) THEN
             IF( ( j >= ic ) .AND. ( j - ic + 1 <= idesc(LAX_DESC_NC) ) ) THEN
                lambda_dist( i - ir + 1, j - ic + 1 ) = val
             END IF
          END IF
       END IF
       RETURN
    END SUBROUTINE setval_lambda_x

!------------------------------------------------------------------------
    SUBROUTINE distribute_zmat_x( zmat_repl, zmat_dist, idesc )
!------------------------------------------------------------------------
       IMPLICIT NONE
       include 'laxlib_kinds.fh'
       include 'laxlib_param.fh'
       REAL(DP), INTENT(IN)  :: zmat_repl(:,:)
       REAL(DP), INTENT(OUT) :: zmat_dist(:,:)
       INTEGER, INTENT(IN)  :: idesc(LAX_DESC_SIZE)
       INTEGER :: i, ii, j, me, np
       me = idesc(LAX_DESC_MYPE)
       np = idesc(LAX_DESC_NPC) * idesc(LAX_DESC_NPR)
       IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
          DO j = 1, idesc(LAX_DESC_N)
             ii = me + 1
             DO i = 1, idesc(LAX_DESC_NRL)
                zmat_dist( i, j ) = zmat_repl( ii, j )
                ii = ii + np
             END DO
          END DO
       END IF
       RETURN
    END SUBROUTINE distribute_zmat_x
    !