quantum-espresso/CPV/ortho_base.f90

1400 lines
42 KiB
Fortran

!
! Copyright (C) 2002-2009 Quantum ESPRESSO groups
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
MODULE orthogonalize_base
USE kinds
USE dspev_module, ONLY: pdspev_drv, dspev_drv
IMPLICIT NONE
SAVE
PRIVATE
REAL(DP) :: one, zero, two, minus_one, minus_two
PARAMETER ( one = 1.0d0, zero = 0.0d0, two = 2.0d0, minus_one = -1.0d0 )
PARAMETER ( minus_two = -2.0d0 )
COMPLEX(DP) :: cone, czero, mcone
PARAMETER ( cone = (1.0d0, 0.0d0), czero = (0.0d0, 0.0d0) )
PARAMETER ( mcone = (-1.0d0, 0.0d0) )
REAL(DP) :: small = 1.0d-14
LOGICAL :: use_parallel_diag
PUBLIC :: sigset
PUBLIC :: tauset
PUBLIC :: rhoset
PUBLIC :: ortho_iterate
PUBLIC :: ortho_alt_iterate
PUBLIC :: updatc, calphi
PUBLIC :: mesure_diag_perf
PUBLIC :: mesure_mmul_perf
PUBLIC :: diagonalize_parallel
PUBLIC :: diagonalize_serial
PUBLIC :: use_parallel_diag
CONTAINS
! ----------------------------------------------
SUBROUTINE diagonalize_serial( n, rhos, rhod )
IMPLICIT NONE
INTEGER, INTENT(IN) :: n
REAL(DP) :: rhos(:,:)
REAL(DP) :: rhod(:)
!
! inputs:
! n size of the eigenproblem
! rhos the symmetric matrix
! outputs:
! rhos eigenvectors
! rhod eigenvalues
!
REAL(DP), ALLOCATABLE :: aux(:)
INTEGER :: i, j, k
IF( n < 1 ) RETURN
ALLOCATE( aux( n * ( n + 1 ) / 2 ) )
! pack lower triangle of rho into aux
!
k = 0
DO j = 1, n
DO i = j, n
k = k + 1
aux( k ) = rhos( i, j )
END DO
END DO
CALL dspev_drv( 'V', 'L', n, aux, rhod, rhos, SIZE(rhos,1) )
DEALLOCATE( aux )
RETURN
END SUBROUTINE diagonalize_serial
! ----------------------------------------------
SUBROUTINE diagonalize_parallel( n, rhos, rhod, s, desc )
USE descriptors, ONLY: lambda_node_ , nlax_
#ifdef __SCALAPACK
USE mp_global, ONLY: ortho_cntx
USE dspev_module, ONLY: pdsyevd_drv
#endif
IMPLICIT NONE
REAL(DP), INTENT(IN) :: rhos(:,:) ! input symmetric matrix
REAL(DP) :: rhod(:) ! output eigenvalues
REAL(DP) :: s(:,:) ! output eigenvectors
INTEGER, INTENT(IN) :: n ! size of the global matrix
INTEGER, INTENT(IN) :: desc(:)
IF( n < 1 ) RETURN
! Matrix is distributed on the same processors group
! used for parallel matrix multiplication
!
IF( SIZE(s,1) /= SIZE(rhos,1) .OR. SIZE(s,2) /= SIZE(rhos,2) ) &
CALL errore( " diagonalize_parallel ", " inconsistent dimension for s and rhos ", 1 )
IF ( desc( lambda_node_ ) > 0 ) THEN
!
IF( SIZE(s,1) /= desc( nlax_ ) ) &
CALL errore( " diagonalize_parallel ", " inconsistent dimension ", 1 )
!
! Compute local dimension of the cyclically distributed matrix
!
s = rhos
!
#ifdef __SCALAPACK
CALL pdsyevd_drv( .true. , n, desc( nlax_ ), s, SIZE(s,1), rhod, ortho_cntx )
#else
CALL qe_pdsyevd( .true., n, desc, s, SIZE(s,1), rhod )
#endif
!
END IF
RETURN
END SUBROUTINE diagonalize_parallel
! ----------------------------------------------
SUBROUTINE mesure_diag_perf( n )
!
USE mp_global, ONLY: nproc_image, me_image, intra_image_comm, root_image
USE mp_global, ONLY: np_ortho2, np_ortho, me_ortho, ortho_comm, ortho_comm_id
USE io_global, ONLY: ionode, stdout
USE mp, ONLY: mp_sum, mp_bcast, mp_barrier
USE mp, ONLY: mp_max
USE descriptors, ONLY: descla_siz_ , descla_init , nlar_ , nlac_ , &
ilar_ , ilac_ , nlax_ , lambda_node_ , la_myc_ , la_myr_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
REAL(DP), ALLOCATABLE :: s(:,:), a(:,:), d(:)
REAL(DP) :: t1, tpar, tser
INTEGER :: nr, nc, ir, ic, nx
INTEGER :: desc( descla_siz_ )
REAL(DP) :: cclock
EXTERNAL :: cclock
INTEGER, PARAMETER :: paradim = 1000
!
! Check if number of PEs for orthogonalization/diagonalization is given from the input
!
IF( np_ortho2 > 0 ) THEN
use_parallel_diag = .TRUE.
RETURN
END IF
ALLOCATE( d( n ) )
!
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, ortho_comm_id )
nx = 1
IF( desc( lambda_node_ ) > 0 ) nx = desc( nlax_ )
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
ALLOCATE( s( nx, nx ) )
ALLOCATE( a( nx, nx ) )
!
CALL set_a()
!
! some MPIs (OpenMPI) the first time they call a collective routine take too much
! time to perform initializations, then perform a dummy call to get meaningful time
!
CALL diagonalize_parallel( n, a, d, s, desc )
!
CALL set_a()
!
CALL mp_barrier( intra_image_comm )
t1 = cclock()
!
CALL diagonalize_parallel( n, a, d, s, desc )
!
tpar = cclock() - t1
CALL mp_max( tpar, intra_image_comm )
DEALLOCATE( s, a )
!
IF( desc( la_myc_ ) == 0 .AND. desc( la_myr_ ) == 0 .AND. &
desc( lambda_node_ ) > 0 .AND. n < paradim ) THEN
! when n >= paradim do not mesure serial perf, go parallel
ALLOCATE( a( n, n ) )
nr = n
nc = n
ir = 1
ic = 1
CALL set_a()
t1 = cclock()
CALL diagonalize_serial( n, a, d )
tser = cclock() - t1
DEALLOCATE( a )
ELSE
tser = 0_DP
END IF
CALL mp_max( tser, intra_image_comm )
#if defined __PARA
IF( ionode ) THEN
use_parallel_diag = .FALSE.
WRITE( stdout, 90 )
IF( n < paradim ) WRITE( stdout, 100 ) tser
WRITE( stdout, 110 ) tpar, np_ortho(1) * np_ortho(2)
90 FORMAT(/,3X,'Diagonalization Performances')
100 FORMAT(3X,'ortho diag, time for serial driver = ', 1F9.5)
110 FORMAT(3X,'ortho diag, time for parallel driver = ', 1F9.5, ' with ', I4, ' procs' )
IF( n < paradim ) THEN
IF( tpar < tser ) use_parallel_diag = .TRUE.
ELSE
use_parallel_diag = .TRUE.
END IF
END IF
#else
use_parallel_diag = .FALSE.
#endif
CALL mp_bcast( use_parallel_diag, root_image, intra_image_comm )
DEALLOCATE( d )
RETURN
CONTAINS
SUBROUTINE set_a()
INTEGER :: i, j, ii, jj
IF( desc( lambda_node_ ) < 0 ) RETURN
DO j = 1, nc
DO i = 1, nr
ii = i + ir - 1
jj = j + ic - 1
IF( ii == jj ) THEN
a(i,j) = ( DBLE( n-ii+1 ) ) / DBLE( n ) + 1.0d0 / ( DBLE( ii+jj ) - 1.0d0 )
ELSE
a(i,j) = 1.0d0 / ( DBLE( ii+jj ) - 1.0d0 )
END IF
END DO
END DO
RETURN
END SUBROUTINE set_a
END SUBROUTINE mesure_diag_perf
! ----------------------------------------------
SUBROUTINE mesure_mmul_perf( n )
!
USE mp_global, ONLY: nproc_image, me_image, intra_image_comm, &
root_image, ortho_comm, np_ortho2, np_ortho, &
me_ortho, init_ortho_group, ortho_comm_id
USE io_global, ONLY: ionode, stdout
USE mp, ONLY: mp_sum, mp_bcast, mp_barrier
USE mp, ONLY: mp_max
USE descriptors, ONLY: descla_siz_ , descla_init , nlar_ , nlac_ , la_comm_ , lambda_node_
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n
!
REAL(DP), ALLOCATABLE :: c(:,:), a(:,:), b(:,:)
REAL(DP) :: t1, tcan
INTEGER :: nr, nc, ir, ic, np, lnode
INTEGER :: desc( descla_siz_ )
!
REAL(DP) :: cclock
EXTERNAL :: cclock
!
IF( np_ortho2 > 0 ) THEN
!
! Here the number of processors is suggested on input
!
np = np_ortho2
if( np > MIN( n, nproc_image ) ) np = MIN( n, nproc_image )
np = MAX( INT( SQRT( DBLE( np ) + 0.1d0 ) ), 1 )
!
ELSE
!
! Take the maximum number of processors
!
np = MIN( n, nproc_image )
np = MAX( INT( SQRT( DBLE( np ) + 0.1d0 ) ), 1 )
!
END IF
!
! Now test the allowed processors mesh sizes
!
CALL init_ortho_group( np * np, intra_image_comm )
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, ortho_comm_id )
nr = desc( nlar_ )
nc = desc( nlac_ )
ALLOCATE( a( nr, nc ), c( nr, nc ), b( nr, nc ) )
a = 1.0d0 / DBLE( n )
b = 1.0d0 / DBLE( n )
! some MPIs (OpenMPI) the first time they call a collective routine take too much
! time to perform initializations, then perform a dummy call to get meaningful time
CALL sqr_mm_cannon( 'N', 'N', n, 1.0d0, a, nr, b, nr, 0.0d0, c, nr, desc)
CALL mp_barrier( intra_image_comm )
t1 = cclock()
CALL sqr_mm_cannon( 'N', 'N', n, 1.0d0, a, nr, b, nr, 0.0d0, c, nr, desc)
tcan = cclock() - t1
CALL mp_max( tcan, intra_image_comm )
DEALLOCATE( a, c, b )
#if defined __PARA
IF( ionode ) THEN
!
WRITE( stdout, 90 )
WRITE( stdout, 120 ) tcan, np*np
90 FORMAT(/,3X,'Matrix Multiplication Performances')
120 FORMAT(3X,'ortho mmul, time for parallel driver = ', 1F9.5, ' with ', I4, ' procs')
!
END IF
#else
np = 1
#endif
CALL init_ortho_group( np*np, intra_image_comm )
#if defined __PARA
IF( ionode ) THEN
WRITE( stdout, '(/,3X,"Constraints matrixes will be distributed block like on")' )
WRITE( stdout, '(3X,"ortho sub-group = ", I4, "*", I4, " procs",/)' ) np_ortho(1), np_ortho(2)
END IF
#endif
RETURN
END SUBROUTINE mesure_mmul_perf
!=----------------------------------------------------------------------------=!
SUBROUTINE ortho_iterate( iter, diff, u, ldx, diag, xloc, nx0, sig, rhor, rhos, tau, nss, desc )
! this iterative loop uses Cannon's parallel matrix multiplication
! matrix are distributed over a square processor grid: 1x1 2x2 3x3 ...
! But the subroutine work with any number of processors, when
! nproc is not a square, some procs are left idle
USE kinds, ONLY: DP
USE io_global, ONLY: stdout
USE control_flags, ONLY: ortho_eps, ortho_max
USE mp_global, ONLY: intra_image_comm, me_image, nproc_image
USE mp, ONLY: mp_sum, mp_max
USE descriptors, ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , &
la_myr_ , la_myc_ , la_comm_ , descla_siz_ , nlax_
IMPLICIT NONE
INTEGER, INTENT(IN) :: nss, ldx, nx0
INTEGER, INTENT(IN) :: desc( descla_siz_ )
REAL(DP) :: u ( ldx, ldx )
REAL(DP) :: diag( nss )
REAL(DP) :: xloc( nx0, nx0 )
REAL(DP) :: rhor( ldx, ldx )
REAL(DP) :: rhos( ldx, ldx )
REAL(DP) :: tau ( ldx, ldx )
REAL(DP) :: sig ( ldx, ldx )
INTEGER, INTENT(OUT) :: iter
REAL(DP), INTENT(OUT) :: diff
INTEGER :: i, j
INTEGER :: nr, nc, ir, ic
REAL(DP), ALLOCATABLE :: tmp1(:,:), tmp2(:,:), dd(:,:), tr1(:,:), tr2(:,:)
REAL(DP), ALLOCATABLE :: con(:,:), x1(:,:)
!
IF( nss < 1 ) RETURN
!
! all processors not involved in the parallel orthogonalization
! jump at the end of the subroutine
!
IF( ldx/= nx0 ) &
CALL errore( " ortho_iterate ", " inconsistent dimensions ldx, nx0 ", nx0 )
IF( desc( lambda_node_ ) < 0 ) then
xloc = 0.0d0
iter = 0
go to 100
endif
!
! Compute the size of the local block
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
IF( ldx/= desc( nlax_ ) ) &
CALL errore( " ortho_iterate ", " inconsistent dimensions ldx ", ldx )
ALLOCATE( tr1(ldx,ldx), tr2(ldx,ldx) )
ALLOCATE( tmp1(ldx,ldx), tmp2(ldx,ldx), dd(ldx,ldx), x1(ldx,ldx), con(ldx,ldx) )
! Clear elements not involved in the orthogonalization
!
do j = nc + 1, nx0
do i = 1, nx0
xloc( i, j ) = 0.0d0
end do
end do
do j = 1, nx0
do i = nr + 1, nx0
xloc( i, j ) = 0.0d0
end do
end do
ITERATIVE_LOOP: DO iter = 1, ortho_max
!
! the following calls do the following matrix multiplications:
! tmp1 = x0*rhor (1st call)
! dd = x0*tau*x0 (2nd and 3rd call)
! tmp2 = x0*rhos (4th call)
!
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, rhor, ldx, 0.0d0, tmp1, ldx, desc)
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, tau, ldx, xloc, nx0, 0.0d0, tmp2, ldx, desc)
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, tmp2, ldx, 0.0d0, dd, ldx, desc)
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, rhos, ldx, 0.0d0, tmp2, ldx, desc)
!
CALL sqr_tr_cannon( nss, tmp1, ldx, tr1, ldx, desc )
CALL sqr_tr_cannon( nss, tmp2, ldx, tr2, ldx, desc )
!
!$omp parallel do default(shared), private(j)
DO i=1,nr
DO j=1,nc
x1(i,j) = sig(i,j)-tmp1(i,j)-tr1(i,j)-dd(i,j)
con(i,j)= x1(i,j)-tmp2(i,j)-tr2(i,j)
END DO
END DO
!
! x1 = sig -x0*rho -x0*rho^t -x0*tau*x0
!
diff = 0.d0
DO i=1,nr
DO j=1,nc
IF(ABS(con(i,j)).GT.diff) diff=ABS(con(i,j))
END DO
END DO
CALL mp_max( diff, desc( la_comm_ ) )
IF( diff < ortho_eps ) EXIT ITERATIVE_LOOP
!
! the following calls do:
! tmp1 = x1*u
! tmp2 = ut*x1*u
!
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, x1, ldx, u, ldx, 0.0d0, tmp1, ldx, desc )
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, tmp2, ldx, desc )
!
! g=ut*x1*u/d (g is stored in tmp1)
!
!$omp parallel do default(shared), private(j)
DO i=1,nr
DO j=1,nc
tmp1(i,j)=tmp2(i,j)/(diag(i+ir-1)+diag(j+ic-1))
END DO
END DO
!
! the following calls do:
! tmp2 = g*ut
! x0 = u*g*ut
!
CALL sqr_mm_cannon( 'N', 'T', nss, 1.0d0, tmp1, ldx, u, ldx, 0.0d0, tmp2, ldx, desc )
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, u, ldx, tmp2, ldx, 0.0d0, xloc, nx0, desc)
!
END DO ITERATIVE_LOOP
DEALLOCATE( tmp1, tmp2, dd, x1, con, tr1, tr2 )
100 CONTINUE
CALL mp_max( iter, intra_image_comm )
RETURN
END SUBROUTINE ortho_iterate
!=----------------------------------------------------------------------------=!
!
! Alternative iterative cycle
!
!=----------------------------------------------------------------------------=!
!
SUBROUTINE ortho_alt_iterate( iter, diff, u, ldx, diag, xloc, nx0, sig, rhor, tau, nss, desc )
USE kinds, ONLY: DP
USE io_global, ONLY: stdout
USE control_flags, ONLY: ortho_eps, ortho_max
USE mp_global, ONLY: intra_image_comm, me_image, nproc_image
USE mp, ONLY: mp_sum, mp_max
USE descriptors, ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , &
la_myr_ , la_myc_ , la_comm_ , descla_siz_ , nlax_
IMPLICIT NONE
INTEGER, INTENT(IN) :: nss, ldx, nx0
INTEGER, INTENT(IN) :: desc( descla_siz_ )
REAL(DP) :: u ( ldx, ldx )
REAL(DP) :: diag( nss )
REAL(DP) :: xloc( nx0, nx0 )
REAL(DP) :: rhor( ldx, ldx )
REAL(DP) :: tau ( ldx, ldx )
REAL(DP) :: sig ( ldx, ldx )
INTEGER, INTENT(OUT) :: iter
REAL(DP), INTENT(OUT) :: diff
INTEGER :: i, j
INTEGER :: nr, nc, ir, ic
REAL(DP), ALLOCATABLE :: tmp1(:,:), tmp2(:,:)
REAL(DP), ALLOCATABLE :: x1(:,:)
REAL(DP), ALLOCATABLE :: sigd(:)
REAL(DP) :: den, dx
!
IF( nss < 1 ) RETURN
IF( ldx/= nx0 ) &
CALL errore( " ortho_alt_iterate ", " inconsistent dimensions ldx, nx0 ", nx0 )
if( desc( lambda_node_ ) < 0 ) then
xloc = 0.0d0
iter = 0
go to 100
endif
!
! Compute the size of the local block
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
IF( ldx/= desc( nlax_ ) ) &
CALL errore( " ortho_alt_iterate ", " inconsistent dimensions ldx ", ldx )
ALLOCATE( tmp1(ldx,ldx), tmp2(ldx,ldx), x1(ldx,ldx), sigd(nss) )
! Clear elements not involved in the orthogonalization
!
do j = nc + 1, nx0
do i = 1, nx0
xloc( i, j ) = 0.0d0
end do
end do
do j = 1, nx0
do i = nr + 1, nx0
xloc( i, j ) = 0.0d0
end do
end do
!
! ... Transform "sig", "rhoa" and "tau" in the new basis through matrix "s"
!
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, sig, ldx, u, ldx, 0.0d0, tmp1, ldx, desc)
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, sig, ldx, desc)
!
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, rhor, ldx, u, ldx, 0.0d0, tmp1, ldx, desc)
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, rhor, ldx, desc)
!
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, tau, ldx, u, ldx, 0.0d0, tmp1, ldx, desc)
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, tau, ldx, desc)
!
! ... Initialize x0 with preconditioning
!
DO J = 1, nc
DO I = 1, nr
den = ( diag( i + ir - 1 ) + diag( j + ic - 1 ) )
IF( ABS( den ) <= small ) den = SIGN( small, den )
xloc( i, j ) = sig( i, j ) / den
ENDDO
ENDDO
!
! ... Starting iteration
!
ITERATIVE_LOOP: DO iter = 0, ortho_max
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, rhor, ldx, 0.0d0, tmp2, ldx, desc)
CALL sqr_tr_cannon( nss, tmp2, ldx, tmp1, ldx, desc )
DO J=1,nc
DO I=1,nr
tmp2(I,J) = tmp2(I,J) + tmp1(I,J)
ENDDO
ENDDO
!
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, tau, ldx, xloc, nx0, 0.0d0, tmp1, ldx, desc)
!
sigd = 0.0d0
IF( desc( la_myr_ ) == desc( la_myc_ ) ) THEN
DO i = 1, nr
SIGD( i + ir - 1 ) = tmp1(i,i)
tmp1(i,i) = -SIGD( i + ir - 1 )
ENDDO
END IF
CALL mp_sum( sigd, desc( la_comm_ ) )
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, xloc, nx0, tmp1, ldx, 0.0d0, x1, ldx, desc)
!
CALL sqr_tr_cannon( nss, x1, ldx, tmp1, ldx, desc )
! ... X1 = SIG - tmp2 - 0.5d0 * ( X1 + X1^t )
diff = 0.0d0
!
DO j = 1, nc
DO i = 1, nr
!
den = ( diag(i+ir-1) + sigd(i+ir-1) + diag(j+ic-1) + sigd(j+ic-1) )
IF( ABS( den ) <= small ) den = SIGN( small, den )
x1(i,j) = sig(i,j) - tmp2(i,j) - 0.5d0 * (x1(i,j)+tmp1(i,j))
x1(i,j) = x1(i,j) / den
diff = MAX( ABS( x1(i,j) - xloc(i,j) ), diff )
xloc(i,j) = x1(i,j)
!
END DO
END DO
CALL mp_max( diff, desc( la_comm_ ) )
IF( diff < ortho_eps ) EXIT ITERATIVE_LOOP
END DO ITERATIVE_LOOP
!
! ... Transform x0 back to the original basis
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, u, ldx, xloc, nx0, 0.0d0, tmp1, ldx, desc)
CALL sqr_mm_cannon( 'N', 'T', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, xloc, nx0, desc)
DEALLOCATE( tmp1, tmp2, x1, sigd )
100 CONTINUE
CALL mp_max( iter, intra_image_comm )
RETURN
END SUBROUTINE ortho_alt_iterate
!-------------------------------------------------------------------------
SUBROUTINE sigset( cp, ngwx, becp, nkbx, qbecp, n, nss, ist, sig, ldx, desc )
!-----------------------------------------------------------------------
! input: cp (non-orthonormal), becp, qbecp
! computes the matrix
! sig = 1 - a , a = <cp|s|cp> = <cp|cp> + sum q_ij <cp|i><j|cp>
! where s=s(r(t+dt))
! routine makes use of c(-q)=c*(q)
!
USE kinds, ONLY: DP
USE uspp, ONLY: nkbus
USE cvan, ONLY: nvb
USE gvecw, ONLY: ngw
USE reciprocal_vectors, ONLY: gstart
USE mp, ONLY: mp_root_sum
USE control_flags, ONLY: iprsta
USE io_global, ONLY: stdout
USE mp_global, ONLY: intra_image_comm, leg_ortho
USE descriptors, ONLY: lambda_node_ , la_npc_ , la_npr_ , descla_siz_ , &
descla_init , la_comm_ , ilar_ , ilac_ , nlar_ , &
nlac_ , la_myr_ , la_myc_ , la_nx_ , la_n_ , nlax_
USE parallel_toolkit, ONLY: dsqmsym
!
IMPLICIT NONE
!
INTEGER :: nss, ist, ngwx, nkbx, n, ldx, nx
COMPLEX(DP) :: cp( ngwx, n )
REAL(DP) :: becp( nkbx, n ), qbecp( nkbx, ldx )
REAL(DP) :: sig( ldx, ldx )
INTEGER :: desc( descla_siz_ )
!
INTEGER :: i, j, ipr, ipc, nr, nc, ir, ic, npr, npc
INTEGER :: ii, jj, root
INTEGER :: desc_ip( descla_siz_ )
INTEGER :: np( 2 ), coor_ip( 2 )
!
REAL(DP), ALLOCATABLE :: sigp(:,:)
!
IF( nss < 1 ) RETURN
np(1) = desc( la_npr_ )
np(2) = desc( la_npc_ )
nx = desc( nlax_ )
ALLOCATE( sigp( nx, nx ) )
IF( desc( lambda_node_ ) > 0 ) THEN
IF( desc( nlax_ ) /= ldx ) &
CALL errore( " sigset ", " inconsistent dimension ldx ", ldx )
IF( nx /= ldx ) &
CALL errore( " sigset ", " inconsistent dimension nx ", nx )
END IF
DO ipc = 1, np(2)
DO ipr = 1, ipc ! np(1) use symmetry
coor_ip(1) = ipr - 1
coor_ip(2) = ipc - 1
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
nr = desc_ip( nlar_ )
nc = desc_ip( nlac_ )
ir = desc_ip( ilar_ )
ic = desc_ip( ilac_ )
!
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
root = root * leg_ortho
CALL dgemm( 'T', 'N', nr, nc, 2*ngw, -2.0d0, cp( 1, ist + ir - 1), 2*ngwx, &
cp( 1, ist + ic - 1 ), 2*ngwx, 0.0d0, sigp, nx )
!
! q = 0 components has weight 1.0
!
IF ( gstart == 2 ) THEN
CALL DGER( nr, nc, 1.D0, cp(1,ist+ir-1), 2*ngwx, cp(1,ist+ic-1), 2*ngwx, sigp, nx )
END IF
!
CALL mp_root_sum( sigp, sig, root, intra_image_comm )
!
END DO
!
END DO
!
DEALLOCATE( sigp )
!
CALL dsqmsym( nss, sig, nx, desc )
!
IF( desc( lambda_node_ ) > 0 ) THEN
!
nr = desc( nlar_ )
nc = desc( nlac_ )
ir = desc( ilar_ )
ic = desc( ilac_ )
!
IF( desc( la_myr_ ) == desc( la_myc_ ) ) THEN
DO i = 1, nr
sig(i,i) = sig(i,i) + 1.0d0
END DO
END IF
!
IF( nvb > 0 ) THEN
CALL dgemm( 'T', 'N', nr, nc, nkbus, -1.0d0, becp( 1, ist+ir-1 ), &
nkbx, qbecp( 1, 1 ), nkbx, 1.0d0, sig, ldx )
ENDIF
!
IF( iprsta > 4 ) THEN
WRITE( stdout,*)
WRITE( stdout,'(26x,a)') ' sig '
DO i = 1, nr
WRITE( stdout,'(7f11.6)' ) ( sig(i,j), j=1, nc )
END DO
ENDIF
!
END IF
!
RETURN
END SUBROUTINE sigset
!
!-----------------------------------------------------------------------
SUBROUTINE rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, ist, rho, ldx, desc )
!-----------------------------------------------------------------------
! input: cp (non-orthonormal), phi, bephi, qbecp
! computes the matrix
! rho = <s'c0|s cp> = <phi|s cp>
! where |phi> = s'|c0> = |c0> + sum q_ij |i><j|c0>
! where s=s(r(t+dt)) and s'=s(r(t))
! routine makes use of c(-q)=c*(q)
!
USE gvecw, ONLY: ngw
USE reciprocal_vectors, ONLY: gstart
USE uspp, ONLY: nkbus
USE cvan, ONLY: nvb
USE kinds, ONLY: DP
USE mp, ONLY: mp_root_sum
USE mp_global, ONLY: intra_image_comm, me_image, leg_ortho
USE control_flags, ONLY: iprsta
USE io_global, ONLY: stdout
USE descriptors, ONLY: lambda_node_ , la_npc_ , la_npr_ , descla_siz_ , &
descla_init , la_comm_ , ilar_ , ilac_ , nlar_ , &
nlac_ , la_myr_ , la_myc_ , la_nx_ , la_n_ , nlax_
!
IMPLICIT NONE
!
INTEGER :: nss, ist, ngwx, nkbx, ldx, n
COMPLEX(DP) :: cp( ngwx, n ), phi( ngwx, n )
REAL(DP) :: bephi( nkbx, ldx ), qbecp( nkbx, ldx )
REAL(DP) :: rho( ldx, ldx )
INTEGER :: desc( descla_siz_ )
!
INTEGER :: i, j, ipr, ipc, nr, nc, ir, ic, npr, npc
INTEGER :: ii, jj, root, nx
INTEGER :: desc_ip( descla_siz_ )
INTEGER :: np( 2 ), coor_ip( 2 )
REAL(DP), ALLOCATABLE :: rhop(:,:)
!
! <phi|cp>
!
!
IF( nss < 1 ) RETURN
np(1) = desc( la_npr_ )
np(2) = desc( la_npc_ )
nx = desc( nlax_ )
IF( desc( lambda_node_ ) > 0 ) THEN
IF( desc( nlax_ ) /= ldx ) &
CALL errore( " rhoset ", " inconsistent dimension ldx ", ldx )
IF( nx /= ldx ) &
CALL errore( " rhoset ", " inconsistent dimension nx ", nx )
END IF
ALLOCATE( rhop( nx, nx ) )
rhop = 0.0d0
DO ipc = 1, np(2)
DO ipr = 1, np(1)
coor_ip(1) = ipr - 1
coor_ip(2) = ipc - 1
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
nr = desc_ip( nlar_ )
nc = desc_ip( nlac_ )
ir = desc_ip( ilar_ )
ic = desc_ip( ilac_ )
!
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
!
root = root * leg_ortho
CALL dgemm( 'T', 'N', nr, nc, 2*ngw, 2.0d0, phi( 1, ist + ir - 1 ), 2*ngwx, &
cp( 1, ist + ic - 1 ), 2*ngwx, 0.0d0, rhop, nx )
!
! q = 0 components has weight 1.0
!
IF (gstart == 2) THEN
CALL DGER( nr, nc, -1.D0, phi(1,ist+ir-1), 2*ngwx, cp(1,ist+ic-1), 2*ngwx, rhop, nx )
END IF
CALL mp_root_sum( rhop, rho, root, intra_image_comm )
END DO
END DO
DEALLOCATE( rhop )
IF( desc( lambda_node_ ) > 0 ) THEN
!
nr = desc( nlar_ )
nc = desc( nlac_ )
!
! bephi is distributed among processor rows
! qbephi is distributed among processor columns
! tau is block distributed among the whole processor 2D grid
!
!
IF( nvb > 0 ) THEN
!
! rho(i,j) = rho(i,j) + SUM_b bephi( b, i ) * qbecp( b, j )
!
CALL dgemm( 'T', 'N', nr, nc, nkbus, 1.0d0, bephi, nkbx, qbecp, nkbx, 1.0d0, rho, ldx )
END IF
IF ( iprsta > 4 ) THEN
WRITE( stdout,*)
WRITE( stdout,'(26x,a)') ' rho '
DO i=1,nr
WRITE( stdout,'(7f11.6)') (rho(i,j),j=1,nc)
END DO
END IF
END IF
!
RETURN
END SUBROUTINE rhoset
!-------------------------------------------------------------------------
SUBROUTINE tauset( phi, ngwx, bephi, nkbx, qbephi, n, nss, ist, tau, ldx, desc )
!-----------------------------------------------------------------------
! input: phi
! computes the matrix
! tau = <s'c0|s|s'c0> = <phi|s|phi>, where |phi> = s'|c0>
! where s=s(r(t+dt)) and s'=s(r(t))
! routine makes use of c(-q)=c*(q)
!
USE kinds, ONLY: DP
USE cvan, ONLY: nvb
USE uspp, ONLY: nkbus
USE gvecw, ONLY: ngw
USE reciprocal_vectors, ONLY: gstart
USE mp, ONLY: mp_root_sum
USE control_flags, ONLY: iprsta
USE io_global, ONLY: stdout
USE mp_global, ONLY: intra_image_comm, leg_ortho
USE descriptors, ONLY: lambda_node_ , la_npc_ , la_npr_ , descla_siz_ , &
descla_init , la_comm_ , ilar_ , ilac_ , nlar_ , &
nlac_ , la_myr_ , la_myc_ , la_nx_ , la_n_ , nlax_
USE parallel_toolkit, ONLY: dsqmsym
!
IMPLICIT NONE
!
INTEGER :: nss, ist, ngwx, nkbx, n, ldx, nx
COMPLEX(DP) :: phi( ngwx, n )
REAL(DP) :: bephi( nkbx, ldx ), qbephi( nkbx, ldx )
REAL(DP) :: tau( ldx, ldx )
INTEGER :: desc( descla_siz_ )
!
INTEGER :: i, j, ipr, ipc, nr, nc, ir, ic, npr, npc
INTEGER :: ii, jj, root
INTEGER :: desc_ip( descla_siz_ )
INTEGER :: np( 2 ), coor_ip( 2 )
REAL(DP), ALLOCATABLE :: taup( :, : )
!
IF( nss < 1 ) RETURN
!
! get dimensions of the square processor grid
!
np(1) = desc( la_npr_ )
np(2) = desc( la_npc_ )
!
nx = desc( nlax_ )
!
IF( desc( lambda_node_ ) > 0 ) THEN
IF( desc( nlax_ ) /= ldx ) &
CALL errore( " tauset ", " inconsistent dimension ldx ", ldx )
IF( nx /= ldx ) &
CALL errore( " tauset ", " inconsistent dimension nx ", nx )
END IF
!
ALLOCATE( taup( nx, nx ) )
!
taup = 0.0d0
!
! loop on processors coordinates
!
DO ipc = 1, np(2)
!
DO ipr = 1, ipc ! np(1) use symmetry
coor_ip(1) = ipr - 1
coor_ip(2) = ipc - 1
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
nr = desc_ip( nlar_ )
nc = desc_ip( nlac_ )
ir = desc_ip( ilar_ )
ic = desc_ip( ilac_ )
!
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
!
root = root * leg_ortho
!
! All processors contribute to the tau block of processor (ipr,ipc)
! with their own part of wavefunctions
!
CALL dgemm( 'T', 'N', nr, nc, 2*ngw, 2.0d0, phi( 1, ist + ir - 1 ), 2*ngwx, &
phi( 1, ist + ic - 1 ), 2*ngwx, 0.0d0, taup, nx )
!
! q = 0 components has weight 1.0
!
IF (gstart == 2) THEN
CALL DGER( nr, nc, -1.D0, phi(1,ist+ir-1), 2*ngwx, phi(1,ist+ic-1), 2*ngwx, taup, nx )
END IF
!
CALL mp_root_sum( taup, tau, root, intra_image_comm )
!
END DO
!
END DO
!
DEALLOCATE( taup )
!
CALL dsqmsym( nss, tau, nx, desc )
!
IF( desc( lambda_node_ ) > 0 ) THEN
!
nr = desc( nlar_ )
nc = desc( nlac_ )
!
! bephi is distributed among processor rows
! qbephi is distributed among processor columns
! tau is block distributed among the whole processor 2D grid
!
IF( nvb > 0 ) THEN
!
CALL dgemm( 'T', 'N', nr, nc, nkbus, 1.0d0, bephi, nkbx, qbephi, nkbx, 1.0d0, tau, ldx )
!
END IF
IF( iprsta > 4 ) THEN
WRITE( stdout,*)
WRITE( stdout,'(26x,a)') ' tau '
DO i=1,nr
WRITE( stdout,'(7f11.6)') (tau(i,j),j=1,nc)
END DO
ENDIF
!
ENDIF
!
RETURN
END SUBROUTINE tauset
!
!-------------------------------------------------------------------------
SUBROUTINE updatc( ccc, n, x0, nx0, phi, ngwx, bephi, nkbx, becp, bec, cp, nss, istart, desc )
!-----------------------------------------------------------------------
!
! input ccc : dt**2/emass OR 1.0d0 demending on ortho
! input x0 : converged lambdas from ortho-loop (unchanged in output)
! input cp : non-orthonormal cp=c0+dh/dc*ccc
! input bec : <cp|beta_i>
! input phi
! output cp : orthonormal cp=cp+lambda*phi
! output bec: bec=becp+lambda*bephi
!
USE kinds, ONLY: DP
USE ions_base, ONLY: nsp, na
USE io_global, ONLY: stdout
USE cvan, ONLY: nvb, ish
USE uspp, ONLY: nkb, nkbus
USE uspp_param, ONLY: nh
USE gvecw, ONLY: ngw
USE control_flags, ONLY: iprint, iprsta
USE mp, ONLY: mp_sum, mp_bcast
USE mp_global, ONLY: intra_image_comm, leg_ortho, me_image
USE descriptors, ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , descla_siz_ , la_comm_ , &
la_npc_ , la_npr_ , nlax_ , la_n_ , la_nx_ , la_myr_ , la_myc_ , &
descla_init
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: n, nx0, ngwx, nkbx, istart, nss
INTEGER, INTENT(IN) :: desc( descla_siz_ )
COMPLEX(DP) :: cp( ngwx, n ), phi( ngwx, n )
REAL(DP), INTENT(IN) :: ccc
REAL(DP) :: bec( nkbx, n ), x0( nx0, nx0 )
REAL(DP) :: bephi( :, : ), becp( nkbx, n )
! local variables
INTEGER :: i, j, ig, is, iv, ia, inl, nr, nc, ir, ic
REAL(DP), ALLOCATABLE :: wtemp(:,:)
REAL(DP), ALLOCATABLE :: xd(:,:)
REAL(DP), ALLOCATABLE :: bephi_tmp(:,:)
REAL(DP) :: beta
INTEGER :: ipr, ipc, nx, root
INTEGER :: np( 2 ), coor_ip( 2 )
INTEGER :: desc_ip( descla_siz_ )
!
! lagrange multipliers
!
IF( nss < 1 ) RETURN
!
IF( desc( lambda_node_ ) > 0 ) THEN
IF( nx0 /= desc( nlax_ ) ) &
CALL errore( " updatc ", " inconsistent dimension nx0 ", nx0 )
END IF
!
! size of the local block
!
nx = desc( nlax_ )
!
np(1) = desc( la_npr_ )
np(2) = desc( la_npc_ )
!
CALL start_clock( 'updatc' )
ALLOCATE( xd( nx, nx ) )
IF( nvb > 0 )THEN
ALLOCATE( wtemp( nx, nkb ) )
ALLOCATE( bephi_tmp( nkbx, nx ) )
DO i = 1, nss
DO inl = 1, nkbus
bec( inl, i + istart - 1 ) = 0.0d0
END DO
END DO
END IF
DO ipc = 1, np(2)
!
IF( nvb > 0 )THEN
!
! For the inner loop we need the block of bebhi( :, ic : ic + nc - 1 )
! this is the same of block bephi( :, ir : ir + nr - 1 ) on processor
! with coords ipr == ipc
!
! get the right processor owning the block of bephi
!
CALL GRID2D_RANK( 'R', np(1), np(2), ipc-1, ipc-1, root )
root = root * leg_ortho
!
! broadcast the block to all processors
!
IF( me_image == root ) bephi_tmp = bephi
CALL mp_bcast( bephi_tmp, root, intra_image_comm )
!
END IF
DO ipr = 1, np(1)
!
coor_ip(1) = ipr - 1
coor_ip(2) = ipc - 1
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
nr = desc_ip( nlar_ )
nc = desc_ip( nlac_ )
ir = desc_ip( ilar_ )
ic = desc_ip( ilac_ )
!
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
root = root * leg_ortho
IF( desc( la_myr_ ) == ipr - 1 .AND. desc( la_myc_ ) == ipc - 1 .AND. desc( lambda_node_ ) > 0 ) THEN
xd = x0 * ccc
END IF
CALL mp_bcast( xd, root, intra_image_comm )
CALL dgemm( 'N', 'N', 2*ngw, nc, nr, 1.0d0, phi(1,istart+ir-1), 2*ngwx, &
xd, nx, 1.0d0, cp(1,istart+ic-1), 2*ngwx )
IF( nvb > 0 )THEN
! updating of the <beta|c(n,g)>
!
! bec of vanderbilt species are updated
!
CALL dgemm( 'N', 'T', nr, nkbus, nc, 1.0d0, xd, nx, bephi_tmp, nkbx, 0.0d0, wtemp, nx )
!
! here nr and ir are still valid, since they are the same for all procs in the same row
!
DO i = 1, nr
DO inl = 1, nkbus
bec( inl, i + istart + ir - 2 ) = bec( inl, i + istart + ir - 2 ) + wtemp( i, inl )
END DO
END DO
!
END IF
END DO
!
END DO
IF( nvb > 0 )THEN
DEALLOCATE( wtemp )
DEALLOCATE( bephi_tmp )
DO i = istart, istart + nss - 1
DO inl = 1, nkbus
bec( inl, i ) = bec( inl, i ) + becp( inl, i )
END DO
END DO
END IF
!
IF ( iprsta > 2 ) THEN
WRITE( stdout,*)
DO is = 1, nvb
IF( nvb > 1 ) THEN
WRITE( stdout,'(33x,a,i4)') ' updatc: bec (is)',is
WRITE( stdout,'(8f9.4)') &
& ((bec(ish(is)+(iv-1)*na(is)+1,i+istart-1),iv=1,nh(is)),i=1,nss)
ELSE
DO ia=1,na(is)
WRITE( stdout,'(33x,a,i4)') ' updatc: bec (ia)',ia
WRITE( stdout,'(8f9.4)') &
& ((bec(ish(is)+(iv-1)*na(is)+ia,i+istart-1),iv=1,nh(is)),i=1,nss)
END DO
END IF
WRITE( stdout,*)
END DO
ENDIF
!
DEALLOCATE( xd )
!
CALL stop_clock( 'updatc' )
!
RETURN
END SUBROUTINE updatc
!-------------------------------------------------------------------------
SUBROUTINE calphi( c0, ngwx, bec, nkbx, betae, phi, n, ema0bg )
!-----------------------------------------------------------------------
! input: c0 (orthonormal with s(r(t)), bec=<c0|beta>, betae=|beta>
! computes the matrix phi (with the old positions)
! where |phi> = s'|c0> = |c0> + sum q_ij |i><j|c0>
! where s'=s(r(t))
!
USE kinds, ONLY: DP
USE ions_base, ONLY: na, nsp
USE io_global, ONLY: stdout
USE mp_global, ONLY: intra_image_comm
USE cvan, ONLY: ish, nvb
USE uspp_param, ONLY: nh
USE uspp, ONLY: nkbus, qq
USE gvecw, ONLY: ngw
USE constants, ONLY: pi, fpi
USE control_flags, ONLY: iprint, iprsta
USE mp, ONLY: mp_sum
!
IMPLICIT NONE
INTEGER, INTENT(IN) :: ngwx, nkbx, n
COMPLEX(DP) :: c0( ngwx, n ), phi( ngwx, n ), betae( ngwx, nkbx )
REAL(DP) :: bec( nkbx, n ), emtot
REAL(DP), OPTIONAL :: ema0bg( ngwx )
! local variables
!
INTEGER :: is, iv, jv, ia, inl, jnl, i, j
REAL(DP), ALLOCATABLE :: qtemp( : , : )
REAL(DP) :: qqf
!
IF( n < 1 ) RETURN
!
CALL start_clock( 'calphi' )
!
IF ( nvb > 0 ) THEN
ALLOCATE( qtemp( nkbus, n ) )
qtemp (:,:) = 0.d0
DO is=1,nvb
DO iv=1,nh(is)
inl = ish(is)+(iv-1)*na(is)
DO jv=1,nh(is)
jnl = ish(is)+(jv-1)*na(is)
IF(ABS(qq(iv,jv,is)) > 1.d-5) THEN
qqf = qq(iv,jv,is)
DO i=1,n
CALL daxpy( na(is), qqf, bec(jnl+1,i),1,qtemp(inl+1,i), 1 )
END DO
ENDIF
END DO
END DO
END DO
!
CALL dgemm ( 'N', 'N', 2*ngw, n, nkbus, 1.0d0, betae, &
2*ngwx, qtemp, nkbus, 0.0d0, phi, 2*ngwx )
DEALLOCATE( qtemp )
ELSE
phi = (0.d0, 0.d0)
END IF
!
IF( PRESENT( ema0bg ) ) THEN
!$omp parallel do default(shared), private(i)
DO j=1,n
DO i=1,ngw
phi(i,j)=(phi(i,j)+c0(i,j))*ema0bg(i)
END DO
END DO
!$omp end parallel do
ELSE
!$omp parallel do default(shared), private(i)
DO j=1,n
DO i=1,ngw
phi(i,j)=phi(i,j)+c0(i,j)
END DO
END DO
!$omp end parallel do
END IF
!
IF(iprsta > 2) THEN
emtot=0.0d0
IF( PRESENT( ema0bg ) ) THEN
DO j=1,n
DO i=1,ngw
emtot=emtot +2.0d0*DBLE(phi(i,j)*CONJG(c0(i,j)))*ema0bg(i)**(-2.0d0)
END DO
END DO
ELSE
DO j=1,n
DO i=1,ngw
emtot=emtot +2.0d0*DBLE(phi(i,j)*CONJG(c0(i,j)))
END DO
END DO
END IF
emtot=emtot/n
CALL mp_sum( emtot, intra_image_comm )
WRITE( stdout,*) 'in calphi sqrt(emtot)=',SQRT(emtot)
WRITE( stdout,*)
DO is = 1, nvb
IF( nvb > 1 ) THEN
WRITE( stdout,'(33x,a,i4)') ' calphi: bec (is)',is
WRITE( stdout,'(8f9.4)') &
& ((bec(ish(is)+(iv-1)*na(is)+1,i),iv=1,nh(is)),i=1,n)
ELSE
DO ia=1,na(is)
WRITE( stdout,'(33x,a,i4)') ' calphi: bec (ia)',ia
WRITE( stdout,'(8f9.4)') &
& ((bec(ish(is)+(iv-1)*na(is)+ia,i),iv=1,nh(is)),i=1,n)
END DO
END IF
END DO
ENDIF
CALL stop_clock( 'calphi' )
!
RETURN
END SUBROUTINE calphi
END MODULE orthogonalize_base