mirror of https://gitlab.com/QEF/q-e.git
1400 lines
42 KiB
Fortran
1400 lines
42 KiB
Fortran
!
|
|
! Copyright (C) 2002-2009 Quantum ESPRESSO groups
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License. See the file `License'
|
|
! in the root directory of the present distribution,
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
!
|
|
|
|
|
|
MODULE orthogonalize_base
|
|
|
|
|
|
USE kinds
|
|
USE dspev_module, ONLY: pdspev_drv, dspev_drv
|
|
|
|
IMPLICIT NONE
|
|
|
|
SAVE
|
|
|
|
PRIVATE
|
|
|
|
REAL(DP) :: one, zero, two, minus_one, minus_two
|
|
PARAMETER ( one = 1.0d0, zero = 0.0d0, two = 2.0d0, minus_one = -1.0d0 )
|
|
PARAMETER ( minus_two = -2.0d0 )
|
|
COMPLEX(DP) :: cone, czero, mcone
|
|
PARAMETER ( cone = (1.0d0, 0.0d0), czero = (0.0d0, 0.0d0) )
|
|
PARAMETER ( mcone = (-1.0d0, 0.0d0) )
|
|
REAL(DP) :: small = 1.0d-14
|
|
LOGICAL :: use_parallel_diag
|
|
|
|
PUBLIC :: sigset
|
|
PUBLIC :: tauset
|
|
PUBLIC :: rhoset
|
|
PUBLIC :: ortho_iterate
|
|
PUBLIC :: ortho_alt_iterate
|
|
PUBLIC :: updatc, calphi
|
|
PUBLIC :: mesure_diag_perf
|
|
PUBLIC :: mesure_mmul_perf
|
|
PUBLIC :: diagonalize_parallel
|
|
PUBLIC :: diagonalize_serial
|
|
PUBLIC :: use_parallel_diag
|
|
|
|
CONTAINS
|
|
|
|
|
|
! ----------------------------------------------
|
|
|
|
|
|
SUBROUTINE diagonalize_serial( n, rhos, rhod )
|
|
IMPLICIT NONE
|
|
INTEGER, INTENT(IN) :: n
|
|
REAL(DP) :: rhos(:,:)
|
|
REAL(DP) :: rhod(:)
|
|
!
|
|
! inputs:
|
|
! n size of the eigenproblem
|
|
! rhos the symmetric matrix
|
|
! outputs:
|
|
! rhos eigenvectors
|
|
! rhod eigenvalues
|
|
!
|
|
REAL(DP), ALLOCATABLE :: aux(:)
|
|
INTEGER :: i, j, k
|
|
|
|
IF( n < 1 ) RETURN
|
|
|
|
ALLOCATE( aux( n * ( n + 1 ) / 2 ) )
|
|
|
|
! pack lower triangle of rho into aux
|
|
!
|
|
k = 0
|
|
DO j = 1, n
|
|
DO i = j, n
|
|
k = k + 1
|
|
aux( k ) = rhos( i, j )
|
|
END DO
|
|
END DO
|
|
|
|
CALL dspev_drv( 'V', 'L', n, aux, rhod, rhos, SIZE(rhos,1) )
|
|
|
|
DEALLOCATE( aux )
|
|
|
|
RETURN
|
|
|
|
END SUBROUTINE diagonalize_serial
|
|
|
|
|
|
! ----------------------------------------------
|
|
|
|
|
|
SUBROUTINE diagonalize_parallel( n, rhos, rhod, s, desc )
|
|
|
|
USE descriptors, ONLY: lambda_node_ , nlax_
|
|
#ifdef __SCALAPACK
|
|
USE mp_global, ONLY: ortho_cntx
|
|
USE dspev_module, ONLY: pdsyevd_drv
|
|
#endif
|
|
|
|
IMPLICIT NONE
|
|
REAL(DP), INTENT(IN) :: rhos(:,:) ! input symmetric matrix
|
|
REAL(DP) :: rhod(:) ! output eigenvalues
|
|
REAL(DP) :: s(:,:) ! output eigenvectors
|
|
INTEGER, INTENT(IN) :: n ! size of the global matrix
|
|
INTEGER, INTENT(IN) :: desc(:)
|
|
|
|
IF( n < 1 ) RETURN
|
|
|
|
! Matrix is distributed on the same processors group
|
|
! used for parallel matrix multiplication
|
|
!
|
|
IF( SIZE(s,1) /= SIZE(rhos,1) .OR. SIZE(s,2) /= SIZE(rhos,2) ) &
|
|
CALL errore( " diagonalize_parallel ", " inconsistent dimension for s and rhos ", 1 )
|
|
|
|
IF ( desc( lambda_node_ ) > 0 ) THEN
|
|
!
|
|
IF( SIZE(s,1) /= desc( nlax_ ) ) &
|
|
CALL errore( " diagonalize_parallel ", " inconsistent dimension ", 1 )
|
|
!
|
|
! Compute local dimension of the cyclically distributed matrix
|
|
!
|
|
s = rhos
|
|
!
|
|
#ifdef __SCALAPACK
|
|
CALL pdsyevd_drv( .true. , n, desc( nlax_ ), s, SIZE(s,1), rhod, ortho_cntx )
|
|
#else
|
|
CALL qe_pdsyevd( .true., n, desc, s, SIZE(s,1), rhod )
|
|
#endif
|
|
!
|
|
END IF
|
|
|
|
RETURN
|
|
|
|
END SUBROUTINE diagonalize_parallel
|
|
|
|
|
|
! ----------------------------------------------
|
|
|
|
|
|
SUBROUTINE mesure_diag_perf( n )
|
|
!
|
|
USE mp_global, ONLY: nproc_image, me_image, intra_image_comm, root_image
|
|
USE mp_global, ONLY: np_ortho2, np_ortho, me_ortho, ortho_comm, ortho_comm_id
|
|
USE io_global, ONLY: ionode, stdout
|
|
USE mp, ONLY: mp_sum, mp_bcast, mp_barrier
|
|
USE mp, ONLY: mp_max
|
|
USE descriptors, ONLY: descla_siz_ , descla_init , nlar_ , nlac_ , &
|
|
ilar_ , ilac_ , nlax_ , lambda_node_ , la_myc_ , la_myr_
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER, INTENT(IN) :: n
|
|
REAL(DP), ALLOCATABLE :: s(:,:), a(:,:), d(:)
|
|
REAL(DP) :: t1, tpar, tser
|
|
INTEGER :: nr, nc, ir, ic, nx
|
|
INTEGER :: desc( descla_siz_ )
|
|
REAL(DP) :: cclock
|
|
EXTERNAL :: cclock
|
|
INTEGER, PARAMETER :: paradim = 1000
|
|
!
|
|
! Check if number of PEs for orthogonalization/diagonalization is given from the input
|
|
!
|
|
IF( np_ortho2 > 0 ) THEN
|
|
use_parallel_diag = .TRUE.
|
|
RETURN
|
|
END IF
|
|
|
|
ALLOCATE( d( n ) )
|
|
!
|
|
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, ortho_comm_id )
|
|
|
|
nx = 1
|
|
IF( desc( lambda_node_ ) > 0 ) nx = desc( nlax_ )
|
|
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
ir = desc( ilar_ )
|
|
ic = desc( ilac_ )
|
|
|
|
ALLOCATE( s( nx, nx ) )
|
|
ALLOCATE( a( nx, nx ) )
|
|
!
|
|
CALL set_a()
|
|
!
|
|
! some MPIs (OpenMPI) the first time they call a collective routine take too much
|
|
! time to perform initializations, then perform a dummy call to get meaningful time
|
|
!
|
|
CALL diagonalize_parallel( n, a, d, s, desc )
|
|
!
|
|
CALL set_a()
|
|
!
|
|
CALL mp_barrier( intra_image_comm )
|
|
t1 = cclock()
|
|
!
|
|
CALL diagonalize_parallel( n, a, d, s, desc )
|
|
!
|
|
tpar = cclock() - t1
|
|
CALL mp_max( tpar, intra_image_comm )
|
|
|
|
DEALLOCATE( s, a )
|
|
!
|
|
IF( desc( la_myc_ ) == 0 .AND. desc( la_myr_ ) == 0 .AND. &
|
|
desc( lambda_node_ ) > 0 .AND. n < paradim ) THEN
|
|
|
|
! when n >= paradim do not mesure serial perf, go parallel
|
|
|
|
ALLOCATE( a( n, n ) )
|
|
nr = n
|
|
nc = n
|
|
ir = 1
|
|
ic = 1
|
|
|
|
CALL set_a()
|
|
|
|
t1 = cclock()
|
|
|
|
CALL diagonalize_serial( n, a, d )
|
|
|
|
tser = cclock() - t1
|
|
|
|
DEALLOCATE( a )
|
|
|
|
ELSE
|
|
|
|
tser = 0_DP
|
|
|
|
END IF
|
|
|
|
CALL mp_max( tser, intra_image_comm )
|
|
|
|
#if defined __PARA
|
|
|
|
IF( ionode ) THEN
|
|
use_parallel_diag = .FALSE.
|
|
WRITE( stdout, 90 )
|
|
IF( n < paradim ) WRITE( stdout, 100 ) tser
|
|
WRITE( stdout, 110 ) tpar, np_ortho(1) * np_ortho(2)
|
|
90 FORMAT(/,3X,'Diagonalization Performances')
|
|
100 FORMAT(3X,'ortho diag, time for serial driver = ', 1F9.5)
|
|
110 FORMAT(3X,'ortho diag, time for parallel driver = ', 1F9.5, ' with ', I4, ' procs' )
|
|
IF( n < paradim ) THEN
|
|
IF( tpar < tser ) use_parallel_diag = .TRUE.
|
|
ELSE
|
|
use_parallel_diag = .TRUE.
|
|
END IF
|
|
END IF
|
|
|
|
#else
|
|
|
|
use_parallel_diag = .FALSE.
|
|
|
|
#endif
|
|
|
|
CALL mp_bcast( use_parallel_diag, root_image, intra_image_comm )
|
|
|
|
DEALLOCATE( d )
|
|
|
|
RETURN
|
|
|
|
CONTAINS
|
|
|
|
SUBROUTINE set_a()
|
|
INTEGER :: i, j, ii, jj
|
|
IF( desc( lambda_node_ ) < 0 ) RETURN
|
|
DO j = 1, nc
|
|
DO i = 1, nr
|
|
ii = i + ir - 1
|
|
jj = j + ic - 1
|
|
IF( ii == jj ) THEN
|
|
a(i,j) = ( DBLE( n-ii+1 ) ) / DBLE( n ) + 1.0d0 / ( DBLE( ii+jj ) - 1.0d0 )
|
|
ELSE
|
|
a(i,j) = 1.0d0 / ( DBLE( ii+jj ) - 1.0d0 )
|
|
END IF
|
|
END DO
|
|
END DO
|
|
RETURN
|
|
END SUBROUTINE set_a
|
|
|
|
END SUBROUTINE mesure_diag_perf
|
|
|
|
|
|
! ----------------------------------------------
|
|
|
|
|
|
SUBROUTINE mesure_mmul_perf( n )
|
|
!
|
|
USE mp_global, ONLY: nproc_image, me_image, intra_image_comm, &
|
|
root_image, ortho_comm, np_ortho2, np_ortho, &
|
|
me_ortho, init_ortho_group, ortho_comm_id
|
|
USE io_global, ONLY: ionode, stdout
|
|
USE mp, ONLY: mp_sum, mp_bcast, mp_barrier
|
|
USE mp, ONLY: mp_max
|
|
USE descriptors, ONLY: descla_siz_ , descla_init , nlar_ , nlac_ , la_comm_ , lambda_node_
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER, INTENT(IN) :: n
|
|
!
|
|
REAL(DP), ALLOCATABLE :: c(:,:), a(:,:), b(:,:)
|
|
REAL(DP) :: t1, tcan
|
|
INTEGER :: nr, nc, ir, ic, np, lnode
|
|
INTEGER :: desc( descla_siz_ )
|
|
!
|
|
REAL(DP) :: cclock
|
|
EXTERNAL :: cclock
|
|
!
|
|
IF( np_ortho2 > 0 ) THEN
|
|
!
|
|
! Here the number of processors is suggested on input
|
|
!
|
|
np = np_ortho2
|
|
if( np > MIN( n, nproc_image ) ) np = MIN( n, nproc_image )
|
|
np = MAX( INT( SQRT( DBLE( np ) + 0.1d0 ) ), 1 )
|
|
!
|
|
ELSE
|
|
!
|
|
! Take the maximum number of processors
|
|
!
|
|
np = MIN( n, nproc_image )
|
|
np = MAX( INT( SQRT( DBLE( np ) + 0.1d0 ) ), 1 )
|
|
!
|
|
END IF
|
|
|
|
!
|
|
! Now test the allowed processors mesh sizes
|
|
!
|
|
|
|
CALL init_ortho_group( np * np, intra_image_comm )
|
|
|
|
CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, ortho_comm_id )
|
|
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
|
|
ALLOCATE( a( nr, nc ), c( nr, nc ), b( nr, nc ) )
|
|
|
|
a = 1.0d0 / DBLE( n )
|
|
b = 1.0d0 / DBLE( n )
|
|
|
|
! some MPIs (OpenMPI) the first time they call a collective routine take too much
|
|
! time to perform initializations, then perform a dummy call to get meaningful time
|
|
CALL sqr_mm_cannon( 'N', 'N', n, 1.0d0, a, nr, b, nr, 0.0d0, c, nr, desc)
|
|
|
|
CALL mp_barrier( intra_image_comm )
|
|
t1 = cclock()
|
|
|
|
CALL sqr_mm_cannon( 'N', 'N', n, 1.0d0, a, nr, b, nr, 0.0d0, c, nr, desc)
|
|
|
|
tcan = cclock() - t1
|
|
CALL mp_max( tcan, intra_image_comm )
|
|
|
|
DEALLOCATE( a, c, b )
|
|
|
|
|
|
#if defined __PARA
|
|
|
|
IF( ionode ) THEN
|
|
!
|
|
WRITE( stdout, 90 )
|
|
WRITE( stdout, 120 ) tcan, np*np
|
|
90 FORMAT(/,3X,'Matrix Multiplication Performances')
|
|
120 FORMAT(3X,'ortho mmul, time for parallel driver = ', 1F9.5, ' with ', I4, ' procs')
|
|
!
|
|
END IF
|
|
|
|
#else
|
|
|
|
np = 1
|
|
|
|
#endif
|
|
|
|
CALL init_ortho_group( np*np, intra_image_comm )
|
|
|
|
#if defined __PARA
|
|
IF( ionode ) THEN
|
|
WRITE( stdout, '(/,3X,"Constraints matrixes will be distributed block like on")' )
|
|
WRITE( stdout, '(3X,"ortho sub-group = ", I4, "*", I4, " procs",/)' ) np_ortho(1), np_ortho(2)
|
|
END IF
|
|
#endif
|
|
|
|
RETURN
|
|
|
|
END SUBROUTINE mesure_mmul_perf
|
|
|
|
|
|
|
|
|
|
!=----------------------------------------------------------------------------=!
|
|
|
|
|
|
|
|
SUBROUTINE ortho_iterate( iter, diff, u, ldx, diag, xloc, nx0, sig, rhor, rhos, tau, nss, desc )
|
|
|
|
! this iterative loop uses Cannon's parallel matrix multiplication
|
|
! matrix are distributed over a square processor grid: 1x1 2x2 3x3 ...
|
|
! But the subroutine work with any number of processors, when
|
|
! nproc is not a square, some procs are left idle
|
|
|
|
USE kinds, ONLY: DP
|
|
USE io_global, ONLY: stdout
|
|
USE control_flags, ONLY: ortho_eps, ortho_max
|
|
USE mp_global, ONLY: intra_image_comm, me_image, nproc_image
|
|
USE mp, ONLY: mp_sum, mp_max
|
|
USE descriptors, ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , &
|
|
la_myr_ , la_myc_ , la_comm_ , descla_siz_ , nlax_
|
|
|
|
IMPLICIT NONE
|
|
|
|
INTEGER, INTENT(IN) :: nss, ldx, nx0
|
|
INTEGER, INTENT(IN) :: desc( descla_siz_ )
|
|
REAL(DP) :: u ( ldx, ldx )
|
|
REAL(DP) :: diag( nss )
|
|
REAL(DP) :: xloc( nx0, nx0 )
|
|
REAL(DP) :: rhor( ldx, ldx )
|
|
REAL(DP) :: rhos( ldx, ldx )
|
|
REAL(DP) :: tau ( ldx, ldx )
|
|
REAL(DP) :: sig ( ldx, ldx )
|
|
INTEGER, INTENT(OUT) :: iter
|
|
REAL(DP), INTENT(OUT) :: diff
|
|
|
|
INTEGER :: i, j
|
|
INTEGER :: nr, nc, ir, ic
|
|
REAL(DP), ALLOCATABLE :: tmp1(:,:), tmp2(:,:), dd(:,:), tr1(:,:), tr2(:,:)
|
|
REAL(DP), ALLOCATABLE :: con(:,:), x1(:,:)
|
|
!
|
|
IF( nss < 1 ) RETURN
|
|
|
|
!
|
|
! all processors not involved in the parallel orthogonalization
|
|
! jump at the end of the subroutine
|
|
!
|
|
|
|
IF( ldx/= nx0 ) &
|
|
CALL errore( " ortho_iterate ", " inconsistent dimensions ldx, nx0 ", nx0 )
|
|
|
|
IF( desc( lambda_node_ ) < 0 ) then
|
|
xloc = 0.0d0
|
|
iter = 0
|
|
go to 100
|
|
endif
|
|
!
|
|
! Compute the size of the local block
|
|
!
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
ir = desc( ilar_ )
|
|
ic = desc( ilac_ )
|
|
|
|
IF( ldx/= desc( nlax_ ) ) &
|
|
CALL errore( " ortho_iterate ", " inconsistent dimensions ldx ", ldx )
|
|
|
|
ALLOCATE( tr1(ldx,ldx), tr2(ldx,ldx) )
|
|
ALLOCATE( tmp1(ldx,ldx), tmp2(ldx,ldx), dd(ldx,ldx), x1(ldx,ldx), con(ldx,ldx) )
|
|
|
|
! Clear elements not involved in the orthogonalization
|
|
!
|
|
do j = nc + 1, nx0
|
|
do i = 1, nx0
|
|
xloc( i, j ) = 0.0d0
|
|
end do
|
|
end do
|
|
do j = 1, nx0
|
|
do i = nr + 1, nx0
|
|
xloc( i, j ) = 0.0d0
|
|
end do
|
|
end do
|
|
|
|
|
|
ITERATIVE_LOOP: DO iter = 1, ortho_max
|
|
!
|
|
! the following calls do the following matrix multiplications:
|
|
! tmp1 = x0*rhor (1st call)
|
|
! dd = x0*tau*x0 (2nd and 3rd call)
|
|
! tmp2 = x0*rhos (4th call)
|
|
!
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, rhor, ldx, 0.0d0, tmp1, ldx, desc)
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, tau, ldx, xloc, nx0, 0.0d0, tmp2, ldx, desc)
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, tmp2, ldx, 0.0d0, dd, ldx, desc)
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, rhos, ldx, 0.0d0, tmp2, ldx, desc)
|
|
!
|
|
CALL sqr_tr_cannon( nss, tmp1, ldx, tr1, ldx, desc )
|
|
CALL sqr_tr_cannon( nss, tmp2, ldx, tr2, ldx, desc )
|
|
!
|
|
!$omp parallel do default(shared), private(j)
|
|
DO i=1,nr
|
|
DO j=1,nc
|
|
x1(i,j) = sig(i,j)-tmp1(i,j)-tr1(i,j)-dd(i,j)
|
|
con(i,j)= x1(i,j)-tmp2(i,j)-tr2(i,j)
|
|
END DO
|
|
END DO
|
|
!
|
|
! x1 = sig -x0*rho -x0*rho^t -x0*tau*x0
|
|
!
|
|
diff = 0.d0
|
|
DO i=1,nr
|
|
DO j=1,nc
|
|
IF(ABS(con(i,j)).GT.diff) diff=ABS(con(i,j))
|
|
END DO
|
|
END DO
|
|
|
|
CALL mp_max( diff, desc( la_comm_ ) )
|
|
|
|
IF( diff < ortho_eps ) EXIT ITERATIVE_LOOP
|
|
|
|
!
|
|
! the following calls do:
|
|
! tmp1 = x1*u
|
|
! tmp2 = ut*x1*u
|
|
!
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, x1, ldx, u, ldx, 0.0d0, tmp1, ldx, desc )
|
|
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, tmp2, ldx, desc )
|
|
!
|
|
! g=ut*x1*u/d (g is stored in tmp1)
|
|
!
|
|
!$omp parallel do default(shared), private(j)
|
|
DO i=1,nr
|
|
DO j=1,nc
|
|
tmp1(i,j)=tmp2(i,j)/(diag(i+ir-1)+diag(j+ic-1))
|
|
END DO
|
|
END DO
|
|
!
|
|
! the following calls do:
|
|
! tmp2 = g*ut
|
|
! x0 = u*g*ut
|
|
!
|
|
CALL sqr_mm_cannon( 'N', 'T', nss, 1.0d0, tmp1, ldx, u, ldx, 0.0d0, tmp2, ldx, desc )
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, u, ldx, tmp2, ldx, 0.0d0, xloc, nx0, desc)
|
|
!
|
|
END DO ITERATIVE_LOOP
|
|
|
|
DEALLOCATE( tmp1, tmp2, dd, x1, con, tr1, tr2 )
|
|
|
|
100 CONTINUE
|
|
|
|
CALL mp_max( iter, intra_image_comm )
|
|
|
|
RETURN
|
|
END SUBROUTINE ortho_iterate
|
|
|
|
|
|
!=----------------------------------------------------------------------------=!
|
|
!
|
|
! Alternative iterative cycle
|
|
!
|
|
!=----------------------------------------------------------------------------=!
|
|
!
|
|
|
|
|
|
SUBROUTINE ortho_alt_iterate( iter, diff, u, ldx, diag, xloc, nx0, sig, rhor, tau, nss, desc )
|
|
|
|
USE kinds, ONLY: DP
|
|
USE io_global, ONLY: stdout
|
|
USE control_flags, ONLY: ortho_eps, ortho_max
|
|
USE mp_global, ONLY: intra_image_comm, me_image, nproc_image
|
|
USE mp, ONLY: mp_sum, mp_max
|
|
USE descriptors, ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , &
|
|
la_myr_ , la_myc_ , la_comm_ , descla_siz_ , nlax_
|
|
|
|
IMPLICIT NONE
|
|
|
|
INTEGER, INTENT(IN) :: nss, ldx, nx0
|
|
INTEGER, INTENT(IN) :: desc( descla_siz_ )
|
|
REAL(DP) :: u ( ldx, ldx )
|
|
REAL(DP) :: diag( nss )
|
|
REAL(DP) :: xloc( nx0, nx0 )
|
|
REAL(DP) :: rhor( ldx, ldx )
|
|
REAL(DP) :: tau ( ldx, ldx )
|
|
REAL(DP) :: sig ( ldx, ldx )
|
|
INTEGER, INTENT(OUT) :: iter
|
|
REAL(DP), INTENT(OUT) :: diff
|
|
|
|
INTEGER :: i, j
|
|
INTEGER :: nr, nc, ir, ic
|
|
REAL(DP), ALLOCATABLE :: tmp1(:,:), tmp2(:,:)
|
|
REAL(DP), ALLOCATABLE :: x1(:,:)
|
|
REAL(DP), ALLOCATABLE :: sigd(:)
|
|
REAL(DP) :: den, dx
|
|
!
|
|
IF( nss < 1 ) RETURN
|
|
|
|
IF( ldx/= nx0 ) &
|
|
CALL errore( " ortho_alt_iterate ", " inconsistent dimensions ldx, nx0 ", nx0 )
|
|
|
|
if( desc( lambda_node_ ) < 0 ) then
|
|
xloc = 0.0d0
|
|
iter = 0
|
|
go to 100
|
|
endif
|
|
!
|
|
! Compute the size of the local block
|
|
!
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
ir = desc( ilar_ )
|
|
ic = desc( ilac_ )
|
|
|
|
IF( ldx/= desc( nlax_ ) ) &
|
|
CALL errore( " ortho_alt_iterate ", " inconsistent dimensions ldx ", ldx )
|
|
|
|
ALLOCATE( tmp1(ldx,ldx), tmp2(ldx,ldx), x1(ldx,ldx), sigd(nss) )
|
|
|
|
! Clear elements not involved in the orthogonalization
|
|
!
|
|
do j = nc + 1, nx0
|
|
do i = 1, nx0
|
|
xloc( i, j ) = 0.0d0
|
|
end do
|
|
end do
|
|
do j = 1, nx0
|
|
do i = nr + 1, nx0
|
|
xloc( i, j ) = 0.0d0
|
|
end do
|
|
end do
|
|
!
|
|
! ... Transform "sig", "rhoa" and "tau" in the new basis through matrix "s"
|
|
!
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, sig, ldx, u, ldx, 0.0d0, tmp1, ldx, desc)
|
|
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, sig, ldx, desc)
|
|
!
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, rhor, ldx, u, ldx, 0.0d0, tmp1, ldx, desc)
|
|
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, rhor, ldx, desc)
|
|
!
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, tau, ldx, u, ldx, 0.0d0, tmp1, ldx, desc)
|
|
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, tau, ldx, desc)
|
|
!
|
|
! ... Initialize x0 with preconditioning
|
|
!
|
|
DO J = 1, nc
|
|
DO I = 1, nr
|
|
den = ( diag( i + ir - 1 ) + diag( j + ic - 1 ) )
|
|
IF( ABS( den ) <= small ) den = SIGN( small, den )
|
|
xloc( i, j ) = sig( i, j ) / den
|
|
ENDDO
|
|
ENDDO
|
|
|
|
!
|
|
! ... Starting iteration
|
|
!
|
|
|
|
ITERATIVE_LOOP: DO iter = 0, ortho_max
|
|
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, xloc, nx0, rhor, ldx, 0.0d0, tmp2, ldx, desc)
|
|
|
|
CALL sqr_tr_cannon( nss, tmp2, ldx, tmp1, ldx, desc )
|
|
|
|
DO J=1,nc
|
|
DO I=1,nr
|
|
tmp2(I,J) = tmp2(I,J) + tmp1(I,J)
|
|
ENDDO
|
|
ENDDO
|
|
!
|
|
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, tau, ldx, xloc, nx0, 0.0d0, tmp1, ldx, desc)
|
|
!
|
|
sigd = 0.0d0
|
|
IF( desc( la_myr_ ) == desc( la_myc_ ) ) THEN
|
|
DO i = 1, nr
|
|
SIGD( i + ir - 1 ) = tmp1(i,i)
|
|
tmp1(i,i) = -SIGD( i + ir - 1 )
|
|
ENDDO
|
|
END IF
|
|
CALL mp_sum( sigd, desc( la_comm_ ) )
|
|
|
|
CALL sqr_mm_cannon( 'T', 'N', nss, 1.0d0, xloc, nx0, tmp1, ldx, 0.0d0, x1, ldx, desc)
|
|
!
|
|
CALL sqr_tr_cannon( nss, x1, ldx, tmp1, ldx, desc )
|
|
|
|
! ... X1 = SIG - tmp2 - 0.5d0 * ( X1 + X1^t )
|
|
|
|
diff = 0.0d0
|
|
!
|
|
DO j = 1, nc
|
|
DO i = 1, nr
|
|
!
|
|
den = ( diag(i+ir-1) + sigd(i+ir-1) + diag(j+ic-1) + sigd(j+ic-1) )
|
|
IF( ABS( den ) <= small ) den = SIGN( small, den )
|
|
x1(i,j) = sig(i,j) - tmp2(i,j) - 0.5d0 * (x1(i,j)+tmp1(i,j))
|
|
x1(i,j) = x1(i,j) / den
|
|
diff = MAX( ABS( x1(i,j) - xloc(i,j) ), diff )
|
|
xloc(i,j) = x1(i,j)
|
|
!
|
|
END DO
|
|
END DO
|
|
|
|
CALL mp_max( diff, desc( la_comm_ ) )
|
|
|
|
IF( diff < ortho_eps ) EXIT ITERATIVE_LOOP
|
|
|
|
END DO ITERATIVE_LOOP
|
|
!
|
|
! ... Transform x0 back to the original basis
|
|
|
|
CALL sqr_mm_cannon( 'N', 'N', nss, 1.0d0, u, ldx, xloc, nx0, 0.0d0, tmp1, ldx, desc)
|
|
CALL sqr_mm_cannon( 'N', 'T', nss, 1.0d0, u, ldx, tmp1, ldx, 0.0d0, xloc, nx0, desc)
|
|
|
|
DEALLOCATE( tmp1, tmp2, x1, sigd )
|
|
|
|
100 CONTINUE
|
|
|
|
CALL mp_max( iter, intra_image_comm )
|
|
|
|
RETURN
|
|
END SUBROUTINE ortho_alt_iterate
|
|
|
|
|
|
!-------------------------------------------------------------------------
|
|
SUBROUTINE sigset( cp, ngwx, becp, nkbx, qbecp, n, nss, ist, sig, ldx, desc )
|
|
!-----------------------------------------------------------------------
|
|
! input: cp (non-orthonormal), becp, qbecp
|
|
! computes the matrix
|
|
! sig = 1 - a , a = <cp|s|cp> = <cp|cp> + sum q_ij <cp|i><j|cp>
|
|
! where s=s(r(t+dt))
|
|
! routine makes use of c(-q)=c*(q)
|
|
!
|
|
USE kinds, ONLY: DP
|
|
USE uspp, ONLY: nkbus
|
|
USE cvan, ONLY: nvb
|
|
USE gvecw, ONLY: ngw
|
|
USE reciprocal_vectors, ONLY: gstart
|
|
USE mp, ONLY: mp_root_sum
|
|
USE control_flags, ONLY: iprsta
|
|
USE io_global, ONLY: stdout
|
|
USE mp_global, ONLY: intra_image_comm, leg_ortho
|
|
USE descriptors, ONLY: lambda_node_ , la_npc_ , la_npr_ , descla_siz_ , &
|
|
descla_init , la_comm_ , ilar_ , ilac_ , nlar_ , &
|
|
nlac_ , la_myr_ , la_myc_ , la_nx_ , la_n_ , nlax_
|
|
USE parallel_toolkit, ONLY: dsqmsym
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: nss, ist, ngwx, nkbx, n, ldx, nx
|
|
COMPLEX(DP) :: cp( ngwx, n )
|
|
REAL(DP) :: becp( nkbx, n ), qbecp( nkbx, ldx )
|
|
REAL(DP) :: sig( ldx, ldx )
|
|
INTEGER :: desc( descla_siz_ )
|
|
!
|
|
INTEGER :: i, j, ipr, ipc, nr, nc, ir, ic, npr, npc
|
|
INTEGER :: ii, jj, root
|
|
INTEGER :: desc_ip( descla_siz_ )
|
|
INTEGER :: np( 2 ), coor_ip( 2 )
|
|
!
|
|
REAL(DP), ALLOCATABLE :: sigp(:,:)
|
|
!
|
|
IF( nss < 1 ) RETURN
|
|
|
|
np(1) = desc( la_npr_ )
|
|
np(2) = desc( la_npc_ )
|
|
|
|
nx = desc( nlax_ )
|
|
|
|
ALLOCATE( sigp( nx, nx ) )
|
|
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
IF( desc( nlax_ ) /= ldx ) &
|
|
CALL errore( " sigset ", " inconsistent dimension ldx ", ldx )
|
|
IF( nx /= ldx ) &
|
|
CALL errore( " sigset ", " inconsistent dimension nx ", nx )
|
|
END IF
|
|
|
|
DO ipc = 1, np(2)
|
|
DO ipr = 1, ipc ! np(1) use symmetry
|
|
|
|
coor_ip(1) = ipr - 1
|
|
coor_ip(2) = ipc - 1
|
|
|
|
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
|
|
|
|
nr = desc_ip( nlar_ )
|
|
nc = desc_ip( nlac_ )
|
|
ir = desc_ip( ilar_ )
|
|
ic = desc_ip( ilac_ )
|
|
!
|
|
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
|
|
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
|
|
|
|
root = root * leg_ortho
|
|
|
|
CALL dgemm( 'T', 'N', nr, nc, 2*ngw, -2.0d0, cp( 1, ist + ir - 1), 2*ngwx, &
|
|
cp( 1, ist + ic - 1 ), 2*ngwx, 0.0d0, sigp, nx )
|
|
!
|
|
! q = 0 components has weight 1.0
|
|
!
|
|
IF ( gstart == 2 ) THEN
|
|
CALL DGER( nr, nc, 1.D0, cp(1,ist+ir-1), 2*ngwx, cp(1,ist+ic-1), 2*ngwx, sigp, nx )
|
|
END IF
|
|
!
|
|
CALL mp_root_sum( sigp, sig, root, intra_image_comm )
|
|
!
|
|
END DO
|
|
!
|
|
END DO
|
|
!
|
|
DEALLOCATE( sigp )
|
|
!
|
|
CALL dsqmsym( nss, sig, nx, desc )
|
|
!
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
!
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
ir = desc( ilar_ )
|
|
ic = desc( ilac_ )
|
|
!
|
|
IF( desc( la_myr_ ) == desc( la_myc_ ) ) THEN
|
|
DO i = 1, nr
|
|
sig(i,i) = sig(i,i) + 1.0d0
|
|
END DO
|
|
END IF
|
|
!
|
|
IF( nvb > 0 ) THEN
|
|
CALL dgemm( 'T', 'N', nr, nc, nkbus, -1.0d0, becp( 1, ist+ir-1 ), &
|
|
nkbx, qbecp( 1, 1 ), nkbx, 1.0d0, sig, ldx )
|
|
ENDIF
|
|
!
|
|
IF( iprsta > 4 ) THEN
|
|
WRITE( stdout,*)
|
|
WRITE( stdout,'(26x,a)') ' sig '
|
|
DO i = 1, nr
|
|
WRITE( stdout,'(7f11.6)' ) ( sig(i,j), j=1, nc )
|
|
END DO
|
|
ENDIF
|
|
!
|
|
END IF
|
|
!
|
|
RETURN
|
|
END SUBROUTINE sigset
|
|
|
|
|
|
!
|
|
!-----------------------------------------------------------------------
|
|
SUBROUTINE rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, ist, rho, ldx, desc )
|
|
!-----------------------------------------------------------------------
|
|
! input: cp (non-orthonormal), phi, bephi, qbecp
|
|
! computes the matrix
|
|
! rho = <s'c0|s cp> = <phi|s cp>
|
|
! where |phi> = s'|c0> = |c0> + sum q_ij |i><j|c0>
|
|
! where s=s(r(t+dt)) and s'=s(r(t))
|
|
! routine makes use of c(-q)=c*(q)
|
|
!
|
|
USE gvecw, ONLY: ngw
|
|
USE reciprocal_vectors, ONLY: gstart
|
|
USE uspp, ONLY: nkbus
|
|
USE cvan, ONLY: nvb
|
|
USE kinds, ONLY: DP
|
|
USE mp, ONLY: mp_root_sum
|
|
USE mp_global, ONLY: intra_image_comm, me_image, leg_ortho
|
|
USE control_flags, ONLY: iprsta
|
|
USE io_global, ONLY: stdout
|
|
USE descriptors, ONLY: lambda_node_ , la_npc_ , la_npr_ , descla_siz_ , &
|
|
descla_init , la_comm_ , ilar_ , ilac_ , nlar_ , &
|
|
nlac_ , la_myr_ , la_myc_ , la_nx_ , la_n_ , nlax_
|
|
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: nss, ist, ngwx, nkbx, ldx, n
|
|
COMPLEX(DP) :: cp( ngwx, n ), phi( ngwx, n )
|
|
REAL(DP) :: bephi( nkbx, ldx ), qbecp( nkbx, ldx )
|
|
REAL(DP) :: rho( ldx, ldx )
|
|
INTEGER :: desc( descla_siz_ )
|
|
!
|
|
INTEGER :: i, j, ipr, ipc, nr, nc, ir, ic, npr, npc
|
|
INTEGER :: ii, jj, root, nx
|
|
INTEGER :: desc_ip( descla_siz_ )
|
|
INTEGER :: np( 2 ), coor_ip( 2 )
|
|
|
|
REAL(DP), ALLOCATABLE :: rhop(:,:)
|
|
!
|
|
! <phi|cp>
|
|
!
|
|
!
|
|
|
|
IF( nss < 1 ) RETURN
|
|
|
|
np(1) = desc( la_npr_ )
|
|
np(2) = desc( la_npc_ )
|
|
|
|
nx = desc( nlax_ )
|
|
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
IF( desc( nlax_ ) /= ldx ) &
|
|
CALL errore( " rhoset ", " inconsistent dimension ldx ", ldx )
|
|
IF( nx /= ldx ) &
|
|
CALL errore( " rhoset ", " inconsistent dimension nx ", nx )
|
|
END IF
|
|
|
|
ALLOCATE( rhop( nx, nx ) )
|
|
|
|
rhop = 0.0d0
|
|
|
|
DO ipc = 1, np(2)
|
|
DO ipr = 1, np(1)
|
|
|
|
coor_ip(1) = ipr - 1
|
|
coor_ip(2) = ipc - 1
|
|
|
|
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
|
|
|
|
nr = desc_ip( nlar_ )
|
|
nc = desc_ip( nlac_ )
|
|
ir = desc_ip( ilar_ )
|
|
ic = desc_ip( ilac_ )
|
|
!
|
|
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
|
|
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
|
|
!
|
|
root = root * leg_ortho
|
|
|
|
CALL dgemm( 'T', 'N', nr, nc, 2*ngw, 2.0d0, phi( 1, ist + ir - 1 ), 2*ngwx, &
|
|
cp( 1, ist + ic - 1 ), 2*ngwx, 0.0d0, rhop, nx )
|
|
!
|
|
! q = 0 components has weight 1.0
|
|
!
|
|
IF (gstart == 2) THEN
|
|
CALL DGER( nr, nc, -1.D0, phi(1,ist+ir-1), 2*ngwx, cp(1,ist+ic-1), 2*ngwx, rhop, nx )
|
|
END IF
|
|
|
|
CALL mp_root_sum( rhop, rho, root, intra_image_comm )
|
|
|
|
END DO
|
|
END DO
|
|
|
|
DEALLOCATE( rhop )
|
|
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
!
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
!
|
|
! bephi is distributed among processor rows
|
|
! qbephi is distributed among processor columns
|
|
! tau is block distributed among the whole processor 2D grid
|
|
!
|
|
!
|
|
IF( nvb > 0 ) THEN
|
|
!
|
|
! rho(i,j) = rho(i,j) + SUM_b bephi( b, i ) * qbecp( b, j )
|
|
!
|
|
CALL dgemm( 'T', 'N', nr, nc, nkbus, 1.0d0, bephi, nkbx, qbecp, nkbx, 1.0d0, rho, ldx )
|
|
|
|
END IF
|
|
|
|
IF ( iprsta > 4 ) THEN
|
|
WRITE( stdout,*)
|
|
WRITE( stdout,'(26x,a)') ' rho '
|
|
DO i=1,nr
|
|
WRITE( stdout,'(7f11.6)') (rho(i,j),j=1,nc)
|
|
END DO
|
|
END IF
|
|
|
|
END IF
|
|
!
|
|
RETURN
|
|
END SUBROUTINE rhoset
|
|
|
|
|
|
!-------------------------------------------------------------------------
|
|
SUBROUTINE tauset( phi, ngwx, bephi, nkbx, qbephi, n, nss, ist, tau, ldx, desc )
|
|
!-----------------------------------------------------------------------
|
|
! input: phi
|
|
! computes the matrix
|
|
! tau = <s'c0|s|s'c0> = <phi|s|phi>, where |phi> = s'|c0>
|
|
! where s=s(r(t+dt)) and s'=s(r(t))
|
|
! routine makes use of c(-q)=c*(q)
|
|
!
|
|
USE kinds, ONLY: DP
|
|
USE cvan, ONLY: nvb
|
|
USE uspp, ONLY: nkbus
|
|
USE gvecw, ONLY: ngw
|
|
USE reciprocal_vectors, ONLY: gstart
|
|
USE mp, ONLY: mp_root_sum
|
|
USE control_flags, ONLY: iprsta
|
|
USE io_global, ONLY: stdout
|
|
USE mp_global, ONLY: intra_image_comm, leg_ortho
|
|
USE descriptors, ONLY: lambda_node_ , la_npc_ , la_npr_ , descla_siz_ , &
|
|
descla_init , la_comm_ , ilar_ , ilac_ , nlar_ , &
|
|
nlac_ , la_myr_ , la_myc_ , la_nx_ , la_n_ , nlax_
|
|
USE parallel_toolkit, ONLY: dsqmsym
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER :: nss, ist, ngwx, nkbx, n, ldx, nx
|
|
COMPLEX(DP) :: phi( ngwx, n )
|
|
REAL(DP) :: bephi( nkbx, ldx ), qbephi( nkbx, ldx )
|
|
REAL(DP) :: tau( ldx, ldx )
|
|
INTEGER :: desc( descla_siz_ )
|
|
!
|
|
INTEGER :: i, j, ipr, ipc, nr, nc, ir, ic, npr, npc
|
|
INTEGER :: ii, jj, root
|
|
INTEGER :: desc_ip( descla_siz_ )
|
|
INTEGER :: np( 2 ), coor_ip( 2 )
|
|
|
|
REAL(DP), ALLOCATABLE :: taup( :, : )
|
|
!
|
|
IF( nss < 1 ) RETURN
|
|
!
|
|
! get dimensions of the square processor grid
|
|
!
|
|
np(1) = desc( la_npr_ )
|
|
np(2) = desc( la_npc_ )
|
|
!
|
|
nx = desc( nlax_ )
|
|
!
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
IF( desc( nlax_ ) /= ldx ) &
|
|
CALL errore( " tauset ", " inconsistent dimension ldx ", ldx )
|
|
IF( nx /= ldx ) &
|
|
CALL errore( " tauset ", " inconsistent dimension nx ", nx )
|
|
END IF
|
|
!
|
|
ALLOCATE( taup( nx, nx ) )
|
|
!
|
|
taup = 0.0d0
|
|
!
|
|
! loop on processors coordinates
|
|
!
|
|
DO ipc = 1, np(2)
|
|
!
|
|
DO ipr = 1, ipc ! np(1) use symmetry
|
|
|
|
coor_ip(1) = ipr - 1
|
|
coor_ip(2) = ipc - 1
|
|
|
|
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
|
|
|
|
nr = desc_ip( nlar_ )
|
|
nc = desc_ip( nlac_ )
|
|
ir = desc_ip( ilar_ )
|
|
ic = desc_ip( ilac_ )
|
|
!
|
|
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
|
|
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
|
|
!
|
|
root = root * leg_ortho
|
|
!
|
|
! All processors contribute to the tau block of processor (ipr,ipc)
|
|
! with their own part of wavefunctions
|
|
!
|
|
CALL dgemm( 'T', 'N', nr, nc, 2*ngw, 2.0d0, phi( 1, ist + ir - 1 ), 2*ngwx, &
|
|
phi( 1, ist + ic - 1 ), 2*ngwx, 0.0d0, taup, nx )
|
|
!
|
|
! q = 0 components has weight 1.0
|
|
!
|
|
IF (gstart == 2) THEN
|
|
CALL DGER( nr, nc, -1.D0, phi(1,ist+ir-1), 2*ngwx, phi(1,ist+ic-1), 2*ngwx, taup, nx )
|
|
END IF
|
|
!
|
|
CALL mp_root_sum( taup, tau, root, intra_image_comm )
|
|
!
|
|
END DO
|
|
!
|
|
END DO
|
|
!
|
|
DEALLOCATE( taup )
|
|
!
|
|
CALL dsqmsym( nss, tau, nx, desc )
|
|
!
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
!
|
|
nr = desc( nlar_ )
|
|
nc = desc( nlac_ )
|
|
!
|
|
! bephi is distributed among processor rows
|
|
! qbephi is distributed among processor columns
|
|
! tau is block distributed among the whole processor 2D grid
|
|
!
|
|
IF( nvb > 0 ) THEN
|
|
!
|
|
CALL dgemm( 'T', 'N', nr, nc, nkbus, 1.0d0, bephi, nkbx, qbephi, nkbx, 1.0d0, tau, ldx )
|
|
!
|
|
END IF
|
|
|
|
IF( iprsta > 4 ) THEN
|
|
WRITE( stdout,*)
|
|
WRITE( stdout,'(26x,a)') ' tau '
|
|
DO i=1,nr
|
|
WRITE( stdout,'(7f11.6)') (tau(i,j),j=1,nc)
|
|
END DO
|
|
ENDIF
|
|
!
|
|
ENDIF
|
|
!
|
|
RETURN
|
|
END SUBROUTINE tauset
|
|
|
|
!
|
|
!-------------------------------------------------------------------------
|
|
SUBROUTINE updatc( ccc, n, x0, nx0, phi, ngwx, bephi, nkbx, becp, bec, cp, nss, istart, desc )
|
|
!-----------------------------------------------------------------------
|
|
!
|
|
! input ccc : dt**2/emass OR 1.0d0 demending on ortho
|
|
! input x0 : converged lambdas from ortho-loop (unchanged in output)
|
|
! input cp : non-orthonormal cp=c0+dh/dc*ccc
|
|
! input bec : <cp|beta_i>
|
|
! input phi
|
|
! output cp : orthonormal cp=cp+lambda*phi
|
|
! output bec: bec=becp+lambda*bephi
|
|
!
|
|
USE kinds, ONLY: DP
|
|
USE ions_base, ONLY: nsp, na
|
|
USE io_global, ONLY: stdout
|
|
USE cvan, ONLY: nvb, ish
|
|
USE uspp, ONLY: nkb, nkbus
|
|
USE uspp_param, ONLY: nh
|
|
USE gvecw, ONLY: ngw
|
|
USE control_flags, ONLY: iprint, iprsta
|
|
USE mp, ONLY: mp_sum, mp_bcast
|
|
USE mp_global, ONLY: intra_image_comm, leg_ortho, me_image
|
|
USE descriptors, ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , descla_siz_ , la_comm_ , &
|
|
la_npc_ , la_npr_ , nlax_ , la_n_ , la_nx_ , la_myr_ , la_myc_ , &
|
|
descla_init
|
|
!
|
|
IMPLICIT NONE
|
|
!
|
|
INTEGER, INTENT(IN) :: n, nx0, ngwx, nkbx, istart, nss
|
|
INTEGER, INTENT(IN) :: desc( descla_siz_ )
|
|
COMPLEX(DP) :: cp( ngwx, n ), phi( ngwx, n )
|
|
REAL(DP), INTENT(IN) :: ccc
|
|
REAL(DP) :: bec( nkbx, n ), x0( nx0, nx0 )
|
|
REAL(DP) :: bephi( :, : ), becp( nkbx, n )
|
|
|
|
! local variables
|
|
|
|
INTEGER :: i, j, ig, is, iv, ia, inl, nr, nc, ir, ic
|
|
REAL(DP), ALLOCATABLE :: wtemp(:,:)
|
|
REAL(DP), ALLOCATABLE :: xd(:,:)
|
|
REAL(DP), ALLOCATABLE :: bephi_tmp(:,:)
|
|
REAL(DP) :: beta
|
|
INTEGER :: ipr, ipc, nx, root
|
|
INTEGER :: np( 2 ), coor_ip( 2 )
|
|
INTEGER :: desc_ip( descla_siz_ )
|
|
!
|
|
! lagrange multipliers
|
|
!
|
|
IF( nss < 1 ) RETURN
|
|
!
|
|
IF( desc( lambda_node_ ) > 0 ) THEN
|
|
IF( nx0 /= desc( nlax_ ) ) &
|
|
CALL errore( " updatc ", " inconsistent dimension nx0 ", nx0 )
|
|
END IF
|
|
!
|
|
! size of the local block
|
|
!
|
|
nx = desc( nlax_ )
|
|
!
|
|
np(1) = desc( la_npr_ )
|
|
np(2) = desc( la_npc_ )
|
|
!
|
|
CALL start_clock( 'updatc' )
|
|
|
|
ALLOCATE( xd( nx, nx ) )
|
|
|
|
IF( nvb > 0 )THEN
|
|
ALLOCATE( wtemp( nx, nkb ) )
|
|
ALLOCATE( bephi_tmp( nkbx, nx ) )
|
|
DO i = 1, nss
|
|
DO inl = 1, nkbus
|
|
bec( inl, i + istart - 1 ) = 0.0d0
|
|
END DO
|
|
END DO
|
|
END IF
|
|
|
|
|
|
DO ipc = 1, np(2)
|
|
!
|
|
IF( nvb > 0 )THEN
|
|
!
|
|
! For the inner loop we need the block of bebhi( :, ic : ic + nc - 1 )
|
|
! this is the same of block bephi( :, ir : ir + nr - 1 ) on processor
|
|
! with coords ipr == ipc
|
|
!
|
|
! get the right processor owning the block of bephi
|
|
!
|
|
CALL GRID2D_RANK( 'R', np(1), np(2), ipc-1, ipc-1, root )
|
|
root = root * leg_ortho
|
|
!
|
|
! broadcast the block to all processors
|
|
!
|
|
IF( me_image == root ) bephi_tmp = bephi
|
|
CALL mp_bcast( bephi_tmp, root, intra_image_comm )
|
|
!
|
|
END IF
|
|
|
|
DO ipr = 1, np(1)
|
|
!
|
|
coor_ip(1) = ipr - 1
|
|
coor_ip(2) = ipc - 1
|
|
|
|
CALL descla_init( desc_ip, desc( la_n_ ), desc( la_nx_ ), np, coor_ip, desc( la_comm_ ), 1 )
|
|
|
|
nr = desc_ip( nlar_ )
|
|
nc = desc_ip( nlac_ )
|
|
ir = desc_ip( ilar_ )
|
|
ic = desc_ip( ilac_ )
|
|
!
|
|
CALL GRID2D_RANK( 'R', desc_ip( la_npr_ ), desc_ip( la_npc_ ), &
|
|
desc_ip( la_myr_ ), desc_ip( la_myc_ ), root )
|
|
|
|
root = root * leg_ortho
|
|
|
|
IF( desc( la_myr_ ) == ipr - 1 .AND. desc( la_myc_ ) == ipc - 1 .AND. desc( lambda_node_ ) > 0 ) THEN
|
|
xd = x0 * ccc
|
|
END IF
|
|
|
|
CALL mp_bcast( xd, root, intra_image_comm )
|
|
|
|
CALL dgemm( 'N', 'N', 2*ngw, nc, nr, 1.0d0, phi(1,istart+ir-1), 2*ngwx, &
|
|
xd, nx, 1.0d0, cp(1,istart+ic-1), 2*ngwx )
|
|
|
|
IF( nvb > 0 )THEN
|
|
|
|
! updating of the <beta|c(n,g)>
|
|
!
|
|
! bec of vanderbilt species are updated
|
|
!
|
|
CALL dgemm( 'N', 'T', nr, nkbus, nc, 1.0d0, xd, nx, bephi_tmp, nkbx, 0.0d0, wtemp, nx )
|
|
!
|
|
! here nr and ir are still valid, since they are the same for all procs in the same row
|
|
!
|
|
DO i = 1, nr
|
|
DO inl = 1, nkbus
|
|
bec( inl, i + istart + ir - 2 ) = bec( inl, i + istart + ir - 2 ) + wtemp( i, inl )
|
|
END DO
|
|
END DO
|
|
!
|
|
END IF
|
|
|
|
END DO
|
|
!
|
|
END DO
|
|
|
|
IF( nvb > 0 )THEN
|
|
DEALLOCATE( wtemp )
|
|
DEALLOCATE( bephi_tmp )
|
|
DO i = istart, istart + nss - 1
|
|
DO inl = 1, nkbus
|
|
bec( inl, i ) = bec( inl, i ) + becp( inl, i )
|
|
END DO
|
|
END DO
|
|
END IF
|
|
!
|
|
IF ( iprsta > 2 ) THEN
|
|
WRITE( stdout,*)
|
|
DO is = 1, nvb
|
|
IF( nvb > 1 ) THEN
|
|
WRITE( stdout,'(33x,a,i4)') ' updatc: bec (is)',is
|
|
WRITE( stdout,'(8f9.4)') &
|
|
& ((bec(ish(is)+(iv-1)*na(is)+1,i+istart-1),iv=1,nh(is)),i=1,nss)
|
|
ELSE
|
|
DO ia=1,na(is)
|
|
WRITE( stdout,'(33x,a,i4)') ' updatc: bec (ia)',ia
|
|
WRITE( stdout,'(8f9.4)') &
|
|
& ((bec(ish(is)+(iv-1)*na(is)+ia,i+istart-1),iv=1,nh(is)),i=1,nss)
|
|
END DO
|
|
END IF
|
|
WRITE( stdout,*)
|
|
END DO
|
|
ENDIF
|
|
!
|
|
DEALLOCATE( xd )
|
|
!
|
|
CALL stop_clock( 'updatc' )
|
|
!
|
|
RETURN
|
|
END SUBROUTINE updatc
|
|
|
|
|
|
|
|
|
|
!-------------------------------------------------------------------------
|
|
SUBROUTINE calphi( c0, ngwx, bec, nkbx, betae, phi, n, ema0bg )
|
|
!-----------------------------------------------------------------------
|
|
! input: c0 (orthonormal with s(r(t)), bec=<c0|beta>, betae=|beta>
|
|
! computes the matrix phi (with the old positions)
|
|
! where |phi> = s'|c0> = |c0> + sum q_ij |i><j|c0>
|
|
! where s'=s(r(t))
|
|
!
|
|
USE kinds, ONLY: DP
|
|
USE ions_base, ONLY: na, nsp
|
|
USE io_global, ONLY: stdout
|
|
USE mp_global, ONLY: intra_image_comm
|
|
USE cvan, ONLY: ish, nvb
|
|
USE uspp_param, ONLY: nh
|
|
USE uspp, ONLY: nkbus, qq
|
|
USE gvecw, ONLY: ngw
|
|
USE constants, ONLY: pi, fpi
|
|
USE control_flags, ONLY: iprint, iprsta
|
|
USE mp, ONLY: mp_sum
|
|
!
|
|
IMPLICIT NONE
|
|
|
|
INTEGER, INTENT(IN) :: ngwx, nkbx, n
|
|
COMPLEX(DP) :: c0( ngwx, n ), phi( ngwx, n ), betae( ngwx, nkbx )
|
|
REAL(DP) :: bec( nkbx, n ), emtot
|
|
REAL(DP), OPTIONAL :: ema0bg( ngwx )
|
|
|
|
! local variables
|
|
!
|
|
INTEGER :: is, iv, jv, ia, inl, jnl, i, j
|
|
REAL(DP), ALLOCATABLE :: qtemp( : , : )
|
|
REAL(DP) :: qqf
|
|
!
|
|
IF( n < 1 ) RETURN
|
|
!
|
|
CALL start_clock( 'calphi' )
|
|
|
|
!
|
|
IF ( nvb > 0 ) THEN
|
|
|
|
ALLOCATE( qtemp( nkbus, n ) )
|
|
|
|
qtemp (:,:) = 0.d0
|
|
DO is=1,nvb
|
|
DO iv=1,nh(is)
|
|
inl = ish(is)+(iv-1)*na(is)
|
|
DO jv=1,nh(is)
|
|
jnl = ish(is)+(jv-1)*na(is)
|
|
IF(ABS(qq(iv,jv,is)) > 1.d-5) THEN
|
|
qqf = qq(iv,jv,is)
|
|
DO i=1,n
|
|
CALL daxpy( na(is), qqf, bec(jnl+1,i),1,qtemp(inl+1,i), 1 )
|
|
END DO
|
|
ENDIF
|
|
END DO
|
|
END DO
|
|
END DO
|
|
!
|
|
CALL dgemm ( 'N', 'N', 2*ngw, n, nkbus, 1.0d0, betae, &
|
|
2*ngwx, qtemp, nkbus, 0.0d0, phi, 2*ngwx )
|
|
|
|
DEALLOCATE( qtemp )
|
|
|
|
ELSE
|
|
|
|
phi = (0.d0, 0.d0)
|
|
|
|
END IF
|
|
!
|
|
IF( PRESENT( ema0bg ) ) THEN
|
|
!$omp parallel do default(shared), private(i)
|
|
DO j=1,n
|
|
DO i=1,ngw
|
|
phi(i,j)=(phi(i,j)+c0(i,j))*ema0bg(i)
|
|
END DO
|
|
END DO
|
|
!$omp end parallel do
|
|
ELSE
|
|
!$omp parallel do default(shared), private(i)
|
|
DO j=1,n
|
|
DO i=1,ngw
|
|
phi(i,j)=phi(i,j)+c0(i,j)
|
|
END DO
|
|
END DO
|
|
!$omp end parallel do
|
|
END IF
|
|
|
|
!
|
|
|
|
IF(iprsta > 2) THEN
|
|
emtot=0.0d0
|
|
IF( PRESENT( ema0bg ) ) THEN
|
|
DO j=1,n
|
|
DO i=1,ngw
|
|
emtot=emtot +2.0d0*DBLE(phi(i,j)*CONJG(c0(i,j)))*ema0bg(i)**(-2.0d0)
|
|
END DO
|
|
END DO
|
|
ELSE
|
|
DO j=1,n
|
|
DO i=1,ngw
|
|
emtot=emtot +2.0d0*DBLE(phi(i,j)*CONJG(c0(i,j)))
|
|
END DO
|
|
END DO
|
|
END IF
|
|
emtot=emtot/n
|
|
|
|
CALL mp_sum( emtot, intra_image_comm )
|
|
|
|
WRITE( stdout,*) 'in calphi sqrt(emtot)=',SQRT(emtot)
|
|
WRITE( stdout,*)
|
|
DO is = 1, nvb
|
|
IF( nvb > 1 ) THEN
|
|
WRITE( stdout,'(33x,a,i4)') ' calphi: bec (is)',is
|
|
WRITE( stdout,'(8f9.4)') &
|
|
& ((bec(ish(is)+(iv-1)*na(is)+1,i),iv=1,nh(is)),i=1,n)
|
|
ELSE
|
|
DO ia=1,na(is)
|
|
WRITE( stdout,'(33x,a,i4)') ' calphi: bec (ia)',ia
|
|
WRITE( stdout,'(8f9.4)') &
|
|
& ((bec(ish(is)+(iv-1)*na(is)+ia,i),iv=1,nh(is)),i=1,n)
|
|
END DO
|
|
END IF
|
|
END DO
|
|
ENDIF
|
|
|
|
|
|
CALL stop_clock( 'calphi' )
|
|
!
|
|
RETURN
|
|
END SUBROUTINE calphi
|
|
|
|
END MODULE orthogonalize_base
|