quantum-espresso/KS_Solvers/Davidson/regterg.f90

1614 lines
48 KiB
Fortran

!
! Copyright (C) 2001-2015 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
! NOTE (Ivan Carnimeo, May, 05th, 2022):
! cegterg and regterg have been ported to GPU with OpenACC,
! the previous CUF versions (cegterg_gpu and regterg_gpu) have been removed,
! and now cegterg and regterg are used for both CPU and GPU execution.
! If you want to see the previous code checkout to commit: df3080b231c5daf52295c23501fbcaa9bfc4bfcc (on Thu Apr 21 06:18:02 2022 +0000)
!
#define ZERO ( 0.D0, 0.D0 )
#define ONE ( 1.D0, 0.D0 )
!
!
!----------------------------------------------------------------------------
SUBROUTINE regterg( h_psi_ptr, s_psi_ptr, uspp, g_psi_ptr, &
npw, npwx, nvec, nvecx, evc, ethr, &
e, btype, notcnv, lrot, dav_iter, nhpsi )
!----------------------------------------------------------------------------
!
! ... iterative solution of the eigenvalue problem:
!
! ... ( H - e S ) * evc = 0
!
! ... where H is an hermitean operator, e is a real scalar,
! ... S is an uspp matrix, evc is a complex vector
! ... (real wavefunctions with only half plane waves stored)
!
#if defined(__CUDA)
use cublas
#endif
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
nbgrp, my_bgrp_id, me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart
USE mp, ONLY : mp_sum, mp_bcast
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
INTEGER, INTENT(IN) :: npw, npwx, nvec, nvecx
! dimension of the matrix to be diagonalized
! leading dimension of matrix evc, as declared in the calling pgm unit
! integer number of searched low-lying roots
! maximum dimension of the reduced basis set
! (the basis set is refreshed when its dimension would exceed nvecx)
COMPLEX(DP), INTENT(INOUT) :: evc(npwx,nvec)
! evc contains the refined estimates of the eigenvectors
REAL(DP), INTENT(IN) :: ethr
! energy threshold for convergence: root improvement is stopped,
! when two consecutive estimates of the root differ by less than ethr.
LOGICAL, INTENT(IN) :: uspp
! if .FALSE. : S|psi> not needed
INTEGER, INTENT(IN) :: btype(nvec)
! band type ( 1 = occupied, 0 = empty )
LOGICAL, INTENT(IN) :: lrot
! .TRUE. if the wfc have already been rotated
REAL(DP), INTENT(OUT) :: e(nvec)
! contains the estimated roots.
INTEGER, INTENT(OUT) :: dav_iter, notcnv
! integer number of iterations performed
! number of unconverged roots
INTEGER, INTENT(OUT) :: nhpsi
! number of individual Hpsi made
!
! ... LOCAL variables
!
INTEGER, PARAMETER :: maxter = 20
! maximum number of iterations
!
INTEGER :: kter, nbase, np, npw2, npwx2, n, m, nb1, nbn
! counter on iterations
! dimension of the reduced basis
! counter on the reduced basis vectors
! do-loop counters
! counter on the bands
INTEGER :: n_start, n_end, my_n
INTEGER :: ierr
REAL(DP), ALLOCATABLE :: hr(:,:), sr(:,:), vr(:,:), ew(:)
!$acc declare device_resident(hr, sr, vr, ew)
! Hamiltonian on the reduced basis
! S matrix on the reduced basis
! eigenvectors of the Hamiltonian
! eigenvalues of the reduced hamiltonian
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
!$acc declare device_resident(psi, hpsi, spsi)
! work space, contains psi
! the product of H and psi
! the product of S and psi
LOGICAL, ALLOCATABLE :: conv(:)
! true if the root is converged
REAL(DP) :: empty_ethr
! threshold for empty bands
INTEGER :: i,j,k
!
REAL(DP), EXTERNAL :: MYDDOT_VECTOR_GPU
!$acc routine(MYDDOT_VECTOR_GPU) vector
!
EXTERNAL h_psi_ptr, s_psi_ptr, g_psi_ptr
! h_psi_ptr(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi_ptr(npwx,npw,nvec,psi,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,nvec)
! g_psi_ptr(npwx,npw,notcnv,psi,e)
! calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
! the first nvec columns contain the trial eigenvectors
!
CALL start_clock( 'regterg' ) !; write(6,*) 'enter regterg' ; FLUSH(6)
!
!$acc data deviceptr(e)
!
IF ( nvec > nvecx / 2 ) CALL errore( 'regter', 'nvecx is too small', 1 )
!
IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
!
! ... threshold for empty bands
!
empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
!
npw2 = 2*npw
npwx2 = 2*npwx
!
ALLOCATE( psi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate psi ', ABS(ierr) )
ALLOCATE( hpsi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate hpsi ', ABS(ierr) )
!
IF ( uspp ) THEN
ALLOCATE( spsi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( ' regterg ',' cannot allocate spsi ', ABS(ierr) )
END IF
!
ALLOCATE( sr( nvecx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate sr ', ABS(ierr) )
ALLOCATE( hr( nvecx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate hr ', ABS(ierr) )
ALLOCATE( vr( nvecx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate vr ', ABS(ierr) )
ALLOCATE( ew( nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate ew ', ABS(ierr) )
ALLOCATE( conv( nvec ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'regterg ',' cannot allocate conv ', ABS(ierr) )
!
notcnv = nvec
nbase = nvec
conv = .FALSE.
!
!$acc kernels
hpsi = ZERO
psi = ZERO
IF ( uspp ) spsi = ZERO
!$acc end kernels
!
!$acc parallel vector_length(64)
!$acc loop gang independent
DO k=1,nvec
psi(1,k) = evc(1,k)
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (gstart == 2) psi(1,k) = CMPLX( DBLE( psi(1,k) ), 0.D0 ,kind=DP)
!$acc loop vector
DO i=2,npwx
psi(i,k) = evc(i,k)
END DO
END DO
!$acc end parallel
!
! ... hpsi contains h times the basis vectors
!
!$acc host_data use_device(psi, hpsi, spsi)
CALL h_psi_ptr( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nvec
!
! ... spsi contains s times the basis vectors
!
IF ( uspp ) CALL s_psi_ptr( npwx, npw, nvec, psi, spsi )
!$acc end host_data
!
! ... hr contains the projection of the hamiltonian onto the reduced
! ... space vr contains the eigenvectors of hr
!
CALL start_clock( 'regterg:init' )
!$acc kernels
hr(:,:) = 0.D0
sr(:,:) = 0.D0
vr(:,:) = 0.D0
!$acc end kernels
!
!$acc host_data use_device(psi, hpsi, spsi, hr, sr)
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0 , psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nvecx )
CALL mp_sum( hr( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( hr( :, 1:nbase ), intra_bgrp_comm )
!
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nvecx )
!
ELSE
!
if (n_start .le. n_end) &
CALL DGEMM( 'T','N', nbase, my_n, npw2, 2.D0, psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nvecx )
IF ( gstart == 2 ) CALL MYDGER( nbase, my_n, -1.D0, psi, npwx2, psi(1,n_start), npwx2, sr(1,n_start), nvecx )
!
END IF
CALL mp_sum( sr( :, 1:nbase ), inter_bgrp_comm )
!
CALL mp_sum( sr( :, 1:nbase ), intra_bgrp_comm )
!$acc end host_data
!
CALL stop_clock( 'regterg:init' )
!
IF ( lrot ) THEN
!
!$acc parallel loop
DO n = 1, nbase
!
e(n) = hr(n,n)
vr(n,n) = 1.D0
!
END DO
!
ELSE
!
! ... diagonalize the reduced hamiltonian
!
!$acc host_data use_device(hr, sr, vr, ew)
CALL start_clock( 'regterg:diag' )
IF( my_bgrp_id == root_bgrp_id ) THEN
CALL diaghg( nbase, nvec, hr, sr, nvecx, ew, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
END IF
IF( nbgrp > 1 ) THEN
CALL mp_bcast( vr, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
CALL stop_clock( 'regterg:diag' )
!$acc end host_data
!
!$acc parallel loop
DO i = 1, nvec
e(i) = ew(i)
END DO
!
END IF
!
! ... iterate
!
iterate: DO kter = 1, maxter
!
dav_iter = kter ; !write(*,*) kter, notcnv, conv
!
CALL start_clock( 'regterg:update' )
!
np = 0
!
DO n = 1, nvec
!
IF ( .NOT. conv(n) ) THEN
!
! ... this root not yet converged ...
!
np = np + 1
!
! ... reorder eigenvectors so that coefficients for unconverged
! ... roots come first. This allows to use quick matrix-matrix
! ... multiplications to set a new basis vector (see below)
!
IF ( np /= n ) THEN
!$acc parallel loop
DO i = 1, nvecx
vr(i,np) = vr(i,n)
END DO
END IF
!
! ... for use in g_psi_ptr
!
!$acc kernels
ew(nbase+np) = e(n)
!$acc end kernels
!
END IF
!
END DO
!
nb1 = nbase + 1
!
! ... expand the basis set with new basis vectors ( H - e*S )|psi> ...
!
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$acc parallel loop collapse(2)
DO i=1, notcnv
DO k=1,npwx
psi(k,nbase+i)=ZERO
END DO
END DO
!$acc host_data use_device(psi, spsi, vr)
IF ( uspp ) THEN
!
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
!
ELSE
!
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nb1), npwx2 )
!
END IF
!$acc end host_data
! NB: must not call mp_sum over inter_bgrp_comm here because it is done later to the full correction
!
!$acc parallel loop collapse(2)
DO np=1,notcnv
DO k=1,npwx
psi(k,nbase+np) = - ew(nbase+np) * psi(k,nbase+np)
END DO
END DO
!
!$acc host_data use_device(psi, hpsi, vr, ew)
if (n_start .le. n_end) &
CALL DGEMM( 'N','N', npw2, notcnv, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 1.D0, psi(1,nb1), npwx2 )
CALL mp_sum( psi(:,nb1:nbase+notcnv), inter_bgrp_comm )
!
CALL stop_clock( 'regterg:update' )
!
! ... approximate inverse iteration
!
CALL g_psi_ptr( npwx, npw, notcnv, 1, psi(1,nb1), ew(nb1) )
!$acc end host_data
!
! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
! ... order to improve numerical stability of subspace diagonalization
! ... (rdiaghg) ew is used as work array :
!
! ... ew = <psi_i|psi_i>, i = nbase + 1, nbase + notcnv
!
!$acc parallel vector_length(96)
!$acc loop gang private(nbn)
DO n = 1, notcnv
!
nbn = nbase + n
!
ew(n) = 2.D0 * MYDDOT_VECTOR_GPU( npw2, psi(1,nbn), psi(1,nbn) )
IF (gstart == 2) ew(n) = ew(n) - DBLE(psi(1,nbn) * psi(1,nbn)) ! psi(1,nbn) * psi(1,nbn)
!
END DO
!$acc end parallel
!
!$acc host_data use_device(ew)
CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
!$acc end host_data
!
!$acc parallel vector_length(96)
!$acc loop gang
DO i = 1,notcnv
psi(1,nbase+i) = psi(1,nbase+i)/SQRT( ew(i) )
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF (gstart == 2) psi(1,nbase+i) = CMPLX( DBLE(psi(1,nbase+i)), 0.D0 ,kind=DP)
!$acc loop vector
DO k=2,npwx
psi(k,nbase+i) = psi(k,nbase+i)/SQRT( ew(i) )
END DO
END DO
!$acc end parallel
!
! ... here compute the hpsi and spsi of the new functions
!
!$acc host_data use_device(psi, hpsi, spsi)
CALL h_psi_ptr( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
!
IF ( uspp ) CALL s_psi_ptr( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
!$acc end host_data
!
! ... update the reduced hamiltonian
!
CALL start_clock( 'regterg:overlap' )
!
!$acc parallel loop collapse(2)
DO i=0,notcnv-1
DO j=1, nvecx
hr( j, nb1+i )=0.d0
END DO
END DO
!
!$acc host_data use_device(psi, hpsi, hr)
CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, 0.D0, hr(n_start,nb1), nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, hpsi(1,nb1), npwx2, hr(n_start,nb1), nvecx )
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
!
CALL mp_sum( hr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!$acc end host_data
!
!$acc parallel loop collapse(2)
DO i=0,notcnv-1
DO j=1, nvecx
sr( j, nb1+i )=0.d0
END DO
END DO
!
!$acc host_data use_device(psi, spsi, sr)
CALL divide(inter_bgrp_comm,nbase+notcnv,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase+notcnv,n_start,n_end
IF ( uspp ) THEN
!
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, 0.D0, sr(n_start,nb1), nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, spsi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
!
ELSE
!
CALL DGEMM( 'T','N', my_n, notcnv, npw2, 2.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, 0.D0, sr(n_start,nb1) , nvecx )
IF ( gstart == 2 ) CALL MYDGER( my_n, notcnv, -1.D0, psi(1,n_start), npwx2, psi(1,nb1), npwx2, sr(n_start,nb1), nvecx )
!
END IF
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), inter_bgrp_comm )
!
CALL mp_sum( sr( :, nb1:nb1+notcnv-1 ), intra_bgrp_comm )
!$acc end host_data
!
CALL stop_clock( 'regterg:overlap' )
!
nbase = nbase + notcnv
!
!$acc parallel
!$acc loop gang
DO n = 1, nbase
!
!$acc loop vector
DO m = n + 1, nbase
!
hr(m,n) = hr(n,m)
sr(m,n) = sr(n,m)
!
END DO
!
END DO
!$acc end parallel
!
! ... diagonalize the reduced hamiltonian
!
CALL start_clock( 'regterg:diag' )
!$acc host_data use_device(hr, sr, ew, vr)
IF( my_bgrp_id == root_bgrp_id ) THEN
CALL diaghg( nbase, nvec, hr, sr, nvecx, ew, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
END IF
IF( nbgrp > 1 ) THEN
CALL mp_bcast( vr, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
!$acc end host_data
CALL stop_clock( 'regterg:diag' )
!
! ... test for convergence
!
!$acc parallel loop copy(conv(1:nvec)) copyin(btype(1:nvec))
DO i = 1, nvec
IF(btype(i) == 1) THEN
conv(i) = ( ( ABS( ew(i) - e(i) ) < ethr ) )
ELSE
conv(i) = ( ( ABS( ew(i) - e(i) ) < empty_ethr ) )
END IF
END DO
!
! ... next line useful for band parallelization of exact exchange
IF ( nbgrp > 1 ) CALL mp_bcast(conv,root_bgrp_id,inter_bgrp_comm)
!
notcnv = COUNT( .NOT. conv(:) )
!
!$acc parallel loop
DO i=1,nvec
e(i) = ew(i)
END DO
!
! ... if overall convergence has been achieved, or the dimension of
! ... the reduced basis set is becoming too large, or in any case if
! ... we are at the last iteration refresh the basis set. i.e. replace
! ... the first nvec elements with the current estimate of the
! ... eigenvectors; set the basis dimension to nvec.
!
IF ( notcnv == 0 .OR. &
nbase+notcnv > nvecx .OR. dav_iter == maxter ) THEN
!
CALL start_clock( 'regterg:last' )
!
!$acc parallel loop collapse(2)
DO k=1,nvec
DO i=1,npwx
evc(i,k) = ZERO
END DO
END DO
!
CALL divide(inter_bgrp_comm,nbase,n_start,n_end)
my_n = n_end - n_start + 1; !write (*,*) nbase,n_start,n_end
!$acc host_data use_device(evc, psi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, psi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, evc, npwx2 )
CALL mp_sum( evc, inter_bgrp_comm )
!$acc end host_data
!
IF ( notcnv == 0 ) THEN
!
! ... all roots converged: return
!
CALL stop_clock( 'regterg:last' )
!
EXIT iterate
!
ELSE IF ( dav_iter == maxter ) THEN
!
! ... last iteration, some roots not converged: return
!
!!!WRITE( stdout, '(5X,"WARNING: ",I5, &
!!! & " eigenvalues not converged in regterg")' ) notcnv
!
CALL stop_clock( 'regterg:last' )
!
EXIT iterate
!
END IF
!
! ... refresh psi, H*psi and S*psi
!
!$acc parallel loop collapse(2)
DO i=1,nvec
DO k=1,npwx
psi(k,i) = evc(k,i)
END DO
END DO
!
IF ( uspp ) THEN
!
!$acc parallel loop collapse(2)
DO i = 1, npwx
DO j = nvec+1, nvec+nvec
psi(i,j) = ZERO
END DO
END DO
!
!$acc host_data use_device(psi, spsi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, spsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
!$acc end host_data
!
!$acc parallel loop collapse(2)
DO i=1,nvec
DO k=1,npwx
spsi(k,i) = psi(k,i+nvec)
END DO
END DO
!
END IF
!
!$acc kernels
psi(:,nvec+1:nvec+nvec) = ZERO
!$acc end kernels
!$acc host_data use_device(psi, hpsi, vr)
CALL DGEMM( 'N','N', npw2, nvec, my_n, 1.D0, hpsi(1,n_start), npwx2, vr(n_start,1), nvecx, 0.D0, psi(1,nvec+1), npwx2 )
CALL mp_sum( psi(:,nvec+1:nvec+nvec), inter_bgrp_comm )
!$acc end host_data
!
!$acc parallel loop collapse(2)
DO i=1,nvec
DO k=1, npwx
hpsi(k,i) = psi(k,i+nvec)
END DO
END DO
!
! ... refresh the reduced hamiltonian
!
nbase = nvec
!
!$acc parallel loop collapse(2)
DO i = 1, nvecx
DO j = 1, nbase
hr(i,j) = 0.D0
sr(i,j) = 0.D0
vr(i,j) = 0.D0
END DO
END DO
!
!$acc parallel loop
DO j = 1, nbase
hr(j,j) = e(j)
sr(j,j) = 1.D0
vr(j,j) = 1.D0
END DO
!
CALL stop_clock( 'regterg:last' )
!
END IF
!
END DO iterate
!
DEALLOCATE( conv )
DEALLOCATE( ew )
DEALLOCATE( vr )
DEALLOCATE( hr )
DEALLOCATE( sr )
!
IF ( uspp ) DEALLOCATE( spsi )
!
DEALLOCATE( hpsi )
DEALLOCATE( psi )
!
!$acc end data
!
CALL stop_clock( 'regterg' )
!call print_clock( 'regterg' )
!call print_clock( 'regterg:init' )
!call print_clock( 'regterg:diag' )
!call print_clock( 'regterg:update' )
!call print_clock( 'regterg:overlap' )
!call print_clock( 'regterg:last' )
!
RETURN
!
END SUBROUTINE regterg
!
!
! Subroutine with distributed matrixes
! (written by Carlo Cavazzoni)
!
!----------------------------------------------------------------------------
SUBROUTINE pregterg(h_psi_ptr, s_psi_ptr, uspp, g_psi_ptr, &
npw, npwx, nvec, nvecx, evc, ethr, &
e, btype, notcnv, lrot, dav_iter, nhpsi )
!----------------------------------------------------------------------------
!
! ... iterative solution of the eigenvalue problem:
!
! ... ( H - e S ) * evc = 0
!
! ... where H is an hermitean operator, e is a real scalar,
! ... S is an uspp matrix, evc is a complex vector
! ... (real wavefunctions with only half plane waves stored)
!
USE util_param, ONLY : DP, stdout
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id
USE mp_bands_util, ONLY : gstart
USE mp, ONLY : mp_bcast, mp_root_sum, mp_sum
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
INTEGER, INTENT(IN) :: npw, npwx, nvec, nvecx
! dimension of the matrix to be diagonalized
! leading dimension of matrix evc, as declared in the calling pgm unit
! integer number of searched low-lying roots
! maximum dimension of the reduced basis set
! (the basis set is refreshed when its dimension would exceed nvecx)
COMPLEX(DP), INTENT(INOUT) :: evc(npwx,nvec)
! evc contains the refined estimates of the eigenvectors
REAL(DP), INTENT(IN) :: ethr
! energy threshold for convergence: root improvement is stopped,
! when two consecutive estimates of the root differ by less than ethr.
LOGICAL, INTENT(IN) :: uspp
! if .FALSE. : S|psi> not needed
INTEGER, INTENT(IN) :: btype(nvec)
! band type ( 1 = occupied, 0 = empty )
LOGICAL, INTENT(IN) :: lrot
! .TRUE. if the wfc have already be rotated
REAL(DP), INTENT(OUT) :: e(nvec)
! contains the estimated roots.
INTEGER, INTENT(OUT) :: dav_iter, notcnv
! integer number of iterations performed
! number of unconverged roots
INTEGER, INTENT(OUT) :: nhpsi
! number of individual Hpsi made
!
! ... LOCAL variables
!
INTEGER, PARAMETER :: maxter = 20
! maximum number of iterations
!
INTEGER :: kter, nbase, np, n, m, nb1
! counter on iterations
! dimension of the reduced basis
! counter on the reduced basis vectors
! do-loop counters
INTEGER :: ierr
REAL(DP), ALLOCATABLE :: ew(:)
REAL(DP), ALLOCATABLE :: hl(:,:), sl(:,:), vl(:,:)
! Hamiltonian on the reduced basis
! S matrix on the reduced basis
! eigenvectors of the Hamiltonian
! eigenvalues of the reduced hamiltonian
COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
! work space, contains psi
! the product of H and psi
! the product of S and psi
LOGICAL, ALLOCATABLE :: conv(:)
! true if the root is converged
REAL(DP) :: empty_ethr
! threshold for empty bands
INTEGER :: npw2, npwx2
INTEGER :: idesc(LAX_DESC_SIZE), idesc_old(LAX_DESC_SIZE)
INTEGER, ALLOCATABLE :: irc_ip( : )
INTEGER, ALLOCATABLE :: nrc_ip( : )
INTEGER, ALLOCATABLE :: rank_ip( :, : )
! matrix distribution descriptors
INTEGER :: nx
! maximum local block dimension
LOGICAL :: la_proc
! flag to distinguish procs involved in linear algebra
INTEGER, ALLOCATABLE :: notcnv_ip( : )
INTEGER, ALLOCATABLE :: ic_notcnv( : )
!
INTEGER :: np_ortho(2), ortho_parent_comm
LOGICAL :: do_distr_diag_inside_bgrp
!
REAL(DP), EXTERNAL :: ddot
!
EXTERNAL h_psi_ptr, s_psi_ptr, g_psi_ptr
! h_psi_ptr(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi_ptr(npwx,npw,nvec,psi,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,nvec)
! g_psi_ptr(npwx,npw,notcnv,psi,e)
! calculates (diag(h)-e)^-1 * psi, diagonal approx. to (h-e)^-1*psi
! the first nvec columns contain the trial eigenvectors
!
!
CALL start_clock( 'regterg' ) !; write(6,*) 'enter pregterg' ; FLUSH(6)
!
CALL laxlib_getval( np_ortho = np_ortho, ortho_parent_comm = ortho_parent_comm, &
do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp )
!
IF ( nvec > nvecx / 2 ) CALL errore( 'pregter', 'nvecx is too small', 1 )
!
IF ( gstart == -1 ) CALL errore( 'pregter', 'gstart variable not initialized', 1 )
!
! ... threshold for empty bands
!
empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
!
ALLOCATE( psi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate psi ', ABS(ierr) )
!
ALLOCATE( hpsi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate hpsi ', ABS(ierr) )
!
IF ( uspp ) THEN
ALLOCATE( spsi( npwx, nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate spsi ', ABS(ierr) )
END IF
!
! ... Initialize the matrix descriptor
!
ALLOCATE( ic_notcnv( np_ortho(2) ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate ic_notcnv ', ABS(ierr) )
!
ALLOCATE( notcnv_ip( np_ortho(2) ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate notcnv_ip ', ABS(ierr) )
!
CALL desc_init( nvec, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
!
IF( la_proc ) THEN
!
! only procs involved in the diagonalization need to allocate local
! matrix block.
!
ALLOCATE( vl( nx , nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate vl ', ABS(ierr) )
!
ALLOCATE( sl( nx , nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate sl ', ABS(ierr) )
!
ALLOCATE( hl( nx , nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate hl ', ABS(ierr) )
!
ELSE
!
ALLOCATE( vl( 1 , 1 ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate vl ', ABS(ierr) )
!
ALLOCATE( sl( 1 , 1 ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate sl ', ABS(ierr) )
!
ALLOCATE( hl( 1 , 1 ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate hl ', ABS(ierr) )
!
END IF
!
ALLOCATE( ew( nvecx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate ew ', ABS(ierr) )
!
ALLOCATE( conv( nvec ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate conv ', ABS(ierr) )
!
npw2 = 2*npw
npwx2 = 2*npwx
notcnv = nvec
nbase = nvec
conv = .FALSE.
!
IF ( uspp ) spsi = ZERO
!
hpsi = ZERO
psi = ZERO
psi(:,1:nvec) = evc(:,1:nvec)
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) psi(1,1:nvec) = CMPLX( DBLE( psi(1,1:nvec) ), 0.D0 ,kind=DP)
!
! ... hpsi contains h times the basis vectors
!
CALL h_psi_ptr( npwx, npw, nvec, psi, hpsi ) ; nhpsi = nvec
!
IF ( uspp ) CALL s_psi_ptr( npwx, npw, nvec, psi, spsi )
!
! ... hl contains the projection of the hamiltonian onto the reduced
! ... space, vl contains the eigenvectors of hl. Remember hl, vl and sl
! ... are all distributed across processors, global replicated matrixes
! ... here are never allocated
!
CALL start_clock( 'regterg:init' )
CALL compute_distmat( hl, psi, hpsi )
!
IF ( uspp ) THEN
!
CALL compute_distmat( sl, psi, spsi )
!
ELSE
!
CALL compute_distmat( sl, psi, psi )
!
END IF
CALL stop_clock( 'regterg:init' )
!
!
IF ( lrot ) THEN
!
CALL set_e_from_h()
!
CALL set_to_identity( vl, idesc )
!
ELSE
!
! ... diagonalize the reduced hamiltonian
! Calling block parallel algorithm
!
CALL start_clock( 'regterg:diag' )
IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg ew and vl are the same across ortho_parent_comm
! only the first bgrp performs the diagonalization
IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
CALL mp_bcast( vl, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
ELSE
CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
END IF
CALL stop_clock( 'regterg:diag' )
!
e(1:nvec) = ew(1:nvec)
!
END IF
!
! ... iterate
!
iterate: DO kter = 1, maxter
!
dav_iter = kter ; !write(*,*) kter, notcnv, conv
!
CALL start_clock( 'regterg:update' )
!
CALL reorder_v()
!
nb1 = nbase + 1
!
! ... expand the basis set with new basis vectors ( H - e*S )|psi> ...
!
CALL hpsi_dot_v()
!
CALL stop_clock( 'regterg:update' )
!
! ... approximate inverse iteration
!
CALL g_psi_ptr( npwx, npw, notcnv, 1, psi(1,nb1), ew(nb1) )
!
! ... "normalize" correction vectors psi(:,nb1:nbase+notcnv) in
! ... order to improve numerical stability of subspace diagonalization
! ... (rdiaghg) ew is used as work array :
!
! ... ew = <psi_i|psi_i>, i = nbase + 1, nbase + notcnv
!
DO n = 1, notcnv
!
ew(n) = 2.D0 * ddot( npw2, psi(1,nbase+n), 1, psi(1,nbase+n), 1 )
!
IF ( gstart == 2 ) ew(n) = ew(n) - psi(1,nbase+n) * psi(1,nbase+n)
!
END DO
!
CALL mp_sum( ew( 1:notcnv ), intra_bgrp_comm )
!
DO n = 1, notcnv
!
psi(:,nbase+n) = psi(:,nbase+n) / SQRT( ew(n) )
! ... set Im[ psi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) psi(1,nbase+n) = CMPLX( DBLE(psi(1,nbase+n)), 0.D0 ,kind=DP)
!
END DO
!
! ... here compute the hpsi and spsi of the new functions
!
CALL h_psi_ptr( npwx, npw, notcnv, psi(1,nb1), hpsi(1,nb1) ) ; nhpsi = nhpsi + notcnv
!
IF ( uspp ) CALL s_psi_ptr( npwx, npw, notcnv, psi(1,nb1), spsi(1,nb1) )
!
! ... update the reduced hamiltonian
!
CALL start_clock( 'regterg:overlap' )
!
! we need to save the old descriptor in order to redistribute matrices
!
idesc_old = idesc
!
! ... RE-Initialize the matrix descriptor
!
CALL desc_init( nbase+notcnv, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
!
IF( la_proc ) THEN
! redistribute hl and sl (see dsqmred), since the dimension of the subspace has changed
!
vl = hl
DEALLOCATE( hl )
ALLOCATE( hl( nx , nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate hl ', ABS(ierr) )
CALL laxlib_dsqmred( nbase, vl, idesc_old(LAX_DESC_NRCX), idesc_old, nbase+notcnv, hl, nx, idesc )
vl = sl
DEALLOCATE( sl )
ALLOCATE( sl( nx , nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate sl ', ABS(ierr) )
CALL laxlib_dsqmred( nbase, vl, idesc_old(LAX_DESC_NRCX), idesc_old, nbase+notcnv, sl, nx, idesc )
DEALLOCATE( vl )
ALLOCATE( vl( nx , nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate vl ', ABS(ierr) )
END IF
!
!
CALL update_distmat( hl, psi, hpsi )
!
IF ( uspp ) THEN
!
CALL update_distmat( sl, psi, spsi )
!
ELSE
!
CALL update_distmat( sl, psi, psi )
!
END IF
!
CALL stop_clock( 'regterg:overlap' )
!
nbase = nbase + notcnv
!
! ... diagonalize the reduced hamiltonian
! Call block parallel algorithm
!
CALL start_clock( 'regterg:diag' )
IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg ew and vl are the same across ortho_parent_comm
! only the first bgrp performs the diagonalization
IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
IF( nbgrp > 1 ) THEN ! results must be brodcast to the other bnd groups
CALL mp_bcast( vl, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( ew, root_bgrp_id, inter_bgrp_comm )
ENDIF
ELSE
CALL pdiaghg( nbase, hl, sl, nx, ew, vl, idesc )
END IF
CALL stop_clock( 'regterg:diag' )
!
! ... test for convergence
!
WHERE( btype(1:nvec) == 1 )
!
conv(1:nvec) = ( ( ABS( ew(1:nvec) - e(1:nvec) ) < ethr ) )
!
ELSEWHERE
!
conv(1:nvec) = ( ( ABS( ew(1:nvec) - e(1:nvec) ) < empty_ethr ) )
!
END WHERE
! ... next line useful for band parallelization of exact exchange
IF ( nbgrp > 1 ) CALL mp_bcast(conv,root_bgrp_id,inter_bgrp_comm)
!
notcnv = COUNT( .NOT. conv(:) )
!
e(1:nvec) = ew(1:nvec)
!
! ... if overall convergence has been achieved, or the dimension of
! ... the reduced basis set is becoming too large, or in any case if
! ... we are at the last iteration refresh the basis set. i.e. replace
! ... the first nvec elements with the current estimate of the
! ... eigenvectors; set the basis dimension to nvec.
!
IF ( notcnv == 0 .OR. nbase+notcnv > nvecx .OR. dav_iter == maxter ) THEN
!
CALL start_clock( 'regterg:last' )
!
CALL refresh_evc()
!
IF ( notcnv == 0 ) THEN
!
! ... all roots converged: return
!
CALL stop_clock( 'regterg:last' )
!
EXIT iterate
!
ELSE IF ( dav_iter == maxter ) THEN
!
! ... last iteration, some roots not converged: return
!
!!!WRITE( stdout, '(5X,"WARNING: ",I5, &
!!! & " eigenvalues not converged in regterg")' ) notcnv
!
CALL stop_clock( 'regterg:last' )
!
EXIT iterate
!
END IF
!
! ... refresh psi, H*psi and S*psi
!
psi(:,1:nvec) = evc(:,1:nvec)
!
IF ( uspp ) THEN
!
CALL refresh_spsi()
!
END IF
!
CALL refresh_hpsi()
!
! ... refresh the reduced hamiltonian
!
nbase = nvec
!
CALL desc_init( nvec, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
!
IF( la_proc ) THEN
!
! note that nx has been changed by desc_init
! we need to re-alloc with the new size.
!
DEALLOCATE( vl, hl, sl )
ALLOCATE( vl( nx, nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate vl ', ABS(ierr) )
ALLOCATE( hl( nx, nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate hl ', ABS(ierr) )
ALLOCATE( sl( nx, nx ), STAT=ierr )
IF( ierr /= 0 ) &
CALL errore( 'pregterg ',' cannot allocate sl ', ABS(ierr) )
!
END IF
!
CALL set_h_from_e( )
!
CALL set_to_identity( vl, idesc )
CALL set_to_identity( sl, idesc )
!
CALL stop_clock( 'regterg:last' )
!
END IF
!
END DO iterate
!
!
DEALLOCATE( vl, hl, sl )
!
DEALLOCATE( rank_ip )
DEALLOCATE( irc_ip )
DEALLOCATE( nrc_ip )
DEALLOCATE( ic_notcnv )
DEALLOCATE( notcnv_ip )
DEALLOCATE( conv )
DEALLOCATE( ew )
!
IF ( uspp ) DEALLOCATE( spsi )
!
DEALLOCATE( hpsi )
DEALLOCATE( psi )
!
CALL stop_clock( 'regterg' )
!call print_clock( 'regterg' )
!call print_clock( 'regterg:init' )
!call print_clock( 'regterg:diag' )
!call print_clock( 'regterg:update' )
!call print_clock( 'regterg:overlap' )
!call print_clock( 'regterg:last' )
!
RETURN
!
!
CONTAINS
!
!
SUBROUTINE set_to_identity( distmat, idesc )
INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
REAL(DP), INTENT(OUT) :: distmat(:,:)
INTEGER :: i
distmat = 0_DP
IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) .AND. idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
DO i = 1, idesc(LAX_DESC_NC)
distmat( i, i ) = 1_DP
END DO
END IF
RETURN
END SUBROUTINE set_to_identity
!
!
SUBROUTINE reorder_v()
!
INTEGER :: ipc, ipr
INTEGER :: nc, ic
INTEGER :: nl, npl
!
np = 0
!
notcnv_ip = 0
!
n = 0
!
DO ipc = 1, idesc(LAX_DESC_NPC)
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
!
npl = 0
!
IF( ic <= nvec ) THEN
!
DO nl = 1, min( nvec - ic + 1, nc )
!
n = n + 1
!
IF ( .NOT. conv(n) ) THEN
!
! ... this root not yet converged ...
!
np = np + 1
npl = npl + 1
IF( npl == 1 ) ic_notcnv( ipc ) = np
!
! ... reorder eigenvectors so that coefficients for unconverged
! ... roots come first. This allows to use quick matrix-matrix
! ... multiplications to set a new basis vector (see below)
!
notcnv_ip( ipc ) = notcnv_ip( ipc ) + 1
!
IF ( npl /= nl ) THEN
IF( la_proc .AND. idesc(LAX_DESC_MYC) == ipc-1 ) THEN
vl( :, npl) = vl( :, nl )
END IF
END IF
!
! ... for use in g_psi_ptr
!
ew(nbase+np) = e(n)
!
END IF
!
END DO
!
END IF
!
END DO
!
END SUBROUTINE reorder_v
!
!
SUBROUTINE hpsi_dot_v()
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, notcl, root, np
REAL(DP), ALLOCATABLE :: vtmp( :, : )
COMPLEX(DP), ALLOCATABLE :: ptmp( :, : )
REAL(DP) :: beta
ALLOCATE( vtmp( nx, nx ) )
ALLOCATE( ptmp( npwx, nx ) )
DO ipc = 1, idesc(LAX_DESC_NPC)
!
IF( notcnv_ip( ipc ) > 0 ) THEN
notcl = notcnv_ip( ipc )
ic = ic_notcnv( ipc )
beta = 0.0d0
DO ipr = 1, idesc(LAX_DESC_NPR)
!
nr = nrc_ip( ipr )
ir = irc_ip( ipr )
!
root = rank_ip( ipr, ipc )
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
vtmp(:,1:notcl) = vl(:,1:notcl)
END IF
CALL mp_bcast( vtmp(:,1:notcl), root, ortho_parent_comm )
!
IF ( uspp ) THEN
!
CALL DGEMM( 'N', 'N', npw2, notcl, nr, 1.D0, &
spsi( 1, ir ), npwx2, vtmp, nx, beta, psi(1,nb1+ic-1), npwx2 )
!
ELSE
!
CALL DGEMM( 'N', 'N', npw2, notcl, nr, 1.D0, &
psi( 1, ir ), npwx2, vtmp, nx, beta, psi(1,nb1+ic-1), npwx2 )
!
END IF
!
CALL DGEMM( 'N', 'N', npw2, notcl, nr, 1.D0, &
hpsi( 1, ir ), npwx2, vtmp, nx, beta, ptmp, npwx2 )
beta = 1.0d0
END DO
DO np = 1, notcl
!
psi(1:npw,nbase+np+ic-1) = ptmp(1:npw,np) - ew(nbase+np+ic-1) * psi(1:npw,nbase+np+ic-1)
!
END DO
!
END IF
!
END DO
DEALLOCATE( vtmp )
DEALLOCATE( ptmp )
RETURN
END SUBROUTINE hpsi_dot_v
!
!
SUBROUTINE refresh_evc( )
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
REAL(DP), ALLOCATABLE :: vtmp( :, : )
REAL(DP) :: beta
ALLOCATE( vtmp( nx, nx ) )
!
DO ipc = 1, idesc(LAX_DESC_NPC)
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
!
IF( ic <= nvec ) THEN
!
nc = min( nc, nvec - ic + 1 )
!
beta = 0.0d0
DO ipr = 1, idesc(LAX_DESC_NPR)
!
nr = nrc_ip( ipr )
ir = irc_ip( ipr )
!
root = rank_ip( ipr, ipc )
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
psi(1,ir), npwx2, vl, nx, beta, evc(1,ic), npwx2 )
ELSE
!
! all other procs receive
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
psi(1,ir), npwx2, vtmp, nx, beta, evc(1,ic), npwx2 )
END IF
!
beta = 1.0d0
END DO
!
END IF
!
END DO
!
DEALLOCATE( vtmp )
RETURN
END SUBROUTINE refresh_evc
!
!
SUBROUTINE refresh_spsi( )
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
REAL(DP), ALLOCATABLE :: vtmp( :, : )
REAL(DP) :: beta
ALLOCATE( vtmp( nx, nx ) )
!
DO ipc = 1, idesc(LAX_DESC_NPC)
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
!
IF( ic <= nvec ) THEN
!
nc = min( nc, nvec - ic + 1 )
!
beta = 0_DP
!
DO ipr = 1, idesc(LAX_DESC_NPR)
!
nr = nrc_ip( ipr )
ir = irc_ip( ipr )
!
root = rank_ip( ipr, ipc )
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
spsi(1,ir), npwx2, vl, nx, beta, psi(1,nvec+ic), npwx2 )
ELSE
!
! all other procs receive
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
spsi(1,ir), npwx2, vtmp, nx, beta, psi(1,nvec+ic), npwx2 )
END IF
!
beta = 1_DP
END DO
!
END IF
!
END DO
!
spsi(:,1:nvec) = psi(:,nvec+1:nvec+nvec)
!
DEALLOCATE( vtmp )
RETURN
END SUBROUTINE refresh_spsi
!
!
!
SUBROUTINE refresh_hpsi( )
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
REAL(DP), ALLOCATABLE :: vtmp( :, : )
REAL(DP) :: beta
ALLOCATE( vtmp( nx, nx ) )
!
DO ipc = 1, idesc(LAX_DESC_NPC)
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
!
IF( ic <= nvec ) THEN
!
nc = min( nc, nvec - ic + 1 )
!
beta = 0.0d0
!
DO ipr = 1, idesc(LAX_DESC_NPR)
!
nr = nrc_ip( ipr )
ir = irc_ip( ipr )
!
root = rank_ip( ipr, ipc )
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
CALL mp_bcast( vl(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
hpsi(1,ir), npwx2, vl, nx, beta, psi(1,nvec+ic), npwx2 )
ELSE
!
! all other procs receive
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
hpsi(1,ir), npwx2, vtmp, nx, beta, psi(1,nvec+ic), npwx2 )
END IF
!
beta = 1.0d0
END DO
!
END IF
!
END DO
!
DEALLOCATE( vtmp )
hpsi(:,1:nvec) = psi(:,nvec+1:nvec+nvec)
RETURN
END SUBROUTINE refresh_hpsi
!
!
SUBROUTINE compute_distmat( dm, v, w )
!
! This subroutine compute <vi|wj> and store the
! result in distributed matrix dm
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
REAL(DP), INTENT(OUT) :: dm( :, : )
COMPLEX(DP) :: v(:,:), w(:,:)
REAL(DP), ALLOCATABLE :: work( :, : )
!
ALLOCATE( work( nx, nx ) )
!
work = 0.0d0
!
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
!
DO ipr = 1, ipc ! use symmetry for the loop on row procs
!
nr = nrc_ip( ipr )
ir = irc_ip( ipr )
!
! rank of the processor for which this block (ipr,ipc) is destinated
!
root = rank_ip( ipr, ipc )
! use blas subs. on the matrix block
CALL DGEMM( 'T', 'N', nr, nc, npw2, 2.D0 , &
v(1,ir), npwx2, w(1,ic), npwx2, 0.D0, work, nx )
IF ( gstart == 2 ) &
CALL DGER( nr, nc, -1.D0, v(1,ir), npwx2, w(1,ic), npwx2, work, nx )
! accumulate result on dm of root proc.
CALL mp_root_sum( work, dm, root, ortho_parent_comm )
END DO
!
END DO
IF (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
!
CALL laxlib_dsqmsym( nbase, dm, nx, idesc )
!
DEALLOCATE( work )
!
RETURN
END SUBROUTINE compute_distmat
!
!
SUBROUTINE update_distmat( dm, v, w )
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root, icc, ii
REAL(DP) :: dm( :, : )
COMPLEX(DP) :: v(:,:), w(:,:)
REAL(DP), ALLOCATABLE :: vtmp( :, : )
ALLOCATE( vtmp( nx, nx ) )
!
vtmp = 0.0d0
!
DO ipc = 1, idesc(LAX_DESC_NPC)
!
nc = nrc_ip( ipc )
ic = irc_ip( ipc )
!
IF( ic+nc-1 >= nb1 ) THEN
nc = MIN( nc, ic+nc-1 - nb1 + 1 )
IF( ic >= nb1 ) THEN
ii = ic
icc = 1
ELSE
ii = nb1
icc = nb1-ic+1
END IF
DO ipr = 1, ipc ! desc%npr use symmetry
!
nr = nrc_ip( ipr )
ir = irc_ip( ipr )
!
root = rank_ip( ipr, ipc )
CALL DGEMM( 'T', 'N', nr, nc, npw2, 2.D0, v( 1, ir ), &
npwx2, w(1,ii), npwx2, 0.D0, vtmp, nx )
!
IF ( gstart == 2 ) &
CALL DGER( nr, nc, -1.D0, v( 1, ir ), npwx2, w(1,ii), npwx2, vtmp, nx )
IF (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) vtmp = vtmp/nbgrp
IF( (idesc(LAX_DESC_ACTIVE_NODE) > 0) .AND. &
(ipr-1 == idesc(LAX_DESC_MYR)) .AND. (ipc-1 == idesc(LAX_DESC_MYC)) ) THEN
CALL mp_root_sum( vtmp(:,1:nc), dm(:,icc:icc+nc-1), root, ortho_parent_comm )
ELSE
CALL mp_root_sum( vtmp(:,1:nc), dm, root, ortho_parent_comm )
END IF
END DO
!
END IF
!
END DO
!
CALL laxlib_dsqmsym( nbase+notcnv, dm, nx, idesc )
!
DEALLOCATE( vtmp )
RETURN
END SUBROUTINE update_distmat
!
!
!
SUBROUTINE set_e_from_h()
INTEGER :: nc, ic, i
e(1:nbase) = 0.0d0
IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) .AND. la_proc ) THEN
nc = idesc(LAX_DESC_NC)
ic = idesc(LAX_DESC_IC)
DO i = 1, nc
e( i + ic - 1 ) = hl( i, i )
END DO
END IF
CALL mp_sum( e(1:nbase), ortho_parent_comm )
RETURN
END SUBROUTINE set_e_from_h
!
SUBROUTINE set_h_from_e()
INTEGER :: nc, ic, i
IF( la_proc ) THEN
hl = 0.0d0
IF( idesc(LAX_DESC_MYC) == idesc(LAX_DESC_MYR) ) THEN
nc = idesc(LAX_DESC_NC)
ic = idesc(LAX_DESC_IC)
DO i = 1, nc
hl(i,i) = e( i + ic - 1 )
END DO
END IF
END IF
RETURN
END SUBROUTINE set_h_from_e
!
END SUBROUTINE pregterg