quantum-espresso/GWW/gww/do_polarization_lanczos.f90

964 lines
32 KiB
Fortran

!
! Copyright (C) 2001-2013 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!this subroutines performs the lanczos style calculation of the polarization
subroutine calculate_compact_pola_lanczos(options,ispin)
!this subroutine calculate the compact products for the lanczos calculation
!of the polarization
USE kinds, ONLY : DP
USE input_gw, ONLY : input_options
USE basic_structures, ONLY : wannier_u,vt_mat_lanczos,tt_mat_lanczos,initialize_memory,free_memory
USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,&
&free_memory_compact_q_lanczos,do_compact_q_lanczos,write_compact_q_lanczos
USE mp_world, ONLY : nproc,mpime
USE io_global, ONLY : stdout
implicit none
TYPE(input_options), INTENT(in) :: options
INTEGER, INTENT(in) :: ispin!spin channel
TYPE(wannier_u) :: uu!U matrix
TYPE(vt_mat_lanczos) :: vtl
TYPE(tt_mat_lanczos) :: ttl
TYPE(compact_q_lanczos) :: cql
INTEGER :: l_blk,nbegin,nend,iv,iu
call initialize_memory(vtl)
call initialize_memory(ttl)
call initialize_compact_q_lanczos(cql)
write(stdout,*) 'Routine calculate_compact_pola_lanczos'
FLUSH(stdout)
!
!read U matrix
call read_data_pw_u(uu,options%prefix)
!KS valence state index v distributed on processors
l_blk= uu%nums_occ(ispin)/nproc
if(l_blk*nproc < uu%nums_occ(ispin)) l_blk = l_blk+1
nbegin=mpime*l_blk+1
nend=nbegin+l_blk-1
if(nend > uu%nums_occ(ispin)) nend = uu%nums_occ(ispin)
do iv=nbegin,nbegin+l_blk-1
if(iv <= uu%nums_occ(ispin)) then
!loop v' on wannier valence state
do iu=1,uu%nums_occ(ispin)
!read in V and T
call read_data_pw_vt_mat_lanczos(vtl, iu, options%prefix, .true.,ispin)
call read_data_pw_tt_mat_lanczos(ttl, iu, options%prefix, .true.,ispin)
!calculate contribution to Q
if(iu==1) then
allocate( cql%qlm(vtl%numpw,ttl%numt))
cql%qlm(:,:)=0.d0
endif
!sum up with factor Uvvi
call do_compact_q_lanczos(vtl,ttl,cql,dble(uu%umat(iv,iu,ispin)))
call free_memory(vtl)
call free_memory(ttl)
enddo
!write Q^v on disk
cql%ii=iv
call write_compact_q_lanczos(cql)
deallocate(cql%qlm)
else
!put here global routines involving MPI
do iu=1,uu%nums_occ(ispin)
call read_data_pw_vt_mat_lanczos(vtl, iu, options%prefix, .true.,ispin)
call read_data_pw_tt_mat_lanczos(ttl, iu, options%prefix, .true.,ispin)
call free_memory(vtl)
call free_memory(ttl)
enddo
endif
enddo
call free_memory(vtl)
call free_memory(ttl)
call free_memory_compact_q_lanczos(cql)
call free_memory(uu)
return
end subroutine calculate_compact_pola_lanczos
subroutine solve_lanczos(nbuf, alpha,e_mat,lc, l_verbose)
!this subroutine sums to the matrix E_{no}=<t_n|(H-alpha)^-1|t_o>
!the matrix lc%o_mat is distributed among the processors
USE kinds, ONLY : DP
USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory
USE io_global, ONLY : stdout
USE mp, ONLY : mp_sum, mp_bcast
USE mp_world, ONLY : nproc,mpime,world_comm
implicit none
INTEGER :: nbuf!number of matrices to treat
COMPLEX(kind=DP) :: alpha(nbuf)!constant for Ev+iw
TYPE(lanczos_chain) :: lc!lanczos chain descriptor
REAL(kind=DP) :: e_mat(lc%numt,lc%numt,nbuf)!matrix to be calculated
LOGICAL, INTENT(in) :: l_verbose!if true a lot of output
INTEGER :: io,info
COMPLEX(kind=DP), ALLOCATABLE :: dl(:),du(:),d(:),t(:)
REAL(kind=DP), ALLOCATABLE :: tr(:,:)
INTEGER :: l_blk,nbegin,nend
REAL(kind=DP), ALLOCATABLE :: o_mat(:,:,:)
INTEGER :: iv, iproc,nsize
allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps))
e_mat(:,:,:)=0.d0
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
allocate(o_mat(lc%numt,lc%num_steps,l_blk))
allocate(tr(lc%num_steps,l_blk))
!loop on procs
do iproc=0,nproc-1
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
nbegin=iproc*l_blk+1
nend=min(nbegin+l_blk-1,lc%numt)
nsize=nend-nbegin+1
if(nbegin<=lc%numt) then
if(iproc==mpime) then
o_mat(:,:,:)=lc%o_mat(:,:,1:nsize)
endif
if(l_verbose) write(stdout,*) 'proc:', iproc, lc%numt,lc%num_steps,l_blk,nsize
if(l_verbose)FLUSH(stdout)
call mp_bcast(o_mat, iproc, world_comm)
if(l_verbose) write(stdout,*) 'mp_bcast done'
if(l_verbose)FLUSH(stdout)
do iv=1,nbuf
do io=nbegin,nend
!!set up vectors for lapack routine
dl(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0)
du(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0)
d(1:lc%num_steps)=cmplx(lc%d(1:lc%num_steps,io),0.d0)+alpha(iv)
t(:)=(0.d0,0.d0)
t(1)=(1.d0,0.d0)
!!call lapack
call zgtsv(lc%num_steps,1,dl,d,du,t,lc%num_steps,info)
if(info /= 0) then
write(stdout,*) 'ZGTSV info:', info
FLUSH(stdout)
stop
endif
!!calculate term
tr(:,io-nbegin+1)=dble(t(:))
enddo
if(l_verbose) write(stdout,*) 'zgtsv done'
if(l_verbose) FLUSH(stdout)
do io=nbegin,nend
call dgemv( 'N', lc%numt,lc%num_steps,1.d0, o_mat(1,1,io-nbegin+1), lc%numt,tr(1,io-nbegin+1),1,1.d0,e_mat(1,io,iv),1)
! call dgemm('N','N',lc%numt,1,lc%num_steps,1.d0,o_mat(1,1,io-nbegin+1),lc%numt,tr(1,io-nbegin+1),lc%num_steps,1.d0,e_mat(1,io,iv),lc%numt)
enddo
if(l_verbose) write(stdout,*) 'dgemv done'
if(l_verbose) FLUSH(stdout)
enddo!on iv
end if
end do
deallocate(tr)
deallocate(o_mat)
return
end subroutine solve_lanczos
subroutine solve_lanczos_fake(lc,l_verbose)
!this subroutine is a parallel fake routine for the solve lanczos routine
USE kinds, ONLY : DP
USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory
USE io_global, ONLY : stdout
USE mp, ONLY : mp_sum,mp_bcast
USE mp_world, ONLY : nproc,mpime,world_comm
implicit none
TYPE(lanczos_chain) :: lc!lanczos chain descriptor
LOGICAL, INTENT(in) :: l_verbose
INTEGER :: l_blk,nbegin,nend, iproc
REAL(kind=DP), ALLOCATABLE :: o_mat(:,:,:)
INTEGER :: io,nsize
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
allocate(o_mat(lc%numt,lc%num_steps,l_blk))
do iproc=0,nproc-1
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
nbegin=iproc*l_blk+1
nend=min(nbegin+l_blk-1,lc%numt)
nsize=nend-nbegin+1
if(nbegin<=lc%numt) then
if(iproc==mpime) then
o_mat(:,:,:)=lc%o_mat(:,:,1:nsize)
endif
if(l_verbose) write(stdout,*) 'proc:', iproc, lc%numt,lc%num_steps,l_blk,nsize
if(l_verbose) FLUSH(stdout)
call mp_bcast(o_mat, iproc,world_comm)
endif
enddo
deallocate(o_mat)
end subroutine solve_lanczos_fake
subroutine do_polarization_lanczos(tf,options,ispin)
!this subroutine create polarization descriptor in imaginary frequency using the lanczos scheme
USE kinds, ONLY : DP
USE input_gw, ONLY : input_options
USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,free_memory_compact_q_lanczos,&
&read_compact_q_lanczos
USE mp_world, ONLY : world_comm, mpime, nproc
USE basic_structures, ONLY : wannier_u,lanczos_chain, initialize_memory,free_memory,&
&vt_mat_lanczos,tt_mat_lanczos,partial_occ
USE io_global, ONLY : stdout,ionode,ionode_id
USE times_gw, ONLY : times_freqs
USE polarization, ONLY : polaw,initialize_polaw,free_memory_polaw,write_polaw,read_polaw
USE mp, ONLY : mp_bcast
USE io_files, ONLY : prefix, tmp_dir
implicit none
INTEGER, EXTERNAL :: find_free_unit
TYPE(times_freqs) , INTENT(in) :: tf!for time grid
TYPE(input_options), INTENT(in) :: options! for imaginary time range and number of samples
INTEGER, INTENT(in) :: ispin!spin channel
TYPE(wannier_u) :: uu
TYPE(lanczos_chain) :: lc
TYPE(partial_occ) :: po
INTEGER :: l_blk,nbegin,nend,iv,iw, it,ii,jj,il
TYPE(polaw) :: pw
! REAL(kind=DP), ALLOCATABLE :: e_mat(:,:,:)
COMPLEX(kind=DP), ALLOCATABLE :: af(:)
TYPE(compact_q_lanczos) :: cql
REAL(kind=DP), ALLOCATABLE :: tmp_mat(:,:)
REAL(kind=DP), ALLOCATABLE :: tmp_mat1(:,:),tmp_mat2(:,:),tmp_mat3(:,:)
REAL(kind=DP) :: offset
INTEGER :: n_bufv!number of states which are bufferized
INTEGER :: n_dim
INTEGER :: io_tot,l_blk_io, nbegin_io,nend_io, iproc
REAL(kind=DP), ALLOCATABLE :: cql_save(:,:,:)
REAL(kind=DP), ALLOCATABLE :: vtl_save(:,:,:)
REAL(kind=DP), ALLOCATABLE :: ttl_save(:,:,:)
COMPLEX(kind=DP) :: afc
TYPE(vt_mat_lanczos) :: vtl
TYPE(tt_mat_lanczos) :: ttl
INTEGER :: numpw,numl,numt!to be passed to subroutine
LOGICAL :: l_reduce_memory=.true. !if true reduces the use of memory, only useful for nprocs==nfreqs
LOGICAL :: exst
INTEGER :: iw_nbegin, iv_begin, iunrestart,off_nbegin
LOGICAL :: l_do_restart
REAL(kind=DP), ALLOCATABLE :: occ(:)
#if defined(_OPENMP)
INTEGER :: omp_get_num_threads, omp_get_max_threads
EXTERNAL omp_set_num_threads, omp_get_num_threads, omp_get_max_threads
#endif
write(stdout,*) 'Routine: do_polarization_lanczos'
FLUSH(stdout)
call initialize_memory(vtl)
call initialize_memory(ttl)
call initialize_memory(po)
n_bufv=options%n_set_pola
! if(l_reduce_memory) n_bufv=1 OCIO ATTENZIONE
!read uu for energies and lanczos chain
call read_data_pw_u(uu,options%prefix)
if(uu%nums_occ(ispin)==0) then
call free_memory(uu)
return
endif
!call free_memory(uu) !i do not need the orthonormalization matrix
!if required read partial occupancis stuff
if(options%l_frac_occ) then
!NOT_TO_BE_INCLUDED_START
call read_data_pw_partial_occ(po,options%prefix,ispin)
!NOT_TO_BE_INCLUDED_END
endif
call initialize_memory(lc)
call read_data_pw_lanczos_chain(lc, 1, options%prefix, .true.,ispin)
write(stdout,*) 'Lanczos dimensions', lc%numt,lc%num_steps
do ii=1,lc%numt
! do jj=1,lc%num_steps
! write(stdout,*)' Lanczos elemets:', ii, jj, lc%d(jj,ii), lc%f(jj,ii)
! enddo
write(stdout,*)' Lanczos elemets:', ii, lc%d(1,ii), lc%f(1,ii)
enddo
! allocate(e_mat(lc%numt,lc%numt,n_bufv))
allocate(af(n_bufv),occ(n_bufv))
occ(:)=1.d0
call initialize_compact_q_lanczos(cql)
if(uu%nums > uu%nums_occ(ispin)) then
offset=-(uu%ene(uu%nums_occ(ispin)+1,ispin)+uu%ene(uu%nums_occ(ispin),ispin))/2.d0
else
offset=-uu%ene(uu%nums_occ(ispin),ispin)
endif
! l_blk= (2*tf%n+1)/nproc
! if(l_blk*nproc < (2*tf%n+1)) l_blk = l_blk+1
! nbegin=mpime*l_blk-tf%n
! nend=nbegin+l_blk-1
l_blk= (tf%n+1)/nproc
if(l_blk*nproc < (tf%n+1)) l_blk = l_blk+1
nbegin=mpime*l_blk
nend=nbegin+l_blk-1
!if reduce io and l_direct put compact_q_lanczos in memory
if(options%l_t_wannier) then
l_blk_io= (uu%nums_occ(ispin))/nproc
if(l_blk_io*nproc < (uu%nums_occ(ispin))) l_blk_io = l_blk_io+1
nbegin_io=mpime*l_blk_io+1
nend_io=nbegin_io+l_blk_io-1
if(nend_io > uu%nums_occ(ispin)) nend_io=uu%nums_occ(ispin)
io_tot=nend_io-nbegin_io+1
if(.not.l_reduce_memory) then
do iv=1,uu%nums_occ(ispin)
call read_compact_q_lanczos(cql, iv)
if(iv==1) allocate(cql_save(cql%numpw,cql%numt,l_blk_io))
if(iv >= nbegin_io .and. iv <= nend_io) then
cql_save(:,:,iv-nbegin_io+1)=cql%qlm(:,:)
endif
enddo
else
call read_compact_q_lanczos(cql, 1)!just for obtaining numpw,numt...poor man solution
allocate(cql_save(1,1,1))
endif
else
!put all matrices vtl and ttl in memory, distributed according to valence state
l_blk_io= (uu%nums_occ(ispin))/nproc
if(l_blk_io*nproc < (uu%nums_occ(ispin))) l_blk_io = l_blk_io+1
nbegin_io=mpime*l_blk_io+1
nend_io=nbegin_io+l_blk_io-1
if(nend_io > uu%nums_occ(ispin)) nend_io=uu%nums_occ(ispin)
io_tot=nend_io-nbegin_io+1
do iv=1,uu%nums_occ(ispin)
call read_data_pw_vt_mat_lanczos(vtl, iv, options%prefix, .true.,ispin)
call read_data_pw_tt_mat_lanczos(ttl, iv, options%prefix, .true.,ispin)
if(iv==1) then
allocate(vtl_save(vtl%numpw,vtl%numl,l_blk_io))
allocate(ttl_save(ttl%numt,ttl%numl,l_blk_io))
endif
if(iv >= nbegin_io .and. iv <= nend_io) then
vtl_save(:,:,iv-nbegin_io+1)= vtl%vt_mat(:,:)
ttl_save(:,:,iv-nbegin_io+1)= ttl%tt_mat(:,:)
endif
call free_memory(vtl)
call free_memory(ttl)
enddo
endif
!sutup parameters for solve_lanczos
if(options%l_t_wannier) then
numpw=cql%numpw
numt=cql%numt
pw%numpw=cql%numpw
else
numpw=vtl%numpw
numl=vtl%numl
numt=ttl%numt
pw%numpw=vtl%numpw
endif
!loop on imaginary frequency i\omega
!check for restart
if(ionode) then
inquire(file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', exist = exst)
if(.not. exst .or. ispin==2) then
off_nbegin=0
iv_begin=1
l_do_restart=.false.
else
iunrestart = find_free_unit()
open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='old')
read(iunrestart,*) off_nbegin
read(iunrestart,*) iv_begin
close(iunrestart)
if(off_nbegin<0) off_nbegin=0
if(iv_begin<1 ) then
iv_begin=1
l_do_restart=.false.
else
iv_begin=iv_begin+n_bufv
l_do_restart=.true.
endif
endif
endif
call mp_bcast(off_nbegin,ionode_id,world_comm)
iw_nbegin=nbegin+off_nbegin
call mp_bcast(iv_begin, ionode_id,world_comm)
call mp_bcast(l_do_restart, ionode_id,world_comm)
do iw=iw_nbegin,nbegin+l_blk-1
if(iw <= tf%n) then
write(stdout,*) 'do_polarization_lanczos1', iw, l_do_restart
FLUSH(stdout)
!!set up polaw descriptor
!if required read polaw from disk
if(l_do_restart.or.ispin==2) then
call initialize_polaw(pw)
call read_polaw(iw,pw,.false.,options%l_verbose)
l_do_restart=.false.
else
call initialize_polaw(pw)
pw%label=iw
pw%ontime=.false.
pw%time=tf%freqs(iw)
pw%factor=(1.d0,0.d0)
endif
!!loop on valence states v
do iv=iv_begin,uu%nums_occ(ispin),n_bufv
write(stdout,*) 'do_polarization_lanczos iv', iv
FLUSH(stdout)
!!!solve tridiagonal problem for -E_v+i\omega
n_dim=0
do il=1,n_bufv
if((iv+il-1) <= uu%nums_occ(ispin)) then
af(il) = -uu%ene(iv+il-1,ispin)+dcmplx(0.d0,tf%freqs(iw))
if(.not.options%l_frac_occ) then
occ(il)=1.d0
else
!NOT_TO_BE_INCLUDED_START
occ(il)=po%f_occ(iv+il-1)
!NOT_TO_BE_INCLUDED_END
endif
n_dim=n_dim+1
endif
enddo
if(options%l_verbose) write(stdout,*) 'Call solve_lanczos'
if(options%l_verbose) FLUSH(stdout)
if(iv==1.and.ispin/=2) then
! if(options%l_t_wannier) then
! pw%numpw=cql%numpw
! else
! pw%numpw=vtl%numpw
! endif
allocate(pw%pw(pw%numpw,pw%numpw))
pw%pw(:,:)=0.d0
endif
!NOT_TO_BE_INCLUDED_START
if(iv==1 .and. options%l_frac_occ) then
afc=dcmplx(0.d0,tf%freqs(iw))
call add_partial_pola(pw,po,uu,afc,options%nspin,ispin)
endif
!NOT_TO_BE_INCLUDED_END
call solve_lanczos_2(numpw,numt,numl,n_dim,n_bufv,af,lc, iv, uu%nums_occ(ispin),l_blk_io,&
&cql_save,pw,n_dim,options%l_t_wannier,vtl_save,ttl_save,l_reduce_memory,uu%nspin,occ,options%l_verbose)
if(options%l_verbose) write(stdout,*) 'Done'
if(options%l_verbose) FLUSH(stdout)
#if defined(_OPENMP)
! write(stdout,*) 'Call dgemms, ntids=', omp_get_max_threads()
#else
! write(stdout,*) 'Call dgemms'
#endif
!restart part, every 10
if(mod(iv-iv_begin,10)==0.or.n_bufv/=1) then
call write_polaw(pw,options%debug)
if(ionode) then
iunrestart = find_free_unit()
open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='unknown')
write(iunrestart,*) iw-nbegin
write(iunrestart,*) iv
close(iunrestart)
endif
endif
enddo
!!write polarization on disk
if(ionode) then
iunrestart = find_free_unit()
open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='unknown')
write(iunrestart,*) iw-nbegin
write(iunrestart,*) iv
close(iunrestart)
endif
call write_polaw(pw,options%debug)
call free_memory_polaw(pw)
else
!here just global parallel calls
do iv=iv_begin,uu%nums_occ(ispin),n_bufv
n_dim=0
do il=1,n_bufv
if((iv+il-1) <= uu%nums_occ(ispin)) then
n_dim=n_dim+1
endif
enddo
call solve_lanczos_2(numpw,numt,numl,0,n_bufv,af,lc, iv, uu%nums_occ(ispin),l_blk_io,&
&cql_save,pw,n_dim,options%l_t_wannier,vtl_save,ttl_save,l_reduce_memory,uu%nspin,occ,.false.)
enddo
endif
iv_begin=1
l_do_restart=.false.
enddo
if(ionode) then
iunrestart = find_free_unit()
open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='unknown')
write(iunrestart,*) -1
write(iunrestart,*) -1
close(iunrestart)
endif
if(options%l_t_wannier) then
deallocate(cql_save)
call free_memory_compact_q_lanczos(cql)
else
deallocate(vtl_save,ttl_save)
endif
! deallocate(e_mat)
deallocate(af,occ)
call free_memory(lc)
call free_memory(uu)
call free_memory(vtl)
call free_memory(ttl)
call free_memory(po)
return
end subroutine do_polarization_lanczos
subroutine solve_lanczos_2(numpw,numt,numl,nbuf,mbuf, alpha,lc, iv0,nbndv,&
&lblkio,cql_save,pw,n_dim,l_t_wannier, vtl_save, ttl_save, l_reduce_memory,nspin,occ,l_verbose)
!this subroutine sums to the matrix E_{no}=<t_n|(H-alpha)^-1|t_o>
!the matrix lc%o_mat is distributed among the processors
USE kinds, ONLY : DP
USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory
USE io_global, ONLY : stdout
USE mp, ONLY : mp_sum, mp_bcast
USE mp_world, ONLY : nproc,mpime,world_comm
USE parallel_include
USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,&
free_memory_compact_q_lanczos, &
read_compact_q_lanczos
USE polarization, ONLY : polaw
implicit none
INTEGER, INTENT(in) :: numpw!dimension of polarizability basis
INTEGER, INTENT(in) :: numt!number of t vectors
INTEGER, INTENT(in) :: numl!number of tv vectors
INTEGER :: nbuf!number of matrices to treat
INTEGER :: mbuf
COMPLEX(kind=DP) :: alpha(mbuf)!constant for Ev+iw
TYPE(lanczos_chain) :: lc!lanczos chain descriptor
INTEGER, INTENT(in) :: iv0 !staring valence energy
INTEGER, INTENT(in) :: nbndv!number of valence functions
INTEGER, INTENT(in) :: lblkio!for the dimension of the following array
REAL(kind=DP), INTENT(in) :: cql_save(numpw,numt,lblkio)
TYPE(polaw), INTENT(inout) :: pw
INTEGER, INTENT(in) :: n_dim!max dimension of nbuf
LOGICAL, INTENT(in) :: l_t_wannier !if true t vectors from products with wannier
REAL(kind=DP) :: vtl_save(numpw,numl,lblkio)
REAL(kind=DP) :: ttl_save(numt,numl,lblkio)
LOGICAL, INTENT(in) :: l_reduce_memory!if true cql_save not used
INTEGER, INTENT(in) :: nspin!spin multiplicity
REAL(kind=DP), INTENT(in) :: occ(mbuf)!occupancies of KS valence states
LOGICAL, INTENT(in) :: l_verbose
INTEGER :: io,info
COMPLEX(kind=DP), ALLOCATABLE :: dl(:),du(:),d(:),t(:)
REAL(kind=DP), ALLOCATABLE :: tr(:,:)
INTEGER :: l_blk,nbegin,nend
REAL(kind=DP), ALLOCATABLE :: e_mat_ip(:,:)
INTEGER :: iv, iproc,nsize, iproc_cql
INTEGER :: nbuf_ip
COMPLEX(kind=DP), ALLOCATABLE :: alpha_ip(:)
INTEGER, ALLOCATABLE :: displ(:), recvcounts(:)
INTEGER :: l_blk_io, nbegin_io,nend_io
REAL(kind=DP), ALLOCATABLE :: tmp_mat(:,:),tmp_mat1(:,:),tmp_mat2(:,:),tmp_mat3(:,:)
REAL(kind=DP), ALLOCATABLE :: pw_ip(:,:),pw_dumm(:,:)
REAL(kind=DP), ALLOCATABLE :: qlm_tmp(:,:,:),vtl_tmp(:,:,:),ttl_tmp(:,:,:)
INTEGER :: ierr
TYPE(compact_q_lanczos) :: cql
LOGICAL :: l_qo=.true.!if true perform QO matrix multimplication first (convenient only if nprocs>nlanczos_steps)
REAL(kind=DP), ALLOCATABLE :: qo_mat(:,:,:,:)
LOGICAL :: l_single=.true.!if true saves qlm_tmp in single precision
REAL(kind=4), ALLOCATABLE :: qlm_tmp_single(:,:,:)
REAL(kind=DP) :: factor!global prefactor
if(nspin==1) then
factor=-4.d0
else
factor=-2.d0
endif
allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps))
ierr=0
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
nbegin=mpime*l_blk+1
nend=min(nbegin+l_blk-1,lc%numt)
nsize=nend-nbegin+1
l_blk_io= (nbndv)/nproc
if(l_blk_io*nproc < (nbndv)) l_blk_io = l_blk_io+1
nbegin_io=mpime*l_blk_io+1
nend_io=nbegin_io+l_blk_io-1
allocate(tr(lc%num_steps,l_blk))
allocate(displ(nproc),recvcounts(nproc))
allocate(e_mat_ip(lc%numt,l_blk))
if(l_t_wannier) then
if(.not.l_reduce_memory) then
allocate(qlm_tmp(numpw,numt,n_dim))
do iv=1,n_dim
!read compact matrix
iproc_cql=(iv0+iv-2)/l_blk_io
if(mpime==iproc_cql) qlm_tmp(:,:,iv) = cql_save(:,:,iv0+iv-1-nbegin_io+1)
call mp_bcast(qlm_tmp(:,:,iv), iproc_cql,world_comm)
enddo
else
call initialize_compact_q_lanczos(cql)
if(.not.l_single) then
allocate(qlm_tmp(numpw,numt,n_dim))
do iv=1,n_dim
call read_compact_q_lanczos(cql,iv0+iv-1)
qlm_tmp(1:numpw,1:numt,iv)=cql%qlm(1:numpw,1:numt)
enddo
else
allocate(qlm_tmp(numpw,numt,1))
allocate(qlm_tmp_single(numpw,numt,n_dim))
do iv=1,n_dim
call read_compact_q_lanczos(cql,iv0+iv-1)
qlm_tmp_single(1:numpw,1:numt,iv)=real(cql%qlm(1:numpw,1:numt))
enddo
endif
call free_memory_compact_q_lanczos(cql)
endif
if(l_qo) then
allocate(qo_mat(numpw,lc%num_steps,nsize,n_dim))
do iv=1,n_dim
!calculate qo_mat
qlm_tmp(1:numpw,1:numt,1)=dble(qlm_tmp_single(1:numpw,1:numt,iv))
do io=nbegin,nend
if(.not.l_single) then
call dgemm('N','N', numpw,lc%num_steps,numt,1.d0, qlm_tmp(1,1,iv),&
&numpw,lc%o_mat(1,1,io-nbegin+1),numt,0.d0,qo_mat(1,1,io-nbegin+1,iv),numpw)
else
!qlm_tmp(1:numpw,1:numt,1)=dble(qlm_tmp_single(1:numpw,1:numt,iv))
call dgemm('N','N', numpw,lc%num_steps,numt,1.d0, qlm_tmp(1,1,1),numpw,&
&lc%o_mat(1,1,io-nbegin+1),numt,0.d0,qo_mat(1,1,io-nbegin+1,iv),numpw)
endif
enddo
!call dgemv( 'N', lc%numt,lc%num_steps,1.d0, lc%o_mat(1,1,io-nbegin+1), lc%numt,tr(1,io-nbegin+1),1,1.d0,e_mat_ip(1,io-nbegin+1),1)
enddo
endif
else
allocate(vtl_tmp(numpw,numl,n_dim))
allocate(ttl_tmp(numt,numl,n_dim))
do iv=1,n_dim
!read compact matrix
iproc_cql=(iv0+iv-2)/l_blk_io
if(mpime==iproc_cql) then
vtl_tmp(:,:,iv) = vtl_save(:,:,iv0+iv-1-nbegin_io+1)
ttl_tmp(:,:,iv) = ttl_save(:,:,iv0+iv-1-nbegin_io+1)
endif
call mp_bcast(vtl_tmp(:,:,iv), iproc_cql,world_comm)
call mp_bcast(ttl_tmp(:,:,iv), iproc_cql,world_comm)
enddo
endif
do iproc=0,nproc-1
displ(iproc+1)=iproc*l_blk
if(displ(iproc+1)+l_blk > lc%numt) then
recvcounts(iproc+1)=(lc%numt-displ(iproc+1))
else
recvcounts(iproc+1)=l_blk
endif
enddo
displ(:)=displ(:)*lc%numt
recvcounts(:)=recvcounts(:)*lc%numt
if(l_t_wannier) then
allocate(tmp_mat(numpw,l_blk))
else
allocate(tmp_mat1(numl,l_blk))
allocate(tmp_mat2(numpw,l_blk))
allocate(tmp_mat3(numpw,numl))
endif
allocate(pw_ip(numpw,numpw),pw_dumm(numpw,numpw))
!loop on procs
do iproc=0,nproc-1
if(iproc==mpime) nbuf_ip=nbuf
!distribute number of nbuf
!distribute af
call mp_bcast(nbuf_ip, iproc,world_comm)
if(nbuf_ip >0) then
allocate(alpha_ip(nbuf_ip))
if(iproc==mpime) alpha_ip(1:nbuf_ip)=alpha(1:nbuf_ip)
call mp_bcast(alpha_ip,iproc,world_comm)
!loop on freuqency
pw_ip(:,:)=0.d0
do iv=1,nbuf_ip
if(l_single) then
qlm_tmp(1:numpw,1:numt,1)=dble(qlm_tmp_single(1:numpw,1:numt,iv))
endif
e_mat_ip(:,:)=0.d0
!calculate part
!$OMP PARALLEL SHARED(tr,lc,alpha_ip,iv) PRIVATE(dl,du,d,t,info,io)
! allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps))
!$OMP DO
do io=nbegin,nend
!!set up vectors for lapack routine
dl(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0)
du(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0)
d(1:lc%num_steps)=cmplx(lc%d(1:lc%num_steps,io),0.d0)+alpha_ip(iv)
t(:)=(0.d0,0.d0)
t(1)=(1.d0,0.d0)
!!call lapack
call zgtsv(lc%num_steps,1,dl,d,du,t,lc%num_steps,info)
if(info /= 0) then
write(stdout,*) 'ZGTSV info:', info
FLUSH(stdout)
stop
endif
!!calculate term
tr(1:lc%num_steps,io-nbegin+1)=dble(t(1:lc%num_steps))
enddo
!$OMP END DO
! deallocate(dl,du,d,t)
!$OMP END PARALLEL
if(l_verbose) write(stdout,*) 'zgtsv done'
if(l_verbose) FLUSH(stdout)
if(.not.l_qo) then
do io=nbegin,nend
call dgemv( 'N', lc%numt,lc%num_steps,1.d0, lc%o_mat(1,1,io-nbegin+1), &
&lc%numt,tr(1,io-nbegin+1),1,1.d0,e_mat_ip(1,io-nbegin+1),1)
enddo
endif
if(l_verbose) write(stdout,*) 'dgemv done',nsize
if(l_verbose) FLUSH(stdout)
if(nbegin <= lc%numt) then
if(l_t_wannier) then
if(.not.l_qo) then
if(.not.l_single) then
call dgemm('N','N',numpw,nsize,numt,1.d0,qlm_tmp(1,1,iv),numpw,e_mat_ip,lc%numt,0.d0,tmp_mat,numpw)
else
call dgemm('N','N',numpw,nsize,numt,1.d0,qlm_tmp(1,1,1),numpw,e_mat_ip,lc%numt,0.d0,tmp_mat,numpw)
endif
if(l_verbose) write(stdout,*) 'dgemm1 done'
if(l_verbose) FLUSH(stdout)
else
do io=nbegin,nend
call dgemv( 'N', numpw,lc%num_steps,1.d0, qo_mat(1,1,io-nbegin+1,iv), &
&numpw,tr(1,io-nbegin+1),1,0.d0,tmp_mat(1,io-nbegin+1),1)
enddo
endif
if(.not.l_single) then
call dgemm('N','T',numpw,numpw,nsize,factor*occ(iv),tmp_mat,numpw,qlm_tmp(1,nbegin,iv),numpw,1.d0,pw_ip,numpw)
else
call dgemm('N','T',numpw,numpw,nsize,factor*occ(iv),tmp_mat,numpw,qlm_tmp(1,nbegin,1),numpw,1.d0,pw_ip,numpw)
endif
if(l_verbose) write(stdout,*) 'dgemm2 done', pw%numpw, numpw,ierr
if(l_verbose) FLUSH(stdout)
else
call dgemm('T','N',numl,nsize,numt,1.d0,ttl_tmp(1,1,iv),numt,e_mat_ip,numt,0.d0,tmp_mat1,numl)
if(l_verbose) write(stdout,*) 'dgemm1 done'
if(l_verbose) FLUSH(stdout)
call dgemm('N','N',numpw,nsize,numl,1.d0,vtl_tmp(1,1,iv),numpw,tmp_mat1,numl,0.d0,tmp_mat2,numpw)
if(l_verbose) write(stdout,*) 'dgemm2 done'
if(l_verbose) FLUSH(stdout)
call dgemm('N','N', numpw,numl,nsize,1.d0,tmp_mat2,numpw,ttl_tmp(nbegin,1,iv),numt,0.d0,tmp_mat3,numpw)
if(l_verbose)write(stdout,*) 'dgemm3 done'
if(l_verbose) FLUSH(stdout)
call dgemm('N','T', numpw,numpw,numl,factor*occ(iv),tmp_mat3,numpw,vtl_tmp(1,1,iv),numpw,1.d0,pw_ip,numpw)
! call dgemm('T','N', ttl%numl,ttl%numt,ttl%numt,1.d0,ttl%tt_mat,ttl%numt,e_mat(1,1,il),lc%numt,0.d0,tmp_mat1,ttl%numl)
! call dgemm('N','N', pw%numpw,ttl%numt,ttl%numl,1.d0,vtl%vt_mat,vtl%numpw,tmp_mat1,ttl%numl,0.d0,tmp_mat2,pw%numpw)
! call dgemm('N','N', pw%numpw,ttl%numl,ttl%numt,1.d0,tmp_mat2,pw%numpw,ttl%tt_mat,ttl%numt,0.d0,tmp_mat3,pw%numpw)
! call dgemm('N','T', pw%numpw,pw%numpw,ttl%numl,-4.d0,tmp_mat3,pw%numpw,vtl%vt_mat,pw%numpw,1.d0,pw%pw,pw%numpw)
endif
endif
!gather_collect result
! call mpi_gatherv(e_mat_ip, lc%numt*nsize,MPI_DOUBLE_PRECISION,e_mat(1,1,iv),recvcounts,displ, MPI_DOUBLE_PRECISION,iproc,world_comm)
enddo
#if defined(__MPI)
if(iproc==mpime) then
pw_ip(:,:)=pw_ip(:,:)+pw%pw(:,:)
CALL MPI_REDUCE(pw_ip, pw%pw, pw%numpw*pw%numpw,MPI_DOUBLE_PRECISION,MPI_SUM, iproc,world_comm,ierr)
else
!ATTENTION pw%numpw could not be initialized in THIS CASE
CALL MPI_REDUCE(pw_ip, pw_dumm, numpw*numpw,MPI_DOUBLE_PRECISION,MPI_SUM, iproc,world_comm,ierr)
endif
if(l_verbose) write(stdout,*) 'mpi_reduce done'
if(l_verbose) FLUSH(stdout)
#else
pw%pw(:,:)=pw%pw(:,:)+pw_ip(:,:)
#endif
deallocate(alpha_ip)
endif
enddo
if(l_qo) deallocate(qo_mat)
deallocate(displ,recvcounts)
deallocate(e_mat_ip)
deallocate(dl,du,d,t)
deallocate(tr)
if(l_t_wannier) then
deallocate(tmp_mat)
! if(l_reduce_memory) call free_memory_compact_q_lanczos(cql)
else
deallocate(tmp_mat1,tmp_mat2,tmp_mat3)
endif
deallocate(pw_ip,pw_dumm)
if(l_t_wannier) then
deallocate(qlm_tmp)
if(l_single) deallocate(qlm_tmp_single)
else
deallocate(vtl_tmp, ttl_tmp)
endif
return
end subroutine solve_lanczos_2
subroutine add_partial_pola(pw,po,uu,af,nspin,ispin)
!NOT_TO_BE_INCLUDED_START
!this subroutine adds the contribution to the irreducible polarizability due to
!fractionally occupied states
USE kinds, ONLY : DP
USE basic_structures, ONLY : partial_occ,wannier_u
USE io_global, ONLY : stdout,ionode,ionode_id
USE polarization, ONLY : polaw
implicit none
TYPE(polaw), INTENT(inout) :: pw!polarizability matrix to be modified
TYPE(partial_occ), INTENT(in) :: po!fractionary occupaction terms
TYPE(wannier_u), INTENT(in) :: uu!for KS energies
COMPLEX(kind=DP),INTENT(in) :: af!!complex frequency
INTEGER, INTENT(in) :: nspin!spin multiplicity
INTEGER, INTENT(in) :: ispin!spin channel
REAL(kind=DP) :: fact
INTEGER :: iv,jv
do iv=po%nums_occ_min+1,po%nums_occ
do jv=1,iv-1
fact=-(2.d0/dble(nspin))*(po%f_occ(jv)-po%f_occ(iv))/dble(uu%ene(iv,ispin)-uu%ene(jv,ispin)+af)
!invertiti per energie
call dgemm('N','T',pw%numpw,pw%numpw,1,fact,po%ppp_mat(1,jv,iv),po%numpw,&
&po%ppp_mat(1,jv,iv),po%numpw,1.d0,pw%pw,pw%numpw)
enddo
enddo
return
!NOT_TO_BE_INCLUDED_END
end subroutine add_partial_pola