quantum-espresso/GWW/gww/do_self_lanczos.f90

981 lines
35 KiB
Fortran

!
! Copyright (C) 2001-2013 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!
!this subroutines performs the lanczos style calculation of the self-energy
subroutine calculate_compact_self_lanczos(options)
!this subroutine calculate the compact products for the lanczos calculation
!of the self-energy
USE kinds, ONLY : DP
USE input_gw, ONLY : input_options
USE basic_structures, ONLY : wannier_u,vt_mat_lanczos,tt_mat_lanczos,initialize_memory,free_memory
USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,&
&free_memory_compact_q_lanczos,do_compact_q_lanczos,write_compact_q_lanczos
USE mp_world, ONLY : nproc,mpime
USE io_global, ONLY : ionode
implicit none
TYPE(input_options), INTENT(in) :: options
TYPE(wannier_u) :: uu!U matrix
TYPE(vt_mat_lanczos) :: vtl
TYPE(tt_mat_lanczos) :: ttl
TYPE(compact_q_lanczos) :: cql
INTEGER :: l_blk,nbegin,nend,iv,iu
call initialize_memory(vtl)
call initialize_memory(ttl)
call initialize_compact_q_lanczos(cql)
!
!read U matrix
call read_data_pw_u(uu,options%prefix)
do iu=1,options%max_i
!read in V and T
call read_data_pw_vt_mat_lanczos(vtl, iu, options%prefix, .false.,1)
call read_data_pw_tt_mat_lanczos(ttl, iu, options%prefix, .false.,1)
!calculate contribution to Q
allocate( cql%qlm(vtl%numpw,ttl%numt))
cql%qlm(:,:)=0.d0
!sum up with factor Uvvi
call do_compact_q_lanczos(vtl,ttl,cql,1.d0)
!write Q^v on disk
cql%ii=iu
if(ionode) call write_compact_q_lanczos(cql)
deallocate(cql%qlm)
enddo
call free_memory(vtl)
call free_memory(ttl)
call free_memory_compact_q_lanczos(cql)
call free_memory(uu)
return
end subroutine calculate_compact_self_lanczos
subroutine solve_lanczos_complex(nbuf,alpha,e_mat,lc)
!this subroutine sums to the matrix E_{no}=<t_n|(H-alpha)^-1|t_o>
USE kinds, ONLY : DP
USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory
USE io_global, ONLY : stdout
USE mp, ONLY : mp_sum,mp_bcast
USE mp_world, ONLY : nproc, mpime, world_comm
implicit none
INTEGER :: nbuf!number of matrices to treat
COMPLEX(kind=DP) :: alpha(nbuf)!constant for Ev+iw
TYPE(lanczos_chain) :: lc!lanczos chain descriptor
COMPLEX(kind=DP) :: e_mat(lc%numt,lc%numt,nbuf)!matrix to be calculated
INTEGER :: io,info,ii,jj,il
COMPLEX(kind=DP), ALLOCATABLE :: dl(:),du(:),d(:),t(:)
COMPLEX(kind=DP), ALLOCATABLE :: omat(:,:)
REAL(kind=DP), ALLOCATABLE :: tmp_mat(:,:)
INTEGER :: l_blk,nbegin,nend, iproc
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
nbegin=mpime*l_blk+1
nend=nbegin+l_blk-1
allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps))
e_mat(:,:,:)=(0.d0,0.d0)
allocate(omat(lc%numt,lc%num_steps))
allocate(tmp_mat(lc%numt,lc%num_steps))
!loop on o
do io=1,lc%numt
!!set up vectors for lapack routine
!recover matrix from processor
tmp_mat(:,:)=0.d0
if(io >= nbegin .and. io <= nend) then
tmp_mat(:,:)=lc%o_mat(:,:,io-nbegin+1)
endif
!call mp_sum(tmp_mat(:,:),world_comm)
iproc=(io-1)/l_blk
call mp_bcast(tmp_mat(:,:), iproc, world_comm)
omat(:,:)=dcmplx(tmp_mat(:,:),0.d0)
do il=1,nbuf
dl(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0)
du(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0)
d(1:lc%num_steps)=cmplx(lc%d(1:lc%num_steps,io),0.d0)+alpha(il)
t(:)=(0.d0,0.d0)
t(1)=(1.d0,0.d0)
!!call lapack
call zgtsv(lc%num_steps,1,dl,d,du,t,lc%num_steps,info)
if(info /= 0) then
write(stdout,*) 'ZGTSV info:', info
FLUSH(stdout)
stop
endif
!ATTENZIONE
! if(io==1) then
! write(stdout,*) 'Solve lanczos first',t(1),t(lc%num_steps)
! write(stdout,*) 'omat', lc%o_mat(1,1,io), lc%o_mat(1,lc%num_steps,io),&
! & lc%o_mat(lc%num_steps,1,io), lc%o_mat(lc%num_steps,lc%num_steps,io)
! endif
! if(io==lc%numt) then
! write(stdout,*) 'Solve lanczos last',t(1),t(lc%num_steps)
! write(stdout,*) 'omat', lc%o_mat(1,1,io), lc%o_mat(1,lc%num_steps,io),&
! & lc%o_mat(lc%num_steps,1,io), lc%o_mat(lc%num_steps,lc%num_steps,io)
! endif
!!calculate term
!omat(:,:)=dcmplx(lc%o_mat(:,:,io),0.d0)
call zgemm('N','N',lc%numt,1,lc%num_steps,(1.d0,0.d0),omat,lc%numt,t,lc%num_steps,(1.d0,0.d0),e_mat(:,io,il),lc%numt)
enddo
enddo
do il=1,nbuf
do ii=1,lc%numt
do jj=ii+1,lc%numt
e_mat(ii,jj,il)=0.5d0*(e_mat(ii,jj,il)+e_mat(jj,ii,il))
e_mat(jj,ii,il)=e_mat(ii,jj,il)
enddo
enddo
enddo
deallocate(dl,du,d,t)
deallocate(omat,tmp_mat)
return
end subroutine solve_lanczos_complex
!routine do_self_lanczos
subroutine do_self_lanczos(ss, tf ,options)
!this subroutine calculte the self-energy on the frequency fit grid using the lanczos scheme
USE kinds, ONLY : DP
USE io_global, ONLY : stdout, ionode, ionode_id
USE input_gw, ONLY : input_options
USE basic_structures, ONLY : v_pot,wannier_u,free_memory, initialize_memory,lanczos_chain, vt_mat_lanczos,tt_mat_lanczos
USE green_function, ONLY : green, read_green, free_memory_green, initialize_green
USE polarization, ONLY : polaw, free_memory_polaw, read_polaw, write_polaw,invert_v_pot, initialize_polaw, &
& read_polaw_global
USE mp, ONLY : mp_sum, mp_bcast
USE mp_world, ONLY : nproc,mpime,world_comm
USE times_gw, ONLY : times_freqs
USE self_energy_storage, ONLY : self_storage,write_self_storage_ondisk,free_memory_self_storage
USE lanczos
USE constants, ONLY : tpi
USE start_end ! debug
implicit none
TYPE(times_freqs), INTENT(in) :: tf!for time frequency grids
TYPE(input_options) :: options
TYPE(self_storage) :: ss
TYPE(compact_q_lanczos), ALLOCATABLE :: cql(:)
TYPE(vt_mat_lanczos), ALLOCATABLE :: vtl(:)
TYPE(tt_mat_lanczos), ALLOCATABLE :: ttl(:)
TYPE(wannier_u) :: uu
TYPE(lanczos_matrix) :: lm
!1)
!loop on iw (parallelized)
!calculate screened polarization
!write back on polaw
TYPE(v_pot) :: vp,vpi
TYPE(polaw) :: ww!dressed polarization
TYPE(lanczos_chain) :: lc
INTEGER :: l_blk, nbegin,nend, l_blk_freq, nbegin_freq,nend_freq
INTEGER :: iw_tot,l_blk_iw, nbegin_iw,nend_iw
REAL(kind=DP), ALLOCATABLE:: wtemp(:,:), vtemp(:)
INTEGER :: iw,ii, jw, il, kw
COMPLEX(kind=DP), ALLOCATABLE :: af(:)
REAL(kind=DP), ALLOCATABLE :: re_e_mat(:,:),im_e_mat(:,:) ,tmp_mat(:,:), re_c_mat(:,:,:,:), im_c_mat(:,:,:,:)
REAL(kind=DP), ALLOCATABLE :: tmp_mat1(:,:),tmp_mat2(:,:)
INTEGER :: n_buf,iw_min,iw_max,iiw,ix,iy,it,ip,ixx
REAL(kind=DP), EXTERNAL :: ddot
REAL(kind=DP) :: offset
COMPLEX(kind=DP), ALLOCATABLE :: gw_tab(:,:), exp_tab(:,:,:), exp_iw(:,:),exp_ix(:,:)
LOGICAL :: l_conv!if true do just the convolution
LOGICAL :: l_half=.true.
REAL(kind=DP), ALLOCATABLE :: re_c_tmp(:,:), im_c_tmp(:,:)
INTEGER, ALLOCATABLE :: iw_proc(:)
INTEGER :: n_buf_ii, ii_buf, ii_begin,ii_end,ipos
LOGICAL :: l_direct=.true.!if true direct way of performing products for each state
INTEGER :: numpw
COMPLEX(kind=DP), ALLOCATABLE :: e_mat_tmp(:,:,:)
INTEGER :: n_dim
INTEGER :: iproc
COMPLEX(kind=DP) :: csca
REAL(kind=DP), ALLOCATABLE :: polaw_store(:,:,:) !to reduce io
INTEGER :: io_tot,l_blk_io, nbegin_io,nend_io
COMPLEX(kind=DP), ALLOCATABLE :: cterm(:)
REAL(kind=DP) :: inv_epsi,v_head
INTEGER :: iw1,jw1
INTEGER :: n_max
REAL(kind=DP), ALLOCATABLE :: diag_tmp(:)
LOGICAL :: l_iw_min
REAL(kind=DP), ALLOCATABLE :: mat_tmp(:,:)
INTEGER :: i, j
!calculate offset
!read in DFT energies
call read_data_pw_u(uu,options%prefix)
if(uu%nums > uu%nums_occ(1)) then
offset=-(uu%ene(uu%nums_occ(1)+1,1)+uu%ene(uu%nums_occ(1),1))/2.d0
else
offset=-uu%ene(uu%nums_occ(1),1)
endif
call free_memory(uu)
!initialize self_storage ss
if(options%l_lanczos_conv) then
l_conv=.true.
else
l_conv=.false.
endif
ss%ontime=.false.
ss%max_i=options%max_i
ss%i_min=options%i_min
ss%i_max=options%i_max
ss%n=tf%n_grid_fit
ss%tau=options%tau
ss%whole_s=options%whole_s
ss%n_grid_fit=tf%n_grid_fit
if(ss%whole_s) then
allocate(ss%whole(ss%max_i,ss%max_i,2*ss%n+1,1))
ss%whole(:,:,:,:)=(0.d0,0.d0)
allocate(ss%whole_freq_fit(ss%max_i,ss%max_i,2*ss%n+1,1))
ss%whole_freq_fit(:,:,:,:)=(0.d0,0.d0)
nullify(ss%diag)
nullify(ss%diag_freq_fit)
else
allocate(ss%diag(ss%max_i,2*ss%n+1,1))
ss%diag(:,:,:)=(0.d0,0.d0)
nullify(ss%whole)
allocate(ss%diag_freq_fit(ss%max_i,2*ss%n+1,1))
ss%diag_freq_fit(:,:,:)=(0.d0,0.d0)
nullify(ss%whole_freq_fit)
endif
!for compatibility
allocate(ss%ene_remainder(ss%max_i,1))
ss%ene_remainder(:,:)=0.d0
if(ss%whole_s) then
write(stdout,*) 'Whole s not implemented YET'
stop
endif
nullify(vp%vmat)
nullify(vpi%vmat)
call initialize_polaw(ww)
allocate(diag_tmp(options%n_set_ii))
!2) create compact_self_lanczos
if(.not.l_direct) call calculate_compact_self_lanczos(options)
!3)
!read in lanczos chain for self-energy
call initialize_memory(lc)
call read_data_pw_lanczos_chain(lc, 1, options%prefix, .false.,1)
write(stdout,*) 'Lanczos dimensions', lc%numt,lc%num_steps
write(stdout,*) 'Lanczos first diagonal', lc%d(1,1),lc%d(lc%num_steps,1)
write(stdout,*) 'Lanczos last diagonal', lc%d(1,lc%numt),lc%d(lc%num_steps,lc%numt)
write(stdout,*) 'Lanczos first tri', lc%f(1,1),lc%f(lc%num_steps-1,1)
write(stdout,*) 'Lanczos last tri', lc%f(1,lc%numt),lc%f(lc%num_steps-1,lc%numt)
FLUSH(stdout)
call initialize_lanczos_matrix(lm)
lm%numt=lc%numt
allocate(lm%e_mat(lm%numt,lm%numt))
allocate(re_e_mat(lc%numt,lc%numt))
allocate(im_e_mat(lc%numt,lc%numt))
!if required allocates table for products and exponents
if(.not.l_conv) then
allocate(gw_tab(-tf%n:tf%n,-tf%n:tf%n))
l_blk_freq= (2*ss%n+1)/nproc
if(l_blk_freq*nproc < (2*ss%n+1)) l_blk_freq = l_blk_freq+1
nbegin_freq=mpime*l_blk_freq-ss%n
nend_freq=nbegin_freq+l_blk_freq-1
allocate(exp_tab(-tf%n:tf%n,-tf%n:tf%n,l_blk_freq))
exp_tab(:,:,:)=(0.d0,0.d0)
! allocate(exp_iw(-tf%n:tf%n,-tf%n:tf%n))
! do iw=-tf%n,tf%n
! do it=-tf%n,tf%n
! exp_iw(it,iw)=exp((0.d0,1.d0)*tf%freqs(iw)*tf%times(it))
! enddo
! enddo
! allocate(exp_ix(-tf%n:tf%n,l_blk_freq))
! do ix=nbegin_freq,nbegin_freq+l_blk_freq-1
! if(ix<=ss%n) then
! do it=-tf%n,tf%n
! exp_ix(it,ix-nbegin_freq+1)=exp((0.d0,-1.d0)*tf%freqs_fit(ix)*tf%times(it))
! enddo
! endif
! enddo
do ix=nbegin_freq,nbegin_freq+l_blk_freq-1
if(ix<=ss%n) then
write(stdout,*) 'Exp table:', ix
FLUSH(stdout)
do jw=-tf%n,tf%n
do iw=-tf%n,tf%n
do it=-tf%n,tf%n
exp_tab(iw,jw,ix-nbegin_freq+1)=exp_tab(iw,jw,ix-nbegin_freq+1)+tf%weights_freq(iw)*&
&tf%weights_freq(jw)*tf%weights_time(it)*&
&exp((0.d0,1.d0)*(tf%freqs(iw)+tf%freqs(jw)-tf%freqs_fit(ix))*tf%times(it))
! *exp_iw(it,iw)*exp_iw(it,jw)*exp_ix(it,ix-nbegin_freq+1)
enddo
enddo
enddo
endif
enddo
!deallocate(exp_iw,exp_ix)
endif
if(.not.l_direct) then
allocate(cql(options%n_set_ii))
do ii=1,options%n_set_ii
call initialize_compact_q_lanczos(cql(ii))
enddo
else
allocate(vtl(options%n_set_ii))
allocate(ttl(options%n_set_ii))
do ii=1,options%n_set_ii
call initialize_memory(vtl(ii))
call initialize_memory(ttl(ii))
enddo
endif
!!calculate (H-iw)^-1 and write on disk (parallel)
l_blk= (2*tf%n+1)/nproc
if(l_blk*nproc < (2*tf%n+1)) l_blk = l_blk+1
nbegin=mpime*l_blk-tf%n
nend=nbegin+l_blk-1
if(nend > tf%n) nend=tf%n
allocate(e_mat_tmp(lm%numt,lm%numt,options%n_set_self))
allocate(af(options%n_set_self))
do iw=nbegin,nbegin+l_blk-1,options%n_set_self
n_dim=0
do il=1,options%n_set_self
if((iw+il-1) <= tf%n) then
n_dim=n_dim+1
af(il) = dcmplx(offset,-tf%freqs(iw+il-1))
endif
enddo
if(n_dim > 0) then
write(stdout,*) 'Doing solve_lanczos_complex',iw
FLUSH(stdout)
call solve_lanczos_complex(n_dim,af,e_mat_tmp,lc)
write(stdout,*) 'Done'
FLUSH(stdout)
do il=1,n_dim
lm%iw=iw+il-1
lm%e_mat(:,:)=e_mat_tmp(:,:,il)
call write_lanczos_matrix(lm)
enddo
else
call solve_lanczos_fake_complex(lc)
endif
enddo
deallocate(e_mat_tmp,af)
!if reduced I/O read all polaw in memory
if(options%l_reduce_io) then
if(.not. tf%grid_freq == 5) then
n_max=tf%n/2
else
n_max=(tf%n-(1+2*tf%second_grid_i)*tf%second_grid_n)/2+&
&(1+2*tf%second_grid_i)*tf%second_grid_n
endif
l_blk_io= (n_max+1)/nproc
if(l_blk_io*nproc < (n_max+1)) l_blk_io = l_blk_io+1
nbegin_io=mpime*l_blk_io
nend_io=nbegin_io+l_blk_io-1
if(nend_io > n_max) nend_io=n_max
io_tot=nend_io-nbegin_io+1
do jw=nbegin_io,nend_io
call read_polaw(jw, ww,options%debug,options%l_verbose)
if(jw==nbegin_io) then
allocate(polaw_store(ww%numpw,ww%numpw,io_tot))
endif
polaw_store(:,:,jw-nbegin_io+1)=ww%pw(:,:)
enddo
!distribute ww%numpw
if(.not.ionode) ww%numpw=0
!call mp_sum(ww%numpw,world_comm)
call mp_bcast(ww%numpw, ionode_id,world_comm)
if(.not.ionode) ww%factor=(0.d0,0.d0)
!call mp_sum(ww%factor,world_comm)
call mp_bcast(ww%factor, ionode_id,world_comm)
endif
!!loop on i KS states
!!loop/module n_set on iw'
n_buf_ii=(options%i_max-options%i_min+1)/options%n_set_ii
if(n_buf_ii*options%n_set_ii < (options%i_max-options%i_min+1)) n_buf_ii=n_buf_ii+1
allocate(cterm(options%n_set_ii))
do ii_buf=1,n_buf_ii
ii_begin=options%i_min+(ii_buf-1)*options%n_set_ii
ii_end=min(ii_begin+options%n_set_ii-1, options%i_max)
! do ii=options%i_min,options%i_max
do ii=ii_begin,ii_end
write(stdout,*) 'KS STATE', ii
FLUSH(stdout)
!!read in Q^i
write(stdout,*) 'Call read_compact_q_lanczos'
FLUSH(stdout)
if(.not.l_direct) then
call read_compact_q_lanczos(cql(ii-ii_begin+1), ii)
else
call read_data_pw_vt_mat_lanczos(vtl(ii-ii_begin+1), ii, options%prefix, .false.,1)
call read_data_pw_tt_mat_lanczos(ttl(ii-ii_begin+1), ii, options%prefix, .false.,1)
endif
write(stdout,*) 'Done'
FLUSH(stdout)
enddo
!!loop/module n_set on iw'
n_buf=(2*tf%n+1)/options%n_set
if(n_buf*options%n_set < 2*tf%n+1) n_buf=n_buf+1
do iiw=1,n_buf
!!build C matrix
iw_min=-tf%n+options%n_set*(iiw-1)
iw_max=min(iw_min+options%n_set-1,tf%n)
write(stdout,*) 'Solve'
FLUSH(stdout)
iw_tot=iw_max-iw_min+1
l_blk_iw= (iw_tot)/nproc
if(l_blk_iw*nproc < (iw_tot)) l_blk_iw = l_blk_iw+1
nbegin_iw=mpime*l_blk_iw+iw_min
nend_iw=nbegin_iw+l_blk_iw-1
if(nend_iw > iw_max) nend_iw = iw_max
if(.not.l_direct) then
numpw=cql(1)%numpw
else
numpw=vtl(1)%numpw
endif
allocate(re_c_mat(numpw,numpw,options%n_set_ii,l_blk_iw))
allocate(im_c_mat(numpw,numpw,options%n_set_ii,l_blk_iw))
re_c_mat(:,:,:,:)=0.d0
im_c_mat(:,:,:,:)=0.d0
if(.not.l_conv) then
allocate(re_c_tmp(numpw,numpw))
allocate(im_c_tmp(numpw,numpw))
endif
!allocate(iw_proc(iw_tot))
!iw_proc(:)=-1
!do ip=0,nproc-1
! iw_proc(ip*l_blk_iw+1:min(ip*l_blk_iw+l_blk_iw,iw_tot))=ip
!enddo
do iw=nbegin_iw,nend_iw
!!build C matrices
!!!calculate G matrix on {t} basis
!!!solve tridiagonal problem for +i\omega
!af = dcmplx(offset,-tf%freqs(iw))
!call solve_lanczos_complex(af,lm%e_mat,lc)
write(stdout,*) 'Reading lanczos matrix',iw
FLUSH(stdout)
call read_lanczos_matrix(lm,iw)
write(stdout,*) 'Done'
FLUSH(stdout)
!!calculate C^i_lm matrix
re_e_mat(:,:)=dble(lm%e_mat(:,:))
im_e_mat(:,:)=dimag(lm%e_mat(:,:))
if(.not.l_direct) then
allocate(tmp_mat(cql(1)%numpw,cql(1)%numt))
write(stdout,*) 'Doing dgemms'
write(stdout,*) 're_e_mat', re_e_mat(1,1),re_e_mat(1,cql(1)%numt),&
&re_e_mat(cql(1)%numt,1),re_e_mat(cql(1)%numt,cql(1)%numt)
do ii=ii_begin,ii_end
ipos=ii-ii_begin+1
FLUSH(stdout)
call dgemm('N','N',cql(ipos)%numpw,cql(ipos)%numt,cql(ipos)%numt,&
&1.d0,cql(ipos)%qlm,cql(ipos)%numpw,re_e_mat,lc%numt,0.d0,tmp_mat,cql(ipos)%numpw)
call dgemm('N','T',cql(ipos)%numpw,cql(ipos)%numpw,cql(ipos)%numt,&
&1.d0,tmp_mat,cql(ipos)%numpw,cql(ipos)%qlm,cql(ipos)%numpw,0.d0,re_c_mat(1,1,ipos,iw-nbegin_iw+1),cql(ipos)%numpw)
call dgemm('N','N',cql(ipos)%numpw,cql(ipos)%numt,cql(ipos)%numt,&
&1.d0,cql(ipos)%qlm,cql(ipos)%numpw,im_e_mat,lc%numt,0.d0,tmp_mat,cql(ipos)%numpw)
call dgemm('N','T',cql(ipos)%numpw,cql(ipos)%numpw,cql(ipos)%numt,&
&1.d0,tmp_mat,cql(ipos)%numpw,cql(ipos)%qlm,cql(ipos)%numpw,0.d0,im_c_mat(1,1,ipos,iw-nbegin_iw+1),cql(ipos)%numpw)
enddo
write(stdout,*) 'Done'
FLUSH(stdout)
deallocate(tmp_mat)
else
write(stdout,*) 'Doing dgemms',ttl(1)%numl,ttl(1)%numt,vtl(1)%numl,vtl(1)%numpw
FLUSH(stdout)
allocate(tmp_mat(ttl(1)%numl,ttl(1)%numt))
allocate(tmp_mat1(ttl(1)%numl,ttl(1)%numl))
allocate(tmp_mat2(numpw,ttl(1)%numl))
do ii=ii_begin,ii_end
ipos=ii-ii_begin+1
write(stdout,*) 'ATTENZIONE1', ttl(ipos)%numl,ttl(ipos)%numt,vtl(ipos)%numl,vtl(ipos)%numpw
FLUSH(stdout)
call dgemm('T','N',ttl(ipos)%numl,ttl(ipos)%numt,ttl(ipos)%numt,1.d0,ttl(ipos)%tt_mat,ttl(ipos)%numt,&
&re_e_mat,lc%numt,0.d0,tmp_mat,ttl(ipos)%numl)
write(stdout,*) 'ATTENZIONE2', ii
FLUSH(stdout)
call dgemm('N','N',ttl(ipos)%numl,ttl(ipos)%numl,ttl(ipos)%numt,1.d0,tmp_mat,ttl(ipos)%numl,&
ttl(ipos)%tt_mat,ttl(ipos)%numt,0.d0,tmp_mat1,ttl(ipos)%numl)
write(stdout,*) 'ATTENZIONE3', ii, vtl(ipos)%numpw,vtl(ipos)%numl,vtl(ipos)%numl
write(stdout,*) 'ATTENZIONE3', size(vtl(ipos)%vt_mat(:,1)), size(vtl(ipos)%vt_mat(1,:)), ttl(1)%numl, numpw
FLUSH(stdout)
call dgemm('N','N',vtl(ipos)%numpw,vtl(ipos)%numl,vtl(ipos)%numl,1.d0,vtl(ipos)%vt_mat,vtl(ipos)%numpw,&
&tmp_mat1,vtl(ipos)%numl,0.d0,tmp_mat2,numpw)
write(stdout,*) 'ATTENZIONE4', ii
FLUSH(stdout)
call dgemm('N','T',vtl(ipos)%numpw,vtl(ipos)%numpw,vtl(ipos)%numl,1.d0,tmp_mat2,numpw,vtl(ipos)%vt_mat,&
vtl(ipos)%numpw,0.d0,re_c_mat(:,:,ipos,iw-nbegin_iw+1),numpw)
write(stdout,*) 'ATTENZIONE5', ii
FLUSH(stdout)
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1
call dgemm('T','N',ttl(ipos)%numl,ttl(ipos)%numt,ttl(ipos)%numt,1.d0,ttl(ipos)%tt_mat,ttl(ipos)%numt,&
&im_e_mat,lc%numt,0.d0,tmp_mat,ttl(ipos)%numl)
write(stdout,*) 'ATTENZIONE6', ii
FLUSH(stdout)
call dgemm('N','N',ttl(ipos)%numl,ttl(ipos)%numl,ttl(ipos)%numt,1.d0,tmp_mat,ttl(ipos)%numl,&
ttl(ipos)%tt_mat,ttl(ipos)%numt,0.d0,tmp_mat1,ttl(ipos)%numl)
write(stdout,*) 'ATTENZIONE7', ii
FLUSH(stdout)
call dgemm('N','N',vtl(ipos)%numpw,vtl(ipos)%numl,vtl(ipos)%numl,1.d0,vtl(ipos)%vt_mat,vtl(ipos)%numpw,&
&tmp_mat1,vtl(ipos)%numl,0.d0,tmp_mat2,numpw)
write(stdout,*) 'ATTENZIONE8', ii
FLUSH(stdout)
call dgemm('N','T',vtl(ipos)%numpw,vtl(ipos)%numpw,vtl(ipos)%numl,1.d0,tmp_mat2,numpw,vtl(ipos)%vt_mat,&
vtl(ipos)%numpw,0.d0,im_c_mat(:,:,ipos,iw-nbegin_iw+1),numpw)
write(stdout,*) 'ATTENZIONE9', ii
FLUSH(stdout)
enddo
write(stdout,*) 'Done'
FLUSH(stdout)
deallocate(tmp_mat,tmp_mat1,tmp_mat2)
endif
enddo
write(stdout,*) 'Done'
FLUSH(stdout)
!!!loop on w' parallelized
if(.not.l_half) then
l_blk= (tf%n+1)/nproc
if(l_blk*nproc < (tf%n+1)) l_blk = l_blk+1
nbegin=mpime*l_blk
nend=nbegin+l_blk-1
if(nend > tf%n) nend=tf%n
else
if(.not. tf%grid_freq == 5) then
l_blk= (tf%n/2+1)/nproc
if(l_blk*nproc < (tf%n/2+1)) l_blk = l_blk+1
nbegin=mpime*l_blk
nend=nbegin+l_blk-1
if(nend > tf%n/2) nend=tf%n/2
else
n_max=tf%n/2+1+(1+2*tf%second_grid_i)*tf%second_grid_n
l_blk= (n_max)/nproc
if(l_blk*nproc < (n_max)) l_blk = l_blk+1
nbegin=mpime*l_blk
nend=nbegin+l_blk-1
if(nend > n_max) nend=n_max
endif
endif
if(.not.l_conv) gw_tab(:,:)=(0.d0,0.d0)
if(.not. tf%grid_freq == 5) then
n_max=tf%n/2
else
n_max=(tf%n-(1+2*tf%second_grid_i)*tf%second_grid_n)/2+&
&(1+2*tf%second_grid_i)*tf%second_grid_n
endif
do jw=0,n_max
! do jw=nbegin,nbegin+l_blk-1
if(jw <= tf%n) then
write(stdout,*) 'FREQ', jw
FLUSH(stdout)
if(.not.options%l_reduce_io) then
call read_polaw(jw, ww,options%debug,options%l_verbose)
else
call free_memory_polaw(ww)
allocate(ww%pw(ww%numpw,ww%numpw))
if(jw>=nbegin_io .and. jw <= nend_io) then
!ww%pw(:,:)=polaw_store(:,:,jw-nbegin_io+1)
call dcopy( SIZE(ww%pw), polaw_store(1,1,jw-nbegin_io+1), 1, ww%pw(1,1), 1 )
else
!ww%pw(:,:)=0.d0
do j = 1, SIZE( ww%pw, 2 )
do i = 1, SIZE( ww%pw, 1 )
ww%pw(i,j)=0.d0
end do
end do
endif
iproc=jw/l_blk_io
do kw=1,ww%numpw
!call mp_sum(ww%pw(:,kw),world_comm)
call mp_bcast(ww%pw(:,kw),iproc,world_comm)
enddo
!call mp_bcast(ww%pw(:,:),iproc,world_comm)
endif
if(l_conv) then
!loop on iw
if(tf%grid_freq == 3) then
l_iw_min=.false.
do iw=nbegin_iw,nend_iw
ix=iw+jw
ixx=iw-jw
if((ix >= -tf%n .and. ix <= tf%n).or.(ixx >= -tf%n .and. ixx <= tf%n)) then
if(.not.l_iw_min) then
l_iw_min=.true.
iw_min=iw
endif
iw_max=iw
endif
enddo
if(l_iw_min) then
allocate(mat_tmp(options%n_set_ii,l_blk_iw))
!the weights are all the same for this kind of grid
call dgemv('T',ww%numpw*ww%numpw,options%n_set_ii*(iw_max-iw_min+1),&
&tf%weights_freq(1),re_c_mat(1,1,1,iw_min-nbegin_iw+1),&
&ww%numpw*ww%numpw,ww%pw,1,0.d0,mat_tmp,1)
do iw=iw_min,iw_max
ix=iw+jw
ixx=iw-jw
if(ix >= -tf%n .and. ix <= tf%n) then
ss%diag(ii_begin:ii_end,ix+tf%n+1,1)=ss%diag(ii_begin:ii_end,ix+tf%n+1,1)+&
&dcmplx(mat_tmp(:,iw-iw_min+1),0.d0)*ww%factor
endif
if(jw/=0) then
if(ixx >= -tf%n .and. ixx <= tf%n) then
ss%diag(ii_begin:ii_end,ixx+tf%n+1,1)=ss%diag(ii_begin:ii_end,ixx+tf%n+1,1)+&
&dcmplx(mat_tmp(:,iw-iw_min+1),0.d0)*ww%factor
endif
endif
enddo
call dgemv('T',ww%numpw*ww%numpw,options%n_set_ii*(iw_max-iw_min+1),tf%weights_freq(1),&
&im_c_mat(1,1,1,iw_min-nbegin_iw+1),ww%numpw*ww%numpw,ww%pw,1,0.d0,mat_tmp,1)
do iw=iw_min,iw_max
ix=iw+jw
ixx=iw-jw
if(ix >= -tf%n .and. ix <= tf%n) then
ss%diag(ii_begin:ii_end,ix+tf%n+1,1)=ss%diag(ii_begin:ii_end,ix+tf%n+1,1)+&
&dcmplx(0.d0,mat_tmp(:,iw-iw_min+1))*ww%factor
endif
if(jw/=0) then
if(ixx >= -tf%n .and. ixx <= tf%n) then
ss%diag(ii_begin:ii_end,ixx+tf%n+1,1)=ss%diag(ii_begin:ii_end,ixx+tf%n+1,1)+&
&dcmplx(0.d0,mat_tmp(:,iw-iw_min+1))*ww%factor
endif
endif
enddo
deallocate(mat_tmp)
endif
else if (tf%grid_freq == 5) then
l_iw_min=.false.
do iw=nbegin_iw,nend_iw
iw1=tf%whois_freq(iw)
jw1=tf%whois_freq(jw)
ix=iw1+jw1
jw1=tf%whois_freq(-jw)
ixx=iw1+jw1
if((ix >= -ss%n .and. ix <= ss%n).or.(ixx >= -ss%n .and. ixx <= ss%n)) then
if(.not.l_iw_min) then
l_iw_min=.true.
iw_min=iw
endif
iw_max=iw
endif
enddo
if(l_iw_min) then
allocate(mat_tmp(options%n_set_ii,l_blk_iw))
!the weights are all the same for this kind of grid
call dgemv('T',ww%numpw*ww%numpw,options%n_set_ii*(iw_max-iw_min+1),&
&1.d0,re_c_mat(1,1,1,iw_min-nbegin_iw+1),&
&ww%numpw*ww%numpw,ww%pw,1,0.d0,mat_tmp,1)
do iw=nbegin_iw,nend_iw
iw1=tf%whois_freq(iw)
jw1=tf%whois_freq(jw)
ix=iw1+jw1
if(ix >= -ss%n .and. ix <= ss%n) then
ss%diag(ii_begin:ii_end,ix+ss%n+1,1)=ss%diag(ii_begin:ii_end,ix+ss%n+1,1)+&
&dcmplx(mat_tmp(:,iw-iw_min+1),0.d0)*tf%weights_freq(jw)*ww%factor*tf%relative_weight(iw)*tf%relative_weight(jw)
endif
jw1=tf%whois_freq(-jw)
ix=iw1+jw1
if(ix >= -ss%n .and. ix <= ss%n) then
ss%diag(ii_begin:ii_end,ix+ss%n+1,1)=ss%diag(ii_begin:ii_end,ix+ss%n+1,1)+&
&dcmplx(mat_tmp(:,iw-iw_min+1),0.d0)*tf%weights_freq(-jw)*ww%factor*tf%relative_weight(iw)*tf%relative_weight(-jw)
endif
enddo
call dgemv('T',ww%numpw*ww%numpw,options%n_set_ii*(iw_max-iw_min+1),&
&1.d0,im_c_mat(1,1,1,iw_min-nbegin_iw+1),ww%numpw*ww%numpw,ww%pw,1,0.d0,mat_tmp,1)
do iw=nbegin_iw,nend_iw
iw1=tf%whois_freq(iw)
jw1=tf%whois_freq(jw)
ix=iw1+jw1
if(ix >= -ss%n .and. ix <= ss%n) then
ss%diag(ii_begin:ii_end,ix+ss%n+1,1)=ss%diag(ii_begin:ii_end,ix+ss%n+1,1)+&
&dcmplx(0.d0,mat_tmp(:,iw-iw_min+1))*tf%weights_freq(jw)*ww%factor*tf%relative_weight(iw)*tf%relative_weight(jw)
endif
jw1=tf%whois_freq(-jw)
ix=iw1+jw1
if(ix >= -ss%n .and. ix <= ss%n) then
ss%diag(ii_begin:ii_end,ix+ss%n+1,1)=ss%diag(ii_begin:ii_end,ix+ss%n+1,1)+&
&dcmplx(0.d0,mat_tmp(:,iw-iw_min+1))*tf%weights_freq(-jw)*ww%factor*tf%relative_weight(iw)*tf%relative_weight(-jw)
endif
enddo
deallocate(mat_tmp)
endif
else
endif
else
!here does a explicit convolution
do iw=nbegin_iw,nend_iw
! do iw=iw_min,iw_max
!distribute re_c_mat and im_c_mat
!if(iw_proc(iw-iw_min+1)==mpime) then
! re_c_tmp(:,:)=re_c_mat(:,:,iw-nbegin_iw+1)
! im_c_tmp(:,:)=im_c_mat(:,:,iw-nbegin_iw+1)
!endif
!call
!mp_bcast(re_c_tmp(:,:),iw_proc(iw-iw_min+1),world_comm)
!call
!mp_bcast(im_c_tmp(:,:),iw_proc(iw-iw_min+1),world_comm)
re_c_tmp(:,:)=re_c_mat(:,:,ipos,iw-nbegin_iw+1)
im_c_tmp(:,:)=im_c_mat(:,:,ipos,iw-nbegin_iw+1)!ATTENZIONE TUTTO DA FARE
gw_tab(iw,jw)=gw_tab(iw,jw)+&
&ddot(ww%numpw*ww%numpw,re_c_tmp(:,1),1,ww%pw(:,1),1)*ww%factor
gw_tab(iw,jw)=gw_tab(iw,jw)+(0.d0,1.d0)*&
&ddot(ww%numpw*ww%numpw,im_c_tmp(:,1),1,ww%pw,1)*ww%factor
if(jw/=0)then
gw_tab(iw,-jw)=gw_tab(iw,-jw)+&
&ddot(ww%numpw*ww%numpw,re_c_tmp(:,1),1,ww%pw(:,1),1)*ww%factor
gw_tab(iw,-jw)=gw_tab(iw,-jw)+(0.d0,1.d0)*&
&ddot(ww%numpw*ww%numpw,im_c_tmp(:,1),1,ww%pw,1)*ww%factor
endif
enddo
endif
else
!here global io and parallel routines
endif
enddo
if(.not.l_conv) then
call mp_sum(gw_tab(:,:),world_comm)
do ix=nbegin_freq,nbegin_freq+l_blk_freq-1
if(ix<=ss%n) then
do jw=-tf%n,tf%n
do iw=-tf%n,tf%n
ss%diag(ii,ix+ss%n+1,1)=ss%diag(ii,ix+ss%n+1,1)+gw_tab(iw,jw)*exp_tab(iw,jw,ix-nbegin_freq+1)
enddo
enddo
endif
enddo
endif
deallocate(re_c_mat,im_c_mat)
if(.not.l_conv) deallocate(re_c_tmp,im_c_tmp)
!deallocate(iw_proc)
enddo
if(.not.l_direct) then
do ii=1,options%n_set_ii
call free_memory_compact_q_lanczos(cql(ii))
enddo
else
do ii=1,options%n_set_ii
call free_memory(vtl(ii))
call free_memory(ttl(ii))
enddo
endif
call free_memory_polaw(ww)
enddo
deallocate(cterm)
if(options%l_reduce_io) then
if(nend_io >= nbegin_io) deallocate(polaw_store)
endif
if(.not.l_direct) then
deallocate(cql)
else
deallocate(vtl,ttl)
endif
!set global factor
call mp_sum(ss%diag(:,:,:),world_comm)
if(l_conv) then
ss%diag(:,:,:)=ss%diag(:,:,:)*(1.d0/tpi)
else
ss%diag(:,:,:)=ss%diag(:,:,:)*(1.d0/tpi**2.d0)
endif
ss%diag_freq_fit(:,:,:)=ss%diag(:,:,:)
call free_memory(lc)
deallocate(re_e_mat, im_e_mat)
call free_memory_lanczos_matrix(lm)
if(.not.l_conv) then
deallocate(gw_tab,exp_tab)
endif
deallocate(diag_tmp)
return
end subroutine do_self_lanczos
subroutine solve_lanczos_fake_complex(lc)
!this subroutine is a parallel fake routine for the solve lanczos routine
USE kinds, ONLY : DP
USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory
USE io_global, ONLY : stdout
USE mp, ONLY : mp_sum,mp_bcast
USE mp_world, ONLY : nproc,mpime,world_comm
implicit none
TYPE(lanczos_chain) :: lc!lanczos chain descriptor
INTEGER :: l_blk,nbegin,nend, iproc
REAL(kind=DP), ALLOCATABLE :: o_mat(:,:)
INTEGER :: io
allocate(o_mat(lc%numt,lc%num_steps))
l_blk= (lc%numt)/nproc
if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1
nbegin=mpime*l_blk+1
nend=nbegin+l_blk-1
!loop on io
do io=1,lc%numt
!recover matrix from processor
o_mat(:,:)=0.d0
if(io >= nbegin .and. io <= nend) then
o_mat(:,:)=lc%o_mat(:,:,io-nbegin+1)
endif
! call mp_sum(o_mat(:,:),world_comm)!this should be much faster than mp_bcast
iproc=(io-1)/l_blk
call mp_bcast(o_mat(:,:), iproc,world_comm)
enddo
deallocate(o_mat)
end subroutine solve_lanczos_fake_complex