! ! Copyright (C) 2001-2013 Quantum ESPRESSO group ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! ! !this subroutines performs the lanczos style calculation of the polarization subroutine calculate_compact_pola_lanczos(options,ispin) !this subroutine calculate the compact products for the lanczos calculation !of the polarization USE kinds, ONLY : DP USE input_gw, ONLY : input_options USE basic_structures, ONLY : wannier_u,vt_mat_lanczos,tt_mat_lanczos,initialize_memory,free_memory USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,& &free_memory_compact_q_lanczos,do_compact_q_lanczos,write_compact_q_lanczos USE mp_world, ONLY : nproc,mpime USE io_global, ONLY : stdout implicit none TYPE(input_options), INTENT(in) :: options INTEGER, INTENT(in) :: ispin!spin channel TYPE(wannier_u) :: uu!U matrix TYPE(vt_mat_lanczos) :: vtl TYPE(tt_mat_lanczos) :: ttl TYPE(compact_q_lanczos) :: cql INTEGER :: l_blk,nbegin,nend,iv,iu call initialize_memory(vtl) call initialize_memory(ttl) call initialize_compact_q_lanczos(cql) write(stdout,*) 'Routine calculate_compact_pola_lanczos' FLUSH(stdout) ! !read U matrix call read_data_pw_u(uu,options%prefix) !KS valence state index v distributed on processors l_blk= uu%nums_occ(ispin)/nproc if(l_blk*nproc < uu%nums_occ(ispin)) l_blk = l_blk+1 nbegin=mpime*l_blk+1 nend=nbegin+l_blk-1 if(nend > uu%nums_occ(ispin)) nend = uu%nums_occ(ispin) do iv=nbegin,nbegin+l_blk-1 if(iv <= uu%nums_occ(ispin)) then !loop v' on wannier valence state do iu=1,uu%nums_occ(ispin) !read in V and T call read_data_pw_vt_mat_lanczos(vtl, iu, options%prefix, .true.,ispin) call read_data_pw_tt_mat_lanczos(ttl, iu, options%prefix, .true.,ispin) !calculate contribution to Q if(iu==1) then allocate( cql%qlm(vtl%numpw,ttl%numt)) cql%qlm(:,:)=0.d0 endif !sum up with factor Uvvi call do_compact_q_lanczos(vtl,ttl,cql,dble(uu%umat(iv,iu,ispin))) call free_memory(vtl) call free_memory(ttl) enddo !write Q^v on disk cql%ii=iv call write_compact_q_lanczos(cql) deallocate(cql%qlm) else !put here global routines involving MPI do iu=1,uu%nums_occ(ispin) call read_data_pw_vt_mat_lanczos(vtl, iu, options%prefix, .true.,ispin) call read_data_pw_tt_mat_lanczos(ttl, iu, options%prefix, .true.,ispin) call free_memory(vtl) call free_memory(ttl) enddo endif enddo call free_memory(vtl) call free_memory(ttl) call free_memory_compact_q_lanczos(cql) call free_memory(uu) return end subroutine calculate_compact_pola_lanczos subroutine solve_lanczos(nbuf, alpha,e_mat,lc, l_verbose) !this subroutine sums to the matrix E_{no}= !the matrix lc%o_mat is distributed among the processors USE kinds, ONLY : DP USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory USE io_global, ONLY : stdout USE mp, ONLY : mp_sum, mp_bcast USE mp_world, ONLY : nproc,mpime,world_comm implicit none INTEGER :: nbuf!number of matrices to treat COMPLEX(kind=DP) :: alpha(nbuf)!constant for Ev+iw TYPE(lanczos_chain) :: lc!lanczos chain descriptor REAL(kind=DP) :: e_mat(lc%numt,lc%numt,nbuf)!matrix to be calculated LOGICAL, INTENT(in) :: l_verbose!if true a lot of output INTEGER :: io,info COMPLEX(kind=DP), ALLOCATABLE :: dl(:),du(:),d(:),t(:) REAL(kind=DP), ALLOCATABLE :: tr(:,:) INTEGER :: l_blk,nbegin,nend REAL(kind=DP), ALLOCATABLE :: o_mat(:,:,:) INTEGER :: iv, iproc,nsize allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps)) e_mat(:,:,:)=0.d0 l_blk= (lc%numt)/nproc if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1 allocate(o_mat(lc%numt,lc%num_steps,l_blk)) allocate(tr(lc%num_steps,l_blk)) !loop on procs do iproc=0,nproc-1 l_blk= (lc%numt)/nproc if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1 nbegin=iproc*l_blk+1 nend=min(nbegin+l_blk-1,lc%numt) nsize=nend-nbegin+1 if(nbegin<=lc%numt) then if(iproc==mpime) then o_mat(:,:,:)=lc%o_mat(:,:,1:nsize) endif if(l_verbose) write(stdout,*) 'proc:', iproc, lc%numt,lc%num_steps,l_blk,nsize if(l_verbose)FLUSH(stdout) call mp_bcast(o_mat, iproc, world_comm) if(l_verbose) write(stdout,*) 'mp_bcast done' if(l_verbose)FLUSH(stdout) do iv=1,nbuf do io=nbegin,nend !!set up vectors for lapack routine dl(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0) du(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0) d(1:lc%num_steps)=cmplx(lc%d(1:lc%num_steps,io),0.d0)+alpha(iv) t(:)=(0.d0,0.d0) t(1)=(1.d0,0.d0) !!call lapack call zgtsv(lc%num_steps,1,dl,d,du,t,lc%num_steps,info) if(info /= 0) then write(stdout,*) 'ZGTSV info:', info FLUSH(stdout) stop endif !!calculate term tr(:,io-nbegin+1)=dble(t(:)) enddo if(l_verbose) write(stdout,*) 'zgtsv done' if(l_verbose) FLUSH(stdout) do io=nbegin,nend call dgemv( 'N', lc%numt,lc%num_steps,1.d0, o_mat(1,1,io-nbegin+1), lc%numt,tr(1,io-nbegin+1),1,1.d0,e_mat(1,io,iv),1) ! call dgemm('N','N',lc%numt,1,lc%num_steps,1.d0,o_mat(1,1,io-nbegin+1),lc%numt,tr(1,io-nbegin+1),lc%num_steps,1.d0,e_mat(1,io,iv),lc%numt) enddo if(l_verbose) write(stdout,*) 'dgemv done' if(l_verbose) FLUSH(stdout) enddo!on iv end if end do deallocate(tr) deallocate(o_mat) return end subroutine solve_lanczos subroutine solve_lanczos_fake(lc,l_verbose) !this subroutine is a parallel fake routine for the solve lanczos routine USE kinds, ONLY : DP USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory USE io_global, ONLY : stdout USE mp, ONLY : mp_sum,mp_bcast USE mp_world, ONLY : nproc,mpime,world_comm implicit none TYPE(lanczos_chain) :: lc!lanczos chain descriptor LOGICAL, INTENT(in) :: l_verbose INTEGER :: l_blk,nbegin,nend, iproc REAL(kind=DP), ALLOCATABLE :: o_mat(:,:,:) INTEGER :: io,nsize l_blk= (lc%numt)/nproc if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1 allocate(o_mat(lc%numt,lc%num_steps,l_blk)) do iproc=0,nproc-1 l_blk= (lc%numt)/nproc if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1 nbegin=iproc*l_blk+1 nend=min(nbegin+l_blk-1,lc%numt) nsize=nend-nbegin+1 if(nbegin<=lc%numt) then if(iproc==mpime) then o_mat(:,:,:)=lc%o_mat(:,:,1:nsize) endif if(l_verbose) write(stdout,*) 'proc:', iproc, lc%numt,lc%num_steps,l_blk,nsize if(l_verbose) FLUSH(stdout) call mp_bcast(o_mat, iproc,world_comm) endif enddo deallocate(o_mat) end subroutine solve_lanczos_fake subroutine do_polarization_lanczos(tf,options,ispin) !this subroutine create polarization descriptor in imaginary frequency using the lanczos scheme USE kinds, ONLY : DP USE input_gw, ONLY : input_options USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,free_memory_compact_q_lanczos,& &read_compact_q_lanczos USE mp_world, ONLY : world_comm, mpime, nproc USE basic_structures, ONLY : wannier_u,lanczos_chain, initialize_memory,free_memory,& &vt_mat_lanczos,tt_mat_lanczos,partial_occ USE io_global, ONLY : stdout,ionode,ionode_id USE times_gw, ONLY : times_freqs USE polarization, ONLY : polaw,initialize_polaw,free_memory_polaw,write_polaw,read_polaw USE mp, ONLY : mp_bcast USE io_files, ONLY : prefix, tmp_dir implicit none INTEGER, EXTERNAL :: find_free_unit TYPE(times_freqs) , INTENT(in) :: tf!for time grid TYPE(input_options), INTENT(in) :: options! for imaginary time range and number of samples INTEGER, INTENT(in) :: ispin!spin channel TYPE(wannier_u) :: uu TYPE(lanczos_chain) :: lc TYPE(partial_occ) :: po INTEGER :: l_blk,nbegin,nend,iv,iw, it,ii,jj,il TYPE(polaw) :: pw ! REAL(kind=DP), ALLOCATABLE :: e_mat(:,:,:) COMPLEX(kind=DP), ALLOCATABLE :: af(:) TYPE(compact_q_lanczos) :: cql REAL(kind=DP), ALLOCATABLE :: tmp_mat(:,:) REAL(kind=DP), ALLOCATABLE :: tmp_mat1(:,:),tmp_mat2(:,:),tmp_mat3(:,:) REAL(kind=DP) :: offset INTEGER :: n_bufv!number of states which are bufferized INTEGER :: n_dim INTEGER :: io_tot,l_blk_io, nbegin_io,nend_io, iproc REAL(kind=DP), ALLOCATABLE :: cql_save(:,:,:) REAL(kind=DP), ALLOCATABLE :: vtl_save(:,:,:) REAL(kind=DP), ALLOCATABLE :: ttl_save(:,:,:) COMPLEX(kind=DP) :: afc TYPE(vt_mat_lanczos) :: vtl TYPE(tt_mat_lanczos) :: ttl INTEGER :: numpw,numl,numt!to be passed to subroutine LOGICAL :: l_reduce_memory=.true. !if true reduces the use of memory, only useful for nprocs==nfreqs LOGICAL :: exst INTEGER :: iw_nbegin, iv_begin, iunrestart,off_nbegin LOGICAL :: l_do_restart REAL(kind=DP), ALLOCATABLE :: occ(:) #if defined(_OPENMP) INTEGER :: omp_get_num_threads, omp_get_max_threads EXTERNAL omp_set_num_threads, omp_get_num_threads, omp_get_max_threads #endif write(stdout,*) 'Routine: do_polarization_lanczos' FLUSH(stdout) call initialize_memory(vtl) call initialize_memory(ttl) call initialize_memory(po) n_bufv=options%n_set_pola ! if(l_reduce_memory) n_bufv=1 OCIO ATTENZIONE !read uu for energies and lanczos chain call read_data_pw_u(uu,options%prefix) if(uu%nums_occ(ispin)==0) then call free_memory(uu) return endif !call free_memory(uu) !i do not need the orthonormalization matrix !if required read partial occupancis stuff if(options%l_frac_occ) then !NOT_TO_BE_INCLUDED_START call read_data_pw_partial_occ(po,options%prefix,ispin) !NOT_TO_BE_INCLUDED_END endif call initialize_memory(lc) call read_data_pw_lanczos_chain(lc, 1, options%prefix, .true.,ispin) write(stdout,*) 'Lanczos dimensions', lc%numt,lc%num_steps do ii=1,lc%numt ! do jj=1,lc%num_steps ! write(stdout,*)' Lanczos elemets:', ii, jj, lc%d(jj,ii), lc%f(jj,ii) ! enddo write(stdout,*)' Lanczos elemets:', ii, lc%d(1,ii), lc%f(1,ii) enddo ! allocate(e_mat(lc%numt,lc%numt,n_bufv)) allocate(af(n_bufv),occ(n_bufv)) occ(:)=1.d0 call initialize_compact_q_lanczos(cql) if(uu%nums > uu%nums_occ(ispin)) then offset=-(uu%ene(uu%nums_occ(ispin)+1,ispin)+uu%ene(uu%nums_occ(ispin),ispin))/2.d0 else offset=-uu%ene(uu%nums_occ(ispin),ispin) endif ! l_blk= (2*tf%n+1)/nproc ! if(l_blk*nproc < (2*tf%n+1)) l_blk = l_blk+1 ! nbegin=mpime*l_blk-tf%n ! nend=nbegin+l_blk-1 l_blk= (tf%n+1)/nproc if(l_blk*nproc < (tf%n+1)) l_blk = l_blk+1 nbegin=mpime*l_blk nend=nbegin+l_blk-1 !if reduce io and l_direct put compact_q_lanczos in memory if(options%l_t_wannier) then l_blk_io= (uu%nums_occ(ispin))/nproc if(l_blk_io*nproc < (uu%nums_occ(ispin))) l_blk_io = l_blk_io+1 nbegin_io=mpime*l_blk_io+1 nend_io=nbegin_io+l_blk_io-1 if(nend_io > uu%nums_occ(ispin)) nend_io=uu%nums_occ(ispin) io_tot=nend_io-nbegin_io+1 if(.not.l_reduce_memory) then do iv=1,uu%nums_occ(ispin) call read_compact_q_lanczos(cql, iv) if(iv==1) allocate(cql_save(cql%numpw,cql%numt,l_blk_io)) if(iv >= nbegin_io .and. iv <= nend_io) then cql_save(:,:,iv-nbegin_io+1)=cql%qlm(:,:) endif enddo else call read_compact_q_lanczos(cql, 1)!just for obtaining numpw,numt...poor man solution allocate(cql_save(1,1,1)) endif else !put all matrices vtl and ttl in memory, distributed according to valence state l_blk_io= (uu%nums_occ(ispin))/nproc if(l_blk_io*nproc < (uu%nums_occ(ispin))) l_blk_io = l_blk_io+1 nbegin_io=mpime*l_blk_io+1 nend_io=nbegin_io+l_blk_io-1 if(nend_io > uu%nums_occ(ispin)) nend_io=uu%nums_occ(ispin) io_tot=nend_io-nbegin_io+1 do iv=1,uu%nums_occ(ispin) call read_data_pw_vt_mat_lanczos(vtl, iv, options%prefix, .true.,ispin) call read_data_pw_tt_mat_lanczos(ttl, iv, options%prefix, .true.,ispin) if(iv==1) then allocate(vtl_save(vtl%numpw,vtl%numl,l_blk_io)) allocate(ttl_save(ttl%numt,ttl%numl,l_blk_io)) endif if(iv >= nbegin_io .and. iv <= nend_io) then vtl_save(:,:,iv-nbegin_io+1)= vtl%vt_mat(:,:) ttl_save(:,:,iv-nbegin_io+1)= ttl%tt_mat(:,:) endif call free_memory(vtl) call free_memory(ttl) enddo endif !sutup parameters for solve_lanczos if(options%l_t_wannier) then numpw=cql%numpw numt=cql%numt pw%numpw=cql%numpw else numpw=vtl%numpw numl=vtl%numl numt=ttl%numt pw%numpw=vtl%numpw endif !loop on imaginary frequency i\omega !check for restart if(ionode) then inquire(file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', exist = exst) if(.not. exst .or. ispin==2) then off_nbegin=0 iv_begin=1 l_do_restart=.false. else iunrestart = find_free_unit() open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='old') read(iunrestart,*) off_nbegin read(iunrestart,*) iv_begin close(iunrestart) if(off_nbegin<0) off_nbegin=0 if(iv_begin<1 ) then iv_begin=1 l_do_restart=.false. else iv_begin=iv_begin+n_bufv l_do_restart=.true. endif endif endif call mp_bcast(off_nbegin,ionode_id,world_comm) iw_nbegin=nbegin+off_nbegin call mp_bcast(iv_begin, ionode_id,world_comm) call mp_bcast(l_do_restart, ionode_id,world_comm) do iw=iw_nbegin,nbegin+l_blk-1 if(iw <= tf%n) then write(stdout,*) 'do_polarization_lanczos1', iw, l_do_restart FLUSH(stdout) !!set up polaw descriptor !if required read polaw from disk if(l_do_restart.or.ispin==2) then call initialize_polaw(pw) call read_polaw(iw,pw,.false.,options%l_verbose) l_do_restart=.false. else call initialize_polaw(pw) pw%label=iw pw%ontime=.false. pw%time=tf%freqs(iw) pw%factor=(1.d0,0.d0) endif !!loop on valence states v do iv=iv_begin,uu%nums_occ(ispin),n_bufv write(stdout,*) 'do_polarization_lanczos iv', iv FLUSH(stdout) !!!solve tridiagonal problem for -E_v+i\omega n_dim=0 do il=1,n_bufv if((iv+il-1) <= uu%nums_occ(ispin)) then af(il) = -uu%ene(iv+il-1,ispin)+dcmplx(0.d0,tf%freqs(iw)) if(.not.options%l_frac_occ) then occ(il)=1.d0 else !NOT_TO_BE_INCLUDED_START occ(il)=po%f_occ(iv+il-1) !NOT_TO_BE_INCLUDED_END endif n_dim=n_dim+1 endif enddo if(options%l_verbose) write(stdout,*) 'Call solve_lanczos' if(options%l_verbose) FLUSH(stdout) if(iv==1.and.ispin/=2) then ! if(options%l_t_wannier) then ! pw%numpw=cql%numpw ! else ! pw%numpw=vtl%numpw ! endif allocate(pw%pw(pw%numpw,pw%numpw)) pw%pw(:,:)=0.d0 endif !NOT_TO_BE_INCLUDED_START if(iv==1 .and. options%l_frac_occ) then afc=dcmplx(0.d0,tf%freqs(iw)) call add_partial_pola(pw,po,uu,afc,options%nspin,ispin) endif !NOT_TO_BE_INCLUDED_END call solve_lanczos_2(numpw,numt,numl,n_dim,n_bufv,af,lc, iv, uu%nums_occ(ispin),l_blk_io,& &cql_save,pw,n_dim,options%l_t_wannier,vtl_save,ttl_save,l_reduce_memory,uu%nspin,occ,options%l_verbose) if(options%l_verbose) write(stdout,*) 'Done' if(options%l_verbose) FLUSH(stdout) #if defined(_OPENMP) ! write(stdout,*) 'Call dgemms, ntids=', omp_get_max_threads() #else ! write(stdout,*) 'Call dgemms' #endif !restart part, every 10 if(mod(iv-iv_begin,10)==0.or.n_bufv/=1) then call write_polaw(pw,options%debug) if(ionode) then iunrestart = find_free_unit() open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='unknown') write(iunrestart,*) iw-nbegin write(iunrestart,*) iv close(iunrestart) endif endif enddo !!write polarization on disk if(ionode) then iunrestart = find_free_unit() open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='unknown') write(iunrestart,*) iw-nbegin write(iunrestart,*) iv close(iunrestart) endif call write_polaw(pw,options%debug) call free_memory_polaw(pw) else !here just global parallel calls do iv=iv_begin,uu%nums_occ(ispin),n_bufv n_dim=0 do il=1,n_bufv if((iv+il-1) <= uu%nums_occ(ispin)) then n_dim=n_dim+1 endif enddo call solve_lanczos_2(numpw,numt,numl,0,n_bufv,af,lc, iv, uu%nums_occ(ispin),l_blk_io,& &cql_save,pw,n_dim,options%l_t_wannier,vtl_save,ttl_save,l_reduce_memory,uu%nspin,occ,.false.) enddo endif iv_begin=1 l_do_restart=.false. enddo if(ionode) then iunrestart = find_free_unit() open( unit= iunrestart, file=trim(tmp_dir)//trim(prefix)//'-'//'restart_polaw', status='unknown') write(iunrestart,*) -1 write(iunrestart,*) -1 close(iunrestart) endif if(options%l_t_wannier) then deallocate(cql_save) call free_memory_compact_q_lanczos(cql) else deallocate(vtl_save,ttl_save) endif ! deallocate(e_mat) deallocate(af,occ) call free_memory(lc) call free_memory(uu) call free_memory(vtl) call free_memory(ttl) call free_memory(po) return end subroutine do_polarization_lanczos subroutine solve_lanczos_2(numpw,numt,numl,nbuf,mbuf, alpha,lc, iv0,nbndv,& &lblkio,cql_save,pw,n_dim,l_t_wannier, vtl_save, ttl_save, l_reduce_memory,nspin,occ,l_verbose) !this subroutine sums to the matrix E_{no}= !the matrix lc%o_mat is distributed among the processors USE kinds, ONLY : DP USE basic_structures, ONLY : lanczos_chain, initialize_memory,free_memory USE io_global, ONLY : stdout USE mp, ONLY : mp_sum, mp_bcast USE mp_world, ONLY : nproc,mpime,world_comm USE parallel_include USE lanczos, ONLY : compact_q_lanczos,initialize_compact_q_lanczos,& free_memory_compact_q_lanczos, & read_compact_q_lanczos USE polarization, ONLY : polaw implicit none INTEGER, INTENT(in) :: numpw!dimension of polarizability basis INTEGER, INTENT(in) :: numt!number of t vectors INTEGER, INTENT(in) :: numl!number of tv vectors INTEGER :: nbuf!number of matrices to treat INTEGER :: mbuf COMPLEX(kind=DP) :: alpha(mbuf)!constant for Ev+iw TYPE(lanczos_chain) :: lc!lanczos chain descriptor INTEGER, INTENT(in) :: iv0 !staring valence energy INTEGER, INTENT(in) :: nbndv!number of valence functions INTEGER, INTENT(in) :: lblkio!for the dimension of the following array REAL(kind=DP), INTENT(in) :: cql_save(numpw,numt,lblkio) TYPE(polaw), INTENT(inout) :: pw INTEGER, INTENT(in) :: n_dim!max dimension of nbuf LOGICAL, INTENT(in) :: l_t_wannier !if true t vectors from products with wannier REAL(kind=DP) :: vtl_save(numpw,numl,lblkio) REAL(kind=DP) :: ttl_save(numt,numl,lblkio) LOGICAL, INTENT(in) :: l_reduce_memory!if true cql_save not used INTEGER, INTENT(in) :: nspin!spin multiplicity REAL(kind=DP), INTENT(in) :: occ(mbuf)!occupancies of KS valence states LOGICAL, INTENT(in) :: l_verbose INTEGER :: io,info COMPLEX(kind=DP), ALLOCATABLE :: dl(:),du(:),d(:),t(:) REAL(kind=DP), ALLOCATABLE :: tr(:,:) INTEGER :: l_blk,nbegin,nend REAL(kind=DP), ALLOCATABLE :: e_mat_ip(:,:) INTEGER :: iv, iproc,nsize, iproc_cql INTEGER :: nbuf_ip COMPLEX(kind=DP), ALLOCATABLE :: alpha_ip(:) INTEGER, ALLOCATABLE :: displ(:), recvcounts(:) INTEGER :: l_blk_io, nbegin_io,nend_io REAL(kind=DP), ALLOCATABLE :: tmp_mat(:,:),tmp_mat1(:,:),tmp_mat2(:,:),tmp_mat3(:,:) REAL(kind=DP), ALLOCATABLE :: pw_ip(:,:),pw_dumm(:,:) REAL(kind=DP), ALLOCATABLE :: qlm_tmp(:,:,:),vtl_tmp(:,:,:),ttl_tmp(:,:,:) INTEGER :: ierr TYPE(compact_q_lanczos) :: cql LOGICAL :: l_qo=.true.!if true perform QO matrix multimplication first (convenient only if nprocs>nlanczos_steps) REAL(kind=DP), ALLOCATABLE :: qo_mat(:,:,:,:) LOGICAL :: l_single=.true.!if true saves qlm_tmp in single precision REAL(kind=4), ALLOCATABLE :: qlm_tmp_single(:,:,:) REAL(kind=DP) :: factor!global prefactor if(nspin==1) then factor=-4.d0 else factor=-2.d0 endif allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps)) ierr=0 l_blk= (lc%numt)/nproc if(l_blk*nproc < (lc%numt)) l_blk = l_blk+1 nbegin=mpime*l_blk+1 nend=min(nbegin+l_blk-1,lc%numt) nsize=nend-nbegin+1 l_blk_io= (nbndv)/nproc if(l_blk_io*nproc < (nbndv)) l_blk_io = l_blk_io+1 nbegin_io=mpime*l_blk_io+1 nend_io=nbegin_io+l_blk_io-1 allocate(tr(lc%num_steps,l_blk)) allocate(displ(nproc),recvcounts(nproc)) allocate(e_mat_ip(lc%numt,l_blk)) if(l_t_wannier) then if(.not.l_reduce_memory) then allocate(qlm_tmp(numpw,numt,n_dim)) do iv=1,n_dim !read compact matrix iproc_cql=(iv0+iv-2)/l_blk_io if(mpime==iproc_cql) qlm_tmp(:,:,iv) = cql_save(:,:,iv0+iv-1-nbegin_io+1) call mp_bcast(qlm_tmp(:,:,iv), iproc_cql,world_comm) enddo else call initialize_compact_q_lanczos(cql) if(.not.l_single) then allocate(qlm_tmp(numpw,numt,n_dim)) do iv=1,n_dim call read_compact_q_lanczos(cql,iv0+iv-1) qlm_tmp(1:numpw,1:numt,iv)=cql%qlm(1:numpw,1:numt) enddo else allocate(qlm_tmp(numpw,numt,1)) allocate(qlm_tmp_single(numpw,numt,n_dim)) do iv=1,n_dim call read_compact_q_lanczos(cql,iv0+iv-1) qlm_tmp_single(1:numpw,1:numt,iv)=real(cql%qlm(1:numpw,1:numt)) enddo endif call free_memory_compact_q_lanczos(cql) endif if(l_qo) then allocate(qo_mat(numpw,lc%num_steps,nsize,n_dim)) do iv=1,n_dim !calculate qo_mat qlm_tmp(1:numpw,1:numt,1)=dble(qlm_tmp_single(1:numpw,1:numt,iv)) do io=nbegin,nend if(.not.l_single) then call dgemm('N','N', numpw,lc%num_steps,numt,1.d0, qlm_tmp(1,1,iv),& &numpw,lc%o_mat(1,1,io-nbegin+1),numt,0.d0,qo_mat(1,1,io-nbegin+1,iv),numpw) else !qlm_tmp(1:numpw,1:numt,1)=dble(qlm_tmp_single(1:numpw,1:numt,iv)) call dgemm('N','N', numpw,lc%num_steps,numt,1.d0, qlm_tmp(1,1,1),numpw,& &lc%o_mat(1,1,io-nbegin+1),numt,0.d0,qo_mat(1,1,io-nbegin+1,iv),numpw) endif enddo !call dgemv( 'N', lc%numt,lc%num_steps,1.d0, lc%o_mat(1,1,io-nbegin+1), lc%numt,tr(1,io-nbegin+1),1,1.d0,e_mat_ip(1,io-nbegin+1),1) enddo endif else allocate(vtl_tmp(numpw,numl,n_dim)) allocate(ttl_tmp(numt,numl,n_dim)) do iv=1,n_dim !read compact matrix iproc_cql=(iv0+iv-2)/l_blk_io if(mpime==iproc_cql) then vtl_tmp(:,:,iv) = vtl_save(:,:,iv0+iv-1-nbegin_io+1) ttl_tmp(:,:,iv) = ttl_save(:,:,iv0+iv-1-nbegin_io+1) endif call mp_bcast(vtl_tmp(:,:,iv), iproc_cql,world_comm) call mp_bcast(ttl_tmp(:,:,iv), iproc_cql,world_comm) enddo endif do iproc=0,nproc-1 displ(iproc+1)=iproc*l_blk if(displ(iproc+1)+l_blk > lc%numt) then recvcounts(iproc+1)=(lc%numt-displ(iproc+1)) else recvcounts(iproc+1)=l_blk endif enddo displ(:)=displ(:)*lc%numt recvcounts(:)=recvcounts(:)*lc%numt if(l_t_wannier) then allocate(tmp_mat(numpw,l_blk)) else allocate(tmp_mat1(numl,l_blk)) allocate(tmp_mat2(numpw,l_blk)) allocate(tmp_mat3(numpw,numl)) endif allocate(pw_ip(numpw,numpw),pw_dumm(numpw,numpw)) !loop on procs do iproc=0,nproc-1 if(iproc==mpime) nbuf_ip=nbuf !distribute number of nbuf !distribute af call mp_bcast(nbuf_ip, iproc,world_comm) if(nbuf_ip >0) then allocate(alpha_ip(nbuf_ip)) if(iproc==mpime) alpha_ip(1:nbuf_ip)=alpha(1:nbuf_ip) call mp_bcast(alpha_ip,iproc,world_comm) !loop on freuqency pw_ip(:,:)=0.d0 do iv=1,nbuf_ip if(l_single) then qlm_tmp(1:numpw,1:numt,1)=dble(qlm_tmp_single(1:numpw,1:numt,iv)) endif e_mat_ip(:,:)=0.d0 !calculate part !$OMP PARALLEL SHARED(tr,lc,alpha_ip,iv) PRIVATE(dl,du,d,t,info,io) ! allocate(dl(lc%num_steps-1),du(lc%num_steps-1),d(lc%num_steps),t(lc%num_steps)) !$OMP DO do io=nbegin,nend !!set up vectors for lapack routine dl(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0) du(1:lc%num_steps-1)=cmplx(lc%f(1:lc%num_steps-1,io),0.d0) d(1:lc%num_steps)=cmplx(lc%d(1:lc%num_steps,io),0.d0)+alpha_ip(iv) t(:)=(0.d0,0.d0) t(1)=(1.d0,0.d0) !!call lapack call zgtsv(lc%num_steps,1,dl,d,du,t,lc%num_steps,info) if(info /= 0) then write(stdout,*) 'ZGTSV info:', info FLUSH(stdout) stop endif !!calculate term tr(1:lc%num_steps,io-nbegin+1)=dble(t(1:lc%num_steps)) enddo !$OMP END DO ! deallocate(dl,du,d,t) !$OMP END PARALLEL if(l_verbose) write(stdout,*) 'zgtsv done' if(l_verbose) FLUSH(stdout) if(.not.l_qo) then do io=nbegin,nend call dgemv( 'N', lc%numt,lc%num_steps,1.d0, lc%o_mat(1,1,io-nbegin+1), & &lc%numt,tr(1,io-nbegin+1),1,1.d0,e_mat_ip(1,io-nbegin+1),1) enddo endif if(l_verbose) write(stdout,*) 'dgemv done',nsize if(l_verbose) FLUSH(stdout) if(nbegin <= lc%numt) then if(l_t_wannier) then if(.not.l_qo) then if(.not.l_single) then call dgemm('N','N',numpw,nsize,numt,1.d0,qlm_tmp(1,1,iv),numpw,e_mat_ip,lc%numt,0.d0,tmp_mat,numpw) else call dgemm('N','N',numpw,nsize,numt,1.d0,qlm_tmp(1,1,1),numpw,e_mat_ip,lc%numt,0.d0,tmp_mat,numpw) endif if(l_verbose) write(stdout,*) 'dgemm1 done' if(l_verbose) FLUSH(stdout) else do io=nbegin,nend call dgemv( 'N', numpw,lc%num_steps,1.d0, qo_mat(1,1,io-nbegin+1,iv), & &numpw,tr(1,io-nbegin+1),1,0.d0,tmp_mat(1,io-nbegin+1),1) enddo endif if(.not.l_single) then call dgemm('N','T',numpw,numpw,nsize,factor*occ(iv),tmp_mat,numpw,qlm_tmp(1,nbegin,iv),numpw,1.d0,pw_ip,numpw) else call dgemm('N','T',numpw,numpw,nsize,factor*occ(iv),tmp_mat,numpw,qlm_tmp(1,nbegin,1),numpw,1.d0,pw_ip,numpw) endif if(l_verbose) write(stdout,*) 'dgemm2 done', pw%numpw, numpw,ierr if(l_verbose) FLUSH(stdout) else call dgemm('T','N',numl,nsize,numt,1.d0,ttl_tmp(1,1,iv),numt,e_mat_ip,numt,0.d0,tmp_mat1,numl) if(l_verbose) write(stdout,*) 'dgemm1 done' if(l_verbose) FLUSH(stdout) call dgemm('N','N',numpw,nsize,numl,1.d0,vtl_tmp(1,1,iv),numpw,tmp_mat1,numl,0.d0,tmp_mat2,numpw) if(l_verbose) write(stdout,*) 'dgemm2 done' if(l_verbose) FLUSH(stdout) call dgemm('N','N', numpw,numl,nsize,1.d0,tmp_mat2,numpw,ttl_tmp(nbegin,1,iv),numt,0.d0,tmp_mat3,numpw) if(l_verbose)write(stdout,*) 'dgemm3 done' if(l_verbose) FLUSH(stdout) call dgemm('N','T', numpw,numpw,numl,factor*occ(iv),tmp_mat3,numpw,vtl_tmp(1,1,iv),numpw,1.d0,pw_ip,numpw) ! call dgemm('T','N', ttl%numl,ttl%numt,ttl%numt,1.d0,ttl%tt_mat,ttl%numt,e_mat(1,1,il),lc%numt,0.d0,tmp_mat1,ttl%numl) ! call dgemm('N','N', pw%numpw,ttl%numt,ttl%numl,1.d0,vtl%vt_mat,vtl%numpw,tmp_mat1,ttl%numl,0.d0,tmp_mat2,pw%numpw) ! call dgemm('N','N', pw%numpw,ttl%numl,ttl%numt,1.d0,tmp_mat2,pw%numpw,ttl%tt_mat,ttl%numt,0.d0,tmp_mat3,pw%numpw) ! call dgemm('N','T', pw%numpw,pw%numpw,ttl%numl,-4.d0,tmp_mat3,pw%numpw,vtl%vt_mat,pw%numpw,1.d0,pw%pw,pw%numpw) endif endif !gather_collect result ! call mpi_gatherv(e_mat_ip, lc%numt*nsize,MPI_DOUBLE_PRECISION,e_mat(1,1,iv),recvcounts,displ, MPI_DOUBLE_PRECISION,iproc,world_comm) enddo #if defined(__MPI) if(iproc==mpime) then pw_ip(:,:)=pw_ip(:,:)+pw%pw(:,:) CALL MPI_REDUCE(pw_ip, pw%pw, pw%numpw*pw%numpw,MPI_DOUBLE_PRECISION,MPI_SUM, iproc,world_comm,ierr) else !ATTENTION pw%numpw could not be initialized in THIS CASE CALL MPI_REDUCE(pw_ip, pw_dumm, numpw*numpw,MPI_DOUBLE_PRECISION,MPI_SUM, iproc,world_comm,ierr) endif if(l_verbose) write(stdout,*) 'mpi_reduce done' if(l_verbose) FLUSH(stdout) #else pw%pw(:,:)=pw%pw(:,:)+pw_ip(:,:) #endif deallocate(alpha_ip) endif enddo if(l_qo) deallocate(qo_mat) deallocate(displ,recvcounts) deallocate(e_mat_ip) deallocate(dl,du,d,t) deallocate(tr) if(l_t_wannier) then deallocate(tmp_mat) ! if(l_reduce_memory) call free_memory_compact_q_lanczos(cql) else deallocate(tmp_mat1,tmp_mat2,tmp_mat3) endif deallocate(pw_ip,pw_dumm) if(l_t_wannier) then deallocate(qlm_tmp) if(l_single) deallocate(qlm_tmp_single) else deallocate(vtl_tmp, ttl_tmp) endif return end subroutine solve_lanczos_2 subroutine add_partial_pola(pw,po,uu,af,nspin,ispin) !NOT_TO_BE_INCLUDED_START !this subroutine adds the contribution to the irreducible polarizability due to !fractionally occupied states USE kinds, ONLY : DP USE basic_structures, ONLY : partial_occ,wannier_u USE io_global, ONLY : stdout,ionode,ionode_id USE polarization, ONLY : polaw implicit none TYPE(polaw), INTENT(inout) :: pw!polarizability matrix to be modified TYPE(partial_occ), INTENT(in) :: po!fractionary occupaction terms TYPE(wannier_u), INTENT(in) :: uu!for KS energies COMPLEX(kind=DP),INTENT(in) :: af!!complex frequency INTEGER, INTENT(in) :: nspin!spin multiplicity INTEGER, INTENT(in) :: ispin!spin channel REAL(kind=DP) :: fact INTEGER :: iv,jv do iv=po%nums_occ_min+1,po%nums_occ do jv=1,iv-1 fact=-(2.d0/dble(nspin))*(po%f_occ(jv)-po%f_occ(iv))/dble(uu%ene(iv,ispin)-uu%ene(jv,ispin)+af) !invertiti per energie call dgemm('N','T',pw%numpw,pw%numpw,1,fact,po%ppp_mat(1,jv,iv),po%numpw,& &po%ppp_mat(1,jv,iv),po%numpw,1.d0,pw%pw,pw%numpw) enddo enddo return !NOT_TO_BE_INCLUDED_END end subroutine add_partial_pola