quantum-espresso/GWW/simple/commutator.f90

! This subroutine calculates the commutator [r,V_nonloc]


subroutine gen_beta_simple (qk, npw_max, dvkb)
  !----------------------------------------------------------------------
  !
  !   Calculates the beta function pseudopotentials with
  !   the derivative of the Bessel functions
  !
  USE kinds,      ONLY : DP
  USE constants,  ONLY : tpi
  USE ions_base,  ONLY : nat, ntyp => nsp, ityp, tau
  USE cell_base,  ONLY : tpiba
  USE klist,      ONLY : ngk
  USE gvect,      ONLY : mill, eigts1, eigts2, eigts3, g
  USE uspp,       ONLY : nkb, indv, nhtol, nhtolm
  USE us,         ONLY : nqx, tab, tab_d2y, dq, spline_ps
  USE m_gth,      ONLY : mk_dffnl_gth
  USE splinelib
  USE uspp_param, ONLY : upf, lmaxkb, nbetam, nh
  USE io_global, ONLY : stdout
  !
  implicit none
  !
  real(kind=DP), dimension(3) :: qk
  integer :: npw_max
  complex(DP), intent(out) :: dvkb (npw_max, nkb)
  !
  ! local variables
  !
  integer :: ikb, nb, ih, ig, i0, i1, i2, i3 , nt
  ! counter on beta functions
  ! counter on beta functions
  ! counter on beta functions
  ! counter on G vectors
  ! index of the first nonzero point in the r
  ! counter on atomic type

  real(DP) :: arg, px, ux, vx, wx
  ! argument of the atomic phase factor

  complex(DP) :: phase, pref
  ! atomic phase factor
  ! prefactor

  integer :: na, l, iig, lm, iq
  real(DP), allocatable :: djl (:,:,:), ylm (:,:), q (:), gk (:,:)
  real(DP) ::  qt
  complex(DP), allocatable :: sk (:)
  real(DP), allocatable :: xdata(:)

  call start_clock('gen_beta1')

  if (nkb.eq.0) return

  call start_clock('stres_us31')

  allocate (djl( npw_max , nbetam , ntyp))
  allocate (ylm( npw_max ,(lmaxkb + 1) **2))
  allocate (gk( 3, npw_max))
  allocate (q( npw_max))
  do ig = 1, npw_max
     iig = ig
     gk (1,ig) = qk(1) + g(1, iig)
     gk (2,ig) = qk(2) + g(2, iig)
     gk (3,ig) = qk(3) + g(3, iig)
     q (ig) = gk(1, ig)**2 +  gk(2, ig)**2 + gk(3, ig)**2
  enddo

  call stop_clock('stres_us31')
  call start_clock('stres_us32')
  call ylmr2 ((lmaxkb+1)**2, npw_max, gk, q, ylm)
  call stop_clock('stres_us32')
  call start_clock('stres_us33')

  if (spline_ps) then
    allocate(xdata(nqx))
    do iq = 1, nqx
      xdata(iq) = (iq - 1) * dq
    enddo
  endif

  do nt = 1, ntyp
     do nb = 1, upf(nt)%nbeta
        if ( upf(nt)%is_gth ) then
           call mk_dffnl_gth( nt, nb, npw_max, q, djl(1,nb,nt) )
           cycle
        endif
        do ig = 1, npw_max
           qt = sqrt(q (ig)) * tpiba
           if (spline_ps) then
             djl(ig,nb,nt) = splint_deriv(xdata, tab(:,nb,nt), &
                                                 tab_d2y(:,nb,nt), qt)
           else
             px = qt / dq - int (qt / dq)
             ux = 1.d0 - px
             vx = 2.d0 - px
             wx = 3.d0 - px
             i0 = qt / dq + 1
             i1 = i0 + 1
             i2 = i0 + 2
             i3 = i0 + 3
             if (i3 <= nqx) then ! Approximation
                djl(ig,nb,nt) = ( tab (i0, nb, nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + &
                               tab (i1, nb, nt) * (+vx*wx-px*wx-px*vx)/2.d0 - &
                               tab (i2, nb, nt) * (+ux*wx-px*wx-px*ux)/2.d0 + &
                               tab (i3, nb, nt) * (+ux*vx-px*vx-px*ux)/6.d0 )/dq
             else
                djl(ig,nb,nt) = 0.d0  ! Approximation
             endif
           endif
        enddo
     enddo
  enddo
  call stop_clock('stres_us33')
  call start_clock('stres_us34')

  deallocate (q)
  deallocate (gk)

  allocate (sk( npw_max))
  ikb = 0
  do nt = 1, ntyp
     do na = 1, nat
        if (ityp (na) .eq.nt) then
           arg = (qk (1) * tau(1,na) + &
                  qk (2) * tau(2,na) + &
                  qk (3) * tau(3,na) ) * tpi
           phase = CMPLX(cos (arg), - sin (arg) ,kind=DP)
           do ig = 1, npw_max
              iig = ig
              sk (ig) = eigts1 (mill (1,iig), na) * &
                        eigts2 (mill (2,iig), na) * &
                        eigts3 (mill (3,iig), na) * phase
           enddo
           do ih = 1, nh (nt)
              nb = indv (ih, nt)
              l = nhtol (ih, nt)
              lm= nhtolm(ih, nt)
              ikb = ikb + 1
              pref = (0.d0, -1.d0) **l
              !
              do ig = 1, npw_max
                 dvkb (ig, ikb) = djl (ig, nb, nt) * sk (ig) * ylm (ig, lm) &
                      * pref
              enddo
           enddo
        endif
     enddo

  enddo
  call stop_clock('stres_us34')
  call stop_clock('gen_beta1')

  if (ikb.ne.nkb) call errore ('gen_us_dj', 'unexpected error', 1)
  deallocate (sk)
  deallocate (ylm)
  deallocate (djl)
  if (spline_ps) deallocate(xdata)
  return
end subroutine gen_beta_simple


subroutine gen_beta_simple_2 (qk, npw_max, u, dvkb)
  !----------------------------------------------------------------------
  !
  !  Calculates the kleinman-bylander pseudopotentials with the
  !  derivative of the spherical harmonics projected on vector u
  !
  USE kinds,      ONLY : DP
  USE io_global,  ONLY : stdout
  USE constants,  ONLY : tpi
  USE ions_base,  ONLY : nat, ntyp => nsp, ityp, tau
  USE cell_base,  ONLY : tpiba
  USE klist,      ONLY : ngk, igk_k
  USE gvect,      ONLY : mill, eigts1, eigts2, eigts3, g
  USE uspp,       ONLY : nkb, indv, nhtol, nhtolm
  USE us,         ONLY : nqx, tab, tab_d2y, dq, spline_ps
  USE splinelib
  USE uspp_param, ONLY : upf, lmaxkb, nbetam, nh
  !
  implicit none
  !
  real(kind=DP), dimension(3) :: qk
  integer :: npw_max
  real(DP) :: u (3)

  complex(DP) :: dvkb (npw_max, nkb)
  integer :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, &
       i3, ig
  real(DP), allocatable :: gk(:,:), q (:)
  real(DP) :: px, ux, vx, wx, arg

  real(DP), allocatable :: vkb0 (:,:,:), dylm (:,:), dylm_u (:,:)
  ! dylm = d Y_lm/dr_i in cartesian axes
  ! dylm_u as above projected on u

  complex(DP), allocatable :: sk (:)
  complex(DP) :: phase, pref

  integer :: iq
  real(DP), allocatable :: xdata(:)
  !
  !
  call start_clock('gen_beta2')

  dvkb(:,:) = (0.d0, 0.d0)
  if (lmaxkb.le.0) return

  allocate ( vkb0(npw_max,nbetam,ntyp), dylm_u(npw_max,(lmaxkb+1)**2), gk(3,npw_max) )
  allocate ( q(npw_max) )

  do ig = 1, npw_max
     iig = ig
     gk (1, ig) = qk (1) + g (1,iig)
     gk (2, ig) = qk (2) + g (2,iig)
     gk (3, ig) = qk (3) + g (3,iig)
     q (ig) = gk(1, ig)**2 +  gk(2, ig)**2 + gk(3, ig)**2
  enddo

  allocate ( dylm(npw_max,(lmaxkb+1)**2) )
  dylm_u(:,:) = 0.d0
  do ipol = 1, 3
     call dylmr2  ((lmaxkb+1)**2, npw_max, gk, q, dylm, ipol)
     call daxpy (npw_max * (lmaxkb + 1) **2, u (ipol), dylm, 1, dylm_u, 1)
  enddo
  deallocate (dylm)

  do ig = 1, npw_max
     q (ig) = sqrt ( q(ig) ) * tpiba
  end do

  if (spline_ps) then
    allocate(xdata(nqx))
    do iq = 1, nqx
      xdata(iq) = (iq - 1) * dq
    enddo
  endif

  do nt = 1, ntyp
     ! calculate beta in G-space using an interpolation table
     do nb = 1, upf(nt)%nbeta
        do ig = 1, npw_max
           if (spline_ps) then
             vkb0(ig,nb,nt) = splint(xdata, tab(:,nb,nt), &
                                     tab_d2y(:,nb,nt), q(ig))
           else
             px = q (ig) / dq - int (q (ig) / dq)
             ux = 1.d0 - px
             vx = 2.d0 - px
             wx = 3.d0 - px
             i0 = q (ig) / dq + 1
             i1 = i0 + 1
             i2 = i0 + 2
             i3 = i0 + 3
             if (i3<=nqx) then ! DEBUG
                vkb0 (ig, nb, nt) = tab (i0, nb, nt) * ux * vx * wx / 6.d0 + &
                                    tab (i1, nb, nt) * px * vx * wx / 2.d0 - &
                                   tab (i2, nb, nt) * px * ux * wx / 2.d0 + &
                                    tab (i3, nb, nt) * px * ux * vx / 6.d0
             else
                vkb0 (ig, nb, nt) = 0.d0 ! DEBUG
             endif
           endif
        enddo
     enddo
  enddo

  deallocate (q)
  allocate ( sk(npw_max) )

  ikb = 0
  do nt = 1, ntyp
     do na = 1, nat
        if (ityp (na) .eq.nt) then
           arg = (qk (1) * tau (1, na) + qk (2) * tau (2, na) &
                + qk (3) * tau (3, na) ) * tpi
           phase = CMPLX(cos (arg), - sin (arg) ,kind=DP)
           do ig = 1, npw_max
              iig = ig
              sk (ig) = eigts1 (mill (1,iig), na) * &
                        eigts2 (mill (2,iig), na) * &
                        eigts3 (mill (3,iig), na) * phase
           enddo
           do ih = 1, nh (nt)
              nb = indv (ih, nt)
              l = nhtol (ih, nt)
              lm = nhtolm(ih, nt)
              ikb = ikb + 1
              pref = (0.d0, -1.d0) **l
              !
              do ig = 1, npw_max
                 dvkb (ig, ikb) = vkb0(ig, nb, nt) * sk(ig) * dylm_u(ig, lm) &
                      * pref / tpiba
              enddo
           enddo
        endif
     enddo
  enddo

  call stop_clock('gen_beta2')

  if (ikb.ne.nkb) then
     WRITE( stdout, * ) ikb, nkb
     call errore ('gen_us_dy', 'unexpected error', 1)
  endif

  deallocate ( sk )
  deallocate ( vkb0, dylm_u, gk )
  if (spline_ps) deallocate(xdata)

  return
end subroutine gen_beta_simple_2


subroutine commutator_Hx_psi_simple (qk, npw_max, nbnd_occ, becp1, becp2, ipol, dpsi, wfc_e, vkb_max)
  !----------------------------------------------------------------------
  !
  ! On output: dpsi contains [H,x_ipol] | psi_ik > in crystal axis
  !            (projected on at(*,ipol) )
  !
  ! vkb,evc,igk must be properly set for the appropriate k-point
  ! in addition becp1 must be set equal to becp1 = <vkb|evc>
  ! as it is done in PH/phq_init.f90 for the k-point ik
  ! NB: here the last index of becp1 is missing, hence it refers
  !     to a single k-point
  !
  !    CALL calbec (npw, vkb, evc, becp1(:,:) )
  !
  USE kinds,           ONLY : DP
  USE cell_base,       ONLY : tpiba
  USE ions_base,       ONLY : nat, ityp, ntyp => nsp
  USE io_global,       ONLY : stdout
  USE gvect,           ONLY : g
  USE wvfct,           ONLY : nbnd, et
  USE wavefunctions, ONLY: evc
  USE lsda_mod,        ONLY : nspin
  USE noncollin_module,ONLY : noncolin, npol
  USE becmod,          ONLY : becp, bec_type, calbec
  USE uspp,            ONLY : nkb
  USE uspp_param,      ONLY : nh, nhm
  USE control_flags,   ONLY : gamma_only
  USE uspp,          ONLY:  deeq, deeq_nc
  USE wvfct,           ONLY : npwx
  USE mp_world, ONLY : world_comm, mpime, nproc
  USE mp, ONLY : mp_sum , mp_bcast

  implicit none
  integer :: npw_max
  INTEGER, INTENT(IN) :: nbnd_occ, ipol
  COMPLEX(DP), INTENT(OUT)    :: dpsi(npw_max*npol,nbnd_occ)
  TYPE(bec_type), INTENT(IN)  :: becp1 ! dimensions ( nkb, nbnd )
  TYPE(bec_type), INTENT(INOUT) :: becp2 ! dimensions ( nkb, nbnd )
  COMPLEX(kind=DP), INTENT(IN)  :: vkb_max(npw_max, nbnd_occ)
  !
  COMPLEX(kind=DP), DIMENSION(npol*npw_max,nbnd_occ) :: wfc_e
  REAL(kind=DP), DIMENSION(3) :: qk
  REAL(kind=DP), DIMENSION(:), ALLOCATABLE :: g2kin
  REAL(kind=DP), dimension(3,3) :: att
  INTEGER :: nbegin, nend , nbnd_loc
  !
  ! Local variables
  !
  integer :: ig, na, ibnd, jbnd, ikb, jkb, nt, lter, ih, jh, ijkb0,  &
             nrec, is, js, ijs , ii
  ! counters

  real(DP), allocatable  :: gk (:,:)
  ! the derivative of |k+G|
  complex(DP), allocatable :: ps2(:,:,:), dvkb (:,:), dvkb1 (:,:),  &
       work (:,:), psc(:,:,:,:), deff_nc(:,:,:,:)
  REAL(DP), allocatable :: deff(:,:,:)
  !

  CALL start_clock ('commut_Hx_psi')
  dpsi=(0.d0, 0.d0)
  !
  allocate(g2kin(npw_max))
  allocate (gk ( 3, npw_max))
  do ig = 1, npw_max
     gk (1:3, ig) = (qk (1:3) + g (1:3, ig ) ) * tpiba
     g2kin (ig) = SUM(gk (1:3, ig) **2 )
  enddo
  !
  !
  ! the contribution from nonlocal pseudopotentials
  !
  if (nkb == 0) go to 111
  !
  allocate (work ( npw_max, nkb) )
  IF (noncolin) THEN
     allocate (deff_nc (nhm, nhm, nat, nspin))
  ELSE
     allocate (deff (nhm, nhm, nat ))
  END IF
  allocate (dvkb (npw_max, nkb), dvkb1(npw_max, nkb))
  dvkb (:,:) = (0.d0, 0.d0)
  dvkb1(:,:) = (0.d0, 0.d0)

  att = 0.0d0
  do ii = 1,3
    att(ii,ii) = 1.0d0
  enddo

  call gen_beta_simple (qk, npw_max, dvkb)
  call gen_beta_simple_2 (qk, npw_max, att(1,ipol), dvkb1)

  do ig = 1, npw_max
     if (g2kin (ig) < 1.0d-10) then
        gk (1, ig) = 0.d0
        gk (2, ig) = 0.d0
        gk (3, ig) = 0.d0
     else
        gk (1, ig) = gk (1, ig) / sqrt (g2kin (ig) )
        gk (2, ig) = gk (2, ig) / sqrt (g2kin (ig) )
        gk (3, ig) = gk (3, ig) / sqrt (g2kin (ig) )
     endif
  enddo

  jkb = 0
  work=(0.d0,0.d0)
  do nt = 1, ntyp
     do na = 1, nat
        if (nt == ityp (na)) then
           do ikb = 1, nh (nt)
              jkb = jkb + 1
              do ig = 1, npw_max
                 work (ig,jkb) = dvkb1 (ig, jkb) + dvkb (ig, jkb) * &
                      (att (1, ipol) * gk (1, ig) + &
                       att (2, ipol) * gk (2, ig) + &
                       att (3, ipol) * gk (3, ig) )
              enddo
           enddo
        endif
     enddo
  enddo
  deallocate (gk)


  ! In the case of gamma point systems becp2 is real
  ! so we have to include a factor of i before calling
  ! calbec otherwise we would be stuck with the wrong component
  ! of becp2 later on.
  IF (gamma_only) work=(0.0_DP,1.0_DP)*work
  work(npwx+1:npw_max,1:nkb) = 0.d0          ! DEBUG

  CALL calbec (npw_max, work, wfc_e, becp2)

  IF (noncolin) THEN
     allocate (psc ( nkb, npol, nbnd_occ, 2))
     psc=(0.d0,0.d0)
  ELSE
     allocate (ps2 ( nkb, nbnd_occ, 2))
     ps2=(0.d0,0.d0)
  END IF

  call start_clock('commut_nbnd')

!  if(nbnd_loc*nproc < (nbnd_occ)) nbnd_loc = nbnd_loc + 1  ! DEBUG
!  nbegin = mpime*nbnd_loc + 1
!  nend = nbegin+nbnd_loc - 1
!  if(nend > nbnd_occ) nend=nbnd_occ  ! DEBUG

  ! Parallelization over nbnd_occ
  nbnd_loc= (nbnd_occ)/nproc  ! nbnd_loc = num of bands per processor
  nbegin = mpime*nbnd_loc + 1
  nend = nbegin+nbnd_loc - 1
  if(mpime == (nproc-1)) nend=nbnd_occ  ! last processor takes all the remaining bands
  !
  DO ibnd = nbegin , nend
     IF (noncolin) THEN
        deff_nc=deeq_nc
     ELSE
        deff(:,:,:) = deeq(:,:,:,1)
     ENDIF
     ijkb0 = 0
     do nt = 1, ntyp
        do na = 1, nat
           if (nt == ityp (na)) then
              do ih = 1, nh (nt)
                 ikb = ijkb0 + ih
                 do jh = 1, nh (nt)
                    jkb = ijkb0 + jh
                    IF (noncolin) THEN
                       ijs=0
                       DO is=1, npol
                          DO js = 1, npol
                             ijs=ijs+1
                             psc(ikb,is,ibnd,1)=psc(ikb,is,ibnd,1)+  &
                                       (0.d0,-1.d0)*    &
                                  becp2%nc(jkb,js,ibnd)*deff_nc(ih,jh,na,ijs)
                             psc(ikb,is,ibnd,2)=psc(ikb,is,ibnd,2)+ &
                                     (0.d0,-1.d0)* &
                                 becp1%nc(jkb,js,ibnd)*deff_nc(ih,jh,na,ijs)
                          END DO
                       END DO
                    ELSEIF (gamma_only) THEN
                       ! Note the different prefactors due to the factor
                       ! of i introduced to work(:,:), as becp[1,2] are
                       ! real.
                       ps2(ikb,ibnd,1) = ps2(ikb,ibnd,1) + becp2%r(jkb,ibnd) * &
                            (1.0d0, 0.0d0)*deff(ih,jh,na)
                       ps2(ikb,ibnd,2) = ps2(ikb,ibnd,2) + becp1%r(jkb,ibnd)* &
                            (-1.0d0, 0.0d0)*deff(ih,jh,na)
                    ELSE
                       ps2(ikb,ibnd,1) = ps2(ikb,ibnd,1) + becp2%k(jkb,ibnd) * &
                            (0.0d0,-1.0d0)*deff(ih,jh,na)
                       ps2(ikb,ibnd,2) = ps2(ikb,ibnd,2) + becp1%k(jkb,ibnd)* &
                            (0.0d0,-1.0d0)*deff(ih,jh,na)
                    END IF
                 enddo
              enddo
              ijkb0=ijkb0+nh(nt)
           end if
        enddo  ! na
        !write(*,*) nt , na , ikb , jkb ! DEBUG
     end do  ! nt
  end do ! nbnd
  if (ikb /= nkb .OR. jkb /= nkb) call errore ('commutator_Hx_psi_simple', 'unexpected error',1)
  !
  IF (noncolin) THEN
     call mp_sum(psc,world_comm)
  ELSE
     call mp_sum(ps2,world_comm)
  ENDIF
  !
  call stop_clock('commut_nbnd')
  !
  call start_clock('commut_zgemm')
  IF (noncolin) THEN
     CALL zgemm( 'N', 'N', npw_max, nbnd_occ*npol, nkb, &
          (1.d0,0.d0), vkb_max(1,1), npw_max, psc(1,1,1,1), nkb, (1.d0,0.d0), &
          dpsi, npw_max )
     CALL zgemm( 'N', 'N', npw_max, nbnd_occ*npol, nkb, &
          (1.d0,0.d0),work(1,1), npw_max, psc(1,1,1,2), nkb, (1.d0,0.d0), &
          dpsi, npw_max )
  ELSE
     CALL zgemm( 'N', 'N', npw_max, nbnd_occ, nkb, &
          (1.d0,0.d0), vkb_max(1,1), npw_max, ps2(1,1,1), nkb, (1.d0,0.d0), &
          dpsi(1,1), npw_max )
     CALL zgemm( 'N', 'N', npw_max, nbnd_occ, nkb, &
          (1.d0,0.d0),work(1,1), npw_max, ps2(1,1,2), nkb, (1.d0,0.d0), &
          dpsi(1,1), npw_max )
  ENDIF
  call stop_clock('commut_zgemm')

  IF (noncolin) THEN
     deallocate (psc)
     deallocate (deff_nc)
  ELSE
     deallocate (ps2)
     deallocate (deff)
  END IF
  deallocate (work)

  IF (nkb > 0) THEN
     deallocate (dvkb1, dvkb)
  END IF

  111 continue

  deallocate(g2kin)
  call stop_clock ('commut_Hx_psi')
  return
end subroutine commutator_Hx_psi_simple