quantum-espresso/KS_Solvers/PPCG/ppcg_k.f90

!
SUBROUTINE ppcg_k( h_psi, s_psi, overlap, precondition, &
                 npwx, npw, nbnd, npol, psi, e, btype, &
                 ethr, maxter, notconv, avg_iter, sbsize, rr_step, scf_iter)
  !
  !----------------------------------------------------------------------------
  !
  ! E.V. Ignore btype, use ethr as threshold on subspace residual subspace
  ! SdG  restore btype use in the eigenvalue locking procedure
  !
  USE util_param,         ONLY : DP, stdout
  USE mp,                 ONLY : mp_bcast, mp_root_sum, mp_sum
  USE mp_bands_util,      ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id
  !
  IMPLICIT NONE
  include 'laxlib.fh'
  COMPLEX (DP), PARAMETER :: C_ONE = (1.D0,0.D0), C_ZERO = (0.D0,0.D0)
  !
  ! ... I/O variables
  !
  LOGICAL,      INTENT(IN)    :: overlap ! whether the eigenvalue problem is a generalized one or not
  INTEGER,      INTENT(IN)    :: npwx, npw, nbnd, npol, maxter
  ! maximum number of PW for wavefunctions
  ! the number of plane waves
  ! number of bands
  ! number of independent spin components of the wfc.
  ! maximum number of iterations
  COMPLEX (DP), INTENT(INOUT) :: psi(npwx*npol,nbnd)
  INTEGER,      INTENT(IN)    :: btype(nbnd) ! one if the corresponding state has to be
                                             ! ...converged to full accuracy, zero otherwise (src/pwcom.f90)
  REAL (DP),    INTENT(INOUT) :: e(nbnd)
  REAL (DP),    INTENT(IN)    :: precondition(npw), ethr
  ! the diagonal preconditioner
  ! the convergence threshold for eigenvalues
  INTEGER,      INTENT(OUT)   :: notconv
  REAL(DP),     INTENT(OUT)   :: avg_iter
  ! number of notconverged elements
  ! average number of iterations in PPCG
  INTEGER,      INTENT(IN)   ::  sbsize, rr_step      ! sub-block size (num. of vectors in sub-blocks)
                                                      ! ...to be used in PPCG block splitting. by default, sbsize=1
                                                      ! run the Rayleigh Ritz procedure every rr_step
  INTEGER, INTENT(IN)         :: scf_iter             ! this variable should be removed in future version, used for timing purpose
  !
  ! ... local variables
  !
  COMPLEX(DP), ALLOCATABLE ::  hpsi(:,:), spsi(:,:), w(:,:), hw(:,:), sw(:,:), p(:,:), hp(:,:), sp(:,:)
  COMPLEX(DP)              ::  buffer(npwx*npol,nbnd), buffer1(npwx*npol,nbnd)
  COMPLEX(DP), ALLOCATABLE ::  K(:,:), K_store(:,:), M(:,:), M_store(:,:), cwork(:)
  REAL(DP), ALLOCATABLE    ::  rwork(:)
  INTEGER,  ALLOCATABLE    ::  iwork(:)
  REAL (DP)                ::  trG, trG1, trdif, trtol, lock_tol
  COMPLEX(DP)              ::  coord_psi(sbsize,sbsize), coord_w(sbsize,sbsize), coord_p(sbsize,sbsize),  &
                               G(nbnd,nbnd), G1(nbnd, nbnd)
  REAL (DP)                ::  D(3*sbsize)
  INTEGER                  ::  nsb, sbsize_last, sbsize3, dimp,                        &
                               l, i, j, iter, total_iter, ierr = 0, info = 0,                       &
                               col_idx(sbsize), lcwork = -1, lrwork = -1, liwork = -1
  INTEGER                  ::  nact, nact_old, act_idx(nbnd), idx(nbnd)
                               ! number of active columns
                               ! number of active columns on previous iteration
                               ! indices of active columns
                               ! auxiliary indices
  INTEGER                  ::  n_start, n_end, my_n ! auxiliary indices for band group parallelization
  INTEGER                  ::  print_info     ! If > 0 then iteration information is printed
  INTEGER                  ::  kdim, kdimx, ipol, ibnd
  LOGICAL                  ::  clean
  REAL(DP), EXTERNAL :: ZLANGE

  EXTERNAL h_psi, s_psi
    ! h_psi(npwx,npw,nvec,psi,hpsi)
    !     calculates H|psi>
    ! s_psi(npwx,npw,nvec,psi,spsi)
    !     calculates S|psi> (if needed)
    !     Vectors psi,hpsi,spsi are dimensioned (npwx,nvec)

  COMPLEX (DP), ALLOCATABLE    ::  Gl(:,:)
  !
  INTEGER :: idesc(LAX_DESC_SIZE)
  ! descriptor of the current distributed Gram matrix
  LOGICAL :: la_proc
  ! flag to distinguish procs involved in linear algebra
  INTEGER, ALLOCATABLE :: irc_ip( : )
  INTEGER, ALLOCATABLE :: nrc_ip( : )
  INTEGER, ALLOCATABLE :: rank_ip( :, : )
  ! matrix distribution descriptors
  LOGICAL :: force_repmat    ! = .TRUE. to force replication of the Gram matrices for Cholesky
                             ! Needed if the sizes of these  matrices become too small after locking
  REAL                          :: res_array(maxter)

  INTEGER, PARAMETER   :: blocksz = 256 ! used to optimize some omp parallel do loops
  INTEGER   :: nblock
  !
  INTEGER :: np_ortho(2), ortho_parent_comm, ortho_cntx
  LOGICAL :: do_distr_diag_inside_bgrp
  !
  !
  res_array     = 0.0
  !
  CALL start_clock( 'ppcg_k' )
  !
  !  ... Initialization and validation

  CALL laxlib_getval( np_ortho = np_ortho, ortho_cntx = ortho_cntx, &
       ortho_parent_comm = ortho_parent_comm, &
       do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp )
  !
  print_info = 0 ! 3
  sbsize3 = sbsize*3
  kdim  = npwx*(npol-1) + npw
  kdimx = npwx*npol
  clean = (npw < npwx) .AND. ( npol == 2 )
  if (npol> 2) CALL errore( 'ppcg ',' wrong npol value: npol > 2 ', npol )
  if (npol<=0) CALL errore( 'ppcg ',' non positive npol value: errcode = 1+abs(npol) ', 1+abs(npol) )
  !
  nact      =  nbnd
  nact_old  =  nbnd
  lock_tol  =  SQRT( ethr )  ! 1.D-4*SQRT( ethr )  ! 1.0D-7
  trdif     =  -1.D0
  ! trdif is used for stopping. It equals either
  ! the difference of traces of psi'*hpsi on two consecutive
  ! itrations or -1.D0 when this difference is undefined
  ! (e.g., initial iteration or the iteration following RR)
  trG    =  0.D0
  trG1   =  0.D0
  ! trG and trG1 are use to evaluate the difference of traces
  ! of psi'*hpsi on two consecutive iterations
  iter      =  1
  force_repmat = .FALSE.
  !
  CALL allocate_all
  !
  ! ... Compute block residual w = hpsi - psi*(psi'hpsi) (psi is orthonormal on input)
  !
  call start_clock('ppcg:hpsi')
                                                         if (clean)  psi(npw+1:npwx,:) = C_ZERO
  CALL h_psi( npwx, npw, nbnd, psi, hpsi )             ; if (clean) hpsi(npw+1:npwx,:) = C_ZERO
  if (overlap) CALL s_psi( npwx, npw, nbnd, psi, spsi) ; if (clean) spsi(npw+1:npwx,:) = C_ZERO

  avg_iter = 1.d0
  call stop_clock('ppcg:hpsi')
  !
  !     G = psi'hpsi
  call start_clock('ppcg:zgemm')
  G = C_ZERO
  CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
  if (n_start .le. n_end) &
  CALL ZGEMM('C','N', nbnd, my_n, kdim, C_ONE, psi, kdimx, hpsi(1,n_start), kdimx, C_ZERO, G(1,n_start), nbnd)
  CALL mp_sum( G, inter_bgrp_comm )
  !
  CALL mp_sum( G, intra_bgrp_comm )
  call stop_clock('ppcg:zgemm')
  !
  !    w = hpsi - spsi*G
  call start_clock('ppcg:zgemm')
  CALL threaded_assign( w, hpsi, kdimx, nact, bgrp_root_only=.true. )
  CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
  if (overlap) then
     if (n_start .le. n_end) &
     CALL ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE,spsi(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, w, kdimx)
  else
     if (n_start .le. n_end) &
     CALL ZGEMM('N','N',kdim, nbnd, my_n, -C_ONE, psi(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, w, kdimx)
  end if
  CALL mp_sum( w, inter_bgrp_comm )
  call stop_clock('ppcg:zgemm')
  !
  !
  ! ... Lock converged eigenpairs (set up act_idx and nact and store current nact in nact_old)
  call start_clock('ppcg:lock')
  nact_old = nact;
  CALL lock_epairs(kdim, nbnd, btype, w, kdimx, lock_tol, nact, act_idx)
  call stop_clock('ppcg:lock')
  !
  ! ... Set up iteration parameters after locking
  CALL setup_param
  !
  G1(1:nact,1:nact) = G(act_idx(1:nact), act_idx(1:nact))
  trG = get_trace( G1, nbnd, nact )
  !
  ! Print initial info ...
  IF (print_info >= 1)  THEN
     WRITE(stdout, '("Ethr: ",1pD9.2,", npw: ", I10, ", nbnd: ", I10, " , ",  &
              & "maxter: ",I5, ", sbsize:  ", I10,", nsb: ", I10 ,", nact: ", &
              & I10, ", trtol: ", 1pD9.2 )')  ethr, npw, nbnd, maxter, sbsize, nsb, nact, trtol
     IF (print_info == 3) THEN
        CALL print_rnrm
        WRITE(stdout,'("Res. norm:  ", 1pD9.2)') res_array(iter)
     END IF
     FLUSH( stdout )
  END IF
  !
  !---Begin the main loop
  !
  DO WHILE ( ((trdif > trtol) .OR. (trdif == -1.D0))  .AND. (iter <= maxter) .AND. (nact > 0) )
     !
     ! ... apply the diagonal preconditioner
     !
     nblock = (npw-1) / blocksz +1         ! used to optimize some omp parallel do loops
     !$omp parallel do collapse(3)
     DO j = 1, nact ; DO ipol=0,npol-1 ; DO i=1,nblock
        w(1+(i-1)*blocksz+npwx*ipol:MIN(i*blocksz,npw)+npwx*ipol, act_idx(j)) =    &
            w(1+(i-1)*blocksz+npwx*ipol:MIN(i*blocksz,npw)+npwx*ipol, act_idx(j)) / &
                precondition(1+(i-1)*blocksz:MIN(i*blocksz,npw))
     END DO ; END DO ; END DO
     !$omp end parallel do
     !
     call start_clock('ppcg:zgemm')
     call threaded_assign( buffer, w, kdimx, nact, act_idx )
     G(1:nbnd,1:nact) = C_ZERO
     CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
     if (overlap) then
        if (n_start .le. n_end) &
        CALL ZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi(1,n_start), kdimx, buffer, kdimx, C_ZERO, G(n_start,1), nbnd )
     else
        if (n_start .le. n_end) &
        CALL ZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi(1,n_start), kdimx, buffer, kdimx, C_ZERO, G(n_start,1), nbnd )
     end if
     CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm )
     !
     CALL mp_sum( G(1:nbnd,1:nact), intra_bgrp_comm )
     call stop_clock('ppcg:zgemm')
     !
     !     w = w - psi*G
     call start_clock('ppcg:zgemm')
     call threaded_assign( buffer, w, kdimx, nact, act_idx, bgrp_root_only=.true. )
     if (n_start .le. n_end) &
     CALL ZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, buffer, kdimx)
     CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
     w(:,act_idx(1:nact)) = buffer(:,1:nact)
     call stop_clock('ppcg:zgemm')
     !
     ! ... Compute h*w
     call start_clock('ppcg:hpsi')
     call threaded_assign( buffer1, w, kdimx, nact, act_idx )
     CALL h_psi( npwx, npw, nact, buffer1, buffer )       ; if (clean) buffer (npw+1:npwx,1:nact) = C_ZERO
     hw(:,act_idx(1:nact)) = buffer(:,1:nact)
     if (overlap) then ! ... Compute s*w
        CALL s_psi( npwx, npw, nact, buffer1, buffer )    ; if (clean) buffer(npw+1:npwx,1:nact) = C_ZERO
        sw(:,act_idx(1:nact)) = buffer(:,1:nact)
     end if
     avg_iter = avg_iter + nact/dble(nbnd)
     call stop_clock('ppcg:hpsi')
     !
     ! ... orthogonalize p against psi and w
!ev     IF ( MOD(iter, rr_step) /= 1 ) THEN    ! In this case, P is skipped after each RR
     IF ( iter  /=  1 ) THEN
        !  G = spsi'p
        call start_clock('ppcg:zgemm')
        G(1:nact,1:nact) = C_ZERO
        if (overlap) then
           call threaded_assign( buffer, spsi, kdimx, nact, act_idx )
        else
           call threaded_assign( buffer,  psi, kdimx, nact, act_idx )
        end if
        call threaded_assign( buffer1, p, kdimx, nact, act_idx )
        CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
        if (n_start .le. n_end) &
        CALL ZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer(1,n_start), kdimx, buffer1, kdimx, C_ZERO, G(n_start,1), nbnd)
        CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
        !
        CALL mp_sum( G(1:nact,1:nact), intra_bgrp_comm )
        call stop_clock('ppcg:zgemm')
        !
        ! p = p - psi*G, hp = hp - hpsi*G, sp = sp - spsi*G
        call start_clock('ppcg:zgemm')
        call threaded_assign( buffer, p, kdimx, nact, act_idx, bgrp_root_only = .true. )
        call threaded_assign( buffer1, psi, kdimx, nact, act_idx )
        if (n_start .le. n_end) & ! could be done differently
        CALL ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, buffer, kdimx)
        CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
        p(:,act_idx(1:nact)) = buffer(:,1:nact)
        call stop_clock('ppcg:zgemm')
        !
        call start_clock('ppcg:zgemm')
        call threaded_assign( buffer, hp, kdimx, nact, act_idx, bgrp_root_only = .true. )
        call threaded_assign( buffer1, hpsi, kdimx, nact, act_idx )
        if (n_start .le. n_end) &
        CALL ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, buffer, kdimx)
        CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
        hp(:,act_idx(1:nact)) = buffer(:,1:nact)
        call stop_clock('ppcg:zgemm')
        !
        if (overlap) then
           call start_clock('ppcg:zgemm')
           call threaded_assign( buffer, sp, kdimx, nact, act_idx, bgrp_root_only = .true. )
           call threaded_assign( buffer1, spsi, kdimx, nact, act_idx )
           if (n_start .le. n_end) &
           CALL ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, buffer, kdimx)
           CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
           sp(:,act_idx(1:nact)) = buffer(:,1:nact)
           call stop_clock('ppcg:zgemm')
        end if
     END IF
     !
     !  ... for each sub-block construct the small projected matrices K and M
     !      and store in K_store and M_store
     !
     K_store = C_ZERO
     M_store = C_ZERO
     !
     CALL divide(inter_bgrp_comm,nsb,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nsb,n_start,n_end
     DO j = n_start, n_end
        !
        ! Get size of the sub-block and define indices of the corresponding columns
        IF ( j < nsb )  THEN
           l = sbsize
        ELSE
           l = sbsize_last
        END IF
        col_idx(1:l) = act_idx(  (/ (i, i = (j-1)*sbsize + 1, (j-1)*sbsize + l) /) )
        !
        ! ... form the local Gramm matrices (K,M)
        K = C_ZERO
        M = C_ZERO
        !
        call start_clock('ppcg:zgemm')
        call threaded_assign( buffer, psi, kdimx, l, col_idx )
        call threaded_assign( buffer1, hpsi, kdimx, l, col_idx )
        CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, K, sbsize3)
        !
        if (overlap) then
           call threaded_assign( buffer1, spsi, kdimx, l, col_idx ); if (clean) buffer1(npw+1:npwx,1:l) = C_ZERO
        else
           call threaded_assign( buffer1, buffer, kdimx, l )
        end if
        CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, M, sbsize3)
        !
        ! ---
        call threaded_assign( buffer, w, kdimx, l, col_idx )
        call threaded_assign( buffer1, hw, kdimx, l, col_idx )
        CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, K(l+1, l+1), sbsize3)
        !
        if (overlap) then
           call threaded_assign( buffer1, sw, kdimx, l, col_idx )
        else
           call threaded_assign( buffer1, buffer, kdimx, l )
        end if
        CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, M(l+1, l+1 ), sbsize3)
        !
        ! ---
        call threaded_assign( buffer, psi, kdimx, l, col_idx )
        call threaded_assign( buffer1, hw, kdimx, l, col_idx ) ; if (clean) buffer1(npw+1:npwx,1:l) = C_ZERO
        CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, K(1, l+1), sbsize3)
        !
        if (overlap) then
           call threaded_assign( buffer1, sw, kdimx, l, col_idx )
        else
           call threaded_assign( buffer1,  w, kdimx, l, col_idx )
        end if
        CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, M(1, l+1), sbsize3)
        call stop_clock('ppcg:zgemm')
        !
        ! ---
        !
!ev        IF ( MOD(iter,rr_step) /= 1 ) THEN   ! In this case, P is skipped after each RR
        IF ( iter  /= 1 ) THEN
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer,  p, kdimx, l, col_idx )
          call threaded_assign( buffer1,  hp, kdimx, l, col_idx )
          CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, K(2*l + 1, 2*l+1), sbsize3)
          !
          if (overlap) then
             call threaded_assign( buffer1,  sp, kdimx, l, col_idx )
          else
             call threaded_assign( buffer1,  buffer, kdimx, l )
          end if
          CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, M(2*l + 1, 2*l+1), sbsize3)
          !
          ! ---
          call threaded_assign( buffer,  psi, kdimx, l, col_idx )
          call threaded_assign( buffer1,  hp, kdimx, l, col_idx )
          CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, K(1, 2*l+1), sbsize3)
          !
          if (overlap) then
             call threaded_assign( buffer1,  sp, kdimx, l, col_idx )
          else
             call threaded_assign( buffer1,  p, kdimx, l, col_idx )
          end if
          CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, M(1, 2*l+1), sbsize3)
          call stop_clock('ppcg:zgemm')
          !
          ! ---
          !
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer,  w, kdimx, l, col_idx )
          call threaded_assign( buffer1,  hp, kdimx, l, col_idx )
          CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, K(l+1, 2*l+1), sbsize3)
          !
          if (overlap) then
             call threaded_assign( buffer1,  sp, kdimx, l, col_idx )
          else
             call threaded_assign( buffer1,  p, kdimx, l, col_idx )
          end if
          CALL ZGEMM('C','N', l, l, kdim, C_ONE, buffer, kdimx, buffer1, kdimx, C_ZERO, M(l+1, 2*l+1), sbsize3)
          call stop_clock('ppcg:zgemm')
          !
        END IF
        !
        ! ... store the projected matrices
        K_store(:, (j-1)*sbsize3 + 1 : j*sbsize3 ) = K
        M_store(:, (j-1)*sbsize3 + 1 : j*sbsize3 ) = M
        !
     END DO
     CALL mp_sum(K_store,inter_bgrp_comm)
     CALL mp_sum(M_store,inter_bgrp_comm)
     !
     CALL mp_sum(K_store,intra_bgrp_comm)
     CALL mp_sum(M_store,intra_bgrp_comm)
     !
     ! ... perform nsb 'separate RQ minimizations' and update approximate subspace

     idx(:) = 0 ! find the inactive columns to be kept by root_bgrp_id only
     if (my_bgrp_id == root_bgrp_id) then
        idx(1:nbnd) = 1 ; idx(act_idx(1:nact)) = 0
     end if
     DO j = n_start, n_end
       !
       ! Get size of the sub-block and define indices of the corresponding columns
       IF ( j < nsb )  THEN
          l = sbsize
       ELSE
          l = sbsize_last
       END IF

       col_idx(1:l) = act_idx( (/ (i, i = (j-1)*sbsize + 1, (j-1)*sbsize + l) /)  )
       !
       K = K_store(:, (j-1)*sbsize3 + 1 : j*sbsize3)
       M = M_store(:, (j-1)*sbsize3 + 1 : j*sbsize3)
       !
       lcwork =       6*sbsize +  9*sbsize**2
       lrwork =  1 + 15*sbsize + 18*sbsize**2
       liwork =  3 + 15*sbsize
!ev       IF ( MOD(iter, rr_step) /= 1) THEN    ! set the dimension of the separate projected eigenproblem
       IF ( iter /= 1 ) THEN    ! set the dimension of the separate projected eigenproblem
          dimp = 3*l
       ELSE
          dimp = 2*l
       END IF
       !
       CALL ZHEGVD(1, 'V','U', dimp, K, sbsize3, M, sbsize3, D, cwork, lcwork, rwork, lrwork, iwork, liwork, info)
       IF (info /= 0) THEN
       ! reset the matrix and try again with psi and w only
          K = K_store(:, (j-1)*sbsize3 + 1 : j*sbsize3)
          M = M_store(:, (j-1)*sbsize3 + 1 : j*sbsize3)
          dimp = 2*l
          CALL ZHEGVD(1, 'V','U', dimp, K, sbsize3, M, sbsize3, D, cwork, lcwork, rwork, lrwork, iwork, liwork, info)
          IF (info /= 0) THEN
             CALL errore( 'ppcg ',' zhegvd failed ', info )
             STOP
          END IF
       END IF
       !
       coord_psi(1 : l, 1 : l) = K(1 : l, 1 : l)
       coord_w(1 : l, 1 : l) = K(l+1 : 2*l, 1 : l)
       !
       ! ... update the sub-block of P and AP
!ev       IF ( MOD(iter, rr_step) /= 1 ) THEN
!sdg      IF ( iter /= 1 ) THEN
       IF ( dimp == 3*l ) THEN
          !
          coord_p(1 : l, 1 : l) = K(2*l+1 : 3*l, 1 : l)
          !
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer1,  p, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_p, sbsize, C_ZERO, buffer, kdimx)
          call threaded_assign( buffer1,  w, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_w, sbsize, C_ONE, buffer, kdimx)
          p(:,col_idx(1:l))  = buffer(:,1:l)
          call stop_clock('ppcg:zgemm')
          !
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer1,  hp, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_p, sbsize, C_ZERO, buffer, kdimx)
          call threaded_assign( buffer1,  hw, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_w, sbsize, C_ONE, buffer, kdimx)
          hp(:,col_idx(1:l))  = buffer(:,1:l)
          call stop_clock('ppcg:zgemm')
          !
          if (overlap) then
             call start_clock('ppcg:zgemm')
             call threaded_assign( buffer1,  sp, kdimx, l, col_idx )
             CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_p, sbsize, C_ZERO, buffer, kdimx)
             call threaded_assign( buffer1,  sw, kdimx, l, col_idx )
             CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_w, sbsize, C_ONE, buffer, kdimx)
             sp(:,col_idx(1:l))  = buffer(:,1:l)
             call stop_clock('ppcg:zgemm')
          end if
       ELSE
          !
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer1,  w, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_w, sbsize, C_ZERO, buffer, kdimx)
          p(:,col_idx(1:l)) = buffer(:, 1:l)
          call stop_clock('ppcg:zgemm')
          !
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer1,  hw, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_w, sbsize, C_ZERO, buffer, kdimx)
          hp(:,col_idx(1:l)) = buffer(:, 1:l)
          call stop_clock('ppcg:zgemm')
          !
          if (overlap) then
             call start_clock('ppcg:zgemm')
             call threaded_assign( buffer1,  sw, kdimx, l, col_idx )
             CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_w, sbsize, c_ZERO, buffer, kdimx)
             sp(:,col_idx(1:l)) = buffer(:, 1:l)
             call stop_clock('ppcg:zgemm')
          end if
       END IF
       !
       ! Update the sub-blocks of psi and hpsi (and spsi)
       call start_clock('ppcg:zgemm')
       call threaded_assign( buffer1,  psi, kdimx, l, col_idx )
       CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_psi, sbsize, C_ZERO, buffer, kdimx)
       psi(:, col_idx(1:l))  = buffer(:,1:l) + p(:,col_idx(1:l))
       call stop_clock('ppcg:zgemm')
       !
       call start_clock('ppcg:zgemm')
       call threaded_assign( buffer1,  hpsi, kdimx, l, col_idx )
       CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_psi, sbsize, C_ZERO, buffer, kdimx)
       hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l))
       call stop_clock('ppcg:zgemm')
       !
       if (overlap) then
          call start_clock('ppcg:zgemm')
          call threaded_assign( buffer1,  spsi, kdimx, l, col_idx )
          CALL ZGEMM('N','N', kdim, l, l, C_ONE, buffer1, kdimx, coord_psi, sbsize, C_ZERO, buffer, kdimx)
          spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l))
          call stop_clock('ppcg:zgemm')
       end if
       !
       idx(col_idx(1:l)) = 1 ! keep track of which columns this bgrp has acted on
     END DO  ! end 'separate RQ minimizations'
     ! set to zero the columns not assigned to this bgrp, inactive colums are assigned to root_bgrp
     do j=1,nbnd
        if (idx(j)==0) then
           psi (:,j) = C_ZERO ; hpsi (:,j) = C_ZERO ; p(:,j) = C_ZERO ; hp(:,j) = C_ZERO
        end if
     end do
     CALL mp_sum(psi ,inter_bgrp_comm)
     CALL mp_sum(hpsi,inter_bgrp_comm)
     CALL mp_sum(p ,inter_bgrp_comm)
     CALL mp_sum(hp,inter_bgrp_comm)
     if (overlap) then
        do j=1,nbnd
           if (idx(j)==0) then
              spsi (:,j) = C_ZERO ; sp(:,j) = C_ZERO
           end if
        end do
        CALL mp_sum(spsi,inter_bgrp_comm)
        CALL mp_sum(sp,inter_bgrp_comm)
    end if
    !
    !
    ! ... Perform the RR procedure every rr_step
!    IF ( (MOD(iter, rr_step) == 0) .AND. (iter /= maxter) ) THEN
    IF ( MOD(iter, rr_step) == 0 ) THEN
       !
       call start_clock('ppcg:RR')
       CALL extract_epairs_dmat(kdim, nbnd, kdimx, e, psi, hpsi, spsi )
       call stop_clock('ppcg:RR')
       !
       IF (print_info >= 2) WRITE(stdout, *) 'RR has been invoked.' ; !CALL flush( stdout )
       !
       ! ... Compute the new residual vector block by evaluating
       !     residuals for individual eigenpairs in psi and e
       nblock = (kdim-1) / blocksz + 1         ! used to optimize some omp parallel do loops
       if (overlap) then
          !$omp parallel do collapse(2)
          DO j = 1, nbnd ; DO i=1,nblock
             w( 1+(i-1)*blocksz:MIN(i*blocksz,kdim) ,j ) = hpsi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j ) &
                                                  - spsi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j )*e( j )
          END DO ; END DO
          !$omp end parallel do
       else
          !$omp parallel do collapse(2)
          DO j = 1, nbnd ; DO i=1,nblock
             w( 1+(i-1)*blocksz:MIN(i*blocksz,kdim) ,j ) = hpsi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j ) &
                                                  -  psi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j )*e( j )
          END DO ; END DO
          !$omp end parallel do
       end if
       !
       ! ... Lock converged eigenpairs (set up act_idx and nact)
       call start_clock('ppcg:lock')
       nact_old = nact;
       CALL lock_epairs(kdim, nbnd, btype, w, kdimx, lock_tol, nact, act_idx)
       call stop_clock('ppcg:lock')
       !
       ! ... Set up iteration parameters after locking
       CALL setup_param
       !
       !
       trG1  =  0.D0
       trdif = -1.D0
       trG   = SUM( e(act_idx(1:nact)) )
       !
    ELSE
       !
!! EV begin
      ! ... orthogonalize psi and update hpsi accordingly
      IF ( .NOT. force_repmat ) THEN
         !
         call threaded_assign( buffer,  psi, kdimx, nact, act_idx )
         if (overlap) then
            call threaded_assign( buffer1,  spsi, kdimx, nact, act_idx )
         else
            call threaded_assign( buffer1,  buffer, kdimx, nact )
         end if
         !
         call start_clock('ppcg:cholQR')
         CALL cholQR_dmat(kdim, nact, buffer, buffer1, kdimx, Gl, idesc)
         call stop_clock('ppcg:cholQR')
         !
         psi(:,act_idx(1:nact)) = buffer(:,1:nact)
         !
         call threaded_assign( buffer1,  hpsi, kdimx, nact, act_idx )
         call start_clock('ppcg:ZTRSM')
         CALL zgemm_dmat( kdim, nact, kdimx, idesc, C_ONE, buffer1, Gl, C_ZERO, buffer )
         call stop_clock('ppcg:ZTRSM')
         !
         hpsi(:,act_idx(1:nact)) = buffer(:,1:nact)
         !
         if (overlap) then
            call threaded_assign( buffer1,  spsi, kdimx, nact, act_idx )
            call start_clock('ppcg:ZTRSM')
            CALL zgemm_dmat( kdim, nact, kdimx, idesc, C_ONE, buffer1, Gl, C_ZERO, buffer )
            call stop_clock('ppcg:ZTRSM')
            !
            spsi(:,act_idx(1:nact)) = buffer(:,1:nact)
         end if
      ELSE
         !
         call threaded_assign( buffer,  psi, kdimx, nact, act_idx )
         if (overlap) then
            call threaded_assign( buffer1,  spsi, kdimx, nact, act_idx )
         else
            call threaded_assign( buffer1,  buffer, kdimx, nact )
         end if
         !
         call start_clock('ppcg:cholQR')
         CALL cholQR(kdim, nact, buffer, buffer1, kdimx, G, nbnd)
         call stop_clock('ppcg:cholQR')
         !
         psi(:,act_idx(1:nact)) = buffer(:,1:nact)
         !
         call threaded_assign( buffer,  hpsi, kdimx, nact, act_idx )
         !
         call start_clock('ppcg:ZTRSM')
         CALL ZTRSM('R', 'U', 'N', 'N', kdim, nact, C_ONE, G, nbnd, buffer, kdimx)
         call stop_clock('ppcg:ZTRSM')
         !
         hpsi(:,act_idx(1:nact)) = buffer(:,1:nact)
         !
         if (overlap) then
            call threaded_assign( buffer,  spsi, kdimx, nact, act_idx )
            !
            call start_clock('ppcg:ZTRSM')
            CALL ZTRSM('R', 'U', 'N', 'N', kdim, nact, C_ONE, G, nbnd, buffer, kdimx)
            call stop_clock('ppcg:ZTRSM')
            !
            spsi(:,act_idx(1:nact)) = buffer(:,1:nact)
         end if
      END IF
!! EV end
       !
       ! ... Compute the new subspace residual for active columns
       !
       !  G = psi'hpsi
       call threaded_assign( buffer,  psi, kdimx, nact, act_idx )
       call threaded_assign( buffer1,  hpsi, kdimx, nact, act_idx )
       call start_clock('ppcg:zgemm')
       G = C_ZERO
       CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
       if (n_start .le. n_end) &
       CALL ZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer, kdimx, buffer1(1,n_start), kdimx, C_ZERO, G(1,n_start), nbnd)
       CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
       !
       CALL mp_sum(G(1:nact,1:nact), intra_bgrp_comm)
       call stop_clock('ppcg:zgemm')
       !
       ! w = hpsi - spsi*G
       call threaded_assign( buffer,  hpsi, kdimx, nact, act_idx, bgrp_root_only=.true. )
       if (overlap) then
          call threaded_assign( buffer1,  spsi, kdimx, nact, act_idx )
       else
          call threaded_assign( buffer1,   psi, kdimx, nact, act_idx )
       end if
       call start_clock('ppcg:zgemm')
       if (n_start .le. n_end) &
       CALL ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1(1,n_start), kdimx, G(n_start,1), nbnd, C_ONE, buffer, kdimx)
       CALL mp_sum( buffer(:,1:nact), inter_bgrp_comm )
       w(:,act_idx(1:nact)) = buffer(:,1:nact);

       call stop_clock('ppcg:zgemm')
       !
       ! ... Compute trace of the projected matrix on current iteration
       ! trG1  = get_trace( G(1:nact, 1:nact), nact )
       trG1  = get_trace( G, nbnd, nact )
       trdif = ABS(trG1 - trG)
       trG   = trG1
       !
    END IF
    !
    ! Print iteration info ...
    IF (print_info >= 1) THEN
       WRITE(stdout, '("iter: ", I5, " nact = ", I5, ", trdif = ", 1pD9.2, ", trtol = ", 1pD9.2 )') iter, nact, trdif, trtol
       IF (print_info == 3) THEN
          CALL print_rnrm
          WRITE(stdout,'("Res. norm:  ", 1pD9.2)') res_array(iter)
       END IF
       FLUSH( stdout )
    END IF
    !
    total_iter = iter
    iter   = iter + 1
    !
    !
 END DO   !---End the main loop
 !
! IF (nact > 0) THEN
 IF ( MOD(iter-1, rr_step) /= 0 ) THEN        ! if RR has not just been performed
 ! if nact==0 then the RR has just been performed
 ! in the main loop
    call start_clock('ppcg:RR')
    CALL extract_epairs_dmat(kdim, nbnd, kdimx, e, psi, hpsi, spsi )
    call stop_clock('ppcg:RR')
    !
    ! ... Compute residuals
    if (overlap) then
       !$omp parallel do collapse(2)
       DO j = 1, nbnd ; DO i=1,nblock
          w( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j ) = hpsi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j ) &
                                               - spsi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j )*e( j )
       END DO ; END DO
       !$omp end parallel do
    else
       !$omp parallel do collapse(2)
       DO j = 1, nbnd ; DO i=1,nblock
          w( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j ) = hpsi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j ) &
                                               -  psi( 1+(i-1)*blocksz:MIN(i*blocksz,kdim), j )*e( j )
       END DO ; END DO
       !$omp end parallel do
    end if
    !
    ! ... Get the number of converged eigenpairs and their indices
    !     Note: The tolerance is 10*lock_tol, i.e., weaker tham lock_tol
! E.V. notconv issue should be addressed
    call start_clock('ppcg:lock')
    CALL lock_epairs(kdim, nbnd, btype, w, kdimx, 10*lock_tol, nact, act_idx)
    call stop_clock('ppcg:lock')
    !
 END IF
 !
! E.V. notconv issue comment
 notconv = 0 ! nact
 !
 IF (print_info >= 1) THEN
    WRITE(stdout, *) '-----------PPCG result summary ...  ----------------'
    WRITE(stdout, '("avg_iter: ", f6.2,  ", notconv: ", I5)') avg_iter, notconv
    FLUSH( stdout )
 END IF
 !
 CALL deallocate_all
 !
 CALL stop_clock( 'ppcg_k' )
 !

!!!EV-BANDS
if (print_info == 3) then
    write (stdout,'(1pD9.2)') ( res_array(j), j=1,maxter )
end if

 !
 RETURN
   !
CONTAINS
  !
  SUBROUTINE allocate_all
    !
    ! This subroutine allocates memory for the eigensolver
    !
    INTEGER :: nx
    ! maximum local block dimension
    !
    ALLOCATE ( hpsi(kdimx,nbnd), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate hpsi ', ABS(ierr) )
    if (overlap) ALLOCATE ( spsi(kdimx,nbnd), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate spsi ', ABS(ierr) )
    ALLOCATE ( w(kdimx,nbnd), hw(kdimx,nbnd), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate w and hw ', ABS(ierr) )
    if (overlap) ALLOCATE ( sw(kdimx,nbnd), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate sw ', ABS(ierr) )
    ALLOCATE ( p(kdimx,nbnd), hp(kdimx,nbnd), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate p and hp ', ABS(ierr) )
    if (overlap) ALLOCATE ( sp(kdimx,nbnd), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate sp ', ABS(ierr) )
    ALLOCATE ( K(sbsize3, sbsize3), M(sbsize3,sbsize3), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate K and M ', ABS(ierr) )
    ALLOCATE ( cwork( 1 + 18*sbsize + 18*sbsize**2 ), rwork( 1 + 18*sbsize + 18*sbsize**2 ), iwork(3 + 15*sbsize), stat = ierr )
    IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate lapack work arrays ', ABS(ierr) )
    !
    CALL desc_init( nbnd, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip  )
    !
    IF ( la_proc ) THEN
       ALLOCATE( Gl( nx, nx ), STAT=ierr )
    ELSE
       ALLOCATE( Gl( 1, 1 ), STAT=ierr )
    END IF
    IF( ierr /= 0 ) CALL errore( 'ppcg ',' cannot allocate Gl ', ABS(ierr) )

  END SUBROUTINE allocate_all
  !
  !
  !
  SUBROUTINE cholQR(n, k, X, SX, ldx,  R, ldr)
    !
    !  This subroutine orthogonalizes X using the Choleski decomposition.
    !  The matrix X'X and its Choleski decomposition is replicated on all processors.
    !  TBD: If the Choleslki orthogonalization fails, a slower but more accuarte
    !  Householder QR is performed.
    !
    IMPLICIT NONE
    !
    ! ... I/O variables
    !
    INTEGER,     INTENT (IN) :: n, k, ldx, ldr
    COMPLEX(DP), INTENT (INOUT) :: X(ldx,*), SX(ldx,*)
    COMPLEX(DP), INTENT(OUT) :: R(ldr,*)
    !
    ! ... local variables
    !
    COMPLEX(DP):: XTX(k,k)
    INTEGER    :: ierr = 0, info = 0
    !
    ! ... do Cholesky QR unless X is rank deficient
    !
    CALL ZGEMM('C','N', k, k, n, C_ONE, X, ldx, SX, ldx, C_ZERO, XTX, k)
    !
    CALL mp_sum( XTX, intra_bgrp_comm )
    !
    CALL ZPOTRF('U', k, XTX, k, info)
    IF ( info == 0 ) THEN
       !
       ! ... triangualar solve
       CALL ZTRSM('R', 'U', 'N', 'N', n, k, C_ONE, XTX, k, X, ldx)
       !
    ELSE
       ! TBD: QR
       WRITE(stdout,*) '[Q, R] = qr(X, 0) failed'
       STOP
!           ![X, R] = qr(X, 0)
!           ! QR without forming Q
!           !
!           call zgeqrf(nn,kk,X,nn,tau,wqr,lwqr,info)
!           !
!           if (info<0) then
!              write(*,*), '[Q, R] = qr(X, 0) failed'
!              stop
!           endif
!           ! QR: constructing Q
!           !
!           call zungqr(nn,kk,kk,X,nn,tau,wqr,lwqr,info)
!           !
    END IF
    !
    ! ... also return R factor if needed
    !
    CALL ZLACPY('U', k, k,  XTX, k,  R, ldr)
    !
    RETURN
    !
  END SUBROUTINE cholQR
  !
  !
  !
  SUBROUTINE cholQR_dmat(kdim, k, X, SX, kdimx,  Rl, idesc)
    !
    ! Distributed version of cholQR
    !
    IMPLICIT NONE
    !
    ! ... I/O variables
    !
    INTEGER,     INTENT (IN) :: kdim, k, kdimx
    COMPLEX(DP), INTENT (INOUT) :: X(kdimx,k), SX(kdimx,k)
    INTEGER, INTENT (IN)  :: idesc(:)
    COMPLEX(DP), INTENT(OUT) :: Rl(:, :)
    ! inverse of the upper triangular Cholesky factor
    !
    ! ... local variables
    !
    COMPLEX(DP)  :: buffer(kdimx,k)
    COMPLEX(DP), ALLOCATABLE   :: XTXl(:,:)
    INTEGER    :: nx, ierr = 0
#ifdef __SCALAPACK
    INTEGER     :: desc_sca( 16 ), info
#endif
    !
    nx = idesc(LAX_DESC_NRCX)
    !
    IF ( la_proc ) THEN
       !
       ALLOCATE( XTXl( nx, nx ), STAT=ierr )
       IF( ierr /= 0 ) &
          CALL errore( 'ppcg ',' cannot allocate XTXl ', ABS(ierr) )
       !
    ELSE
       !
       ALLOCATE( XTXl( 1, 1 ), STAT=ierr )
       IF( ierr /= 0 ) &
          CALL errore( 'ppcg ',' cannot allocate XTXl ', ABS(ierr) )
       !
    END IF
    !
    ! ... Perform Cholesky of X'X
    !
    CALL compute_distmat(XTXl, idesc, X, SX, k)
    !
    IF ( la_proc ) THEN
       !
#ifdef __SCALAPACK
       CALL descinit( desc_sca, k, k, nx, nx, 0, 0, ortho_cntx, SIZE( XTXl, 1 ), info )
       IF( info /= 0 ) CALL errore( ' ppcg ', ' descinit ', ABS( info ) )
       !
       CALL PZPOTRF( 'U', k, XTXl, 1, 1, desc_sca, info )
!      IF( info /= 0 ) CALL errore( ' ppcg ', ' problems computing cholesky ', ABS( info ) )
       !
       IF ( info == 0 ) THEN
          !
!          ! set the lower triangular part to zero
!          CALL sqr_zsetmat( 'L', k, C_ZERO, XTXl, size(XTXl,1), desc )
          !
          ! find inverse of the upper triangular Cholesky factor R
          CALL PZTRTRI( 'U', 'N', k, XTXl, 1, 1, desc_sca, info )
          IF( info /= 0 ) CALL errore( ' ppcg ', ' problems computing inverse ', ABS( info ) )
          !
          ! set the lower triangular part to zero
          CALL sqr_setmat( 'L', k, C_ZERO, XTXl, size(XTXl,1), idesc )
       !
       ELSE
       ! TBD: QR
          WRITE(stdout,*) '[Q, R] = qr(X, 0) failed'
          STOP
!           ![X, R] = qr(X, 0)
!           ! QR without forming Q
!           !
!           call zgeqrf(nn,kk,X,nn,tau,wqr,lwqr,info)
!           !
!           if (info<0) then
!              write(*,*), '[Q, R] = qr(X, 0) failed'
!              stop
!           endif
!           ! QR: constructing Q
!           !
!           call zungqr(nn,kk,kk,X,nn,tau,wqr,lwqr,info)
!           !
       END IF
#else
       CALL laxlib_pzpotrf( XTXl, nx, k, idesc )
       !
       CALL laxlib_pztrtri ( XTXl, nx, k, idesc )
#endif
    !
    !
    END IF
    !
    CALL zgemm_dmat( kdim, k, kdimx, idesc, C_ONE, X, XTXl, C_ZERO, buffer )
    !
    X = buffer
    ! ... also return R factor
    Rl = XTXl
    !
    DEALLOCATE(XTXl)
    !
    RETURN
    !
  END SUBROUTINE cholQR_dmat
  !
  !
  !
  SUBROUTINE lock_epairs(kdim, nbnd, btype, w, kdimx, tol, nact, act_idx)
     !
     ! Lock converged eigenpairs: detect "active" columns of w
     ! by checking if each column has norm greater than tol.
     ! Returns the number of active columns and their indices.
     !
     IMPLICIT NONE
     !
     ! ... I/O variables
     !
     INTEGER,     INTENT (IN) :: kdim, kdimx, nbnd, btype(nbnd)
     COMPLEX(DP), INTENT (IN) :: w(kdimx,nbnd)
     REAL(DP),    INTENT (IN) :: tol
     INTEGER,     INTENT(OUT) :: nact, act_idx(nbnd)
     !
     ! ... local variables
     !
     INTEGER         :: j
     REAL(DP)        :: rnrm_store(nbnd), band_tollerance
     REAL(DP), EXTERNAL :: DDOT

     !
     nact = 0
     ! ... Compute norms of each column of psi
     rnrm_store = 0.D0
     CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
     DO j = n_start, n_end
        !
        rnrm_store(j)   =  DDOT(2*kdim, w(:,j), 1, w(:,j), 1)
        !
     END DO
     CALL mp_sum( rnrm_store, inter_bgrp_comm )
     !
     CALL mp_sum( rnrm_store, intra_bgrp_comm )
     !
     DO j = 1, nbnd
        !
        if ( btype(j) == 0 ) then
             band_tollerance = max(2.5*tol,1.d-3)
        else
             band_tollerance = tol
        end if
        !
        rnrm_store(j) = SQRT( rnrm_store(j) )
        !
        IF ( (print_info >= 2) .AND. (iter > 1) )  THEN
          write(stdout, '( "Eigenvalue ", I5, " = ", 1pe12.4, ". Residual norm = ",  1pe9.2)') &
                      j, e(j), rnrm_store(j)
        END IF
        !
        IF ( rnrm_store(j) > band_tollerance ) THEN
           nact = nact + 1
           act_idx(nact) = j
        END IF
        !
     END DO
     !
     !
  END SUBROUTINE lock_epairs
  !
  !
  SUBROUTINE setup_param
     !
     ! Based on the information about active columns of psi,
     ! set up iteration parameters, such as:
     !   - number of subblock
     !   - size of the last sub-block
     !   - tolerance level for the trace difference of reduced hamiltonians
     !     on consecutive iterations
     !   - replicate or re-distribute the Gram matrix G
     !
     IMPLICIT NONE
     !
     ! ... I/O variables
     !
!     INTEGER,     INTENT (IN) :: nbnd, nact, act_idx(nbnd), sbsize
!     INTEGER,     INTENT(OUT) :: nsb, sbsize_last
!     REAL(DP),    INTENT(OUT) :: tol
     !
     ! ... local variables
     !
     INTEGER         :: ierr
     INTEGER :: nx, act_thresh
     ! maximum local block dimension
     ! Threshold on the number of active vectors that determines if
     ! Choleski is distributed or replicated
     !
     !
     ! ... Compute the number of sub-blocks and the size of the last sub-block
     sbsize_last = sbsize
     nsb   =  FLOOR( DBLE(nact)/DBLE(sbsize) )
     IF ( MOD( nact, sbsize ) /= 0) THEN
        sbsize_last =  nact - sbsize*nsb
        nsb         =  nsb + 1
     END IF
     !
     ! ... Compute the current tolerance level for nact pairs
     trtol   = ethr*SQRT(DBLE(nact)) ! MIN( ethr*SQRT(DBLE(nact)), 1.0D-2 )
     !
     ! ... Redistribute matrices for Cholesky because number of active vectors has changed
     !
     ! Don't want to run distributed Cholesky for matrices of size <= 100
     act_thresh = MAX( 100, np_ortho(1) )
     !
     IF ( ( nact > act_thresh ) .AND. (nact /= nact_old) ) THEN
        !
        IF ( ALLOCATED(Gl) ) DEALLOCATE(Gl)
        !
        CALL desc_init( nact, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip  )
        !
        IF ( la_proc ) THEN
           !
           ALLOCATE( Gl( nx, nx ), STAT=ierr )
           IF( ierr /= 0 ) &
              CALL errore( 'ppcg ',' cannot allocate Gl ', ABS(ierr) )
           !
        ELSE
           !
           ALLOCATE( Gl( 1, 1 ), STAT=ierr )
           IF( ierr /= 0 ) &
           CALL errore( 'ppcg ',' cannot allocate Gl ', ABS(ierr) )
           !
        END IF
        !
        force_repmat = .FALSE.
        !
     ELSE
       ! replicate Cholesky because number of active vectors is small
       !
       IF (nact <= act_thresh) THEN
          !
          force_repmat = .TRUE.
          !
          IF ( ALLOCATED(Gl) ) DEALLOCATE(Gl)
          !
        ELSE
          !
          force_repmat = .FALSE.
          !
        END IF
           !
     END IF
     !
     !
     ! ... Allocate storage for the small matrices in separate minimization loop
     IF ( ALLOCATED(K_store) ) DEALLOCATE(K_store)
     IF ( ALLOCATED(M_store) ) DEALLOCATE(M_store)
     ALLOCATE ( K_store(sbsize3, sbsize3*nsb), M_store(sbsize3,sbsize3*nsb), stat = ierr )
     IF (ierr /= 0) &
        CALL errore( 'ppcg ',' cannot allocate K_store and M_store ', ABS(ierr) )
     !
     RETURN
     !
  END SUBROUTINE setup_param
  !
  !
  !
  !
  SUBROUTINE extract_epairs_dmat(kdim, nbnd, kdimx, e, psi, hpsi, spsi)
     !
     ! Perform the Rayleigh-Ritz to "rotate" psi to eigenvectors
     ! The Gram matrices are distributed over la processor group
     !
     !
     IMPLICIT NONE
     !
     ! ... I/O variables
     !
     INTEGER,     INTENT (IN)  :: kdim, kdimx, nbnd
     REAL(DP),    INTENT (OUT) :: e(nbnd)
     COMPLEX(DP), INTENT (INOUT)  :: psi(kdimx,nbnd), hpsi(kdimx,nbnd), spsi(kdimx,nbnd)
!     LOGICAL,     INTENT(IN), OPTIONAL :: ortho
     ! ortho = .true.(default) then orthogonalization of psi prior to the RR is enforced
     !
     ! ... local variables
     !
     COMPLEX (DP), ALLOCATABLE    :: Hl(:,:), Sl(:,:)
     ! local part of projected Hamiltonian and of the overlap matrix
     INTEGER :: idesc(LAX_DESC_SIZE)
     !
     ! Matrix distribution descriptors to temporary store the "global" current descriptor
     LOGICAL :: la_proc_store
     ! flag to distinguish procs involved in linear algebra
     INTEGER, ALLOCATABLE :: irc_ip_store( : )
     INTEGER, ALLOCATABLE :: nrc_ip_store( : )
     INTEGER, ALLOCATABLE :: rank_ip_store( :, : )
     !
     COMPLEX(DP), ALLOCATABLE  :: psi_t(:, :), hpsi_t(:, :), spsi_t(:, :)
     COMPLEX(DP), ALLOCATABLE  :: vl(:,:)
     COMPLEX(DP)               :: buffer(kdimx,nbnd)
!     REAL(DP)                  :: R(nbnd, nbnd)
     INTEGER                   :: nx
!     LOGICAL                   :: do_orth
     !
     ALLOCATE ( psi_t(kdimx,nbnd), hpsi_t(kdimx,nbnd), stat = ierr )
     IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate psi_t and hpsi_t ', ABS(ierr) )
     if (overlap) ALLOCATE ( spsi_t(kdimx,nbnd), stat = ierr )
     IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate spsi_t ', ABS(ierr) )
     !
     ! Store current distributed matrix descriptor information
     ALLOCATE ( irc_ip_store( np_ortho(1)  ), stat = ierr )
     IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate irc_ip_store ', ABS(ierr) )
     !
     ALLOCATE ( nrc_ip_store( np_ortho(1) ), stat = ierr )
     IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate nrc_ip_store ', ABS(ierr) )
     !
     ALLOCATE ( rank_ip_store( np_ortho(1), np_ortho(2) ), stat = ierr )
     IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate rank_ip_store ', ABS(ierr) )
     !
     irc_ip_store  = irc_ip
     nrc_ip_store  = nrc_ip
     rank_ip_store = rank_ip
     !
     CALL desc_init( nbnd, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip  )
     !
     IF ( la_proc ) THEN
        !
        ALLOCATE( vl( nx, nx ), STAT=ierr )
        IF( ierr /= 0 ) &
           CALL errore( 'ppcg ',' cannot allocate vl ', ABS(ierr) )
        !
        ALLOCATE( Sl( nx, nx ), STAT=ierr )
        IF( ierr /= 0 ) &
          CALL errore( 'ppcg ',' cannot allocate Sl ', ABS(ierr) )
        !
        ALLOCATE( Hl( nx, nx ), STAT=ierr )
        IF( ierr /= 0 ) &
          CALL errore( 'ppcg ',' cannot allocate Hl ', ABS(ierr) )
        !
     ELSE
        !
        ALLOCATE( vl( 1, 1 ), STAT=ierr )
        IF( ierr /= 0 ) &
           CALL errore( 'pregterg ',' cannot allocate vl ', ABS(ierr) )
        !
        ALLOCATE( Sl( 1, 1 ), STAT=ierr )
        IF( ierr /= 0 ) &
           CALL errore( 'ppcg ',' cannot allocate Sl ', ABS(ierr) )
        !
        ALLOCATE( Hl( 1, 1 ), STAT=ierr )
        IF( ierr /= 0 ) &
           CALL errore( 'ppcg ',' cannot allocate Hl ', ABS(ierr) )
        !
     END IF
     !
     !
     !  G = psi'*hpsi
     CALL compute_distmat(Hl, idesc, psi, hpsi, nbnd)
     if (overlap) then
        CALL compute_distmat(Sl, idesc, psi, spsi, nbnd)
     else
        CALL compute_distmat(Sl, idesc, psi,  psi, nbnd)
     end if
     !
     ! ... diagonalize the reduced hamiltonian
     !     Calling block parallel algorithm
     !
     IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg e and vl are the same across ortho_parent_comm
        ! only the first bgrp performs the diagonalization
        IF( my_bgrp_id == root_bgrp_id) CALL pdiaghg( nbnd, Hl, Sl, nx, e, vl, idesc )
        IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
          CALL mp_bcast( vl, root_bgrp_id, inter_bgrp_comm )
          CALL mp_bcast( e,  root_bgrp_id, inter_bgrp_comm )
        ENDIF
     ELSE
        CALL pdiaghg( nbnd, Hl, Sl, nx, e, vl, idesc )
     END IF
     !
     ! "Rotate" psi to eigenvectors
     !
     CALL zgemm_dmat( kdim, nbnd, kdimx, idesc, C_ONE,  psi, vl, C_ZERO,  psi_t )
     CALL zgemm_dmat( kdim, nbnd, kdimx, idesc, C_ONE, hpsi, vl, C_ZERO, hpsi_t )
     if (overlap) CALL zgemm_dmat( kdim, nbnd, kdimx, idesc, C_ONE, spsi, vl, C_ZERO, spsi_t )
     !
     psi   = psi_t ; hpsi  = hpsi_t ;  if (overlap) spsi  = spsi_t
     !
     ! Restore current "global" distributed  matrix descriptor
     !
     irc_ip  = irc_ip_store
     nrc_ip  = nrc_ip_store
     rank_ip = rank_ip_store
     !
     DEALLOCATE ( irc_ip_store, nrc_ip_store, rank_ip_store )
     DEALLOCATE(psi_t, hpsi_t) ; if (overlap) DEALLOCATE(spsi_t)
     DEALLOCATE(Hl, Sl)
     DEALLOCATE(vl)
     !
  END SUBROUTINE extract_epairs_dmat
  !
  !
  !
  REAL(DP) FUNCTION get_trace(G, ld, k)
     !
     !  This function returns trace of a k-by-k matrix G
     !
     IMPLICIT NONE
     !
     ! ... I/O variables
     !
     INTEGER,     INTENT (IN) :: ld, k
     COMPLEX(DP), INTENT (IN) :: G(ld,*)
     !
     ! ... local variables
     !
     INTEGER    :: j
     !
     get_trace = 0.D0
     DO j = 1, k
        get_trace = get_trace + DBLE(G(j, j))
     END DO
     !
     RETURN
     !
  END FUNCTION get_trace
  !
  !
  !
  SUBROUTINE deallocate_all
    !
    ! This subroutine releases the allocated memory
    !
    IF ( ALLOCATED(hpsi) )    DEALLOCATE ( hpsi )
    IF ( ALLOCATED(spsi) )    DEALLOCATE ( spsi )
    IF ( ALLOCATED(w) )       DEALLOCATE ( w )
    IF ( ALLOCATED(hw) )      DEALLOCATE ( hw )
    IF ( ALLOCATED(sw) )      DEALLOCATE ( sw )
    IF ( ALLOCATED(p) )       DEALLOCATE ( p )
    IF ( ALLOCATED(hp) )      DEALLOCATE ( hp )
    IF ( ALLOCATED(sp) )      DEALLOCATE ( sp )
    IF ( ALLOCATED(K) )       DEALLOCATE ( K )
    IF ( ALLOCATED(M) )       DEALLOCATE ( M )
    IF ( ALLOCATED(K_store) ) DEALLOCATE ( K_store )
    IF ( ALLOCATED(M_store) ) DEALLOCATE ( M_store )
    IF ( ALLOCATED(cwork) )   DEALLOCATE ( cwork )
    IF ( ALLOCATED(rwork) )   DEALLOCATE ( rwork )
    IF ( ALLOCATED(iwork) )   DEALLOCATE ( iwork )
    IF ( ALLOCATED(irc_ip) )  DEALLOCATE ( irc_ip )
    IF ( ALLOCATED(nrc_ip) )  DEALLOCATE ( nrc_ip )
    IF ( ALLOCATED(rank_ip) ) DEALLOCATE ( rank_ip )
    IF ( ALLOCATED(Gl) )      DEALLOCATE ( Gl )
!    IF ( ALLOCATED(G) )       DEALLOCATE( G )
!    IF ( ALLOCATED(Sl) )      DEALLOCATE( Sl )
    !
  END SUBROUTINE deallocate_all
  !
  !
  SUBROUTINE compute_distmat( dm, idesc, v, w, k)
!    Copy-paste from pcegterg and desc added as a parameter
     !
     !  This subroutine compute <vi|wj> and store the
     !  result in distributed matrix dm
     !
     IMPLICIT NONE
     !
     ! ... I/O variables
     !
     COMPLEX(DP), INTENT(OUT)   :: dm( :, : )
     INTEGER, INTENT(IN) :: idesc(:)
     COMPLEX(DP), INTENT(IN) :: v(:,:), w(:,:)
     INTEGER, INTENT(IN)     :: k
     ! global size of dm = number of vectors in v and w blocks
     !
     ! ... local variables
     !
     INTEGER :: ipc, ipr
     INTEGER :: nr, nc, ir, ic, root
     COMPLEX(DP), ALLOCATABLE :: work( :, : )
     INTEGER :: nx
     ! maximum local block dimension
     !
     nx = idesc(LAX_DESC_NRCX)
     !
     ALLOCATE( work( nx, nx ) )
     !
     work = C_ZERO
     !
     DO ipc = 1, idesc(LAX_DESC_NPC) !  loop on column procs
        !
        nc = nrc_ip( ipc )
        ic = irc_ip( ipc )
        !
        DO ipr = 1, ipc ! use symmetry for the loop on row procs
           !
           nr = nrc_ip( ipr )
           ir = irc_ip( ipr )
           !
           !  rank of the processor for which this block (ipr,ipc) is destinated
           !
           root = rank_ip( ipr, ipc )

           ! use blas subs. on the matrix block

           CALL ZGEMM( 'C','N', nr, nc, kdim, C_ONE, v(1,ir), kdimx, w(1,ic), kdimx, C_ZERO, work, nx )

           ! accumulate result on dm of root proc.

           CALL mp_root_sum( work, dm, root, ortho_parent_comm )

        END DO
        !
     END DO
     !
     if (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
     !
     CALL laxlib_zsqmher( k, dm, nx, idesc )
     !
     DEALLOCATE( work )
     !
     RETURN
  END SUBROUTINE compute_distmat
  !
  !
  SUBROUTINE zgemm_dmat( n, k, ld, idesc, alpha, X, Gl, beta, Y  )
!    Copy-paste from refresh_evc in pregterg with some modifications
     !
     ! Compute Y = alpha*(X*G) + beta*Y, where G is distributed across la processor group
     ! and is given by local block Gl.
     !
     IMPLICIT NONE
     !
     ! ... I/O variables
     !
     INTEGER, INTENT(IN) :: n, k, ld
     ! number of rows of X and Y
     ! number of columns of X,Y  and size/leading dimension of (global) G
     ! leading dimension of X and Y
     INTEGER, INTENT(IN) :: idesc(:)
     ! descriptor of G
     COMPLEX(DP), INTENT(IN)      ::  alpha, beta
     COMPLEX(DP), INTENT (IN)     ::  X(ld, k)
     COMPLEX(DP), INTENT (INOUT)  ::  Y(ld, k)
     COMPLEX(DP), INTENT(IN)      ::  Gl( :, :)
     !
     ! ... local variables
     !
     INTEGER :: ipc, ipr
     INTEGER :: nr, nc, ir, ic, root
     COMPLEX(DP), ALLOCATABLE :: Gltmp( :, : )
     COMPLEX(DP), ALLOCATABLE :: Xtmp( :, : )
     COMPLEX(DP) :: gamm
     INTEGER :: nx
     !
     nx = idesc(LAX_DESC_NRCX)
     !
     ALLOCATE( Gltmp( nx, nx ) )
     ALLOCATE( Xtmp( ld, k ) )
     !
     DO ipc = 1, idesc(LAX_DESC_NPC)
        !
        nc = nrc_ip( ipc )
        ic = irc_ip( ipc )
        !
        IF( ic <= k ) THEN
           !
           nc = min( nc, k - ic + 1 )
           !
           gamm = C_ZERO

           DO ipr = 1, idesc(LAX_DESC_NPR)
              !
              nr = nrc_ip( ipr )
              ir = irc_ip( ipr )
              !
              root = rank_ip( ipr, ipc )
              IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
                 !
                 !  this proc sends his block
                 !
                 CALL mp_bcast( Gl(:,1:nc), root, ortho_parent_comm )
                 CALL ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gl, nx, gamm, Xtmp(1,ic), ld )
              ELSE
                 !
                 !  all other procs receive
                 !
                 CALL mp_bcast( Gltmp(:,1:nc), root, ortho_parent_comm )
                 CALL ZGEMM( 'N','N', n, nc, nr, C_ONE, X(1,ir), ld, Gltmp, nx, gamm, Xtmp(1,ic), ld )
              END IF
              !
              gamm = C_ONE
              !
           END DO
           !
        END IF
        !
     END DO
     !
     IF (beta /= 0.D0) THEN
        Y = alpha*Xtmp + beta*Y
     ELSE
       Y = alpha*Xtmp
     END IF
     !
     DEALLOCATE( Gltmp )
     DEALLOCATE( Xtmp )
     !
     RETURN
     !
  END SUBROUTINE zgemm_dmat
  !

! dmat end

  SUBROUTINE print_rnrm
     !
     !  Compute the subspce residual
     !
     COMPLEX(DP), ALLOCATABLE :: psi_t(:,:), hpsi_t(:,:), res(:,:)
     COMPLEX(DP), ALLOCATABLE :: G(:,:), work(:)
     REAL(DP) :: rnrm
     REAL(DP), EXTERNAL       :: ZLANGE
!ev new begin
integer   :: nwanted, nguard
nguard = 0 ! 24 ! 50
!ev new  end

     rnrm  = 0.D0
     ALLOCATE(psi_t(kdimx,nbnd), hpsi_t(kdimx,nbnd), res(kdimx,nbnd))
     ALLOCATE(G(nbnd, nbnd))
     !
!!! EV begin comment
!     psi_t(:,1:nbnd)  = psi(:,1:nbnd)
!     hpsi_t(:,1:nbnd) = hpsi(:,1:nbnd)
!     !
!     !     G = psi_t'hpsi_t
!     CALL ZGEMM('C','N', nbnd, nbnd, npw, C_ONE, psi_t, npwx, hpsi_t, npwx, C_ZERO, G, nbnd)
!     CALL mp_sum( G, intra_bgrp_comm )
!     !
!     !    res = hpsi_t - psi*G
!     res = hpsi_t;
!     CALL ZGEMM('N','N',npw, nbnd, nbnd, -C_ONE, psi_t, npwx, G, nbnd, C_ONE, res, npwx)
!     !
!     ! ... get the Frobenius norm of the residual
!     rnrm = ZLANGE('F', npw, nbnd, res, npwx, work)  ! work is not referenced for Frobenius norm
!     rnrm = abs(rnrm)**2
!!! EV end comment
!!!
!!!
!  EV begin new
!   Instead computing the norm of the whole residual, compute it for X(:,nbnd-nguard)
     nwanted = nbnd - nguard
     psi_t(:,1:nwanted)  = psi(:,1:nwanted)
     hpsi_t(:,1:nwanted) = hpsi(:,1:nwanted)
     !
     !     G = psi_t'hpsi_t
     CALL ZGEMM('C','N', nwanted, nwanted, kdim, C_ONE, psi_t, kdimx, hpsi_t, kdimx, C_ZERO, G, nbnd)
     CALL mp_sum( G, intra_bgrp_comm )
     !
     !    res = hpsi_t - psi*G
     res(:,1:nwanted) = hpsi_t(:,1:nwanted);
     CALL ZGEMM('N','N',kdim, nwanted, nwanted, -C_ONE, psi_t, kdimx, G, nbnd, C_ONE, res, kdimx)
     !
     ! ... get the Frobenius norm of the residual
     rnrm = ZLANGE('F', kdim, nwanted, res, kdimx, work)  ! work is not referenced for Frobenius norm
     rnrm = rnrm**2
!   EV end new

     !
     CALL mp_sum( rnrm, intra_bgrp_comm )
     !
     rnrm = SQRT(ABS(rnrm))
     res_array(iter) = rnrm
     !
     DEALLOCATE(psi_t, hpsi_t, res)
     DEALLOCATE(G)
     !

!     !CALL flush( stdout )
     !
  END SUBROUTINE print_rnrm

!- routines to perform threaded assignements

  SUBROUTINE threaded_assign(array_out, array_in, kdimx, nact, act_idx, bgrp_root_only)
  !
  !  assign (copy) a complex array in a threaded way
  !
  !  array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, 1:nact )       or
  !
  !  array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, act_idx(1:nact) )
  !
  !  if the index array act_idx is given
  !
  !  if  bgrp_root_only is present and .true. the assignement is made only by the
  !  MPI root process of the bgrp and array_out is zeroed otherwise
  !
  USE util_param,   ONLY : DP
  !
  IMPLICIT NONE
  !
  INTEGER, INTENT(IN)      :: kdimx, nact
  COMPLEX(DP), INTENT(OUT) :: array_out( kdimx, nact )
  COMPLEX(DP), INTENT(IN)  :: array_in ( kdimx, * )
  INTEGER, INTENT(IN), OPTIONAL :: act_idx( * )
  LOGICAL, INTENT(IN), OPTIONAL :: bgrp_root_only
  !
  INTEGER, PARAMETER :: blocksz = 256
  INTEGER :: nblock

  INTEGER :: i, j
  !
  IF (kdimx <=0 .OR. nact<= 0) RETURN
  !
  IF (present(bgrp_root_only) ) THEN
     IF (bgrp_root_only .AND. ( my_bgrp_id /= root_bgrp_id ) ) THEN
        call threaded_memset( array_out, 0.d0, 2*kdimx*nact )
        RETURN
     END IF
  END IF

  nblock = (kdimx - 1)/blocksz  + 1

  IF (present(act_idx) ) THEN
     !$omp parallel do collapse(2)
     DO i=1, nact ; DO j=1,nblock
        array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), act_idx( i ) )
     ENDDO ; ENDDO
     !$omp end parallel do
  ELSE
     !$omp parallel do collapse(2)
     DO i=1, nact ; DO j=1,nblock
        array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
     ENDDO ; ENDDO
     !$omp end parallel do
  END IF
  !
  END SUBROUTINE threaded_assign

END SUBROUTINE ppcg_k