- a lot of changes for band parallelization

- old variables substituted by transition variables, eventually to be renamed as the old one, when transition will be over - All array now are distributed across one or more processor groups Note that ** NOW CPV IS almost MEMORY BOTTLENECK FREE ** Warning, not all functionalites have been tested, CPV with band distribution should be considered in ALPHA testing. git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@7376 c92efa57-630b-4861-b058-cf58834340f0
2011-01-03 09:37:27 +00:00 · 2011-01-03 09:37:27 +00:00 · 9b0453df25
parent d0a77c260a
commit 9b0453df25
20 changed files with 1093 additions and 866 deletions
--- a/CPV/bforceion.f90
+++ b/CPV/bforceion.f90
@ -29,15 +29,15 @@ subroutine bforceion(fion,tfor,ipol,qmatinv,bec0,becdr,gqq,evalue)
  use cell_base, only: a1, a2, a3
  use uspp_param, only: nh, nhm
  use uspp, only : nhsa=> nkb
-  use electrons_base, only: n => nbsp, nx => nbspx, nspin
-  use cp_main_variables,  only : nlax, descla, collect_bec
+  use electrons_base, only: nbsp, nbspx, nspin, nbspx_bgrp
+  use mp_global, only: nbgrp


  implicit none

  real(dp) evalue
-  complex(dp) qmatinv(nx,nx),gqq(nhm,nhm,nas,nsp)
-  real(dp) bec0(nhsa,n),becdr(nhsa,nspin*nlax,3)
+  complex(dp) qmatinv(nbspx,nbspx),gqq(nhm,nhm,nas,nsp)
+  real(dp) bec0(nhsa,nbspx),becdr(nhsa,nbspx,3)
  real(dp) fion(3,*)
  integer ipol
  logical tfor
@ -47,17 +47,14 @@ subroutine bforceion(fion,tfor,ipol,qmatinv,bec0,becdr,gqq,evalue)
  complex(dp) ci, temp, temp1,temp2,temp3
  real(dp) gmes
  integer iv,jv,ia,is,k,i,j,isa,ilm,jlm,inl,jnl,ism
-  real(dp), allocatable :: becdr_repl(:,:,:)
      
  if(.not. tfor) return

+  if( nbgrp > 1 ) &
+     call errore(' bforceion ', ' parallelization over bands not yet implemented ', 1 )
+
  ci = (0.d0,1.d0)
     
-  ALLOCATE( becdr_repl( nhsa,n,3 ) )
-  CALL collect_bec( becdr_repl(:,:,1), becdr(:,:,1), descla, nspin )
-  CALL collect_bec( becdr_repl(:,:,2), becdr(:,:,2), descla, nspin )
-  CALL collect_bec( becdr_repl(:,:,3), becdr(:,:,3), descla, nspin )
-
  if(ipol.eq.1) then
     gmes=a1(1)**2+a1(2)**2+a1(3)**2
     gmes=2*pi/SQRT(gmes)
@ -86,20 +83,20 @@ subroutine bforceion(fion,tfor,ipol,qmatinv,bec0,becdr,gqq,evalue)
              temp1=(0.d0,0.d0)
              temp2=(0.d0,0.d0)
              temp3=(0.d0,0.d0)
-              do i=1,n
-                 do j=1,n
+              do i=1,nbsp
+                 do j=1,nbsp

                    temp = temp + ci*gmes*gqq(iv,jv,ia,is)* &!TAKECARE: sign + due to exp(+iGr) in gqq
                         &        bec0(inl,i)*bec0(jnl,j)*qmatinv(j,i)

                    temp1 = temp1 + gqq(iv,jv,ia,is)*&
-     &  (  becdr_repl(inl,i,1)*bec0(jnl,j)+bec0(inl,i)*becdr_repl(jnl,j,1))*qmatinv(j,i)
+     &  (  becdr(inl,i,1)*bec0(jnl,j)+bec0(inl,i)*becdr(jnl,j,1))*qmatinv(j,i)

                    temp2 = temp2 + gqq(iv,jv,ia,is)*&
-     &  (  becdr_repl(inl,i,2)*bec0(jnl,j)+bec0(inl,i)*becdr_repl(jnl,j,2))*qmatinv(j,i)
+     &  (  becdr(inl,i,2)*bec0(jnl,j)+bec0(inl,i)*becdr(jnl,j,2))*qmatinv(j,i)

                    temp3 = temp3 + gqq(iv,jv,ia,is)*&
-     &  (  becdr_repl(inl,i,3)*bec0(jnl,j)+bec0(inl,i)*becdr_repl(jnl,j,3))*qmatinv(j,i)
+     &  (  becdr(inl,i,3)*bec0(jnl,j)+bec0(inl,i)*becdr(jnl,j,3))*qmatinv(j,i)


                 enddo
@ -114,7 +111,5 @@ subroutine bforceion(fion,tfor,ipol,qmatinv,bec0,becdr,gqq,evalue)
     end do
  end do

-  DEALLOCATE( becdr_repl )
-
  return
 end subroutine bforceion
--- a/CPV/cg_sub.f90
+++ b/CPV/cg_sub.f90
@ -10,7 +10,7 @@
 !
   subroutine runcg_uspp( nfi, tfirst, tlast, eigr, bec, irb, eigrb, &
      rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac, fion, ema0bg, becdr, &
-      lambdap, lambda, vpot  )
+      lambdap, lambda, vpot, c0, cm, phi, dbec  )

      use kinds, only: dp
      use control_flags, only: iprint, thdyn, tpre, iprsta, &
@ -23,7 +23,7 @@
      use energies, only: eht, epseu, exc, etot, eself, enl, ekin,          &
     &                    atot, entropy, egrand
      use electrons_base, only: f, nspin, nel, iupdwn, nupdwn, nudx, nelt, &
-                                nx => nbspx, n => nbsp, ispin
+                                nbspx, nbsp, ispin

      use ensemble_dft, only: tens,   ef,  z0t, c0diag,  &
                      becdiag, fmat0, e0,  id_matrix_init
@ -53,18 +53,17 @@
      use cg_module,                only : ene_ok,  maxiter,niter_cg_restart, &
                                           conv_thr, passop, enever, itercg
      use ions_positions,           only : tau0
-      use wavefunctions_module,     only : c0, cm, phi => cp
      use efield_module,            only : tefield, evalue, ctable, qmat, detq, ipolp, &
                                           berry_energy, ctabin, gqq, gqqm, df, pberryel, &
                                           tefield2, evalue2, ctable2, qmat2, detq2, ipolp2, &
                                           berry_energy2, ctabin2, gqq2, gqqm2, pberryel2
      use mp,                       only : mp_sum, mp_bcast
      use cp_electronic_mass,       ONLY : emass_cutoff
-      use orthogonalize_base,       ONLY : calphi
+      use orthogonalize_base,       ONLY : calphi_bgrp
      use cp_interfaces,            ONLY : rhoofr, dforce, compute_stress
      USE cp_main_variables,        ONLY : nlax, collect_lambda, distribute_lambda, descla, nrlx, nlam
      USE descriptors,              ONLY : la_npc_ , la_npr_ , la_comm_ , la_me_ , la_nrl_ , ldim_cyclic
-      USE mp_global, ONLY:  me_image, my_image_id
+      USE mp_global, ONLY:  me_image, my_image_id, nbgrp


 !
@ -75,8 +74,8 @@
      integer :: nfi
      logical :: tfirst , tlast
      complex(dp) :: eigr(ngw,nat)
-      real(dp) :: bec(nhsa,n)
-      real(dp) :: becdr(nhsa,nspin*nlax,3)
+      real(dp) :: bec(nhsa,nbspx)
+      real(dp) :: becdr(nhsa,nbspx,3)
      integer irb(3,nat)
      complex(dp) :: eigrb(ngb,nat)
      real(dp) :: rhor(nrxx,nspin)
@ -92,6 +91,10 @@
      real(dp) :: ema0bg(ngw)
      real(dp) :: lambdap(nlam,nlam,nspin)
      real(dp) :: lambda(nlam,nlam,nspin)
+      complex(dp) :: c0( ngw, nbspx )
+      complex(dp) :: cm( ngw, nbspx )
+      complex(dp) :: phi( ngw, nbspx )
+      real(dp) :: dbec(nhsa,nbspx,3,3)
 !
 !
      integer :: i, j, ig, k, is, iss,ia, iv, jv, il, ii, jj, kk, ip
@ -128,12 +131,16 @@
      real(DP)  ene0,ene1,dene0,enesti !energy terms for linear minimization along hi
   

-      allocate(bec0(nhsa,n),becm(nhsa,n), becdrdiag(nhsa,nspin*nlax,3))
-      allocate (ave_ene(n))
+      allocate(bec0(nhsa,nbspx),becm(nhsa,nbspx), becdrdiag(nhsa,nbspx,3))
+      allocate (ave_ene(nbspx))
      allocate(c2(ngw),c3(ngw))


      call start_clock('runcg_uspp')
+
+      if( nbgrp > 1 ) &
+         call errore(' runcg_uspp ', ' parallelization over bands not yet implemented ', 1 )
+
      newscheme=.false.
      firstiter=.true.

@ -167,16 +174,13 @@

      !orthonormalize c0

-      call calbec(1,nsp,eigr,c0,bec)
-
-      call gram(betae,bec,nhsa,c0,ngw,n)
-
-      !call calbec(1,nsp,eigr,c0,bec)
+      DO iss = 1, nspin
+         CALL gram_bgrp( betae, bec, nhsa, c0, ngw, iss )
+      END DO

      !calculates phi for pcdaga

-      ! call calphiid(c0,bec,betae,phi)
-      CALL calphi( c0, SIZE(c0,1), bec, nhsa, betae, phi, n )
+      CALL calphi_bgrp( c0, SIZE(c0,1), bec, nhsa, betae, phi, nbsp )

      !calculates the factors for S and K inversion in US case
      if(nvb.gt.0) then
@ -193,7 +197,7 @@

      numok = 0

-      allocate(hpsi(ngw,n),hpsi0(ngw,n),gi(ngw,n),hi(ngw,n))
+      allocate(hpsi(ngw,nbspx),hpsi0(ngw,nbspx),gi(ngw,nbspx),hi(ngw,nbspx))
      do while ( itercg .lt. maxiter .and. (.not.ltresh) )


@ -299,8 +303,8 @@

        call prefor(eigr,betae)!ATTENZIONE

-        do i=1,n,2
-          call dforce( i, bec, betae, c0,c2,c3,rhos, nrxxs, ispin,f,n,nspin)
+        do i=1,nbsp,2
+          call dforce( i, bec, betae, c0,c2,c3,rhos, nrxxs, ispin,f,nbsp,nspin)
          if(tefield .and. (evalue.ne.0.d0)) then
            call dforceb(c0, i, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
            c2(1:ngw)=c2(1:ngw)+evalue*df(1:ngw)
@ -315,24 +319,24 @@
          endif

          hpsi(1:ngw,  i)=c2(1:ngw)
-          if(i+1 <= n) then
+          if(i+1 <= nbsp) then
            hpsi(1:ngw,i+1)=c3(1:ngw)
          endif
          if (gstart==2) then
            hpsi(1,  i)=CMPLX(DBLE(hpsi(1,  i)), 0.d0,kind=DP)
-            if(i+1 <= n) then
+            if(i+1 <= nbsp) then
              hpsi(1,i+1)=CMPLX(DBLE(hpsi(1,i+1)), 0.d0,kind=DP)
            endif
          end if
        enddo

-        if(pre_state) call ave_kin(c0,SIZE(c0,1),n,ave_ene)
+        if(pre_state) call ave_kin(c0,SIZE(c0,1),nbsp,ave_ene)

               
        call pcdaga2(c0,phi,hpsi)
               
-        hpsi0(1:ngw,1:n)=hpsi(1:ngw,1:n)
-        gi(1:ngw,1:n) = hpsi(1:ngw,1:n)
+        hpsi0=hpsi
+        gi = hpsi
        
        call calbec(1,nsp,eigr,hpsi,becm)
        call xminus1(hpsi,betae,dumm,becm,s_minus1,.false.)
@ -362,7 +366,7 @@
        gamma=0.d0
        
        if(.not.tens) then
-           do i=1,n
+           do i=1,nbsp
              do ig=1,ngw
                 gamma=gamma+2.d0*DBLE(CONJG(gi(ig,i))*hpsi(ig,i))
              enddo
@ -374,7 +378,7 @@
           call mp_sum( gamma, intra_bgrp_comm )
           
           if (nvb.gt.0) then
-              do i=1,n
+              do i=1,nbsp
                 do is=1,nvb
                    do iv=1,nh(is)
                       do jv=1,nh(is)
@ -465,7 +469,7 @@

          restartcg=.false.
          passof=passop
-          hi(1:ngw,1:n)=gi(1:ngw,1:n)!hi is the search direction
+          hi=gi!hi is the search direction
          esse=gamma


@ -478,7 +482,7 @@
          gamma=gamma/esse
          esse=essenew

-          hi(1:ngw,1:n)=gi(1:ngw,1:n)+gamma*hi(1:ngw,1:n)
+          hi=gi+gamma*hi

        endif
 !note that hi, is saved  on gi, because we need it before projection on conduction states
@ -497,7 +501,7 @@

        dene0=0.
        if(.not.tens) then
-          do i=1,n               
+          do i=1,nbsp
            do ig=1,ngw
              dene0=dene0-4.d0*DBLE(CONJG(hi(ig,i))*hpsi0(ig,i))
            enddo
@ -551,14 +555,14 @@

        !calculates wave-functions on a point on direction hi

-      cm(1:ngw,1:n)=c0(1:ngw,1:n)+spasso*passof*hi(1:ngw,1:n)
+      cm=c0+spasso*passof*hi


        !orthonormalize

-      call calbec(1,nsp,eigr,cm,becm)
-      call gram(betae,becm,nhsa,cm,ngw,n)
-        !call calbec(1,nsp,eigr,cm,becm)
+      DO iss = 1, nspin
+         CALL gram_bgrp( betae, becm, nhsa, cm, ngw, iss )
+      END DO
               
        !calculate energy
        if(.not.tens) then
@ -614,13 +618,14 @@
              
        !calculates wave-functions at minimum

-        cm(1:ngw,1:n)=c0(1:ngw,1:n)+spasso*passo*hi(1:ngw,1:n)
+        cm=c0+spasso*passo*hi
        if(gstart==2) then
          cm(1,:)=0.5d0*(cm(1,:)+CONJG(cm(1,:)))
        endif

-        call calbec(1,nsp,eigr,cm,becm)
-        call gram(betae,becm,nhsa,cm,ngw,n)
+      DO iss = 1, nspin
+         CALL gram_bgrp( betae, becm, nhsa, cm, ngw, iss )
+      END DO

        !test on energy: check the energy has really diminished

@ -687,20 +692,22 @@
          if(ionode) then
             write(stdout,*) 'cg_sub: missed minimum, case 1, iteration',itercg
          endif
-          c0(1:ngw,1:n)=c0(1:ngw,1:n)+spasso*passov*hi(1:ngw,1:n)
+          c0=c0+spasso*passov*hi
          restartcg=.true.
-          call calbec(1,nsp,eigr,c0,bec)
-          call gram(betae,bec,nhsa,c0,ngw,n)
+      DO iss = 1, nspin
+         CALL gram_bgrp( betae, bec, nhsa, c0, ngw, iss )
+      END DO
          ene_ok=.false.
          !if  ene1 << energy <  ene0; go to  ene1
        else if( (enever.ge.ene0).and.(ene0.gt.ene1)) then
          if(ionode) then
             write(stdout,*) 'cg_sub: missed minimum, case 2, iteration',itercg
          endif  
-          c0(1:ngw,1:n)=c0(1:ngw,1:n)+spasso*passov*hi(1:ngw,1:n)
+          c0=c0+spasso*passov*hi
          restartcg=.true.!ATTENZIONE
-          call calbec(1,nsp,eigr,c0,bec)
-          call gram(betae,bec,nhsa,c0,ngw,n)
+      DO iss = 1, nspin
+         CALL gram_bgrp( betae, bec, nhsa, c0, ngw, iss )
+      END DO
          !if ene > ene0,en1 do a steepest descent step
          ene_ok=.false.
        else if((enever.ge.ene0).and.(ene0.le.ene1)) then
@ -712,11 +719,12 @@
          do while(enever.gt.ene0 .and. iter3.lt.maxiter3)
            iter3=iter3+1
            passov=passov*0.5d0
-            cm(1:ngw,1:n)=c0(1:ngw,1:n)+spasso*passov*hi(1:ngw,1:n)
+            cm=c0+spasso*passov*hi
            ! chenge the searching direction
            spasso=spasso*(-1.d0)
-            call calbec(1,nsp,eigr,cm,becm)
-            call gram(betae,bec,nhsa,cm,ngw,n)
+      DO iss = 1, nspin
+         CALL gram_bgrp( betae, bec, nhsa, cm, ngw, iss )
+      END DO
            call calbec(1,nsp,eigr,cm,becm)
            if(.not.tens) then
              call rhoofr(nfi,cm(:,:),irb,eigrb,becm,rhovan,rhor,rhog,rhos,enl,denl,ekin,dekin6)
@ -768,7 +776,7 @@
        if(.not. ene_ok) call calbec (1,nsp,eigr,c0,bec)

        !calculates phi for pc_daga
-        CALL calphi( c0, SIZE(c0,1), bec, nhsa, betae, phi, n )
+        CALL calphi_bgrp( c0, SIZE(c0,1), bec, nhsa, betae, phi, nbsp )
  
        !=======================================================================
        !
@ -800,16 +808,15 @@
       if(tpre) then!if pressure is need the following is written because of caldbec
          call  calbec(1,nsp,eigr,c0,bec)
          if(.not.tens) then
-            call  caldbec( ngw, nhsa, n, 1, nsp, eigr, c0, dbec )
+            call caldbec_bgrp( eigr, c0, dbec )
            call rhoofr(nfi,c0(:,:),irb,eigrb,bec,rhovan,rhor,rhog,rhos,enl,denl,ekin,dekin6)
          else

            !     calculation of the rotated quantities
            call rotate( z0t, c0(:,:), bec, c0diag, becdiag, .false. )
            !     calculation of rho corresponding to the rotated wavefunctions
-            call  caldbec( ngw, nhsa, n, 1, nsp, eigr, c0diag, dbec )
-            call rhoofr(nfi,c0diag,irb,eigrb,becdiag                         &
-                     &                    ,rhovan,rhor,rhog,rhos,enl,denl,ekin,dekin6)
+            call caldbec_bgrp( eigr, c0diag, dbec )
+            call rhoofr(nfi,c0diag,irb,eigrb,becdiag,rhovan,rhor,rhog,rhos,enl,denl,ekin,dekin6)
          endif

          !calculates the potential
@ -835,14 +842,14 @@

      call newd(vpot,irb,eigrb,rhovan,fion)
      if (.not.tens) then
-        if (tfor .or. tprnfor) call nlfq(c0,eigr,bec,becdr,fion)
+        if (tfor .or. tprnfor) call nlfq_bgrp( c0, eigr, bec, becdr, fion ) ! call nlfq(c0,eigr,bec,becdr,fion)
      else
-        if (tfor .or. tprnfor) call nlfq(c0diag,eigr,becdiag,becdrdiag,fion)
+        if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, eigr, becdiag, becdrdiag, fion ) ! call nlfq(c0diag,eigr,becdiag,becdrdiag,fion)
      endif
  
        call prefor(eigr,betae)
-        do i=1,n,2
-          call dforce(i,bec,betae,c0,c2,c3,rhos,nrxxs,ispin,f,n,nspin)
+        do i=1,nbsp,2
+          call dforce(i,bec,betae,c0,c2,c3,rhos,nrxxs,ispin,f,nbsp,nspin)
          if(tefield.and.(evalue .ne. 0.d0)) then
            call dforceb &
               (c0, i, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
@ -870,13 +877,13 @@

          do ig=1,ngw
            gi(ig,  i)=c2(ig)
-            if(i+1 <= n) then
+            if(i+1 <= nbsp) then
              gi(ig,i+1)=c3(ig)
            endif
          end do
          if (gstart==2) then
            gi(1,  i)=CMPLX(DBLE(gi(1,  i)),0.d0,kind=DP)
-            if(i+1 <= n) then
+            if(i+1 <= nbsp) then
              gi(1,i+1)=CMPLX(DBLE(gi(1,i+1)),0.d0,kind=DP)
            endif
          end if
@ -942,13 +949,13 @@
           !
           DEALLOCATE( lambda_dist )
           !
-           call nlsm2(ngw,nhsa,n,nspin,eigr,c0(:,:),becdr)
+           call nlsm2_bgrp( ngw, nhsa, eigr, c0, becdr, nbspx, nbsp )
           !
        endif
        !
  
        !
-        call nlfl(bec,becdr,lambda,fion)
+        CALL nlfl_bgrp( bec, becdr, lambda, fion ) 
          
        ! bforceion adds the force term due to electronic berry phase
        ! only in US-case
--- a/CPV/chargedensity.f90
+++ b/CPV/chargedensity.f90
@ -65,7 +65,7 @@

 !-----------------------------------------------------------------------
   SUBROUTINE rhoofr_cp &
-      ( nfi, c_bgrp, irb, eigrb, bec, rhovan, rhor, rhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
+      ( nfi, c_bgrp, irb, eigrb, bec_bgrp, rhovan, rhor, rhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
 !-----------------------------------------------------------------------
 !
 !  this routine computes:
@ -123,14 +123,14 @@
      USE fft_interfaces,     ONLY: fwfft, invfft
      USE fft_base,           ONLY: dffts, dfftp
      USE cp_interfaces,      ONLY: checkrho
-      USE cdvan,              ONLY: dbec, drhovan
-      USE cp_main_variables,  ONLY: iprint_stdout, drhor, drhog
+      USE cdvan,              ONLY: drhovan
+      USE cp_main_variables,  ONLY: iprint_stdout, drhor, drhog, dbec
      USE wannier_base,       ONLY: iwf
      USE cell_base,          ONLY: a1, a2, a3
 !
      IMPLICIT NONE
      INTEGER nfi
-      REAL(DP) bec(:,:)
+      REAL(DP) bec_bgrp(:,:)
      REAL(DP) rhovan(:, :, : )
      REAL(DP) rhor(:,:)
      REAL(DP) rhos(:,:)
@ -190,13 +190,18 @@
         !
         !     called from WF, compute only of rhovan
         !
-         CALL calrhovan( rhovan, bec, iwf )
+         CALL calrhovan( rhovan, bec_bgrp, iwf )
         !
      ELSE
         !
         !     calculation of non-local energy
         !
-         enl = ennl( rhovan, bec )
+         enl = ennl( rhovan, bec_bgrp )
+         !
+         IF( nbgrp > 1 ) THEN
+            CALL mp_sum( enl, inter_bgrp_comm )
+            CALL mp_sum( rhovan, inter_bgrp_comm )
+         END IF
         !
      END IF
      !
@ -205,7 +210,12 @@
         IF( .NOT. ALLOCATED( drhovan ) ) &
            CALL errore( ' rhoofr ', ' drhovan not allocated ', 1 )
         !
-         CALL dennl( bec, dbec, drhovan, denl ) 
+         CALL dennl( bec_bgrp, dbec, drhovan, denl ) 
+         !
+         IF( nbgrp > 1 ) THEN
+            CALL mp_sum( denl, inter_bgrp_comm )
+            CALL mp_sum( drhovan, inter_bgrp_comm )
+         END IF
         !
      END IF
      !    
@ -790,6 +800,8 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
      USE dqgb_mod,                 ONLY: dqgb
      USE fft_interfaces,           ONLY: fwfft, invfft
      USE fft_base,                 ONLY: dfftb, dfftp
+      USE mp_global,                ONLY: my_bgrp_id, nbgrp, inter_bgrp_comm
+      USE mp,                       ONLY: mp_sum

      IMPLICIT NONE
 ! input
@ -846,7 +858,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)

 !$omp parallel default(none) &
 !$omp          shared(nvb, na, nnrbx, ngb, nh, eigrb, dfftb, irb, v, &
-!$omp                 nmb, ci, npb, i, j, dqgb, qgb, nhm, rhovan, drhovan ) &
+!$omp                 nmb, ci, npb, i, j, dqgb, qgb, nhm, rhovan, drhovan, my_bgrp_id, nbgrp ) &
 !$omp          private(mytid, ntids, is, ia, nfft, ifft, iv, jv, ijv, ig, iss, isa, &
 !$omp                  qv, itid, dqgbt, dsumt, asumt )

@ -866,7 +878,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
 #ifdef __PARA
                  DO ia=1,na(is)
                     nfft=1
-                     IF ( dfftb%np3( isa ) <= 0 ) then
+                     IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
                        isa = isa + nfft
                        CYCLE
                     END IF
@ -946,6 +958,8 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
 !
 !$omp end parallel

+               CALL mp_sum( v, inter_bgrp_comm )
+
               iss = 1

               DO ir=1,nrxx
--- a/CPV/cp_interfaces.f90
+++ b/CPV/cp_interfaces.f90
@ -93,6 +93,8 @@
   PUBLIC :: compute_stress

   PUBLIC :: protate
+   PUBLIC :: c_bgrp_expand
+   PUBLIC :: c_bgrp_pack

   ! ------------------------------------ !

@ -274,7 +276,7 @@
   END INTERFACE

   INTERFACE readfile
-      SUBROUTINE readfile_cp                                         &
+      SUBROUTINE readfile_x                                         &
      &     ( flag, h,hold,nfi,c0,cm,taus,tausm,vels,velsm,acc,    &
      &       lambda,lambdam,xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,nhpcl,nhpdim,ekincm,&
      &       xnhh0,xnhhm,vnhh,velh,fion, tps, mat_z, occ_f )
@ -294,12 +296,12 @@
         REAL(DP) :: xnhh0(3,3),xnhhm(3,3),vnhh(3,3),velh(3,3)
         REAL(DP), INTENT(OUT) :: tps
         REAL(DP), INTENT(INOUT) :: mat_z(:,:,:), occ_f(:)
-      END SUBROUTINE readfile_cp
+      END SUBROUTINE readfile_x
   END INTERFACE


   INTERFACE writefile
-      SUBROUTINE writefile_cp &
+      SUBROUTINE writefile_x &
      &     ( h,hold,nfi,c0,cm,taus,tausm,vels,velsm,acc,           &
      &       lambda,lambdam,xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,nhpcl,nhpdim,ekincm,&
      &       xnhh0,xnhhm,vnhh,velh, fion, tps, mat_z, occ_f, rho )
@ -319,27 +321,28 @@
         REAL(DP), INTENT(in) :: rho(:,:)
         REAL(DP), INTENT(in) :: occ_f(:)
         REAL(DP), INTENT(in) :: mat_z(:,:,:)
-      END SUBROUTINE writefile_cp
+      END SUBROUTINE writefile_x
   END INTERFACE
 

   INTERFACE runcp_uspp
      SUBROUTINE runcp_uspp_x &
-         ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec, c0, cm, fromscra, restart )
+         ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
         USE kinds,             ONLY: DP
         IMPLICIT NONE
         integer, intent(in) :: nfi
         real(DP) :: fccc, ccc
         real(DP) :: ema0bg(:), dt2bye
         real(DP) :: rhos(:,:)
-         real(DP) :: bec(:,:)
-         complex(DP) :: c0(:,:), cm(:,:)
+         real(DP) :: bec_bgrp(:,:)
+         complex(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
         logical, optional, intent(in) :: fromscra
         logical, optional, intent(in) :: restart
      END SUBROUTINE
   END INTERFACE


+
   INTERFACE runcp_uspp_force_pairing
      SUBROUTINE runcp_uspp_force_pairing_x  &
         ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec, c0, cm, intermed, fromscra, &
@ -391,7 +394,7 @@

   INTERFACE ortho
      SUBROUTINE ortho_cp &
-         ( eigr, cp, phi, ngwx, x0, descla, diff, iter, ccc, bephi, becp_dist, nbsp, nspin, nupdwn, iupdwn)
+         ( eigr, cp_bgrp, phi_bgrp, ngwx, x0, descla, diff, iter, ccc, bephi, becp_dist, nbsp, nspin, nupdwn, iupdwn)
         USE kinds,          ONLY: DP
         USE ions_base,      ONLY: nat
         USE uspp,           ONLY: nkb
@ -400,7 +403,8 @@
         INTEGER,    INTENT(IN)     :: ngwx, nbsp, nspin
         INTEGER,    INTENT(IN)     :: nupdwn( nspin ), iupdwn( nspin )
         INTEGER,     INTENT(IN)    :: descla( descla_siz_ , nspin )
-         COMPLEX(DP) :: cp(ngwx,nbsp), phi(ngwx,nbsp), eigr(ngwx,nat)
+         COMPLEX(DP) :: eigr(ngwx,nat)
+         COMPLEX(DP) :: cp_bgrp(:,:), phi_bgrp(:,:)
         REAL(DP)    :: x0( :, :, : ), diff, ccc
         INTEGER     :: iter
         REAL(DP)    :: bephi(:,:)
@ -547,10 +551,9 @@


   INTERFACE wave_rand_init
-      SUBROUTINE wave_rand_init_x( cm, n, noff )
+      SUBROUTINE wave_rand_init_x( cm )
         USE kinds,              ONLY: DP
         IMPLICIT NONE
-         INTEGER,     INTENT(IN)  :: n, noff
         COMPLEX(DP), INTENT(OUT) :: cm(:,:)
      END SUBROUTINE
   END INTERFACE
@ -785,13 +788,14 @@

   INTERFACE move_electrons
      SUBROUTINE move_electrons_x( &
-         nfi, tfirst, tlast, b1, b2, b3, fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress )
+         nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb, enbi, fccc, ccc, dt2bye, stress )
         USE kinds,         ONLY: DP
         IMPLICIT NONE
         INTEGER,  INTENT(IN)    :: nfi
         LOGICAL,  INTENT(IN)    :: tfirst, tlast
         REAL(DP), INTENT(IN)    :: b1(3), b2(3), b3(3)
         REAL(DP)                :: fion(:,:)
+         COMPLEX(DP)             :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
         REAL(DP), INTENT(IN)    :: dt2bye
         REAL(DP)                :: fccc, ccc
         REAL(DP)                :: enb, enbi
@ -844,9 +848,24 @@
         REAL(DP), INTENT(IN) :: bec(:,:)
         REAL(DP), INTENT(OUT) :: becrot(:,:)
      END SUBROUTINE
-
   END INTERFACE

+   INTERFACE c_bgrp_expand
+    SUBROUTINE c_bgrp_expand_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      IMPLICIT NONE
+      COMPLEX(DP) :: c_bgrp(:,:)
+    END SUBROUTINE c_bgrp_expand_x
+   END INTERFACE
+   INTERFACE c_bgrp_pack
+    SUBROUTINE c_bgrp_pack_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      IMPLICIT NONE
+      COMPLEX(DP) :: c_bgrp(:,:)
+    END SUBROUTINE c_bgrp_pack_x
+   END INTERFACE
+
+
 !=----------------------------------------------------------------------------=!
   END MODULE
 !=----------------------------------------------------------------------------=!
--- a/CPV/cplib.f90
+++ b/CPV/cplib.f90
@ -187,6 +187,7 @@ END FUNCTION
      END FUNCTION cscnorm
 !
 !
+!
 !-----------------------------------------------------------------------
      SUBROUTINE denlcc( nnr, nspin, vxcr, sfac, drhocg, dcc )
 !-----------------------------------------------------------------------
@ -279,7 +280,7 @@ END FUNCTION
      USE uspp,               ONLY: nkb, qq
      USE uspp_param,         ONLY: nh
      USE mp,                 ONLY: mp_sum
-      USE mp_global,          ONLY: intra_bgrp_comm
+      USE mp_global,          ONLY: intra_bgrp_comm, nbgrp
 !
      IMPLICIT NONE
 !
@ -291,6 +292,9 @@ END FUNCTION
 
      REAL(DP), ALLOCATABLE::  becp(:,:)
      INTEGER i,kmax,nnn,k,ig,is,ia,iv,jv,inl,jnl
+!
+      IF( nbgrp > 1 ) &
+         CALL errore( ' dotcsc ', ' parallelization over bands not yet implemented ', 1 )
 !
      ALLOCATE(becp(nkb,n))
 !
@ -454,6 +458,7 @@ END FUNCTION
      RETURN
   END FUNCTION enkin
 !            
+
 !-------------------------------------------------------------------------
      SUBROUTINE gracsc( bec, nkbx, betae, cp, ngwx, i, csc, n )
 !-----------------------------------------------------------------------
@ -562,6 +567,158 @@ END FUNCTION
      END SUBROUTINE gracsc


+!-------------------------------------------------------------------------
+      SUBROUTINE gracsc_bgrp( bec_bgrp, nkbx, betae, cp_bgrp, ngwx, i, csc, iss )
+!-----------------------------------------------------------------------
+!     requires in input the updated bec(k) for k<i
+!     on output: bec(i) is recalculated
+!
+      USE ions_base,      ONLY: na
+      USE cvan,           ONLY :nvb, ish
+      USE uspp,           ONLY : nkb, nhsavb=>nkbus, qq
+      USE uspp_param,     ONLY:  nh
+      USE electrons_base, ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
+      USE gvecw,          ONLY: ngw
+      USE mp,             ONLY: mp_sum
+      USE mp_global,      ONLY: intra_bgrp_comm, inter_bgrp_comm
+      USE kinds,          ONLY: DP
+      USE gvect, ONLY: gstart
+!
+      IMPLICIT NONE
+!
+      INTEGER, INTENT(IN) :: i, nkbx, ngwx, iss
+      COMPLEX(DP) :: betae( ngwx, nkb )
+      REAL(DP)    :: bec_bgrp( nkbx, nbspx_bgrp ), cp_bgrp( 2, ngwx, nbspx_bgrp )
+      REAL(DP)    :: csc( nbspx )
+      INTEGER     :: k, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
+      REAL(DP)    :: rsum
+      REAL(DP), ALLOCATABLE :: temp(:) 
+      REAL(DP), ALLOCATABLE :: cp_tmp(:,:) 
+      REAL(DP), ALLOCATABLE :: bec_tmp(:) 
+      REAL(DP), ALLOCATABLE :: csc2( : )
+      !
+      !     calculate csc(k)=<cp(i)|cp(k)>,  k<i
+      !
+      kmax = i - 1
+      !
+      ALLOCATE( cp_tmp( 2, ngwx ) )
+      ALLOCATE( bec_tmp( nkbx ) )
+      ALLOCATE( csc2( SIZE( csc ) ) )
+
+      cp_tmp = 0.0d0
+      csc    = 0.0d0
+
+      ibgrp_i = ibgrp_g2l( i )
+      IF( ibgrp_i > 0 ) cp_tmp = cp_bgrp(:,:, ibgrp_i )
+
+      CALL mp_sum( cp_tmp, inter_bgrp_comm )
+
+!$omp parallel default(none), &
+!$omp          shared(iupdwn,kmax,ispin,ibgrp_g2l,ngw,cp_bgrp,cp_tmp,csc,nhsavb,betae,bec_bgrp,i,iss,gstart), &
+!$omp          private( temp, k, ig, inl, ibgrp_k, ibgrp_i )
+      ALLOCATE( temp( ngw ) )
+!$omp do
+      DO k = iupdwn( iss ), kmax
+         IF ( ispin(i) .EQ. ispin(k) ) THEN
+            ibgrp_k = ibgrp_g2l( k )
+            IF( ibgrp_k > 0 ) THEN
+               DO ig = 1, ngw
+                  temp(ig) = cp_bgrp(1,ig,ibgrp_k) * cp_tmp(1,ig) + cp_bgrp(2,ig,ibgrp_k) * cp_tmp(2,ig)
+               END DO
+               csc(k) = 2.0d0 * SUM(temp)
+               IF (gstart == 2) csc(k) = csc(k) - temp(1)
+            END IF
+         ENDIF
+      END DO
+!$omp end do
+      !
+      !
+      !     calculate bec(i)=<cp(i)|beta>
+      !
+      ibgrp_i = ibgrp_g2l( i )
+      !
+      IF(  ibgrp_i > 0 ) THEN
+!$omp do
+         DO inl=1,nhsavb
+            DO ig=1,ngw
+               temp(ig)=cp_bgrp(1,ig,ibgrp_i)* DBLE(betae(ig,inl))+             &
+     &                  cp_bgrp(2,ig,ibgrp_i)*AIMAG(betae(ig,inl))
+            END DO
+            bec_bgrp(inl,ibgrp_i)=2.d0*SUM(temp)
+            IF (gstart == 2) bec_bgrp(inl,ibgrp_i)= bec_bgrp(inl,ibgrp_i)-temp(1)
+         END DO
+!$omp end do
+      END IF
+      DEALLOCATE( temp )
+!$omp end parallel
+
+      CALL mp_sum( csc, intra_bgrp_comm )
+      CALL mp_sum( csc, inter_bgrp_comm )
+
+      IF(  ibgrp_i > 0 ) THEN
+         CALL mp_sum( bec_bgrp( 1:nhsavb, ibgrp_i ), intra_bgrp_comm )
+      END IF
+
+      bec_tmp = 0.0d0
+      IF( ibgrp_i > 0 ) bec_tmp = bec_bgrp(:,ibgrp_i )
+
+      CALL mp_sum( bec_tmp, inter_bgrp_comm )
+!
+!     calculate csc(k)=<cp(i)|S|cp(k)>,  k<i
+!
+      csc2    = 0.0d0
+!$omp parallel do default(shared), private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k )
+      DO k=iupdwn(iss), kmax
+         IF (ispin(i).EQ.ispin(k)) THEN
+            rsum=0.d0
+            ibgrp_k = ibgrp_g2l( k )
+            IF( ibgrp_k > 0 ) THEN
+            DO is=1,nvb
+               DO iv=1,nh(is)
+                  DO jv=1,nh(is)
+                     IF(ABS(qq(iv,jv,is)).GT.1.e-5) THEN 
+                        DO ia=1,na(is)
+                           inl=ish(is)+(iv-1)*na(is)+ia
+                           jnl=ish(is)+(jv-1)*na(is)+ia
+                           rsum = rsum + qq(iv,jv,is)*bec_tmp(inl)*bec_bgrp(jnl,ibgrp_k)
+                        END DO
+                     ENDIF
+                  END DO
+               END DO
+            END DO
+            END IF
+            csc2(k)=csc2(k)+rsum
+         ENDIF
+      END DO
+!$omp end parallel do
+!
+!     orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
+!
+!     corresponing bec:  bec(i)=<cp(i)|beta>-csc(k)<cp(k)|beta>
+!
+      CALL mp_sum( csc2, inter_bgrp_comm )
+      csc = csc + csc2
+
+      bec_tmp = 0.0d0
+      DO k = iupdwn(iss), kmax
+         ibgrp_k = ibgrp_g2l( k )
+         IF( ibgrp_k > 0 ) THEN
+            DO inl=1,nkbx
+               bec_tmp(inl)=bec_tmp(inl)-csc(k)*bec_bgrp(inl,ibgrp_k)
+            END DO
+         END IF
+      END DO
+      CALL mp_sum( bec_tmp, inter_bgrp_comm )
+      IF( ibgrp_i > 0 ) bec_bgrp(:,ibgrp_i ) = bec_bgrp(:,ibgrp_i ) + bec_tmp
+
+      DEALLOCATE( csc2 )
+      DEALLOCATE( bec_tmp )
+      DEALLOCATE( cp_tmp )
+!
+      RETURN
+      END SUBROUTINE gracsc_bgrp
+
+
 !-------------------------------------------------------------------------
      SUBROUTINE smooth_csv( c, v, ngwx, csv, n )
 !-----------------------------------------------------------------------
@ -681,57 +838,80 @@ END FUNCTION
      END SUBROUTINE bec_csv


-
 !-------------------------------------------------------------------------
-      SUBROUTINE gram( betae, bec, nkbx, cp, ngwx, n )
+      SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx, iss )
 !-----------------------------------------------------------------------
 !     gram-schmidt orthogonalization of the set of wavefunctions cp
 !
      USE uspp,           ONLY : nkb, nhsavb=> nkbus
      USE gvecw,          ONLY : ngw
+      USE electrons_base, ONLY : nbspx_bgrp, ibgrp_g2l, nupdwn, iupdwn, nbspx
      USE kinds,          ONLY : DP
+      USE mp_global,      ONLY : inter_bgrp_comm, mpime
+      USE mp,             ONLY : mp_sum
 !
      IMPLICIT NONE
 !
-      INTEGER, INTENT(IN) :: nkbx, ngwx, n
-      REAL(DP)      :: bec( nkbx, n )
-      COMPLEX(DP)   :: cp( ngwx, n ), betae( ngwx, nkb )
+      INTEGER, INTENT(IN) :: nkbx, ngwx, iss
+      REAL(DP)      :: bec_bgrp( nkbx, nbspx_bgrp )
+      COMPLEX(DP)   :: cp_bgrp( ngwx, nbspx_bgrp ), betae( ngwx, nkb )
 !
      REAL(DP) :: anorm, cscnorm
      REAL(DP), ALLOCATABLE :: csc( : )
-      INTEGER :: i,k
+      COMPLEX(DP), ALLOCATABLE :: ctmp( : )
+      INTEGER :: i,k,j, ig, ibgrp_k, ibgrp_i
      EXTERNAL :: cscnorm
      REAL(DP), PARAMETER :: one  =  1.d0
      REAL(DP), PARAMETER :: mone = -1.d0
-
 !
      CALL start_clock( 'gram' )

-      ALLOCATE( csc( n ) )
+      ALLOCATE( csc( nbspx ) )
+      ALLOCATE( ctmp( ngwx ) )
 !
-      DO i = 1, n
+      DO i = iupdwn(iss), iupdwn(iss) + nupdwn(iss) - 1 
         !
-         CALL gracsc( bec, nkbx, betae, cp, ngwx, i, csc, n )
+         ibgrp_i = ibgrp_g2l( i )
+         !
+         CALL gracsc_bgrp( bec_bgrp, nkbx, betae, cp_bgrp, ngwx, i, csc, iss )
         !
         ! calculate orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
         !
-         IF( i > 1 ) &
-            CALL dgemv( 'N', 2*ngw, i-1, mone, cp(1,1), 2*ngwx, csc(1), 1, one, cp(1,i), 1 )
+         IF( ibgrp_i > 0 ) THEN
+            ctmp = cp_bgrp( :, ibgrp_i )
+         ELSE
+            ctmp = 0.0d0
+         END IF
+         !
+         IF( i > iupdwn(iss) ) THEN
+           DO k = iupdwn(iss), i - 1
+              ibgrp_k = ibgrp_g2l( k )
+              IF( ibgrp_k > 0 ) THEN
+                 DO ig = 1, ngw
+                    ctmp( ig ) = ctmp( ig ) - cp_bgrp( ig, ibgrp_k ) * csc( k )
+                 END DO
+              END IF
+           END DO
+         END IF
+         CALL mp_sum( ctmp, inter_bgrp_comm )
+         !IF( i > 1 ) &
+         !   CALL dgemv( 'N', 2*ngw, i-1, mone, cp(1,1), 2*ngwx, csc(1), 1, one, cp(1,i), 1 )

-         anorm = cscnorm( bec, nkbx, cp, ngwx, i, n )
-         CALL dscal( 2*ngw, 1.0d0/anorm, cp(1,i), 1 )
-         !
-         !         these are the final bec's
-         !
-         CALL dscal( nkbx, 1.0d0/anorm, bec(1,i), 1 )
+         IF( ibgrp_i > 0 ) THEN
+            cp_bgrp( :, ibgrp_i ) = ctmp
+            anorm = cscnorm( bec_bgrp, nkbx, cp_bgrp, ngwx, ibgrp_i, nbspx_bgrp )
+            CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
+            CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
+         END IF
      END DO
 !
+      DEALLOCATE( ctmp )
      DEALLOCATE( csc )

      CALL stop_clock( 'gram' )
 !
      RETURN
-      END SUBROUTINE gram
+      END SUBROUTINE gram_bgrp
 !
 !-----------------------------------------------------------------------
      SUBROUTINE initbox ( tau0, taub, irb, ainv, a1, a2, a3 )
@ -891,7 +1071,7 @@ END FUNCTION
      USE electrons_base,           ONLY: nspin
      USE control_flags,            ONLY: iprint, thdyn, tfor, tprnfor
      USE mp,                       ONLY: mp_sum
-      USE mp_global,                ONLY: intra_bgrp_comm
+      USE mp_global,                ONLY: intra_bgrp_comm, inter_bgrp_comm, distribute_over_bgrp, my_bgrp_id, nbgrp 
      USE fft_interfaces,           ONLY: invfft
      USE fft_base,                 ONLY: dfftb
 !
@ -908,6 +1088,7 @@ END FUNCTION
      REAL(DP)  fvan(3,nat,nvb), fac, fac1, fac2, boxdotgrid, res
      COMPLEX(DP) ci, facg1, facg2
      COMPLEX(DP), ALLOCATABLE :: qv(:)
+      INTEGER :: na_bgrp, ia_bgrp
      EXTERNAL boxdotgrid

 #ifdef __OPENMP
@ -926,7 +1107,7 @@ END FUNCTION

 !$omp parallel default(none) &
 !$omp          shared(nvb, na, nnrbx, ngb, nh, qgb, eigrb, dfftb, irb, vr, nmb, npb, ci, deeq, &
-!$omp                 fac, nspin ) &
+!$omp                 fac, nspin, my_bgrp_id, nbgrp ) &
 !$omp          private(mytid, ntids, is, ia, nfft, iv, jv, ijv, ig, isa, qv, itid, res, iss )

      isa = 1
@ -946,9 +1127,10 @@ END FUNCTION
      DO is = 1, nvb

 #ifdef __PARA
+
         DO ia=1,na(is)
             nfft = 1
-             IF ( dfftb%np3( isa ) <= 0 ) THEN
+             IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
                isa = isa + nfft
                CYCLE
             END IF
@ -1017,6 +1199,7 @@ END FUNCTION
 !$omp end parallel

      CALL mp_sum( deeq, intra_bgrp_comm )
+      CALL mp_sum( deeq, inter_bgrp_comm )

      IF (.NOT.( tfor .OR. thdyn .OR. tprnfor ) ) go to 10
 !
@ -1031,7 +1214,7 @@ END FUNCTION

 !$omp parallel default(none) &
 !$omp          shared(nvb, na, nnrbx, ngb, nh, qgb, eigrb, dfftb, irb, vr, nmb, npb, ci, deeq, &
-!$omp                 fac, nspin, rhovan, tpibab, gxb, fvan ) &
+!$omp                 fac, nspin, rhovan, tpibab, gxb, fvan, my_bgrp_id, nbgrp ) &
 !$omp          private(mytid, ntids, is, ia, ik, nfft, iv, jv, ijv, ig, isa, qv, itid, res, iss, &
 !$omp                  fac1, fac2, facg1, facg2 )

@ -1052,7 +1235,7 @@ END FUNCTION
 #ifdef __PARA
            DO ia=1,na(is)
               nfft=1
-               IF ( dfftb%np3( isa ) <= 0 ) THEN
+               IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
                  isa = isa + nfft
                  CYCLE
               END IF
@ -1185,6 +1368,7 @@ END FUNCTION
      END IF

      CALL mp_sum( fvan, intra_bgrp_comm )
+      CALL mp_sum( fvan, inter_bgrp_comm )

      isa = 0
      DO is = 1, nvb
@ -1203,7 +1387,7 @@ END FUNCTION


 !-------------------------------------------------------------------------
-      SUBROUTINE nlfl(bec,becdr,lambda,fion)
+      SUBROUTINE nlfl_bgrp( bec_bgrp, becdr_bgrp, lambda, fion )
 !-----------------------------------------------------------------------
 !     contribution to fion due to the orthonormality constraint
 ! 
@ -1214,20 +1398,26 @@ END FUNCTION
      USE uspp,              ONLY: nhsa=>nkb, qq
      USE uspp_param,        ONLY: nhm, nh
      USE cvan,              ONLY: ish, nvb
-      USE electrons_base,    ONLY: nbspx, nbsp, nudx, nspin, iupdwn, nupdwn
+      USE electrons_base,    ONLY: nspin, iupdwn, nupdwn, nbspx_bgrp, ibgrp_g2l, i2gupdwn_bgrp, nbspx, &
+                                   iupdwn_bgrp, nupdwn_bgrp
      USE constants,         ONLY: pi, fpi
      USE cp_main_variables, ONLY: nlam, nlax, descla, la_proc
-      USE descriptors,       ONLY: nlar_ , nlac_ , ilar_ , ilac_ 
+      USE descriptors,       ONLY: nlar_ , nlac_ , ilar_ , ilac_ , la_myr_ , la_myc_ 
      USE mp,                ONLY: mp_sum
-      USE mp_global,         ONLY: intra_bgrp_comm
+      USE mp_global,         ONLY: intra_bgrp_comm, inter_bgrp_comm
 !
      IMPLICIT NONE
-      REAL(DP) bec(nhsa,nbsp), becdr(nhsa,nspin*nlax,3), lambda(nlam,nlam,nspin)
+      REAL(DP) bec_bgrp(nhsa,nbspx_bgrp), becdr_bgrp(nhsa,nbspx_bgrp,3), lambda(nlam,nlam,nspin)
      REAL(DP) fion(3,nat)
 !
-      INTEGER :: k, is, ia, iv, jv, i, j, inl, isa, iss, nss, istart, ir, ic, nr, nc
+      INTEGER :: k, is, ia, iv, jv, i, j, inl, isa, iss, nss, istart, ir, ic, nr, nc, ibgrp_i
+      INTEGER :: n1, n2, m1, m2
      REAL(DP), ALLOCATABLE :: temp(:,:), tmpbec(:,:),tmpdr(:,:) 
      REAL(DP), ALLOCATABLE :: fion_tmp(:,:)
+      REAL(DP), ALLOCATABLE :: bec(:,:,:)
+      REAL(DP), ALLOCATABLE :: becdr(:,:,:,:)
+      REAL(DP), ALLOCATABLE :: bec_g(:,:)
+      REAL(DP), ALLOCATABLE :: becdr_g(:,:,:)
      !
      CALL start_clock( 'nlfl' )
      !
@ -1235,9 +1425,49 @@ END FUNCTION
      !
      fion_tmp = 0.0d0
      !
-
      ALLOCATE( temp( nlax, nlax ), tmpbec( nhm, nlax ), tmpdr( nlax, nhm ) )
+      ALLOCATE( bec( nhsa, nlax, nspin ), becdr( nhsa, nlax, nspin, 3 ) )

+      ! redistribute bec, becdr according to the ortho subgroup
+      ! this is required because they are combined with "lambda" matrixes
+      
+      IF( la_proc ) THEN
+         DO iss = 1, nspin
+            nss = nupdwn( iss )
+            istart = iupdwn( iss )
+            ic = descla( ilac_ , iss )
+            nc = descla( nlac_ , iss )
+            DO i=1,nc
+               ibgrp_i = ibgrp_g2l( i+istart-1+ic-1 )
+               IF( ibgrp_i > 0 ) THEN
+                  bec( :, i, iss ) = bec_bgrp( :, ibgrp_i )
+               ELSE
+                  bec( :, i, iss ) = 0.0d0
+               END IF
+            END DO
+            ir = descla( ilar_ , iss )
+            nr = descla( nlar_ , iss )
+            DO i=1,nr
+               ibgrp_i = ibgrp_g2l( i+istart-1+ir-1 )
+               IF( ibgrp_i > 0 ) THEN
+                  becdr(:,i,iss,1) = becdr_bgrp( :, ibgrp_i, 1 )
+                  becdr(:,i,iss,2) = becdr_bgrp( :, ibgrp_i, 2 )
+                  becdr(:,i,iss,3) = becdr_bgrp( :, ibgrp_i, 3 )
+               ELSE
+                  becdr(:,i,iss,1) = 0.0d0
+                  becdr(:,i,iss,2) = 0.0d0
+                  becdr(:,i,iss,3) = 0.0d0
+               END IF
+            END DO
+         END DO
+      ELSE
+         bec   = 0.0d0
+         becdr = 0.0d0
+      END IF
+
+      CALL mp_sum( bec, inter_bgrp_comm )
+      CALL mp_sum( becdr, inter_bgrp_comm )
+      !
      DO k=1,3
         isa = 0
         DO is=1,nvb
@ -1251,7 +1481,7 @@ END FUNCTION
                  !
                  tmpbec = 0.d0
                  tmpdr  = 0.d0
-!
+                  !
                  IF( la_proc ) THEN
                     ! tmpbec distributed by columns
                     ic = descla( ilac_ , iss )
@ -1261,7 +1491,7 @@ END FUNCTION
                           inl=ish(is)+(jv-1)*na(is)+ia
                           IF(ABS(qq(iv,jv,is)).GT.1.e-5) THEN
                              DO i=1,nc
-                                 tmpbec(iv,i)=tmpbec(iv,i) + qq(iv,jv,is)*bec(inl,i+istart-1+ic-1)
+                                 tmpbec(iv,i)=tmpbec(iv,i) + qq(iv,jv,is)*bec(inl,i,iss)
                              END DO
                           ENDIF
                        END DO
@ -1272,11 +1502,11 @@ END FUNCTION
                     DO iv=1,nh(is)
                        inl=ish(is)+(iv-1)*na(is)+ia
                        DO i=1,nr
-                           tmpdr(i,iv)=becdr(inl,i+(iss-1)*nlax,k)
+                           tmpdr(i,iv) = becdr( inl, i, iss, k )
                        END DO
                     END DO
                  END IF
-!
+                  !
                  IF(nh(is).GT.0)THEN
                     !
                     IF( la_proc ) THEN
@ -1300,6 +1530,7 @@ END FUNCTION
         END DO
      END DO
      !
+      DEALLOCATE( bec, becdr )
      DEALLOCATE( temp, tmpbec, tmpdr )
      !
      CALL mp_sum( fion_tmp, intra_bgrp_comm )
@ -1312,10 +1543,7 @@ END FUNCTION
      !
      RETURN

-      END SUBROUTINE nlfl
-
-
-
+      END SUBROUTINE nlfl_bgrp


 !
@ -2353,6 +2581,7 @@ end function set_Hubbard_l
      USE step_penalty,       ONLY: E_pen, A_pen, sigma_pen, alpha_pen
      USE step_penalty,       ONLY: step_pen
      USE dspev_module,       only: dspev_drv
+      USE mp_global,       only: nbgrp
 !
      implicit none
 #ifdef __PARA
@ -2380,6 +2609,8 @@ end function set_Hubbard_l
      integer, allocatable ::  offset (:,:)
      complex(DP) :: tempsi
 !
+      if( nbgrp > 1 ) &
+         call errore(' new_ns ', ' parallelization over bands not yet implemented ', 1 )
 !
      allocate(wfc(ngw,n_atomic_wfc))
      allocate(ftemp1(ldmx))
@ -2545,8 +2776,8 @@ end function set_Hubbard_l
 !
        call nlsm1 (n,1,nsp,eigr,c,bp)
        call s_wfc(n,bp,betae,c,spsi)
-        call nlsm2_repl(ngw,nhsa,n,eigr,c,dbp)
-        call nlsm2_repl(ngw,nhsa,n_atomic_wfc,eigr,wfc,wdb)
+        call nlsm2_bgrp( ngw, nhsa, eigr, c, dbp, nx, n )
+        call nlsm2_bgrp( ngw, nhsa, eigr, wfc, wdb, n_atomic_wfc, n_atomic_wfc )
 !
        alpha=0
        do alpha_s = 1, nsp
--- a/CPV/cpr.f90
+++ b/CPV/cpr.f90
@ -15,8 +15,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE control_flags,            ONLY : iprint, isave, thdyn, tpre, iprsta,     &
                                       tfor, remove_rigid_rot, taurdr,         &
                                       tprnfor, tsdc, lconstrain, lwf, lneb,   &
-!                                       lcoarsegrained, ndr, ndw, nomore, tsde, &
-                                        ndr, ndw, nomore, tsde, &
+                                       ndr, ndw, nomore, tsde, &
                                       tortho, tnosee, tnosep, trane, tranp,   &
                                       tsdp, tcp, tcap, ampre, amprp, tnoseh,  &
                                       tolp, ortho_eps, ortho_max, printwfc,   &
@ -28,8 +27,9 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE energies,                 ONLY : eht, epseu, exc, etot, eself, enl, &
                                       ekin, atot, entropy, egrand, enthal, &
                                       ekincm, print_energies
-  USE electrons_base,           ONLY : nbspx, nbsp, ispin, f, nspin
+  USE electrons_base,           ONLY : nbspx, nbsp, ispin, f, nspin, nbsp_bgrp
  USE electrons_base,           ONLY : nel, iupdwn, nupdwn, nudx, nelt
+  USE electrons_module,         ONLY : distribute_c, collect_c
  USE efield_module,            ONLY : efield, epol, tefield, allocate_efield, &
                                       efield_update, ipolp, qmat, gqq, evalue,&
                                       berry_energy, pberryel, pberryion,      &
@ -61,7 +61,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE io_global,                ONLY : io_global_start, &
                                       stdout, ionode, ionode_id
  USE dener,                    ONLY : detot
-  USE cdvan,                    ONLY : dbec, drhovan
+  USE cdvan,                    ONLY : drhovan
  USE gvecw,                    ONLY : ggp
  USE constants,                ONLY : pi, k_boltzmann_au, au_ps
  USE io_files,                 ONLY : psfile, pseudo_dir
@ -87,7 +87,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
                                       electrons_nosevel, electrons_noseupd
  USE pres_ai_mod,              ONLY : P_ext, P_in, P_fin, pvar, volclu, &
                                       surfclu, Surf_t, abivol, abisur
-  USE wavefunctions_module,     ONLY : c0, cm, phi => cp
+  USE wavefunctions_module,     ONLY : c0_bgrp, cm_bgrp, phi_bgrp
  USE wannier_module,           ONLY : allocate_wannier
  USE cp_interfaces,            ONLY : printout_new, move_electrons
  USE printout_base,            ONLY : printout_base_open, &
@ -103,11 +103,11 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE gvect,                    ONLY : ecutrho
  USE time_step,                ONLY : delt, tps, dt2,  twodelt
  USE cp_interfaces,            ONLY : cp_print_rho, nlfh, print_lambda
-  USE cp_main_variables,        ONLY : acc, bec, lambda, lambdam, lambdap, &
+  USE cp_main_variables,        ONLY : acc, lambda, lambdam, lambdap, &
                                       ema0bg, sfac, eigr, iprint_stdout,  &
-                                       irb, becdr, taub, eigrb, rhog, rhos, &
+                                       irb, taub, eigrb, rhog, rhos, &
                                       rhor, bephi, becp_dist, nfi, descla, &
-                                       drhor, drhog, nlax
+                                       drhor, drhog, nlax, bec_bgrp, dbec
  USE autopilot,                ONLY : event_step, event_index, &
                                       max_event_step, restart_p
  USE cell_base,                ONLY : s_to_r, r_to_s
@ -121,9 +121,9 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE ions_nose,                ONLY : ions_nose_allocate, ions_nose_shiftvar
  USE orthogonalize_base,       ONLY : updatc
  USE control_flags,            ONLY : force_pairing
-  USE mp,                       ONLY : mp_bcast
+  USE mp,                       ONLY : mp_bcast, mp_sum
  USE mp_global,                ONLY : root_bgrp, intra_bgrp_comm, np_ortho, me_ortho, ortho_comm, &
-                                       me_bgrp
+                                       me_bgrp, inter_bgrp_comm
  USE ldaU_cp,                  ONLY : lda_plus_u, vupsi
  USE step_penalty,             ONLY : vpsi_pen, step_pen, E_pen
  USE small_box,                ONLY : ainvb
@ -196,6 +196,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  !
  IF ( lda_plus_u ) ALLOCATE( forceh( 3, nat ) )
  !
+  !
  !======================================================================
  !
  !           basic loop for molecular dynamics starts here
@ -285,7 +286,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     !
     IF ( tefield .or. tefield2 ) THEN
        !
-        CALL calbec( 1, nsp, eigr, c0, bec ) ! ATTENZIONE  
+        CALL calbec( 1, nsp, eigr, c0_bgrp, bec_bgrp ) ! ATTENZIONE  
        !
     END IF
     !
@ -303,7 +304,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        vupsi=(0.0d0,0.0d0)
        ! vpsi_pen  ! potentials on electrons due to occupation constraints 
        vpsi_pen=(0.0d0,0.0d0)
-        CALL new_ns(c0,eigr,vkb,vupsi,vpsi_pen,forceh)
+        CALL new_ns(c0_bgrp,eigr,vkb,vupsi,vpsi_pen,forceh)
        vupsi = vupsi + vpsi_pen
        if ( mod(nfi,iprint).eq.0 ) call write_ns
     endif
@ -315,9 +316,9 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     !=======================================================================
     !
     IF( force_pairing ) THEN
-          c0(:,iupdwn(2):nbsp)       =     c0(:,1:nupdwn(2))
-          cm(:,iupdwn(2):nbsp)       =     cm(:,1:nupdwn(2))
-         phi(:,iupdwn(2):nbsp)       =    phi(:,1:nupdwn(2))
+          c0_bgrp(:,iupdwn(2):nbsp)       =     c0_bgrp(:,1:nupdwn(2))
+          cm_bgrp(:,iupdwn(2):nbsp)       =     cm_bgrp(:,1:nupdwn(2))
+         phi_bgrp(:,iupdwn(2):nbsp)       =    phi_bgrp(:,1:nupdwn(2))
      lambda(:,:, 2) = lambda(:,:, 1)
     ENDIF
     !
@ -327,19 +328,18 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        !
        ekincf = 0.0d0

-        CALL elec_fakekine( ekincf, ema0bg, emass, cm, c0, ngw, nbsp, 1, delt )
+        CALL elec_fakekine( ekincf, ema0bg, emass, cm_bgrp, c0_bgrp, ngw, nbsp_bgrp, 1, delt )
        !
     END IF
     !
-     !
-     CALL move_electrons( nfi, tfirst, tlast, b1, b2, b3, fion, &
+     CALL move_electrons( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, &
                          enthal, enb, enbi, fccc, ccc, dt2bye, stress )
     !
     IF (lda_plus_u) fion = fion + forceh
     !
     IF ( tpre ) THEN
        !
-        CALL nlfh( stress, bec, dbec, lambda )
+        CALL nlfh( stress, bec_bgrp, dbec, lambda )
        !
        CALL ions_thermal_stress( stress, pmass, omega, h, vels, nsp, na )
        !
@ -508,14 +508,16 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        !
        IF ( tortho ) THEN
           !
-           CALL ortho( eigr, cm, phi, ngw, lambda, descla, &
+           CALL ortho( eigr, cm_bgrp, phi_bgrp, ngw, lambda, descla, &
                       bigr, iter, ccc, bephi, becp_dist, nbsp, nspin, nupdwn, iupdwn )
           !
        ELSE
           !
-           CALL gram( vkb, bec, nkb, cm, ngw, nbsp )
+           DO iss = 1, nspin
+              CALL gram_bgrp( vkb, bec_bgrp, nkb, cm_bgrp, ngw, iss )
+           END DO
           !
-           IF ( iprsta > 4 ) CALL dotcsc( eigr, cm, ngw, nbsp )
+           IF ( iprsta > 4 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
           !
        END IF
        !
@ -527,26 +529,26 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
           DO iss = 1, nspin_sub
              i1 = (iss-1)*nlax+1
              i2 = iss*nlax
-              CALL updatc( ccc, nbsp, lambda(:,:,iss), SIZE(lambda,1), phi, SIZE(phi,1), &
-                        bephi(:,i1:i2), SIZE(bephi,1), becp_dist(:,i1:i2), bec, cm, nupdwn(iss), iupdwn(iss), &
+              CALL updatc( ccc, nbsp, lambda(:,:,iss), SIZE(lambda,1), phi_bgrp, SIZE(phi_bgrp,1), &
+                        bephi(:,i1:i2), SIZE(bephi,1), becp_dist(:,i1:i2), bec_bgrp, cm_bgrp, nupdwn(iss), iupdwn(iss), &
                        descla(:,iss) )
           END DO
        END IF
        !
        IF( force_pairing ) THEN
-              c0(:,iupdwn(2):nbsp)       =     c0(:,1:nupdwn(2))
-              cm(:,iupdwn(2):nbsp)       =     cm(:,1:nupdwn(2))
-             phi(:,iupdwn(2):nbsp)       =    phi(:,1:nupdwn(2))
+              c0_bgrp(:,iupdwn(2):nbsp)       =     c0_bgrp(:,1:nupdwn(2))
+              cm_bgrp(:,iupdwn(2):nbsp)       =     cm_bgrp(:,1:nupdwn(2))
+             phi_bgrp(:,iupdwn(2):nbsp)       =    phi_bgrp(:,1:nupdwn(2))
          lambda(:,:, 2) = lambda(:,:, 1)
        ENDIF
        !
-        CALL calbec( nvb+1, nsp, eigr, cm, bec )
+        CALL calbec_bgrp( nvb+1, nsp, eigr, cm_bgrp, bec_bgrp )
        !
        IF ( tpre ) THEN
-           CALL caldbec( ngw, nkb, nbsp, 1, nsp, eigr, cm, dbec )
+           CALL caldbec_bgrp( eigr, cm_bgrp, dbec )
        END IF
        !
-        IF ( iprsta >= 3 ) CALL dotcsc( eigr, cm, ngw, nbsp )
+        IF ( iprsta >= 3 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
        !
     END IF
     !
@ -579,7 +581,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     !
     IF ( .NOT. tcg ) THEN
        !
-        CALL elec_fakekine( ekinc0, ema0bg, emass, c0, cm, ngw, nbsp, 1, delt )
+        CALL elec_fakekine( ekinc0, ema0bg, emass, c0_bgrp, cm_bgrp, ngw, nbsp_bgrp, 1, delt )
        !
        ekinc0 = (ekinc0 + ekincf)*0.5d0
        !
@ -738,11 +740,11 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     !
     IF( .NOT. tcg ) THEN
        !
-        CALL dswap( 2*ngw*nbsp, c0, 1, cm, 1 )
+        CALL dswap( 2*SIZE( c0_bgrp ), c0_bgrp, 1, cm_bgrp, 1 )
        !
     ELSE
        !
-        CALL cg_update( tfirst, nfi, c0 )
+        CALL cg_update( tfirst, nfi, c0_bgrp )
        !
        IF ( tfor .AND. .NOT. tens .AND. &
             ( ( MOD( nfi, isave ) == 0 ) .OR. tlast ) ) THEN
@ -765,7 +767,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
           lambdam = lambda
           !
           CALL move_electrons( nfi, tfirst, tlast, b1, b2, b3, &
-                                fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress )
+                                fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb, enbi, fccc, ccc, dt2bye, stress )
           !
        END IF
        !
@ -781,14 +783,14 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        !
        IF ( tcg ) THEN
          !
-          CALL writefile( h, hold ,nfi, c0, c0old, taus, tausm,  &
+          CALL writefile( h, hold ,nfi, c0_bgrp, c0old, taus, tausm,  &
                          vels, velsm, acc, lambda, lambdam, xnhe0, xnhem,     &
                          vnhe, xnhp0, xnhpm, vnhp, nhpcl,nhpdim,ekincm, xnhh0,&
                          xnhhm, vnhh, velh, fion, tps, z0t, f, rhor )
           !
        ELSE
           !
-           CALL writefile( h, hold, nfi, c0, cm, taus,  &
+           CALL writefile( h, hold, nfi, c0_bgrp, cm_bgrp, taus,  &
                           tausm, vels, velsm, acc,  lambda, lambdam, xnhe0,   &
                           xnhem, vnhe, xnhp0, xnhpm, vnhp, nhpcl, nhpdim, ekincm,&
                           xnhh0, xnhhm, vnhh, velh, fion, tps, z0t, f, rhor )
@ -857,7 +859,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     END IF
     !
     IF ( lwf ) &
-        CALL wf_closing_options( nfi, c0, cm, bec, eigr, eigrb, taub, &
+        CALL wf_closing_options( nfi, c0_bgrp, cm_bgrp, bec_bgrp, eigr, eigrb, taub, &
                                 irb, ibrav, b1, b2, b3, taus, tausm, vels,  &
                                 velsm, acc, lambda, lambdam, xnhe0, xnhem,  &
                                 vnhe, xnhp0, xnhpm, vnhp, nhpcl, nhpdim,    &
@ -896,9 +898,9 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  !
  conv_elec = .TRUE.
  !
-  IF ( tcg ) cm = c0old
+  IF ( tcg ) cm_bgrp = c0old
  !
-  CALL writefile( h, hold, nfi, c0, cm, taus, tausm, &
+  CALL writefile( h, hold, nfi, c0_bgrp, cm_bgrp, taus, tausm, &
                  vels, velsm, acc, lambda, lambdam, xnhe0, xnhem, vnhe,    &
                  xnhp0, xnhpm, vnhp, nhpcl,nhpdim,ekincm, xnhh0, xnhhm,    &
                  vnhh, velh, fion, tps, z0t, f, rhor )
--- a/CPV/cprsub.f90
+++ b/CPV/cprsub.f90
@ -188,7 +188,7 @@ SUBROUTINE newnlinit()
 END SUBROUTINE newnlinit
 !
 !-----------------------------------------------------------------------
-subroutine nlfh_x( stress, bec, dbec, lambda )
+subroutine nlfh_x( stress, bec_bgrp, dbec, lambda )
  !-----------------------------------------------------------------------
  !
  !     contribution to the internal stress tensor due to the constraints
@ -198,29 +198,51 @@ subroutine nlfh_x( stress, bec, dbec, lambda )
  use uspp,              ONLY : nkb, qq
  use uspp_param,        ONLY : nh, nhm
  use ions_base,         ONLY : na
-  use electrons_base,    ONLY : nbspx, nbsp, nudx, nspin, nupdwn, iupdwn
+  use electrons_base,    ONLY : nbspx, nbsp, nudx, nspin, nupdwn, iupdwn, ibgrp_g2l
  use cell_base,         ONLY : omega, h
  use constants,         ONLY : pi, fpi, au_gpa
  use io_global,         ONLY : stdout
  use control_flags,     ONLY : iprsta
-  USE cp_main_variables, ONLY : descla, la_proc, nlam
+  USE cp_main_variables, ONLY : descla, la_proc, nlam, nlax
  USE descriptors,       ONLY : nlar_ , nlac_ , ilar_ , ilac_ , nlax_
  USE mp,                ONLY : mp_sum
-  USE mp_global,         ONLY : intra_bgrp_comm
+  USE mp_global,         ONLY : intra_bgrp_comm, inter_bgrp_comm

 !
  implicit none

  REAL(DP), INTENT(INOUT) :: stress(3,3) 
-  REAL(DP), INTENT(IN)    :: bec( :, : ), dbec( :, :, :, : )
+  REAL(DP), INTENT(IN)    :: bec_bgrp( :, : ), dbec( :, :, :, : )
  REAL(DP), INTENT(IN)    :: lambda( :, :, : )
 !
  INTEGER  :: i, j, ii, jj, inl, iv, jv, ia, is, iss, nss, istart
-  INTEGER  :: jnl, ir, ic, nr, nc, nx
+  INTEGER  :: jnl, ir, ic, nr, nc, nx, ibgrp_i
  REAL(DP) :: fpre(3,3), TT, T1, T2
  !
-  REAL(DP), ALLOCATABLE :: tmpbec(:,:), tmpdh(:,:), temp(:,:)
+  REAL(DP), ALLOCATABLE :: tmpbec(:,:), tmpdh(:,:), temp(:,:), bec(:,:,:)
  !
+  ALLOCATE( bec( nkb, nlax, nspin ) )
+  !
+  IF( la_proc ) THEN
+     DO iss = 1, nspin
+        nss = nupdwn( iss )
+        istart = iupdwn( iss )
+        ic = descla( ilac_ , iss )
+        nc = descla( nlac_ , iss )
+        DO i=1,nc
+           ibgrp_i = ibgrp_g2l( i+istart-1+ic-1 )
+           IF( ibgrp_i > 0 ) THEN
+              bec( :, i, iss ) = bec_bgrp( :, ibgrp_i )
+           ELSE
+              bec( :, i, iss ) = 0.0d0
+           END IF
+        END DO
+     END DO
+  ELSE
+     bec   = 0.0d0
+  END IF
+
+  CALL mp_sum( bec, inter_bgrp_comm )
  !
  IF( la_proc ) THEN
     nx=descla( nlax_ , 1 ) 
@ -258,7 +280,8 @@ subroutine nlfh_x( stress, bec, dbec, lambda )
                          inl=ish(is)+(jv-1)*na(is)+ia
                          if(abs(qq(iv,jv,is)).gt.1.e-5) then
                             do i = 1, nc
-                                tmpbec(iv,i) = tmpbec(iv,i) +  qq(iv,jv,is) * bec(inl, i + istart - 1 + ic - 1 )
+                                !tmpbec(iv,i) = tmpbec(iv,i) +  qq(iv,jv,is) * bec(inl, i + istart - 1 + ic - 1 )
+                                tmpbec(iv,i) = tmpbec(iv,i) +  qq(iv,jv,is) * bec( inl, i, iss  )
                             end do
                          endif
                       end do
@ -267,7 +290,7 @@ subroutine nlfh_x( stress, bec, dbec, lambda )
                    do iv=1,nh(is)
                       inl=ish(is)+(iv-1)*na(is)+ia
                       do i = 1, nr
-                          tmpdh(i,iv) = dbec( inl, i + (iss-1)*nlam, ii, jj )
+                          tmpdh(i,iv) = dbec( inl, i + (iss-1)*nlax, ii, jj )
                       end do
                    end do

@ -308,6 +331,8 @@ subroutine nlfh_x( stress, bec, dbec, lambda )
     DEALLOCATE ( tmpbec, tmpdh, temp )
  END IF

+  DEALLOCATE( bec )
+

  IF( iprsta > 2 ) THEN
     WRITE( stdout,*) 
--- a/CPV/electrons.f90
+++ b/CPV/electrons.f90
@ -12,7 +12,9 @@
        USE dspev_module,       ONLY: pdspev_drv, dspev_drv
        USE electrons_base,     ONLY: nbnd, nbndx, nbsp, nbspx, nspin, nel, nelt, &
                                      nupdwn, iupdwn, telectrons_base_initval, f, &
-                                      nudx
+                                      nudx, nupdwn_bgrp, iupdwn_bgrp, nudx_bgrp, &
+                                      nbsp_bgrp, nbspx_bgrp, i2gupdwn_bgrp
+
        USE cp_electronic_mass, ONLY: ecutmass => emass_cutoff, emass, emass_precond


@ -43,8 +45,8 @@
        PUBLIC :: ib_owner, ib_local, nb_l
        PUBLIC :: ei
        PUBLIC :: print_eigenvalues
- 
-
+        PUBLIC :: distribute_c, collect_c
+        PUBLIC :: distribute_b, collect_b
 !
 !  end of module-scope declarations
 !
@ -79,6 +81,76 @@
     RETURN
   END SUBROUTINE occn_info

+!  ----------------------------------------------
+    SUBROUTINE distribute_b( b, b_bgrp )
+      REAL(DP), INTENT(IN) :: b(:,:)
+      REAL(DP), INTENT(OUT) :: b_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2
+      DO iss = 1, nspin
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         b_bgrp(:,n1:n2) = b(:,m1:m2)
+      END DO
+      RETURN
+    END SUBROUTINE distribute_b
+!
+    SUBROUTINE collect_b( b, b_bgrp )
+      USE mp_global, ONLY : inter_bgrp_comm, mpime
+      USE mp,        ONLY : mp_sum
+      REAL(DP), INTENT(OUT) :: b(:,:)
+      REAL(DP), INTENT(IN)  :: b_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2
+      b = 0.0d0
+      DO iss = 1, nspin
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         b(:,m1:m2) = b_bgrp(:,n1:n2)
+         !write(1000+mpime,*) 'n1, n2 = ', n1, n2 ! debug
+         !write(1000+mpime,*) 'm1, m2 = ', m1, m2 ! debug
+      END DO
+      CALL mp_sum( b, inter_bgrp_comm )
+      RETURN
+    END SUBROUTINE collect_b
+
+
+    SUBROUTINE distribute_c( c, c_bgrp )
+      COMPLEX(DP), INTENT(IN) :: c(:,:)
+      COMPLEX(DP), INTENT(OUT) :: c_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2
+      DO iss = 1, nspin
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         c_bgrp(:,n1:n2) = c(:,m1:m2)
+      END DO
+      RETURN
+    END SUBROUTINE distribute_c
+!
+    SUBROUTINE collect_c( c, c_bgrp )
+      USE mp_global, ONLY : inter_bgrp_comm, mpime
+      USE mp,        ONLY : mp_sum
+      COMPLEX(DP), INTENT(OUT) :: c(:,:)
+      COMPLEX(DP), INTENT(IN)  :: c_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2
+      c = 0.0d0
+      DO iss = 1, nspin
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         c(:,m1:m2) = c_bgrp(:,n1:n2)
+         !write(1000+mpime,*) 'n1, n2 = ', n1, n2 ! debug
+         !write(1000+mpime,*) 'm1, m2 = ', m1, m2 ! debug
+      END DO
+      CALL mp_sum( c, inter_bgrp_comm )
+      RETURN
+    END SUBROUTINE collect_c
+
 !  ----------------------------------------------
 !  ----------------------------------------------

--- a/CPV/fromscra.f90
+++ b/CPV/fromscra.f90
@ -24,8 +24,8 @@ SUBROUTINE from_scratch( )
                                     a2, a3, b1, b2, b3
    USE cell_nose,            ONLY : xnhh0, xnhhm, vnhh
    USE electrons_nose,       ONLY : xnhe0, xnhem, vnhe
-    use electrons_base,       ONLY : nbsp, f, nspin, nupdwn, iupdwn, distribute_c
-    USE electrons_module,     ONLY : occn_info
+    use electrons_base,       ONLY : nbsp, f, nspin, nupdwn, iupdwn, nbsp_bgrp, nbspx_bgrp, nbspx
+    USE electrons_module,     ONLY : occn_info, distribute_c, collect_c, distribute_b, collect_b
    USE energies,             ONLY : entropy, eself, enl, ekin, enthal, etot, ekincm
    USE energies,             ONLY : dft_energy_type, debug_energies
    USE dener,                ONLY : denl, denl6, dekin6, detot
@ -48,24 +48,23 @@ SUBROUTINE from_scratch( )
    USE cp_interfaces,        ONLY : compute_stress
    USE cp_interfaces,        ONLY : print_lambda
    USE printout_base,        ONLY : printout_pos
-    USE orthogonalize_base,   ONLY : updatc, calphi
+    USE orthogonalize_base,   ONLY : updatc, calphi_bgrp
    USE atoms_type_module,    ONLY : atoms_type
    USE wave_base,            ONLY : wave_steepest
-    USE wavefunctions_module, ONLY : c0, cm, phi => cp, c0_bgrp, cm_bgrp, cp_bgrp
+    USE wavefunctions_module, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
    USE grid_dimensions,      ONLY : nr1, nr2, nr3
    USE time_step,            ONLY : delt
-    USE cp_main_variables,    ONLY : setval_lambda, descla, bephi, becp_dist, becdr, nfi, &
-                                     sfac, eigr, bec, taub, irb, eigrb, &
+    USE cp_main_variables,    ONLY : setval_lambda, descla, bephi, becp_dist, nfi, &
+                                     sfac, eigr, taub, irb, eigrb, bec_bgrp, &
                                     lambda, lambdam, lambdap, ema0bg, rhog, rhor, rhos, &
-                                     vpot, ht0, edft, nlax
-    USE mp_global,            ONLY : np_ortho, me_ortho, ortho_comm
+                                     vpot, ht0, edft, nlax, becdr_bgrp, dbec
+    USE mp_global,            ONLY : np_ortho, me_ortho, ortho_comm, mpime, inter_bgrp_comm
    USE small_box,            ONLY : ainvb
-    USE cdvan,                ONLY : dbec
+    USE mp,                   ONLY : mp_sum
    !
    IMPLICIT NONE
    !
    REAL(DP),    ALLOCATABLE :: emadt2(:), emaver(:)
-    COMPLEX(DP), ALLOCATABLE :: c2(:), c3(:)
    REAL(DP)                 :: verl1, verl2
    REAL(DP)                 :: bigr, dum
    INTEGER                  :: i, j, iter, iss, ierr, nspin_wfc
@ -109,8 +108,7 @@ SUBROUTINE from_scratch( )
       !
    END IF
    !
-    CALL phfacs( eigts1, eigts2, eigts3, eigr, mill, atoms0%taus, &
-                 nr1, nr2, nr3, atoms0%nat )
+    CALL phfacs( eigts1, eigts2, eigts3, eigr, mill, atoms0%taus, nr1, nr2, nr3, atoms0%nat )
    !
    CALL strucf( sfac, eigts1, eigts2, eigts3, mill, ngms )
    !     
@ -121,7 +119,7 @@ SUBROUTINE from_scratch( )
    !
    !     wfc initialization with random numbers
    !     
-    CALL wave_rand_init( cm, nbsp, 1 )
+    CALL wave_rand_init( cm_bgrp )
    !
    IF ( ionode ) &
       WRITE( stdout, fmt = '(//,3X, "Wave Initialization: random initial wave-functions" )' )
@ -135,13 +133,13 @@ SUBROUTINE from_scratch( )

    DO iss = 1, nspin_wfc
       !
-       CALL gram( vkb, bec, nkb, cm(1,iupdwn(iss)), ngw, nupdwn(iss) )
+       CALL gram_bgrp( vkb, bec_bgrp, nkb, cm_bgrp, ngw, iss )
       !
    END DO

-    IF( force_pairing ) cm(:,iupdwn(2):iupdwn(2)+nupdwn(2)-1) = cm(:,1:nupdwn(2))
+    IF( force_pairing ) cm_bgrp(:,iupdwn(2):iupdwn(2)+nupdwn(2)-1) = cm_bgrp(:,1:nupdwn(2))
    !
-    if( iprsta >= 3 ) CALL dotcsc( eigr, cm, ngw, nbsp )
+    if( iprsta >= 3 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp )
    !
    ! ... initialize bands
    !
@ -169,13 +167,11 @@ SUBROUTINE from_scratch( )
    !
    IF( .NOT. tcg ) THEN
       !
-       CALL calbec ( 1, nsp, eigr, cm, bec )
+       CALL calbec_bgrp ( 1, nsp, eigr, cm_bgrp, bec_bgrp )
       !
-       if ( tstress ) CALL caldbec( ngw, nkb, nbsp, 1, nsp, eigr, cm, dbec )
+       if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec )
       !
-       CALL distribute_c( cm, cm_bgrp )
-       !
-       CALL rhoofr ( nfi, cm_bgrp, irb, eigrb, bec, becsum, rhor, rhog, rhos, enl, denl, ekin, dekin6 )
+       CALL rhoofr ( nfi, cm_bgrp, irb, eigrb, bec_bgrp, becsum, rhor, rhog, rhos, enl, denl, ekin, dekin6 )
       !
       edft%enl  = enl
       edft%ekin = ekin
@ -200,11 +196,11 @@ SUBROUTINE from_scratch( )
        &  eigts1, eigts2, eigts3, irb, eigrb, sfac, tau0, fion )

         IF( tefield ) THEN
-           CALL berry_energy( enb, enbi, bec, cm(:,:), fion ) 
+           CALL berry_energy( enb, enbi, bec_bgrp, cm_bgrp, fion ) 
           etot = etot + enb + enbi
         END IF
         IF( tefield2 ) THEN
-           CALL berry_energy2( enb, enbi, bec, cm(:,:), fion )
+           CALL berry_energy2( enb, enbi, bec_bgrp, cm_bgrp, fion )
           etot = etot + enb + enbi
         END IF

@ -217,67 +213,73 @@ SUBROUTINE from_scratch( )
         !
         IF( force_pairing ) THEN
            !
-            CALL runcp_uspp_force_pairing( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec, cm, &
-        &                 c0, ei_unp, fromscra = .TRUE. )
+            CALL runcp_uspp_force_pairing( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, &
+        &                 c0_bgrp, ei_unp, fromscra = .TRUE. )
            !
            CALL setval_lambda( lambda(:,:,2), nupdwn(1), nupdwn(1), 0.d0, descla(:,1) )
            !
         ELSE
            !
-            CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec, cm, c0, fromscra = .TRUE. )
+            CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, c0_bgrp, fromscra = .TRUE. )
            !
         ENDIF
         !
         !     nlfq needs deeq bec
         !
-         if( ttforce ) CALL nlfq( cm, eigr, bec, becdr, fion )
+         IF( ttforce ) THEN
+            CALL nlfq_bgrp( cm_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
+         END IF
         !
         !     calphi calculates phi
         !     the electron mass rises with g**2
         !
-         CALL calphi( cm, ngw, bec, nkb, vkb, phi, nbsp, ema0bg )
+         CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
         !
         IF( force_pairing ) &
-         &   phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) =    phi( :, 1:nupdwn(2))
-
+         &   phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) =    phi_bgrp( :, 1:nupdwn(2))

         if( tortho ) then
-            CALL ortho( eigr, c0, phi, ngw, lambda, descla, &
+            CALL ortho( eigr, c0_bgrp, phi_bgrp, ngw, lambda, descla, &
                        bigr, iter, ccc, bephi, becp_dist, nbsp, nspin, nupdwn, iupdwn )
         else
-            CALL gram( vkb, bec, nkb, c0, ngw, nbsp )
+            DO iss = 1, nspin
+               CALL gram_bgrp( vkb, bec_bgrp, nkb, c0_bgrp, ngw, iss )
+            END DO
         endif
         !
-         !
-         if ( ttforce ) CALL nlfl( bec, becdr, lambda, fion )
+         IF ( ttforce ) THEN
+            CALL nlfl_bgrp( bec_bgrp, becdr_bgrp, lambda, fion )
+         END IF

         if ( iprsta >= 3 ) CALL print_lambda( lambda, nbsp, 9, ccc )

-         if ( tstress ) CALL nlfh( stress, bec, dbec, lambda )
+         !
+         if ( tstress ) CALL nlfh( stress, bec_bgrp, dbec, lambda )
         !
         IF ( tortho ) THEN
            DO iss = 1, nspin_wfc
               i1 = (iss-1)*nlax+1
               i2 = iss*nlax
-               CALL updatc( ccc, nbsp, lambda(:,:,iss), SIZE(lambda,1), phi, SIZE(phi,1), &
-                            bephi(:,i1:i2), SIZE(bephi,1), becp_dist(:,i1:i2), bec, c0, nupdwn(iss), iupdwn(iss), &
+               CALL updatc( ccc, nbsp, lambda(:,:,iss), SIZE(lambda,1), phi_bgrp, SIZE(phi_bgrp,1), &
+                            bephi(:,i1:i2), SIZE(bephi,1), becp_dist(:,i1:i2), bec_bgrp, c0_bgrp, nupdwn(iss), iupdwn(iss), &
                            descla(:,iss) )
            END DO
         END IF
         !
         IF( force_pairing ) THEN
            !
-            c0 ( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) =  c0( :, 1:nupdwn(2))
-            phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi( :, 1:nupdwn(2))
+            c0_bgrp ( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = c0_bgrp( :, 1:nupdwn(2))
+            phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi_bgrp( :, 1:nupdwn(2))
            lambda(:,:,2) = lambda(:,:,1)
            !
         ENDIF
         !
-         CALL calbec ( nvb+1, nsp, eigr, c0, bec )
+         !
+         CALL calbec_bgrp ( nvb+1, nsp, eigr, c0_bgrp, bec_bgrp )
+         !
+         if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec )

-         if ( tstress ) CALL caldbec( ngw, nkb, nbsp, 1, nsp, eigr, cm, dbec )
-
-         if ( iprsta >= 3 ) CALL dotcsc( eigr, c0, ngw, nbsp )
+         if ( iprsta >= 3 ) CALL dotcsc( eigr, c0_bgrp, ngw, nbsp_bgrp )
         !
         xnhp0 = 0.0d0
         xnhpm = 0.0d0
@ -291,7 +293,7 @@ SUBROUTINE from_scratch( )
         vnhh (:,:) = 0.0d0
         velh (:,:) = ( h(:,:) - hold(:,:) ) / delt
         !
-         CALL elec_fakekine( ekincm, ema0bg, emass, c0, cm, ngw, nbsp, 1, delt )
+         CALL elec_fakekine( ekincm, ema0bg, emass, c0_bgrp, cm_bgrp, ngw, nbsp_bgrp, 1, delt )

         xnhe0 = 0.0d0
         xnhem = 0.0d0
@ -301,7 +303,7 @@ SUBROUTINE from_scratch( )
         !
       ELSE 
          !
-          c0 = cm
+          c0_bgrp = cm_bgrp
          !
       END IF
    !
--- a/CPV/init_run.f90
+++ b/CPV/init_run.f90
@ -33,8 +33,8 @@ SUBROUTINE init_run()
  USE uspp,                     ONLY : nkb, vkb, deeq, becsum,nkbus
  USE core,                     ONLY : rhoc
  USE smooth_grid_dimensions,   ONLY : nrxxs
-  USE wavefunctions_module,     ONLY : c0, cm, cp, c0_bgrp, cm_bgrp, cp_bgrp
-  USE cdvan,                    ONLY : dbec, drhovan
+  USE wavefunctions_module,     ONLY : c0_bgrp, cm_bgrp, phi_bgrp
+  USE cdvan,                    ONLY : drhovan
  USE ensemble_dft,             ONLY : tens, z0t
  USE cg_module,                ONLY : tcg
  USE electrons_base,           ONLY : nudx, nbnd
@ -43,7 +43,7 @@ SUBROUTINE init_run()
  USE ions_nose,                ONLY : xnhp0, xnhpm, vnhp, nhpcl, nhpdim
  USE cell_base,                ONLY : h, hold, hnew, velh, tpiba2, ibrav, &
                                       alat, celldm, a1, a2, a3, b1, b2, b3
-  USE cp_main_variables,        ONLY : lambda, lambdam, lambdap, ema0bg, bec,  &
+  USE cp_main_variables,        ONLY : lambda, lambdam, lambdap, ema0bg, &
                                       sfac, eigr, taub, &
                                       irb, eigrb, rhog, rhos, rhor,     &
                                       acc, acc_this_run, wfill, &
@ -98,6 +98,8 @@ SUBROUTINE init_run()
  !   CALL create_directory( tmp_dir )
  !   !
  !END IF
+  IF( nbgrp > 1 .AND. force_pairing ) &
+     CALL errore( ' init_run ', ' force_pairing with parallelization over bands not implemented yet ', 1 )
  !
  CALL printout_base_init( tmp_dir, prefix )
  !
@ -137,19 +139,15 @@ SUBROUTINE init_run()
  CALL allocate_mainvar( ngw, ngw_g, ngb, ngms, ngm, nr1,nr2,nr3, dfftp%nr1x, &
                         dfftp%nr2x, dfftp%npl, nrxx, nrxxs, nat, nax, nsp,   &
                         nspin, nbsp, nbspx, nupdwn, nkb, gstart, nudx, &
-                         tpre )
+                         tpre, nbspx_bgrp )
  !
  CALL allocate_local_pseudo( ngms, nsp )
  !
  !  initialize wave functions descriptors and allocate wf
  !
-  ALLOCATE( c0( ngw, nbspx ) )
-  ALLOCATE( cm( ngw, nbspx ) )
-  ALLOCATE( cp( ngw, nbspx ) )
-  !
-  ALLOCATE( c0_bgrp( ngw, nbspx_bgrp ) )
-  ALLOCATE( cm_bgrp( ngw, nbspx_bgrp ) )
-  ALLOCATE( cp_bgrp( ngw, nbspx_bgrp ) )
+  ALLOCATE( c0_bgrp( ngw, nbspx ) )
+  ALLOCATE( cm_bgrp( ngw, nbspx ) )
+  ALLOCATE( phi_bgrp( ngw, nbspx ) )
  !
  IF ( iprsta > 2 ) THEN
     !
@ -177,7 +175,6 @@ SUBROUTINE init_run()
  ALLOCATE( becsum(  nhm*(nhm+1)/2, nat, nspin ) )
  ALLOCATE( deeq( nhm, nhm, nat, nspin ) )
  IF ( tpre ) THEN
-     ALLOCATE( dbec( nkb, 2*nlam, 3, 3 ) )
     ALLOCATE( drhovan( nhm*(nhm+1)/2, nat, nspin, 3, 3 ) )
  END IF
  !
@ -241,9 +238,9 @@ SUBROUTINE init_run()
  !
  hnew = h
  !
-  cm = ( 0.D0, 0.D0 )
-  c0 = ( 0.D0, 0.D0 )
-  cp = ( 0.D0, 0.D0 )
+  cm_bgrp  = ( 0.D0, 0.D0 )
+  c0_bgrp  = ( 0.D0, 0.D0 )
+  phi_bgrp = ( 0.D0, 0.D0 )
  !
  IF ( tens ) then
     CALL id_matrix_init( descla, nspin )
@ -279,7 +276,7 @@ SUBROUTINE init_run()
     !======================================================================
     !
     i = 1  
-     CALL readfile( i, h, hold, nfi, c0, cm, taus,   &
+     CALL readfile( i, h, hold, nfi, c0_bgrp, cm_bgrp, taus,   &
                    tausm, vels, velsm, acc, lambda, lambdam, xnhe0, xnhem, &
                    vnhe, xnhp0, xnhpm, vnhp,nhpcl,nhpdim,ekincm, xnhh0, xnhhm,&
                    vnhh, velh, fion, tps, z0t, f )
--- a/CPV/mainvar.f90
+++ b/CPV/mainvar.f90
@ -50,9 +50,11 @@ MODULE cp_main_variables
  ! ...    rhovan= \sum_i f(i) <psi(i)|beta_l><beta_m|psi(i)>
  ! ...    deeq  = \int V_eff(r) q_lm(r) dr
  !
-  REAL(DP), ALLOCATABLE :: bec(:,:), becdr(:,:,:)
-  REAL(DP), ALLOCATABLE :: bephi(:,:)
-  REAL(DP), ALLOCATABLE :: becp_dist(:,:)  ! distributed becp
+  REAL(DP), ALLOCATABLE :: bephi(:,:)      ! distributed (orhto group)
+  REAL(DP), ALLOCATABLE :: becp_dist(:,:)  ! distributed becp (ortho group)
+  REAL(DP), ALLOCATABLE :: bec_bgrp(:,:)  ! distributed bec (band group)
+  REAL(DP), ALLOCATABLE :: becdr_bgrp(:,:,:)  ! distributed becdr (band group)
+  REAL(DP), ALLOCATABLE :: dbec(:,:,:,:)    ! derivative of bec distributed(ortho group) 
  !
  ! ... mass preconditioning
  !
@ -112,7 +114,7 @@ MODULE cp_main_variables
    SUBROUTINE allocate_mainvar( ngw, ngw_g, ngb, ngs, ng, nr1, nr2, nr3, &
                                 nr1x, nr2x, npl, nnr, nrxxs, nat, nax,  &
                                 nsp, nspin, n, nx, nupdwn, nhsa, &
-                                 gstart, nudx, tpre )
+                                 gstart, nudx, tpre, nbspx_bgrp )
      !------------------------------------------------------------------------
      !
      USE mp_global,   ONLY: np_ortho, me_ortho, intra_bgrp_comm, ortho_comm, &
@ -126,6 +128,7 @@ MODULE cp_main_variables
      INTEGER,           INTENT(IN) :: nupdwn(:)
      INTEGER,           INTENT(IN) :: gstart, nudx
      LOGICAL,           INTENT(IN) :: tpre
+      INTEGER,           INTENT(IN) :: nbspx_bgrp
      !
      INTEGER  :: iss
      LOGICAL  :: gzero
@ -202,16 +205,17 @@ MODULE cp_main_variables
      !
      ! becdr, distributed over row processors of the ortho group
      !
-      ALLOCATE( becdr( nhsa, nspin*nlax, 3 ) )  
+      ALLOCATE( becdr_bgrp( nhsa, nbspx_bgrp, 3 ) )  
      !
-      ALLOCATE( bec( nhsa, n ) )
-      !ALLOCATE( bec_dist(  nhsa, 2*nlax*nspin ) )  
-      ! factor 2 in the second dim is required because each task need 
-      ! at the same time row and colum component of becp
+      ALLOCATE( bec_bgrp( nhsa, nbspx_bgrp ) )
      !
      ALLOCATE( bephi( nhsa, nspin*nlax ) )
      ALLOCATE( becp_dist( nhsa, nlax*nspin ) )  
      !
+      IF ( tpre ) THEN
+        ALLOCATE( dbec( nhsa, 2*nlax, 3, 3 ) )
+      END IF
+
      gzero =  (gstart == 2)
      !
      CALL wave_descriptor_init( wfill, ngw, ngw_g, nupdwn,  nupdwn, &
@ -234,10 +238,11 @@ MODULE cp_main_variables
      IF( ALLOCATED( rhog ) )    DEALLOCATE( rhog )
      IF( ALLOCATED( drhog ) )   DEALLOCATE( drhog )
      IF( ALLOCATED( drhor ) )   DEALLOCATE( drhor )
-      IF( ALLOCATED( bec ) )     DEALLOCATE( bec )
-      IF( ALLOCATED( becdr ) )   DEALLOCATE( becdr )
+      IF( ALLOCATED( bec_bgrp ) )     DEALLOCATE( bec_bgrp )
+      IF( ALLOCATED( becdr_bgrp ) )   DEALLOCATE( becdr_bgrp )
      IF( ALLOCATED( bephi ) )   DEALLOCATE( bephi )
      IF( ALLOCATED( becp_dist ) )    DEALLOCATE( becp_dist )
+      IF( ALLOCATED( dbec ) )    DEALLOCATE( dbec )
      IF( ALLOCATED( ema0bg ) )  DEALLOCATE( ema0bg )
      IF( ALLOCATED( lambda ) )  DEALLOCATE( lambda )
      IF( ALLOCATED( lambdam ) ) DEALLOCATE( lambdam )
--- a/CPV/modules.f90
+++ b/CPV/modules.f90
@ -165,13 +165,10 @@ MODULE cdvan
  IMPLICIT NONE
  SAVE
  REAL(DP), ALLOCATABLE :: dbeta(:,:,:,:,:)
-  REAL(DP), ALLOCATABLE :: dbec(:,:,:,:)     
-    ! Warning dbec is distributed over row and column processors of the ortho group
  REAL(DP), ALLOCATABLE :: drhovan(:,:,:,:,:)
 CONTAINS
  SUBROUTINE deallocate_cdvan
      IF( ALLOCATED( dbeta ) ) DEALLOCATE( dbeta )
-      IF( ALLOCATED( dbec ) ) DEALLOCATE( dbec )
      IF( ALLOCATED( drhovan ) ) DEALLOCATE( drhovan )
  END SUBROUTINE deallocate_cdvan
 END MODULE cdvan
--- a/CPV/move_electrons.f90
+++ b/CPV/move_electrons.f90
@ -7,7 +7,7 @@
 !
 !
 !----------------------------------------------------------------------------
-SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
+SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, &
                           enthal, enb, enbi, fccc, ccc, dt2bye, stress )
  !----------------------------------------------------------------------------
  !
@ -16,15 +16,14 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
  USE kinds,                ONLY : DP
  USE control_flags,        ONLY : lwf, tfor, tprnfor, thdyn
  USE cg_module,            ONLY : tcg
-  USE cp_main_variables,    ONLY : eigr, bec, irb, eigrb, rhog, rhos, rhor, &
-                                   sfac, ema0bg, becdr, &
-                                   taub, lambda, lambdam, lambdap, vpot
-  USE wavefunctions_module, ONLY : c0, cm, phi => cp, c0_bgrp, cm_bgrp, cp_bgrp
+  USE cp_main_variables,    ONLY : eigr, irb, eigrb, rhog, rhos, rhor, &
+                                   sfac, ema0bg, bec_bgrp, becdr_bgrp, &
+                                   taub, lambda, lambdam, lambdap, vpot, dbec
  USE cell_base,            ONLY : omega, ibrav, h, press
  USE uspp,                 ONLY : becsum, vkb, nkb
  USE energies,             ONLY : ekin, enl, entropy, etot
  USE grid_dimensions,      ONLY : nrxx
-  USE electrons_base,       ONLY : nbsp, nspin, f, nudx, distribute_c
+  USE electrons_base,       ONLY : nbsp, nspin, f, nudx, nupdwn, nbspx_bgrp
  USE core,                 ONLY : nlcc_any, rhoc
  USE ions_positions,       ONLY : tau0
  USE ions_base,            ONLY : nat
@ -39,17 +38,19 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
  USE cp_interfaces,        ONLY : runcp_uspp, runcp_uspp_force_pairing, &
                                   interpolate_lambda
  USE gvecw,                ONLY : ngw
-  USE orthogonalize_base,   ONLY : calphi
+  USE orthogonalize_base,   ONLY : calphi_bgrp
  USE control_flags,        ONLY : force_pairing
  USE cp_interfaces,        ONLY : rhoofr, compute_stress
-  USE electrons_base,       ONLY : nupdwn 
+  USE electrons_module,     ONLY : distribute_c, collect_c, distribute_b
  USE gvect,   ONLY : eigts1, eigts2, eigts3 
+  USE mp_global,   ONLY : mpime
  IMPLICIT NONE
  !
  INTEGER,  INTENT(IN)    :: nfi
  LOGICAL,  INTENT(IN)    :: tfirst, tlast
  REAL(DP), INTENT(IN)    :: b1(3), b2(3), b3(3)
  REAL(DP)                :: fion(:,:)
+  COMPLEX(DP)             :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
  REAL(DP), INTENT(IN)    :: dt2bye
  REAL(DP)                :: fccc, ccc
  REAL(DP)                :: enb, enbi
@ -59,24 +60,22 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
  !
  INTEGER :: i, j, is, n2
  !
-  !
  electron_dynamic: IF ( tcg ) THEN
     !
-     CALL runcg_uspp( nfi, tfirst, tlast, eigr, bec, irb, eigrb, &
+     CALL runcg_uspp( nfi, tfirst, tlast, eigr, bec_bgrp, irb, eigrb, &
                      rhor, rhog, rhos, rhoc, eigts1, eigts2, eigts3, sfac, &
-                      fion, ema0bg, becdr, lambdap, lambda, vpot  )
+                      fion, ema0bg, becdr_bgrp, lambdap, lambda, vpot, c0_bgrp, &
+                      cm_bgrp, phi_bgrp, dbec  )
     !
     CALL compute_stress( stress, detot, h, omega )
     !
  ELSE
     !
     IF ( lwf ) &
-          CALL get_wannier_center( tfirst, cm, bec, eigr, &
+          CALL get_wannier_center( tfirst, cm_bgrp, bec_bgrp, eigr, &
                                   eigrb, taub, irb, ibrav, b1, b2, b3 )
     !
-     CALL distribute_c( c0, c0_bgrp )
-     !
-     CALL rhoofr( nfi, c0_bgrp, irb, eigrb, bec, &
+     CALL rhoofr( nfi, c0_bgrp, irb, eigrb, bec_bgrp, &
                     becsum, rhor, rhog, rhos, enl, denl, ekin, dekin6 )
     !
     ! ... put core charge (if present) in rhoc(r)
@ -96,7 +95,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
                     eigts1, eigts2, eigts3, irb(1,1), eigrb(1,1), sfac(1,1), &
                     tau0(1,1), fion(1,1) )
     !
-     IF ( lwf ) CALL wf_options( tfirst, nfi, cm, becsum, bec, &
+     IF ( lwf ) CALL wf_options( tfirst, nfi, cm_bgrp, becsum, bec_bgrp, &
                                 eigr, eigrb, taub, irb, ibrav, b1,   &
                                 b2, b3, vpot, rhog, rhos, enl, ekin  )
     !
@ -106,14 +105,14 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
     !
     IF( tefield )  THEN
        !
-        CALL berry_energy( enb, enbi, bec, c0, fion )
+        CALL berry_energy( enb, enbi, bec_bgrp, c0_bgrp, fion )
        !
        etot = etot + enb + enbi
        !
     END IF
     IF( tefield2 )  THEN
        !
-        CALL berry_energy2( enb, enbi, bec, c0, fion )
+        CALL berry_energy2( enb, enbi, bec_bgrp, c0_bgrp, fion )
        !
        etot = etot + enb + enbi
        !
@ -137,11 +136,11 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
     IF( force_pairing ) THEN
        !
        CALL runcp_uspp_force_pairing( nfi, fccc, ccc, ema0bg, dt2bye, &
-                      rhos, bec, c0, cm, ei_unp )
+                      rhos, bec_bgrp, c0_bgrp, cm_bgrp, ei_unp )
        !
     ELSE
        !
-        CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec, c0, cm )
+        CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp )
        !
     ENDIF
     !
@ -151,12 +150,14 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
     !
     ! ... nlfq needs deeq bec
     !
-     IF ( tfor .OR. tprnfor ) CALL nlfq( c0, eigr, bec, becdr, fion )
+     IF ( tfor .OR. tprnfor ) THEN
+        CALL nlfq_bgrp( c0_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
+     END IF
     !
     IF ( (tfor.or.tprnfor) .AND. tefield ) &
-        CALL bforceion( fion, .TRUE. , ipolp, qmat, bec, becdr, gqq, evalue )
+        CALL bforceion( fion, .TRUE. , ipolp, qmat, bec_bgrp, becdr_bgrp, gqq, evalue )
     IF ( (tfor.or.tprnfor) .AND. tefield2 ) &
-        CALL bforceion( fion, .TRUE. , ipolp2, qmat2, bec, becdr, gqq2, evalue2 )
+        CALL bforceion( fion, .TRUE. , ipolp2, qmat2, bec_bgrp, becdr_bgrp, gqq2, evalue2 )
     !
     IF( force_pairing ) THEN
        lambda( :, :, 2 ) =  lambda(:, :, 1 )
@ -173,13 +174,15 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, &
     ! ... calphi calculates phi
     ! ... the electron mass rises with g**2
     !
-     CALL calphi( c0, ngw, bec, nkb, vkb, phi, nbsp, ema0bg )
+     CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
     !
     ! ... begin try and error loop (only one step!)
     !
     ! ... nlfl and nlfh need: lambda (guessed) becdr
     !
-     IF ( tfor .OR. tprnfor ) CALL nlfl( bec, becdr, lambda, fion )
+     IF ( tfor .OR. tprnfor ) THEN
+        CALL nlfl_bgrp( bec_bgrp, becdr_bgrp, lambda, fion )
+     END IF
     !
  END IF electron_dynamic
  !
--- a/CPV/nl_base.f90
+++ b/CPV/nl_base.f90
@ -154,181 +154,9 @@
   end subroutine nlsm1
 !-----------------------------------------------------------------------

-!
-!-----------------------------------------------------------------------
-   subroutine nlsm1_dist ( n, nspmn, nspmx, eigr, c, becp, nlax, nspin, desc )
-!-----------------------------------------------------------------------
-      !  
-      ! This version is for becp distributed over procs  
-      !  
-      !     computes: the array becp
-      !     becp(ia,n,iv,is)=
-      !         = sum_g [(-i)**l beta(g,iv,is) e^(-ig.r_ia)]^* c(g,n)
-      !         = delta_l0 beta(g=0,iv,is) c(g=0,n)
-      !          +sum_g> beta(g,iv,is) 2 re[(i)**l e^(ig.r_ia) c(g,n)]
-      !
-      !     routine makes use of c*(g)=c(-g)  (g> see routine ggen)
-      !     input : beta(ig,l,is), eigr, c
-      !     output: becp as parameter
-      !
-      USE kinds,      ONLY : DP
-      USE mp,         ONLY : mp_sum
-      USE mp_global,  ONLY : nproc_bgrp, intra_bgrp_comm
-      USE ions_base,  only : na, nat
-      USE gvecw,      only : ngw
-      USE uspp,       only : nkb, nhtol, beta
-      USE cvan,       only : ish
-      USE uspp_param, only : nh
-      !
-      USE gvect, ONLY : gstart
-      USE descriptors,        ONLY : descla_siz_ , lambda_node_ , nlar_ , ilar_ , la_n_
-!
-      implicit none
-
-      integer,  intent(in)  :: n, nspmn, nspmx, nlax, nspin
-      integer,  intent(in)  :: desc( descla_siz_ , nspin )
-      real(DP), intent(in)  :: eigr( 2, ngw, nat ), c( 2, ngw, n )
-      real(DP), intent(out) :: becp( nkb, nlax*nspin )
-      !
-      integer   :: isa, ig, is, iv, ia, l, ixr, ixi, inl, i, nhx
-      integer   :: nr, ir, nup
-      real(DP)  :: signre, signim, arg
-      real(DP), allocatable :: becps( :, : )
-      real(DP), allocatable :: wrk2( :, :, : )
-      !
-      call start_clock( 'nlsm1' )
-
-      isa = 0
-      do is = 1, nspmn - 1
-        isa = isa + na(is)
-      end do
-
-      do is = nspmn, nspmx
-         !
-         IF( nh( is ) < 1 ) THEN
-            isa = isa + na(is)
-            CYCLE
-         END IF
-         !
-         allocate( wrk2( 2, ngw, na( is ) ) )
-         !
-         IF( nproc_bgrp > 1 ) THEN
-            nhx = nh( is ) * na( is )
-            IF( MOD( nhx, 2 ) /= 0 ) nhx = nhx + 1
-            ALLOCATE( becps( nhx, n ) )
-            becps = 0.0d0
-         END IF
-         !
-         do iv = 1, nh( is )
-            !
-!$omp parallel default(shared), private(l,ixr,ixi,signre,signim,ig,arg,ia)
-            l = nhtol( iv, is )
-            !
-            if (l == 0) then
-               ixr = 1
-               ixi = 2
-               signre =  1.0d0
-               signim =  1.0d0
-            else if (l == 1) then
-               ixr = 2
-               ixi = 1
-               signre =  1.0d0
-               signim = -1.0d0
-            else if (l == 2) then
-               ixr = 1
-               ixi = 2
-               signre = -1.0d0
-               signim = -1.0d0
-            else if (l == 3) then
-               ixr = 2
-               ixi = 1
-               signre = -1.0d0
-               signim =  1.0d0
-            endif
-!
-!$omp do
-            do ia=1,na(is)
-               !
-               !  q = 0   component (with weight 1.0)
-               !
-               if (gstart == 2) then
-                  wrk2( 1, 1, ia ) = signre*beta(1,iv,is)*eigr(ixr,1,ia+isa)
-                  wrk2( 2, 1, ia ) = signim*beta(1,iv,is)*eigr(ixi,1,ia+isa)
-               end if
-               !
-               !   q > 0   components (with weight 2.0)
-               !
-               do ig = gstart, ngw
-                  arg = 2.0d0 * beta(ig,iv,is)
-                  wrk2( 1, ig, ia ) = signre*arg*eigr(ixr,ig,ia+isa)
-                  wrk2( 2, ig, ia ) = signim*arg*eigr(ixi,ig,ia+isa)
-               end do
-               !
-            end do
-!$omp end do
-            
-!$omp end parallel
-            
-            !
-            IF( nproc_bgrp > 1 ) THEN
-               inl=(iv-1)*na(is)+1
-               CALL dgemm( 'T', 'N', na(is), n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becps( inl, 1 ), nhx )
-            ELSE
-               inl=ish(is)+(iv-1)*na(is)+1
-               CALL dgemm( 'T', 'N', na(is), n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becp( inl, 1 ), nkb )
-            END IF
-
-         end do
-
-         deallocate( wrk2 )
-
-
-         IF( nproc_bgrp > 1 ) THEN
-            !
-            inl = ish(is) + 1
-            !
-            CALL mp_sum( becps, intra_bgrp_comm )
-
-            IF( desc( lambda_node_ , 1 ) > 0 ) THEN
-               ir = desc( ilar_ , 1 )
-               nr = desc( nlar_ , 1 )
-               do i = 1, nr
-                  do iv = inl , ( inl + na(is) * nh(is) - 1 )
-                     becp( iv, i ) = becps( iv - inl + 1, i + ir - 1 )
-                  end do
-               end do
-            END IF
-            !
-            IF( nspin == 2 ) THEN
-               IF( desc( lambda_node_ , 2 ) > 0 ) THEN
-                  nup = desc( la_n_ , 1 )
-                  ir = desc( ilar_ , 2 )
-                  nr = desc( nlar_ , 2 )
-                  do i = 1, nr
-                     do iv = inl , ( inl + na(is) * nh(is) - 1 )
-                        becp( iv, i + nlax ) = becps( iv - inl + 1, i + ir - 1 + nup )
-                     end do
-                  end do
-               END IF
-            END IF
-
-            DEALLOCATE( becps )
-
-         END IF
-
-         isa = isa + na(is)
-
-      end do
-
-      call stop_clock( 'nlsm1' )
-
-      return
-   end subroutine nlsm1_dist
-!-----------------------------------------------------------------------
-

 !-------------------------------------------------------------------------
-   subroutine nlsm2( ngw, nkb, n, nspin, eigr, c, becdr )
+   subroutine nlsm2_bgrp( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
 !-----------------------------------------------------------------------

      !     computes: the array becdr
@ -348,140 +176,13 @@
      use cell_base,  only : tpiba
      use mp,         only : mp_sum
      use mp_global,  only : nproc_bgrp, intra_bgrp_comm
-      use cp_main_variables,  only : nlax, descla, distribute_bec
-      use gvect, only : g, gstart
+      use gvect,      only : g, gstart
 !
      implicit none
    
-      integer,  intent(in)  :: ngw, nkb, n, nspin
-      real(DP), intent(in)  :: eigr(2,ngw,nat), c(2,ngw,n)
-      real(DP), intent(out) :: becdr(nkb,nspin*nlax,3)
-      !
-      real(DP), allocatable :: gk(:)
-      real(DP), allocatable :: wrk2(:,:,:)
-      real(DP), allocatable :: becdr_repl(:,:)
-      !
-      integer   :: ig, is, iv, ia, k, l, ixr, ixi, inl, isa, i
-      real(DP) :: signre, signim, arg
-!
-      call start_clock( 'nlsm2' )
-
-      allocate( gk( ngw ) )
-      allocate( becdr_repl( nkb, n ) )
-
-      becdr = 0.d0
-!
-      do k = 1, 3
-
-         becdr_repl = 0.d0
-
-         do ig=1,ngw
-            gk(ig)=g(k,ig)*tpiba
-         end do
-!
-         isa = 0
-
-         do is=1,nsp
-
-            allocate( wrk2( 2, ngw, na( is ) ) )
-
-            do iv=1,nh(is)
-               !
-               !     order of states:  s_1  p_x1  p_z1  p_y1  s_2  p_x2  p_z2  p_y2
-               !
-!$omp parallel default(shared), private(l,ixr,ixi,signre,signim,ig,arg,ia)
-               l=nhtol(iv,is)
-               if (l.eq.0) then
-                  ixr = 2
-                  ixi = 1
-                  signre =  1.0d0
-                  signim = -1.0d0
-               else if (l.eq.1) then
-                  ixr = 1
-                  ixi = 2
-                  signre = -1.0d0
-                  signim = -1.0d0
-               else if (l.eq.2) then
-                  ixr = 2
-                  ixi = 1
-                  signre = -1.0d0
-                  signim =  1.0d0
-               else if (l == 3) then
-                  ixr = 1
-                  ixi = 2
-                  signre =  1.0d0
-                  signim =  1.0d0
-               endif
-!    
-!$omp do
-               do ia=1,na(is)
-                  !    q = 0   component (with weight 1.0)
-                  if (gstart == 2) then
-                     wrk2(1,1,ia) = signre*gk(1)*beta(1,iv,is)*eigr(ixr,1,ia+isa)
-                     wrk2(2,1,ia) = signim*gk(1)*beta(1,iv,is)*eigr(ixi,1,ia+isa)
-                  end if
-                  !    q > 0   components (with weight 2.0)
-                  do ig=gstart,ngw
-                     arg = 2.0d0*gk(ig)*beta(ig,iv,is)
-                     wrk2(1,ig,ia) = signre*arg*eigr(ixr,ig,ia+isa)
-                     wrk2(2,ig,ia) = signim*arg*eigr(ixi,ig,ia+isa)
-                  end do
-               end do
-!$omp end do
-!$omp end parallel 
-               inl=ish(is)+(iv-1)*na(is)+1
-               CALL dgemm( 'T', 'N', na(is), n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becdr_repl( inl, 1 ), nkb )
-            end do
-
-            deallocate( wrk2 )
-
-            isa = isa + na(is)
-
-         end do
-
-         IF( nproc_bgrp > 1 ) THEN
-            CALL mp_sum( becdr_repl(:,:), intra_bgrp_comm )
-         END IF
-         CALL distribute_bec( becdr_repl, becdr(:,:,k), descla, nspin )
-      end do
-
-      deallocate( gk )
-      deallocate( becdr_repl )
-
-      call stop_clock( 'nlsm2' )
-!
-      return
-   end subroutine nlsm2
-!-----------------------------------------------------------------------
-
-!-------------------------------------------------------------------------
-   subroutine nlsm2_repl( ngw, nkb, n, eigr, c, becdr )
-!-----------------------------------------------------------------------
-
-      !     computes: the array becdr
-      !     becdr(ia,n,iv,is,k)
-      !      =2.0 sum_g> g_k beta(g,iv,is) re[ (i)**(l+1) e^(ig.r_ia) c(g,n)]
-      !
-      !     routine makes use of  c*(g)=c(-g)  (g> see routine ggen)
-      !     input : eigr, c
-      !     output: becdr
-      !
- 
-      USE kinds,      ONLY : DP
-      use ions_base,  only : nsp, na, nat
-      use uspp,       only : nhtol, beta  !, nkb
-      use cvan,       only : ish
-      use uspp_param, only : nh
-      use cell_base,  only : tpiba
-      use mp,         only : mp_sum
-      use mp_global,  only : nproc_bgrp, intra_bgrp_comm
-      use gvect, only : g, gstart
-!
-      implicit none
-    
-      integer,  intent(in)  :: ngw, nkb, n
-      real(DP), intent(in)  :: eigr(2,ngw,nat), c(2,ngw,n)
-      real(DP), intent(out) :: becdr(nkb,n,3)
+      integer,  intent(in)  :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
+      real(DP), intent(in)  :: eigr(2,ngw,nat), c_bgrp(2,ngw,nbspx_bgrp)
+      real(DP), intent(out) :: becdr_bgrp(nkb,nbspx_bgrp,3)
      !
      real(DP), allocatable :: gk(:)
      real(DP), allocatable :: wrk2(:,:,:)
@ -493,7 +194,7 @@

      allocate( gk( ngw ) )

-      becdr = 0.d0
+      becdr_bgrp = 0.d0
 !
      do k = 1, 3

@ -552,7 +253,8 @@
 !$omp end do
 !$omp end parallel 
               inl=ish(is)+(iv-1)*na(is)+1
-               CALL dgemm( 'T', 'N', na(is), n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becdr( inl, 1, k ), nkb )
+               CALL dgemm( 'T', 'N', na(is), nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
+                           c_bgrp, 2*ngw, 0.0d0, becdr_bgrp( inl, 1, k ), nkb )
            end do

            deallocate( wrk2 )
@ -561,23 +263,24 @@

         end do

-         IF( nproc_bgrp > 1 ) THEN
-            CALL mp_sum( becdr(:,:,k), intra_bgrp_comm )
-         END IF
      end do

      deallocate( gk )

+      IF( nproc_bgrp > 1 ) THEN
+         CALL mp_sum( becdr_bgrp, intra_bgrp_comm )
+      END IF
+
      call stop_clock( 'nlsm2' )
 !
      return
-   end subroutine nlsm2_repl
+   end subroutine nlsm2_bgrp
 !-----------------------------------------------------------------------



 !-----------------------------------------------------------------------
-   real(8) function ennl( rhovan, bec )
+   real(8) function ennl( rhovan, bec_bgrp )
 !-----------------------------------------------------------------------
      !
      ! calculation of nonlocal potential energy term and array rhovan
@ -586,14 +289,14 @@
      use cvan,           only : ish
      use uspp_param,     only : nhm, nh
      use uspp,           only : nkb, dvan
-      use electrons_base, only : n => nbsp, nspin, ispin, f
+      use electrons_base, only : nbsp_bgrp, nspin, ispin_bgrp, f_bgrp, nbspx_bgrp
      use ions_base,      only : nsp, nat, na
      !
      implicit none
      !
      ! input
      !
-      real(DP) :: bec( nkb, n )
+      real(DP) :: bec_bgrp( nkb, nbspx_bgrp )
      real(DP) :: rhovan( nhm*(nhm+1)/2, nat, nspin )
      !
      ! local
@ -621,9 +324,9 @@
                  jnl = ish(is)+(jv-1)*na(is)+ia
                  isat = isa+ia
                  sums = 0.d0
-                  do i = 1, n
-                     iss = ispin(i)
-                     sums(iss) = sums(iss) + f(i) * bec(inl,i) * bec(jnl,i)
+                  do i = 1, nbsp_bgrp
+                     iss = ispin_bgrp(i)
+                     sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
                  end do
                  sumt = 0.d0
                  do iss = 1, nspin
@ -749,9 +452,44 @@
   end subroutine calbec
 !-----------------------------------------------------------------------

+!-----------------------------------------------------------------------
+   subroutine calbec_bgrp ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp )
+!-----------------------------------------------------------------------
+
+      !     this routine calculates array bec
+      !
+      !        < psi_n | beta_i,i > = c_n(0) beta_i,i(0) +
+      !                 2 sum_g> re(c_n*(g) (-i)**l beta_i,i(g) e^-ig.r_i)
+      !
+      !     routine makes use of c(-g)=c*(g)  and  beta(-g)=beta*(g)
+      !
+
+      USE kinds,          ONLY : DP
+      use ions_base,      only : na, nat
+      use cvan,           only : ish
+      use electrons_base, only : nbsp_bgrp, nbspx_bgrp
+      use gvecw,          only : ngw
+      use uspp_param,     only : nh
+      use uspp,           only : nkb
+!
+      implicit none
+      !
+      integer,     intent(in)  :: nspmn, nspmx
+      real(DP),    intent(out) :: bec_bgrp( nkb, nbspx_bgrp )
+      complex(DP), intent(in)  :: c_bgrp( ngw, nbspx_bgrp ), eigr( ngw,nat )
+!
+      call start_clock( 'calbec' )
+      !
+      call nlsm1( nbsp_bgrp, nspmn, nspmx, eigr, c_bgrp, bec_bgrp )
+      !
+      call stop_clock( 'calbec' )
+!
+      return
+   end subroutine calbec_bgrp
+

 !-----------------------------------------------------------------------
-SUBROUTINE caldbec( ngw, nkb, n, nspmn, nspmx, eigr, c, dbec )
+SUBROUTINE caldbec_bgrp( eigr, c_bgrp, dbec )
  !-----------------------------------------------------------------------
  !
  !     this routine calculates array dbec, derivative of bec:
@ -765,44 +503,43 @@ SUBROUTINE caldbec( ngw, nkb, n, nspmn, nspmx, eigr, c, dbec )
  !
  USE kinds,      ONLY : DP
  use mp,         only : mp_sum
-  use mp_global,  only : nproc_bgrp, intra_bgrp_comm
-  use ions_base,  only : na, nat
+  use mp_global,  only : nproc_bgrp, intra_bgrp_comm, inter_bgrp_comm, nbgrp
+  use ions_base,  only : na, nat, nsp
  use cvan,       only : ish
  use cdvan,      only : dbeta
-  use uspp,       only : nhtol
+  use uspp,       only : nhtol, nkb
  use uspp_param, only : nh, nhm
-  use gvect, only : gstart
+  use gvect,      only : gstart
+  use gvecw,      only : ngw
  USE cp_main_variables,  ONLY : descla, la_proc, nlax, nlam
  USE descriptors,        ONLY : nlar_ , nlac_ , ilar_ , ilac_ , nlax_ , la_myr_ , la_myc_
-  use electrons_base,     only : nspin, iupdwn, nupdwn
+  use electrons_base,     only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
+                                 ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
  !
  implicit none
  !
-  integer,      intent(in)  :: ngw, nkb, n
-  integer,      intent(in)  :: nspmn, nspmx
-  complex(DP), intent(in)  :: c(ngw,n)
+  complex(DP), intent(in)  :: c_bgrp( ngw, nbspx_bgrp )
  real(DP),    intent(in)  :: eigr(2,ngw,nat)
-  real(DP),    intent(out) :: dbec( nkb, 2*nlam, 3, 3 )
+  real(DP),    intent(out) :: dbec( nkb, 2*nlax, 3, 3 )
  !
-  real(DP), allocatable :: wrk2(:,:,:), dwrk(:,:)
+  real(DP), allocatable :: wrk2(:,:,:), dwrk_bgrp(:,:)
  !
  integer   :: ig, is, iv, ia, l, ixr, ixi, inl, i, j, ii, isa, nanh, iw, iss, nr, ir, istart, nss
+  integer   :: n1, n2, m1, m2, ibgrp_i
  real(DP) :: signre, signim, arg
  !
  !
+  dbec = 0.0d0
  !
  do j=1,3
     do i=1,3

        isa = 0
-        do is = 1, nspmn - 1
-          isa = isa + na(is)
-        end do

-        do is=nspmn,nspmx
+        do is=1,nsp
           allocate( wrk2( 2, ngw, na(is) ) )
           nanh = na(is)*nh(is)
-           allocate( dwrk( nanh, n ) )
+           allocate( dwrk_bgrp( nanh, nbspx_bgrp ) )
           do iv=1,nh(is)
              l=nhtol(iv,is)
              if (l == 0) then
@ -843,12 +580,14 @@ SUBROUTINE caldbec( ngw, nkb, n, nspmn, nspmx, eigr, c, dbec )
                 end do
              end do
              inl=(iv-1)*na(is)+1
-              CALL dgemm( 'T', 'N', na(is), n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, dwrk(inl,1), nanh )
+              CALL dgemm( 'T', 'N', na(is), nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(inl,1), nanh )
           end do
           deallocate( wrk2 )
+
           if( nproc_bgrp > 1 ) then
-              call mp_sum( dwrk, intra_bgrp_comm )
+              call mp_sum( dwrk_bgrp, intra_bgrp_comm )
           end if
+
           inl=ish(is)+1
           do iss=1,nspin
              IF( la_proc ) THEN
@ -857,26 +596,32 @@ SUBROUTINE caldbec( ngw, nkb, n, nspmn, nspmx, eigr, c, dbec )
                 istart = iupdwn( iss )
                 nss    = nupdwn( iss )
                 do ii = 1, nr
-                    do iw = 1, nanh
-                       dbec( iw + inl - 1, ii + (iss-1)*nlam, i, j ) = dwrk( iw, ii + ir - 1 + istart - 1 )
-                       !dbec( iw + inl - 1, ii + (iss-1)*nspin, i, j ) = dwrk( iw, ii + ir - 1 + istart - 1 )
-                    end do
+                    ibgrp_i = ibgrp_g2l( ii + ir - 1 + istart - 1 )
+                    IF( ibgrp_i > 0 ) THEN
+                       do iw = 1, nanh
+                          dbec( iw + inl - 1, ii + (iss-1)*nlax, i, j ) = dwrk_bgrp( iw, ibgrp_i )
+                       end do
+                    END IF
                 end do
              END IF
           end do
-           deallocate( dwrk )
+           deallocate( dwrk_bgrp )
           isa = isa + na(is)
        end do
     end do
  end do

+  if( nbgrp > 1 ) then
+     CALL mp_sum( dbec, inter_bgrp_comm )
+  end if
  !
  return
-end subroutine caldbec
+end subroutine caldbec_bgrp
 !-----------------------------------------------------------------------

+
 !-----------------------------------------------------------------------
-subroutine dennl( bec, dbec, drhovan, denl )
+subroutine dennl( bec_bgrp, dbec, drhovan, denl )
  !-----------------------------------------------------------------------
  !
  !  compute the contribution of the non local part of the
@ -893,19 +638,19 @@ subroutine dennl( bec, dbec, drhovan, denl )
  use mp_global,  only : intra_bgrp_comm
  USE cp_main_variables,  ONLY : descla, la_proc, nlax, nlam
  USE descriptors,        ONLY : nlar_ , nlac_ , ilar_ , ilac_ , nlax_ , la_myr_ , la_myc_
-  use electrons_base,     only : n => nbsp, ispin, f, nspin, iupdwn, nupdwn
+  use electrons_base,     only : nbspx_bgrp, nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, iupdwn, nupdwn, ibgrp_g2l
  use gvect, only : gstart

  implicit none

-  real(DP), intent(in)  :: dbec( nkb, 2*nlam, 3, 3 )
-  real(DP), intent(in)  :: bec( nkb, n )
+  real(DP), intent(in)  :: dbec( nkb, 2*nlax, 3, 3 )
+  real(DP), intent(in)  :: bec_bgrp( nkb, nbspx_bgrp )
  real(DP), intent(out) :: drhovan( nhm*(nhm+1)/2, nat, nspin, 3, 3 )
  real(DP), intent(out) :: denl( 3, 3 )

  real(DP) :: dsum(3,3),dsums(2,3,3), detmp(3,3)
  integer   :: is, iv, jv, ijv, inl, jnl, isa, ism, ia, iss, i,j,k
-  integer   :: istart, nss, ii, ir, nr
+  integer   :: istart, nss, ii, ir, nr, ibgrp
  !
  denl=0.d0
  drhovan=0.0d0
@ -928,20 +673,23 @@ subroutine dennl( bec, dbec, drhovan, denl )
              dsums=0.d0
              do iss=1,nspin
                 IF( descla( la_myr_ , iss ) == descla( la_myc_ , iss ) ) THEN
-                 nr = descla( nlar_ , iss )
-                 ir = descla( ilar_ , iss )
-                 istart = iupdwn( iss )
-                 nss    = nupdwn( iss )
-                 do i=1,nr
-                    ii = i+istart-1+ir-1
-                    do k=1,3
-                       do j=1,3
-                          dsums(iss,k,j)=dsums(iss,k,j)+f(ii)*       &
- &                          (dbec(inl,i+(iss-1)*nlam,k,j)*bec(jnl,ii)          &
- &                          + bec(inl,ii)*dbec(jnl,i+(iss-1)*nlam,k,j))
-                       enddo
-                    enddo
-                 end do
+                    nr = descla( nlar_ , iss )
+                    ir = descla( ilar_ , iss )
+                    istart = iupdwn( iss )
+                    nss    = nupdwn( iss )
+                    do i=1,nr
+                       ii = i+istart-1+ir-1
+                       ibgrp = ibgrp_g2l( ii )
+                       IF( ibgrp > 0 ) THEN
+                          do k=1,3
+                             do j=1,3
+                                dsums(iss,k,j)=dsums(iss,k,j)+f_bgrp(ibgrp)*       &
+ &                          (dbec(inl,i+(iss-1)*nlax,k,j)*bec_bgrp(jnl,ibgrp)          &
+ &                          + bec_bgrp(inl,ibgrp)*dbec(jnl,i+(iss-1)*nlax,k,j))
+                             enddo
+                          enddo
+                       END IF
+                    end do
                 END IF
              end do
              !
@ -966,11 +714,7 @@ subroutine dennl( bec, dbec, drhovan, denl )
  END IF

  CALL mp_sum( denl,    intra_bgrp_comm )
-  do k=1,3
-     do j=1,3
-        CALL mp_sum( drhovan(:,:,:,j,k), intra_bgrp_comm )
-     end do
-  end do
+  CALL mp_sum( drhovan, intra_bgrp_comm )

 !  WRITE(6,*) 'DEBUG enl (CP) = '
 !  detmp = denl
@ -987,8 +731,9 @@ end subroutine dennl



+
 !-----------------------------------------------------------------------
-subroutine nlfq( c, eigr, bec, becdr, fion )
+subroutine nlfq_bgrp( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
  !-----------------------------------------------------------------------
  !
  !     contribution to fion due to nonlocal part
@ -998,26 +743,20 @@ subroutine nlfq( c, eigr, bec, becdr, fion )
  use uspp_param,     only : nhm, nh
  use cvan,           only : ish, nvb
  use ions_base,      only : nax, nat, nsp, na
-  use electrons_base, only : n => nbsp, ispin, f, nspin, iupdwn, nupdwn
+  use electrons_base, only : nbsp_bgrp, f_bgrp, nbspx_bgrp, ispin_bgrp
  use gvecw,          only : ngw
  use constants,      only : pi, fpi
-  use mp_global,      only : me_bgrp, intra_bgrp_comm, nbgrp, inter_bgrp_comm, my_bgrp_id
+  use mp_global,      only : intra_bgrp_comm, nbgrp, inter_bgrp_comm
  use mp,             only : mp_sum
-  USE cp_main_variables, ONLY: nlax, descla, la_proc
-  USE descriptors,       ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , &
-                               la_myr_ , la_myc_
  !
  implicit none
  !
-  real(DP),    intent(in)  :: bec( nkb, n ), c( 2, ngw, n )
-  real(DP),    intent(out) :: becdr( nkb, nspin*nlax, 3 )
+  real(DP),    intent(in)  :: bec_bgrp( nkb, nbspx_bgrp ), c_bgrp( 2, ngw, nbspx_bgrp )
+  real(DP),    intent(out) :: becdr_bgrp( nkb, nbspx_bgrp, 3 )
  complex(DP), intent(in)  :: eigr( ngw, nat )
  real(DP),    intent(out) :: fion( 3, nat )
  !
-  integer   :: k, is, ia, isa, iss, inl, iv, jv, i, ir, nr, nss, istart, ioff
-  INTEGER  :: ibgrp_start( nspin ), ibgrp_end( nspin ), nr_bgrp( nspin )
-  LOGICAL  :: compute( nspin )
-  INTEGER, EXTERNAL :: ldim_block, gind_block
+  integer  :: k, is, ia, isa, inl, iv, jv, i
  real(DP) :: temp
  real(DP) :: sum_tmpdr
  !
@ -1029,43 +768,26 @@ subroutine nlfq( c, eigr, bec, becdr, fion )
  !
  call start_clock( 'nlfq' )
  !
-  !
  !     nlsm2 fills becdr
  !
-  call nlsm2( ngw, nkb, n, nspin, eigr, c, becdr )
+  call nlsm2_bgrp( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
  !
  allocate ( fion_loc( 3, nat ) )
  !
  fion_loc = 0.0d0
  !
-  ! distribute bands, remember here "becdr" coeff is already distributed on the ortho group
-  !
-  DO iss = 1, nspin
-     compute( iss ) = la_proc .AND. ( descla( la_myr_ , iss ) == descla( la_myc_ , iss ) )
-     IF( compute( iss ) ) THEN
-        nr = descla( nlar_ , iss )
-        IF( nbgrp > 1 ) THEN
-           nr_bgrp( iss ) = ldim_block( nr, nbgrp, my_bgrp_id)
-           ibgrp_start( iss ) = gind_block( 1, nr, nbgrp, my_bgrp_id )
-        ELSE 
-           nr_bgrp( iss ) = nr
-           ibgrp_start( iss ) = 1
-        END IF
-        ibgrp_end( iss )   = ibgrp_start( iss ) + nr_bgrp( iss ) - 1
-     END IF
-  END DO
-
  DO k = 1, 3

-!$omp parallel default(shared), &
-!$omp private(tmpbec,tmpdr,isa,is,ia,iss,nss,istart,ir,nr,ioff,iv,jv,inl,temp,i,mytid,ntids)
+!$omp parallel default(none), &
+!$omp shared(becdr_bgrp,bec_bgrp,fion_loc,k,f_bgrp,deeq,dvan,nbsp_bgrp,ish,nh,na,nsp,nhm,nbspx_bgrp,ispin_bgrp), &
+!$omp private(tmpbec,tmpdr,isa,is,ia,iv,jv,inl,temp,i,mytid,ntids,sum_tmpdr)

 #ifdef __OPENMP
     mytid = omp_get_thread_num()  ! take the thread ID
     ntids = omp_get_num_threads() ! take the number of threads
 #endif

-     allocate ( tmpbec( nhm, nlax ), tmpdr( nhm, nlax ) )
+     allocate ( tmpbec( nhm, nbspx_bgrp ), tmpdr( nhm, nbspx_bgrp ) )

     isa = 0
     !
@ -1079,58 +801,36 @@ subroutine nlfq( c, eigr, bec, becdr, fion )
           !
           IF( MOD( isa, ntids ) /= mytid ) CYCLE
 #endif  
-           DO iss = 1, nspin
-
-              nss = nupdwn( iss )
-              istart = iupdwn( iss )
-
-              IF( compute( iss ) ) THEN
-
-                 ! only processors on the diagonal of the square proc grid enter here.
-                 ! This is to distribute the load among different multi-core nodes,
-                 ! and maximize the memory bandwith per core.

                 tmpbec = 0.d0
                 tmpdr  = 0.d0

-                 ir = descla( ilar_ , iss )
-                 nr = descla( nlar_ , iss )
-
-                 ioff = istart-1+ir-1
-
                 do iv=1,nh(is)
                    do jv=1,nh(is)
                       inl=ish(is)+(jv-1)*na(is)+ia
-                       temp=dvan(iv,jv,is)+deeq(jv,iv,isa,iss)
-                       ! do i=1,nr
-                       do i = ibgrp_start( iss ), ibgrp_end( iss )  
-                          tmpbec(iv,i)=tmpbec(iv,i)+temp*bec(inl,i+ioff)
+                       do i = 1, nbsp_bgrp
+                          temp = dvan(iv,jv,is) + deeq(jv,iv,isa,ispin_bgrp( i ) )
+                          tmpbec(iv,i) = tmpbec(iv,i) + temp * bec_bgrp(inl,i)
                       end do
                    end do
                 end do

                 do iv=1,nh(is)
                    inl=ish(is)+(iv-1)*na(is)+ia
-                    ! do i=1,nr
-                    do i = ibgrp_start( iss ), ibgrp_end( iss )  
-                       tmpdr(iv,i)=f(i+ioff)*becdr( inl, i+(iss-1)*nlax, k )
+                    do i = 1, nbsp_bgrp
+                       tmpdr(iv,i) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
                    end do
                 end do

                 sum_tmpdr = 0.0d0
-                 ! do i=1,nr
-                 do i = ibgrp_start( iss ), ibgrp_end( iss )  
-                    do iv=1,nh(is)
+                 do i = 1, nbsp_bgrp
+                    do iv = 1, nh(is)
                       sum_tmpdr = sum_tmpdr + tmpdr(iv,i)*tmpbec(iv,i)
-                       ! tmpdr(iv,i)=tmpdr(iv,i)*tmpbec(iv,i)
                    end do
                 end do

-                 !fion_loc(k,isa) = fion_loc(k,isa)-2.d0*SUM(tmpdr)
                 fion_loc(k,isa) = fion_loc(k,isa)-2.d0*sum_tmpdr

-              END IF
-           END DO
        END DO
     END DO
     deallocate ( tmpbec, tmpdr )
@ -1138,7 +838,6 @@ subroutine nlfq( c, eigr, bec, becdr, fion )

  END DO
  !
-  CALL mp_sum( fion_loc, intra_bgrp_comm )
  IF( nbgrp > 1 ) THEN
     CALL mp_sum( fion_loc, inter_bgrp_comm )
  END IF
@ -1152,5 +851,4 @@ subroutine nlfq( c, eigr, bec, becdr, fion )
  call stop_clock( 'nlfq' )
  !
  return
-end subroutine nlfq
-
+end subroutine nlfq_bgrp
--- a/CPV/ortho.f90
+++ b/CPV/ortho.f90
@ -232,7 +232,7 @@


 !=----------------------------------------------------------------------------=!
-   SUBROUTINE ortho_cp( eigr, cp, phi, ngwx, x0, descla, diff, iter, ccc, &
+   SUBROUTINE ortho_cp( eigr, cp_bgrp, phi_bgrp, ngwx, x0, descla, diff, iter, ccc, &
                        bephi, becp_dist, nbsp, nspin, nupdwn, iupdwn )
 !=----------------------------------------------------------------------------=!
      !
@ -252,32 +252,36 @@
      USE cvan,           ONLY: ish, nvb
      USE uspp,           ONLY: nkb, qq
      USE uspp_param,     ONLY: nh
-      USE electrons_base, ONLY: f
+      USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, i2gupdwn_bgrp
      USE gvecw,          ONLY: ngw
      USE control_flags,  ONLY: iprint, iprsta, ortho_max
      USE control_flags,  ONLY: force_pairing
      USE io_global,      ONLY: stdout, ionode
-      USE cp_interfaces,  ONLY: ortho_gamma
+      USE cp_interfaces,  ONLY: ortho_gamma, c_bgrp_expand, c_bgrp_pack
      USE descriptors,    ONLY: nlac_ , ilac_ , descla_siz_ , nlar_ , ilar_
      USE cp_main_variables,  ONLY: nlam, la_proc, nlax, collect_bec
-      USE mp_global,          ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm  ! DEBUG
+      USE mp_global,          ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm  ! DEBUG
+      USE orthogonalize_base, ONLY: bec_bgrp2ortho
+      USE mp,                 ONLY : mp_sum
      !
      IMPLICIT NONE
      !
      INTEGER,    INTENT(IN) :: ngwx, nbsp, nspin
      INTEGER,    INTENT(IN) :: nupdwn( nspin ), iupdwn( nspin )
      INTEGER,    INTENT(IN) :: descla(descla_siz_,nspin)
-      COMPLEX(DP) :: cp(ngwx,nbsp), phi(ngwx,nbsp), eigr(ngwx,nat)
+      COMPLEX(DP) :: eigr(ngwx,nat)
+      COMPLEX(DP) :: cp_bgrp(:,:), phi_bgrp(:,:)
      REAL(DP)    :: x0(:,:,:), diff, ccc
      INTEGER     :: iter
      REAL(DP)    :: bephi(:,:)
      REAL(DP)    :: becp_dist(:,:)
      !
-      REAL(DP), ALLOCATABLE :: xloc(:,:)
+      REAL(DP), ALLOCATABLE :: xloc(:,:), bec_bgrp(:,:)
      REAL(DP), ALLOCATABLE :: qbephi(:,:,:), qbecp(:,:,:), bec_col(:,:)

      INTEGER :: nkbx
      INTEGER :: istart, nss, ifail, i, j, iss, iv, jv, ia, is, inl, jnl
+      INTEGER :: n1, n2, m1, m2
      INTEGER :: nspin_sub, nx0, nc, ic, icc, nr, ir
      REAL(DP) :: qqf
      !
@ -293,12 +297,20 @@
      !
      CALL start_clock( 'ortho' )

-      !CALL nlsm1( nbsp, 1, nvb, eigr,  cp,  becp )

-      CALL nlsm1_dist ( nbsp, 1, nvb, eigr, cp, becp_dist, nlax, nspin, descla )
-      !CALL collect_bec( becp, becp_dist, descla, nspin )
-
-      CALL nlsm1_dist ( nbsp, 1, nvb, eigr, phi, bephi, nlax, nspin, descla )
+      IF( nvb > 0 ) THEN
+         ALLOCATE( bec_bgrp( SIZE( becp_dist, 1 ), SIZE( cp_bgrp, 2 ) ) )
+         !
+         bec_bgrp = 0.0d0
+         !
+         CALL nlsm1 ( nbsp_bgrp, 1, nvb, eigr, cp_bgrp, bec_bgrp )
+         CALL bec_bgrp2ortho( bec_bgrp, becp_dist, nlax, descla )
+   
+         CALL nlsm1 ( nbsp_bgrp, 1, nvb, eigr, phi_bgrp, bec_bgrp )
+         CALL bec_bgrp2ortho( bec_bgrp, bephi, nlax, descla )
+         !
+         DEALLOCATE( bec_bgrp )
+      END IF
      !
      !     calculation of qbephi and qbecp
      !
@ -373,6 +385,11 @@
      !
      IF( nvb > 0 ) DEALLOCATE( bec_col )
      !
+      ! Expand cp and phi to contain all electronic band
+      !
+      CALL c_bgrp_expand( cp_bgrp )
+      CALL c_bgrp_expand( phi_bgrp )
+      !
      ALLOCATE( xloc( nx0, nx0 ) )
      !
      nspin_sub = nspin 
@ -385,7 +402,7 @@

         IF( la_proc ) xloc = x0(:,:,iss) * ccc

-         CALL ortho_gamma( 0, cp, ngwx, phi, becp_dist(:,(iss-1)*nlax+1:iss*nlax), qbecp(:,:,iss), nkbx, &
+         CALL ortho_gamma( 0, cp_bgrp, ngwx, phi_bgrp, becp_dist(:,(iss-1)*nlax+1:iss*nlax), qbecp(:,:,iss), nkbx, &
                           bephi(:,((iss-1)*nlax+1):iss*nlax), &
                           qbephi(:,:,iss), xloc, nx0, descla(:,iss), diff, iter, nbsp, nss, istart )

@ -402,12 +419,16 @@
         !
      END DO

-      IF( force_pairing ) cp(:, iupdwn(2):iupdwn(2)+nupdwn(2)-1 ) = cp(:,1:nupdwn(2))
+      IF( force_pairing ) cp_bgrp(:, iupdwn(2):iupdwn(2)+nupdwn(2)-1 ) = cp_bgrp(:,1:nupdwn(2))
      !
      DEALLOCATE( xloc )
      DEALLOCATE( qbecp )
      DEALLOCATE( qbephi )
      !
+      ! pack cp so that it contains only the bands in the band subgroup
+      !
+      CALL c_bgrp_pack( cp_bgrp )
+      !
      CALL stop_clock( 'ortho' )
      !
      RETURN
--- a/CPV/ortho_base.f90
+++ b/CPV/ortho_base.f90
@ -33,12 +33,13 @@ MODULE orthogonalize_base
      PUBLIC :: rhoset
      PUBLIC :: ortho_iterate
      PUBLIC :: ortho_alt_iterate
-      PUBLIC :: updatc, calphi
+      PUBLIC :: updatc, calphi_bgrp
      PUBLIC :: mesure_diag_perf
      PUBLIC :: mesure_mmul_perf
      PUBLIC :: diagonalize_parallel
      PUBLIC :: diagonalize_serial
      PUBLIC :: use_parallel_diag
+      PUBLIC :: bec_bgrp2ortho

 CONTAINS

@ -1069,7 +1070,7 @@ END SUBROUTINE diagonalize_parallel

 !
 !-------------------------------------------------------------------------
-   SUBROUTINE updatc( ccc, n, x0, nx0, phi, ngwx, bephi, nkbx, becp_dist, bec, cp, nss, istart, desc )
+   SUBROUTINE updatc( ccc, n, x0, nx0, phi, ngwx, bephi, nkbx, becp_dist, bec_bgrp, cp_bgrp, nss, istart, desc )
 !-----------------------------------------------------------------------
 !
      !     input ccc : dt**2/emass OR 1.0d0 demending on ortho
@ -1089,7 +1090,8 @@ END SUBROUTINE diagonalize_parallel
      USE gvecw,             ONLY: ngw
      USE control_flags,     ONLY: iprint, iprsta
      USE mp,                ONLY: mp_sum, mp_bcast
-      USE mp_global,         ONLY: intra_bgrp_comm, leg_ortho, me_bgrp
+      USE mp_global,         ONLY: intra_bgrp_comm, leg_ortho, me_bgrp, inter_bgrp_comm
+      USE electrons_base,    ONLY: nbspx_bgrp, ibgrp_g2l
      USE descriptors,       ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , descla_siz_ , la_comm_ , &
                                   la_npc_ , la_npr_ , nlax_ , la_n_ , la_nx_ , la_myr_ , la_myc_ , &
                                   descla_init
@ -1098,9 +1100,9 @@ END SUBROUTINE diagonalize_parallel
 !
      INTEGER, INTENT(IN) :: n, nx0, ngwx, nkbx, istart, nss
      INTEGER, INTENT(IN) :: desc( descla_siz_ )
-      COMPLEX(DP) :: cp( ngwx, n ), phi( ngwx, n )
+      COMPLEX(DP) :: cp_bgrp( ngwx, nbspx_bgrp ), phi( ngwx, n )
      REAL(DP), INTENT(IN) :: ccc
-      REAL(DP)    :: bec( nkbx, n ), x0( nx0, nx0 )
+      REAL(DP)    :: bec_bgrp( nkbx, nbspx_bgrp ), x0( nx0, nx0 )
      REAL(DP)    :: bephi( :, : )
      REAL(DP)    :: becp_dist( :, : )

@ -1111,10 +1113,12 @@ END SUBROUTINE diagonalize_parallel
      REAL(DP),    ALLOCATABLE :: xd(:,:) 
      REAL(DP),    ALLOCATABLE :: bephi_tmp(:,:) 
      REAL(DP),    ALLOCATABLE :: becp_tmp(:,:) 
+      COMPLEX(DP), ALLOCATABLE :: cp_tmp(:,:) 
      REAL(DP) :: beta
      INTEGER :: ipr, ipc, nx, root
      INTEGER :: np( 2 ), coor_ip( 2 )
      INTEGER :: desc_ip( descla_siz_ )
+      INTEGER :: ibgrp_i
      !
      !     lagrange multipliers
      !
@ -1135,15 +1139,19 @@ END SUBROUTINE diagonalize_parallel
      CALL start_clock( 'updatc' )

      ALLOCATE( xd( nx, nx ) )
+      ALLOCATE( cp_tmp( SIZE( cp_bgrp, 1 ), nx ) )

      IF( nvb > 0 )THEN
         ALLOCATE( wtemp( nx, nkb ) )
         ALLOCATE( bephi_tmp( nkbx, nx ) )
         ALLOCATE( becp_tmp( nkbx, nx ) )
         DO i = 1, nss
-            DO inl = 1, nkbus
-               bec( inl, i + istart - 1 ) = 0.0d0
-            END DO
+            ibgrp_i = ibgrp_g2l( i + istart - 1 )
+            IF( ibgrp_i > 0 ) THEN
+               DO inl = 1, nkbus
+                  bec_bgrp( inl, ibgrp_i ) = 0.0d0
+               END DO
+            END IF
         END DO
      END IF

@ -1193,8 +1201,17 @@ END SUBROUTINE diagonalize_parallel

            CALL mp_bcast( xd, root, intra_bgrp_comm )

+            !CALL dgemm( 'N', 'N', 2*ngw, nc, nr, 1.0d0, phi(1,istart+ir-1), 2*ngwx, &
+            !            xd, nx, 1.0d0, cp(1,istart+ic-1), 2*ngwx )
            CALL dgemm( 'N', 'N', 2*ngw, nc, nr, 1.0d0, phi(1,istart+ir-1), 2*ngwx, &
-                        xd, nx, 1.0d0, cp(1,istart+ic-1), 2*ngwx )
+                        xd, nx, 0.0d0, cp_tmp, 2*ngwx )
+
+            DO i = 1, nc
+               ibgrp_i = ibgrp_g2l( i + istart + ic - 2 )
+               IF( ibgrp_i > 0 ) THEN
+                  cp_bgrp( : , ibgrp_i ) = cp_bgrp( : , ibgrp_i ) + cp_tmp( : , i )
+               END IF
+            END DO

            IF( nvb > 0 )THEN

@ -1207,15 +1224,21 @@ END SUBROUTINE diagonalize_parallel
               ! here nr and ir are still valid, since they are the same for all procs in the same row
               !
               DO i = 1, nr
-                  DO inl = 1, nkbus
-                     bec( inl, i + istart + ir - 2 ) = bec( inl, i + istart + ir - 2 ) + wtemp( i, inl ) 
-                  END DO
+                  ibgrp_i = ibgrp_g2l( i + istart + ir - 2 )
+                  IF( ibgrp_i > 0 ) THEN
+                     DO inl = 1, nkbus
+                        bec_bgrp( inl, ibgrp_i ) = bec_bgrp( inl, ibgrp_i ) + wtemp( i, inl ) 
+                     END DO
+                  END IF
               END DO
               IF( ipr == ipc )THEN
                  DO i = 1, nr
-                     DO inl = 1, nkbus
-                        bec( inl, i + istart + ir - 2 ) = bec( inl, i + istart + ir - 2 ) + becp_tmp( inl, i ) 
-                     END DO
+                     ibgrp_i = ibgrp_g2l( i + istart + ir - 2 )
+                     IF( ibgrp_i > 0 ) THEN
+                        DO inl = 1, nkbus
+                           bec_bgrp( inl, ibgrp_i ) = bec_bgrp( inl, ibgrp_i ) + becp_tmp( inl, i ) 
+                        END DO
+                     END IF
                  END DO
               END IF
               !
@ -1229,11 +1252,6 @@ END SUBROUTINE diagonalize_parallel
         DEALLOCATE( wtemp )
         DEALLOCATE( bephi_tmp )
         DEALLOCATE( becp_tmp )
-         !DO i = istart, istart + nss - 1
-         !   DO inl = 1, nkbus
-         !      bec( inl, i ) = bec( inl, i ) + becp( inl, i ) 
-         !   END DO
-         !END DO
      END IF
 !
      IF ( iprsta > 2 ) THEN
@ -1242,18 +1260,19 @@ END SUBROUTINE diagonalize_parallel
            IF( nvb > 1 ) THEN
               WRITE( stdout,'(33x,a,i4)') ' updatc: bec (is)',is
               WRITE( stdout,'(8f9.4)')                                       &
-     &            ((bec(ish(is)+(iv-1)*na(is)+1,i+istart-1),iv=1,nh(is)),i=1,nss)
+     &            ((bec_bgrp(ish(is)+(iv-1)*na(is)+1,i+istart-1),iv=1,nh(is)),i=1,nss)
            ELSE
               DO ia=1,na(is)
                  WRITE( stdout,'(33x,a,i4)') ' updatc: bec (ia)',ia
                  WRITE( stdout,'(8f9.4)')                                    &
-     &            ((bec(ish(is)+(iv-1)*na(is)+ia,i+istart-1),iv=1,nh(is)),i=1,nss)
+     &            ((bec_bgrp(ish(is)+(iv-1)*na(is)+ia,i+istart-1),iv=1,nh(is)),i=1,nss)
               END DO
            END IF
            WRITE( stdout,*)
         END DO
      ENDIF
      !
+      DEALLOCATE( cp_tmp )
      DEALLOCATE( xd )
      !
      CALL stop_clock( 'updatc' )
@ -1263,9 +1282,8 @@ END SUBROUTINE diagonalize_parallel



-
 !-------------------------------------------------------------------------
-      SUBROUTINE calphi( c0, ngwx, bec, nkbx, betae, phi, n, ema0bg )
+      SUBROUTINE calphi_bgrp( c0_bgrp, ngwx, bec_bgrp, nkbx, betae, phi_bgrp, nbspx_bgrp, ema0bg )
 !-----------------------------------------------------------------------
 !     input: c0 (orthonormal with s(r(t)), bec=<c0|beta>, betae=|beta>
 !     computes the matrix phi (with the old positions)
@ -1275,20 +1293,21 @@ END SUBROUTINE diagonalize_parallel
      USE kinds,          ONLY: DP
      USE ions_base,      ONLY: na, nsp
      USE io_global,      ONLY: stdout
-      USE mp_global,      ONLY: intra_bgrp_comm
+      USE mp_global,      ONLY: intra_bgrp_comm, inter_bgrp_comm
      USE cvan,           ONLY: ish, nvb
      USE uspp_param,     ONLY: nh
      USE uspp,           ONLY: nkbus, qq
      USE gvecw,          ONLY: ngw
+      USE electrons_base, ONLY: nbsp_bgrp, nbsp
      USE constants,      ONLY: pi, fpi
      USE control_flags,  ONLY: iprint, iprsta
      USE mp,             ONLY: mp_sum
 !
      IMPLICIT NONE
      
-      INTEGER, INTENT(IN) :: ngwx, nkbx, n
-      COMPLEX(DP)         :: c0( ngwx, n ), phi( ngwx, n ), betae( ngwx, nkbx )
-      REAL(DP)            :: bec( nkbx, n ), emtot
+      INTEGER, INTENT(IN) :: ngwx, nkbx, nbspx_bgrp
+      COMPLEX(DP)         :: c0_bgrp( ngwx, nbspx_bgrp ), phi_bgrp( ngwx, nbspx_bgrp ), betae( ngwx, nkbx )
+      REAL(DP)            :: bec_bgrp( nkbx, nbspx_bgrp ), emtot
      REAL(DP), OPTIONAL  :: ema0bg( ngwx )

      ! local variables
@ -1297,14 +1316,15 @@ END SUBROUTINE diagonalize_parallel
      REAL(DP), ALLOCATABLE :: qtemp( : , : )
      REAL(DP) :: qqf
 !
-      IF( n < 1 ) RETURN
+      IF( nbsp_bgrp < 1 ) RETURN
      !
      CALL start_clock( 'calphi' )
-
+      !
+      ! Note that phi here is computed only for my band group
      !
      IF ( nvb > 0 ) THEN

-         ALLOCATE( qtemp( nkbus, n ) )
+         ALLOCATE( qtemp( nkbus, nbspx_bgrp ) )

         qtemp (:,:) = 0.d0
         DO is=1,nvb
@ -1314,38 +1334,38 @@ END SUBROUTINE diagonalize_parallel
                  jnl = ish(is)+(jv-1)*na(is)
                  IF(ABS(qq(iv,jv,is)) > 1.d-5) THEN
                     qqf = qq(iv,jv,is)
-                     DO i=1,n
-                        CALL daxpy( na(is), qqf, bec(jnl+1,i),1,qtemp(inl+1,i), 1 )
+                     DO i=1,nbsp_bgrp
+                        CALL daxpy( na(is), qqf, bec_bgrp(jnl+1,i),1,qtemp(inl+1,i), 1 )
                     END DO
                  ENDIF
               END DO
            END DO
         END DO
 !
-         CALL dgemm ( 'N', 'N', 2*ngw, n, nkbus, 1.0d0, betae, &
-                       2*ngwx, qtemp, nkbus, 0.0d0, phi, 2*ngwx )
+         CALL dgemm ( 'N', 'N', 2*ngw, nbsp_bgrp, nkbus, 1.0d0, betae, &
+                       2*ngwx, qtemp, nkbus, 0.0d0, phi_bgrp, 2*ngwx )

         DEALLOCATE( qtemp )

      ELSE

-         phi = (0.d0, 0.d0)
+         phi_bgrp = (0.d0, 0.d0)

      END IF
 !
      IF( PRESENT( ema0bg ) ) THEN
 !$omp parallel do default(shared), private(i)
-         DO j=1,n
+         DO j=1,nbsp_bgrp
            DO i=1,ngw
-               phi(i,j)=(phi(i,j)+c0(i,j))*ema0bg(i)
+               phi_bgrp(i,j)=(phi_bgrp(i,j)+c0_bgrp(i,j))*ema0bg(i)
            END DO
         END DO
 !$omp end parallel do
      ELSE
 !$omp parallel do default(shared), private(i)
-         DO j=1,n
+         DO j=1,nbsp_bgrp
            DO i=1,ngw
-               phi(i,j)=phi(i,j)+c0(i,j)
+               phi_bgrp(i,j)=phi_bgrp(i,j)+c0_bgrp(i,j)
            END DO
         END DO
 !$omp end parallel do
@ -1356,21 +1376,22 @@ END SUBROUTINE diagonalize_parallel
      IF(iprsta > 2) THEN
         emtot=0.0d0
         IF( PRESENT( ema0bg ) ) THEN
-            DO j=1,n
+            DO j=1,nbsp_bgrp
               DO i=1,ngw
-                  emtot=emtot +2.0d0*DBLE(phi(i,j)*CONJG(c0(i,j)))*ema0bg(i)**(-2.0d0)
+                  emtot=emtot +2.0d0*DBLE(phi_bgrp(i,j)*CONJG(c0_bgrp(i,j)))*ema0bg(i)**(-2.0d0)
               END DO
            END DO
         ELSE
-            DO j=1,n
+            DO j=1,nbsp_bgrp
               DO i=1,ngw
-                  emtot=emtot +2.0d0*DBLE(phi(i,j)*CONJG(c0(i,j)))
+                  emtot=emtot +2.0d0*DBLE(phi_bgrp(i,j)*CONJG(c0_bgrp(i,j)))
               END DO
            END DO
         END IF
-         emtot=emtot/n
+         emtot=emtot/nbsp

         CALL mp_sum( emtot, intra_bgrp_comm )
+         CALL mp_sum( emtot, inter_bgrp_comm )

         WRITE( stdout,*) 'in calphi sqrt(emtot)=',SQRT(emtot)
         WRITE( stdout,*)
@ -1378,12 +1399,12 @@ END SUBROUTINE diagonalize_parallel
            IF( nvb > 1 ) THEN
               WRITE( stdout,'(33x,a,i4)') ' calphi: bec (is)',is
               WRITE( stdout,'(8f9.4)')                                       &
-     &            ((bec(ish(is)+(iv-1)*na(is)+1,i),iv=1,nh(is)),i=1,n)
+     &            ((bec_bgrp(ish(is)+(iv-1)*na(is)+1,i),iv=1,nh(is)),i=1,nbsp_bgrp)
            ELSE
               DO ia=1,na(is)
                  WRITE( stdout,'(33x,a,i4)') ' calphi: bec (ia)',ia
                  WRITE( stdout,'(8f9.4)')                                    &
-     &               ((bec(ish(is)+(iv-1)*na(is)+ia,i),iv=1,nh(is)),i=1,n)
+     &               ((bec_bgrp(ish(is)+(iv-1)*na(is)+ia,i),iv=1,nh(is)),i=1,nbsp_bgrp)
               END DO
            END IF
         END DO
@ -1393,6 +1414,58 @@ END SUBROUTINE diagonalize_parallel
      CALL stop_clock( 'calphi' )
 !
      RETURN
-      END SUBROUTINE calphi
+      END SUBROUTINE calphi_bgrp

-   END MODULE orthogonalize_base
+
+   SUBROUTINE bec_bgrp2ortho( bec_bgrp, bec_ortho, nlax, desc )
+      USE kinds,             ONLY: DP
+      USE uspp,              ONLY: nkb, nkbus
+      USE mp,                ONLY: mp_sum
+      USE mp_global,         ONLY: intra_bgrp_comm, leg_ortho, me_bgrp, inter_bgrp_comm
+      USE electrons_base,    ONLY: nbspx_bgrp, ibgrp_g2l, nspin
+      USE descriptors,       ONLY: nlar_ , nlac_ , ilar_ , ilac_ , lambda_node_ , descla_siz_ , la_comm_ , &
+                                   la_npc_ , la_npr_ , nlax_ , la_n_ , la_nx_ , la_myr_ , la_myc_ , &
+                                   descla_init
+      !
+      IMPLICIT NONE
+      !
+      INTEGER, INTENT(IN) :: nlax
+      INTEGER, INTENT(IN) :: desc(:,:)
+      REAL(DP), INTENT(IN)  :: bec_bgrp(:,:)
+      REAL(DP), INTENT(OUT) :: bec_ortho(:,:)
+      !
+      INTEGER :: ir, nr, i, ibgrp_i, nup
+      !
+      bec_ortho = 0.0d0
+      !
+      IF( desc( lambda_node_ , 1 ) > 0 ) THEN
+         ir = desc( ilar_ , 1 )
+         nr = desc( nlar_ , 1 )
+         do i = 1, nr
+            ibgrp_i = ibgrp_g2l( i + ir - 1 )
+            IF( ibgrp_i > 0 ) THEN
+               bec_ortho( :, i ) = bec_bgrp( :, ibgrp_i )
+            END IF
+         end do
+      END IF
+      !
+      IF( nspin == 2 ) THEN
+         IF( desc( lambda_node_ , 2 ) > 0 ) THEN
+            nup = desc( la_n_ , 1 )
+            ir = desc( ilar_ , 2 )
+            nr = desc( nlar_ , 2 )
+            do i = 1, nr
+               ibgrp_i = ibgrp_g2l( i + ir - 1 + nup )
+               IF( ibgrp_i > 0 ) THEN
+                  bec_ortho( :, i + nlax ) = bec_bgrp( :, ibgrp_i )
+               END IF
+            end do
+         END IF
+      END IF
+      !
+      CALL mp_sum( bec_ortho, inter_bgrp_comm )
+      !
+      RETURN
+   END SUBROUTINE bec_bgrp2ortho
+
+END MODULE orthogonalize_base
--- a/CPV/restart.f90
+++ b/CPV/restart.f90
@ -9,7 +9,7 @@

 !-----------------------------------------------------------------------

-   SUBROUTINE writefile_cp                                         &
+   SUBROUTINE writefile_x                                         &
     &     ( h,hold,nfi,c0,cm,taus,tausm,vels,velsm,acc,           &
     &       lambda,lambdam,xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,nhpcl,nhpdim,ekincm,&
     &       xnhh0,xnhhm,vnhh,velh, fion, tps, mat_z, occ_f, rho )
@ -19,14 +19,16 @@
      USE ions_base,        ONLY: nsp, na, cdmi, taui
      USE cell_base,        ONLY: s_to_r
      USE cp_restart,       ONLY: cp_writefile
-      USE cp_interfaces,    ONLY: set_evtot, set_eitot
-      USE electrons_base,   ONLY: nspin, nbnd, nbsp, iupdwn, nupdwn
+      USE cp_interfaces,    ONLY: set_evtot, set_eitot, c_bgrp_expand, c_bgrp_pack
+      USE electrons_base,   ONLY: nspin, nbnd, nbsp, iupdwn, nupdwn, nbspx
      USE electrons_module, ONLY: ei
      USE io_files,         ONLY: tmp_dir
      USE ensemble_dft,     ONLY: tens
      USE mp,               ONLY: mp_bcast
      USE control_flags,    ONLY: tksw, ndw, io_level, twfcollect
      USE xml_io_base,      ONLY: restart_dir, kpoint_dir
+      USE electrons_module, ONLY: collect_c
+      USE gvecw,            ONLY: ngw

 !
      implicit none
@ -62,6 +64,9 @@
         !
      end if

+      CALL c_bgrp_expand( c0 )
+      CALL c_bgrp_expand( cm )
+
      ht     = TRANSPOSE( h ) 
      htm    = TRANSPOSE( hold ) 
      htvel  = TRANSPOSE( velh ) 
@ -104,12 +109,15 @@
      DEALLOCATE( eitot )
      !
      IF( tksw ) DEALLOCATE( ctot )
+      !
+      CALL c_bgrp_pack( c0 )
+      CALL c_bgrp_pack( cm )

      return
-      end subroutine writefile_cp
+      end subroutine writefile_x

 !-----------------------------------------------------------------------
-      subroutine readfile_cp                                        &
+      subroutine readfile_x                                        &
     &     ( flag, h,hold,nfi,c0,cm,taus,tausm,vels,velsm,acc,    &
     &       lambda,lambdam,xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,nhpcl,nhpdim,ekincm,&
     &       xnhh0,xnhhm,vnhh,velh,&
@ -120,7 +128,7 @@
 !
      USE kinds,          ONLY : DP
      USE io_files,       ONLY : tmp_dir
-      USE electrons_base, ONLY : nbnd, nbsp, nspin, nupdwn, iupdwn, keep_occ
+      USE electrons_base, ONLY : nbnd, nbsp, nspin, nupdwn, iupdwn, keep_occ, nbspx
      USE gvecw,          ONLY : ngw
      USE ions_base,      ONLY : nsp, na, cdmi, taui
      USE cp_restart,     ONLY : cp_readfile, cp_read_cell, cp_read_wfc
@ -128,6 +136,7 @@
      USE autopilot,      ONLY : event_step, event_index, max_event_step
      USE cp_autopilot,   ONLY : employ_rules
      USE control_flags,  ONLY : ndr
+      USE cp_interfaces,  ONLY : c_bgrp_pack
 !
      implicit none
      INTEGER, INTENT(in) :: flag
@ -151,16 +160,20 @@
      REAL(DP), ALLOCATABLE :: occ_ ( : )
      REAL(DP) :: htm1(3,3), b1(3) , b2(3), b3(3), omega

+
      IF( flag == -1 ) THEN
        CALL cp_read_cell( ndr, tmp_dir, .TRUE., ht, htm, htvel, gvel, xnhh0, xnhhm, vnhh )
        h     = TRANSPOSE( ht )
        hold  = TRANSPOSE( htm )
        velh  = TRANSPOSE( htvel )
        RETURN
-      ELSE IF ( flag == 0 ) THEN
+      END IF
+ 
+      IF ( flag == 0 ) THEN
        DO ispin = 1, nspin
          CALL cp_read_wfc( ndr, tmp_dir, 1, 1, ispin, nspin, c2 = cm(:,:), tag = 'm' )
        END DO
+        CALL c_bgrp_pack( cm )
        RETURN
      END IF

@ -198,10 +211,13 @@
         occ_f( : ) = occ_ ( : )
      END IF

+      CALL c_bgrp_pack( cm )
+      CALL c_bgrp_pack( c0 )
+      !
      DEALLOCATE( occ_ )

      return
-      end subroutine readfile_cp
+      end subroutine readfile_x


 !------------------------------------------------------------------------------!
--- a/CPV/restart_sub.f90
+++ b/CPV/restart_sub.f90
@ -12,9 +12,9 @@ SUBROUTINE from_restart( )
   USE control_flags,         ONLY : tbeg, taurdr, tfor, tsdp, tv0rd, &
                                     iprsta, tsde, tzeroe, tzerop, nbeg, tranp, amprp, thdyn, &
                                     tzeroc, force_pairing, trhor, ampre, trane, tpre, dt_old
-   USE wavefunctions_module,  ONLY : c0, cm, phi => cp
+   USE wavefunctions_module,  ONLY : c0_bgrp, cm_bgrp
   USE electrons_module,      ONLY : occn_info
-   USE electrons_base,        ONLY : nspin, iupdwn, nupdwn, f, nbsp
+   USE electrons_base,        ONLY : nspin, iupdwn, nupdwn, f, nbsp, nbsp_bgrp
   USE io_global,             ONLY : ionode, ionode_id, stdout
   USE cell_base,             ONLY : ainv, h, hold, deth, r_to_s, s_to_r, boxdimensions, &
                                     velh, a1, a2, a3
@ -36,12 +36,13 @@ SUBROUTINE from_restart( )
   USE uspp,                  ONLY : okvan, vkb, nkb
   USE core,                  ONLY : nlcc_any
   USE cp_main_variables,     ONLY : ht0, htm, lambdap, lambda, lambdam, eigr, &
-                                     sfac, bec, taub, irb, eigrb, edft
-   USE cdvan,                 ONLY : dbec
+                                     sfac, taub, irb, eigrb, edft, bec_bgrp, dbec
   USE time_step,             ONLY : delt
   USE atoms_type_module,     ONLY : atoms_type
   !
   IMPLICIT NONE
+   
+   INTEGER :: iss
   !
   ! ... We are restarting from file recompute ainv
   !
@ -95,8 +96,8 @@ SUBROUTINE from_restart( )
   fion = 0.D0
   !
   IF( force_pairing ) THEN
-      cm(:,iupdwn(2):nbsp) = cm(:,1:nupdwn(2))
-      c0(:,iupdwn(2):nbsp) = c0(:,1:nupdwn(2))
+      cm_bgrp(:,iupdwn(2):nbsp) = cm_bgrp(:,1:nupdwn(2))
+      c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
      lambdap( :, :, 2) =  lambdap( :, :, 1)
      lambda( :, :, 2) =  lambda( :, :, 1)
      lambdam( :, :, 2) = lambdam( :, :, 1)
@ -106,7 +107,7 @@ SUBROUTINE from_restart( )
      !
      lambdam = lambda
      !
-      cm = c0
+      cm_bgrp = c0_bgrp
      !
      WRITE( stdout, '(" Electronic velocities set to zero")' )
      !
@ -135,19 +136,21 @@ SUBROUTINE from_restart( )
 515   FORMAT(   3X,'Initial random displacement of el. coordinates',/ &
                3X,'Amplitude = ',F10.6 )
      !
-      CALL rande_base( c0, ampre )
-
-      CALL gram( vkb, bec, nkb, c0, ngw, nbsp )
+      CALL rande_base( c0_bgrp, ampre )
      !
-      IF( force_pairing ) c0(:,iupdwn(2):nbsp) = c0(:,1:nupdwn(2))
+      DO iss = 1, nspin
+         CALL gram_bgrp( vkb, bec_bgrp, nkb, c0_bgrp, ngw, iss )
+      END DO
      !
-      cm = c0
+      IF( force_pairing ) c0_bgrp(:,iupdwn(2):nbsp) = c0_bgrp(:,1:nupdwn(2))
+      !
+      cm_bgrp = c0_bgrp
      !
   END IF
   !
-   CALL calbec( 1, nsp, eigr, c0, bec )
+   CALL calbec_bgrp( 1, nsp, eigr, c0_bgrp, bec_bgrp )
   !
-   IF ( tpre     ) CALL caldbec( ngw, nkb, nbsp, 1, nsp, eigr, c0, dbec )
+   IF ( tpre     ) CALL caldbec_bgrp( eigr, c0_bgrp, dbec )
   !
   IF ( tefield  ) CALL efield_berry_setup( eigr, tau0 )
   IF ( tefield2 ) CALL efield_berry_setup2( eigr, tau0 )
@ -163,7 +166,6 @@ SUBROUTINE from_restart( )
      END IF
   END IF
   !
-   !
   ! dt_old should be -1.0 here if untouched ...
   !
   if ( dt_old > 0.0d0 ) then
--- a/CPV/runcp.f90
+++ b/CPV/runcp.f90
@ -12,17 +12,17 @@


   SUBROUTINE runcp_uspp_x &
-      ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec, c0, cm, fromscra, restart )
+      ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
      !
      !  This subroutine performs a Car-Parrinello or Steepest-Descent step
      !  on the electronic variables, computing forces on electrons
      ! 
      !  on input:
-      !  c0  wave functions at time t
-      !  cm  wave functions at time t - dt 
+      !  c0_bgrp  wave functions at time t
+      !  cm_bgrp  wave functions at time t - dt 
      !
      !  on output:
-      !  cm  wave functions at time t + dt, not yet othogonalized 
+      !  cm_bgrp  wave functions at time t + dt, not yet othogonalized 
      !
      USE parallel_include
      USE kinds,               ONLY : DP
@ -34,7 +34,7 @@
      use control_flags,       only : lwf, tsde
      use uspp,                only : deeq, vkb
      use gvect,  only : gstart
-      use electrons_base,      only : n=>nbsp, ispin, f, nspin, nupdwn, iupdwn
+      use electrons_base,      only : nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, nupdwn_bgrp, iupdwn_bgrp
      use wannier_subroutines, only : ef_potential
      use efield_module,       only : dforce_efield, tefield, dforce_efield2, tefield2
      use gvecw,               only : ngw, ngwx
@ -48,8 +48,8 @@
      REAL(DP) :: fccc, ccc
      REAL(DP) :: ema0bg(:), dt2bye
      REAL(DP) :: rhos(:,:)
-      REAL(DP) :: bec(:,:)
-      COMPLEX(DP) :: c0(:,:), cm(:,:)
+      REAL(DP) :: bec_bgrp(:,:)
+      COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
      LOGICAL, OPTIONAL, INTENT(IN) :: fromscra
      LOGICAL, OPTIONAL, INTENT(IN) :: restart
      !
@ -59,7 +59,7 @@
     real(DP),    allocatable :: emaver(:)
     complex(DP), allocatable :: c2(:), c3(:)
     REAL(DP),    ALLOCATABLE :: tg_rhos(:,:)
-     integer :: i, nsiz, incr, idx, idx_in, ierr, icnt
+     integer :: i, nsiz, incr, idx, idx_in, ierr
     integer :: iwfc, nwfc, is, ii, tg_rhos_siz, c2_siz
     integer :: iflag
     logical :: ttsde
@ -107,7 +107,7 @@

     IF( lwf ) THEN

-        call ef_potential( nfi, rhos, bec, deeq, vkb, c0, cm, emadt2, emaver, verl1, verl2 )
+        call ef_potential( nfi, rhos, bec_bgrp, deeq, vkb, c0_bgrp, cm_bgrp, emadt2, emaver, verl1, verl2 )

     ELSE

@ -135,17 +135,14 @@

        END IF

-        icnt = 0 

-        DO i = 1, n, incr
-
-           IF( icnt == my_bgrp_id ) THEN
+        DO i = 1, nbsp_bgrp, incr

           IF( use_task_groups ) THEN
              !
              !The input coefficients to dforce cover eigenstates i:i+2*NOGRP-1
-              !Thus, in dforce the dummy arguments for c0(1,i) and
-              !c0(1,i+1) hold coefficients for eigenstates i,i+2*NOGRP-2,2
+              !Thus, in dforce the dummy arguments for c0_bgrp(1,i) and
+              !c0_bgrp(1,i+1) hold coefficients for eigenstates i,i+2*NOGRP-2,2
              !and i+1,i+2*NOGRP...for example if NOGRP is 4 then we would have
              !1-3-5-7 and 2-4-6-8
              !
@ -157,11 +154,11 @@
                 CALL errore( ' runcp_uspp ', ' lda_plus_u with task group not implemented yet ', 1 )
              end if

-              CALL dforce( i, bec, vkb, c0, c2, c3, tg_rhos, tg_rhos_siz, ispin, f, n, nspin )
+              CALL dforce( i, bec_bgrp, vkb, c0_bgrp, c2, c3, tg_rhos, tg_rhos_siz, ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )

           ELSE

-              CALL dforce( i, bec, vkb, c0, c2, c3, rhos, SIZE(rhos,1), ispin, f, n, nspin )
+              CALL dforce( i, bec_bgrp, vkb, c0_bgrp, c2, c3, rhos, SIZE(rhos,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )

           END IF

@ -172,35 +169,35 @@
           END IF

           IF( tefield ) THEN
-             CALL dforce_efield ( bec, i, c0, c2, c3, rhos)
+             CALL dforce_efield ( bec_bgrp, i, c0_bgrp, c2, c3, rhos)
           END IF

           IF( tefield2 ) THEN
-             CALL dforce_efield2 ( bec, i, c0, c2, c3, rhos)
+             CALL dforce_efield2 ( bec_bgrp, i, c0_bgrp, c2, c3, rhos)
           END IF

           IF( iflag == 2 ) THEN
              DO idx = 1, incr, 2
-                 IF( i + idx - 1 <= n ) THEN
-                    cm( :, i+idx-1) = c0(:,i+idx-1)
-                    cm( :, i+idx  ) = c0(:,i+idx  )
+                 IF( i + idx - 1 <= nbsp_bgrp ) THEN
+                    cm_bgrp( :, i+idx-1) = c0_bgrp(:,i+idx-1)
+                    cm_bgrp( :, i+idx  ) = c0_bgrp(:,i+idx  )
                 END IF
              ENDDO
           END IF

           idx_in = 1
           DO idx = 1, incr, 2
-              IF( i + idx - 1 <= n ) THEN
+              IF( i + idx - 1 <= nbsp_bgrp ) THEN
                 IF (tsde) THEN
-                    CALL wave_steepest( cm(:, i+idx-1 ), c0(:, i+idx-1 ), emaver, c2, ngw, idx_in )
-                    CALL wave_steepest( cm(:, i+idx   ), c0(:, i+idx   ), emaver, c3, ngw, idx_in )
+                    CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2, ngw, idx_in )
+                    CALL wave_steepest( cm_bgrp(:, i+idx   ), c0_bgrp(:, i+idx   ), emaver, c3, ngw, idx_in )
                 ELSE
-                    CALL wave_verlet( cm(:, i+idx-1 ), c0(:, i+idx-1 ), verl1, verl2, emaver, c2, ngw, idx_in )
-                    CALL wave_verlet( cm(:, i+idx   ), c0(:, i+idx   ), verl1, verl2, emaver, c3, ngw, idx_in )
+                    CALL wave_verlet( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), verl1, verl2, emaver, c2, ngw, idx_in )
+                    CALL wave_verlet( cm_bgrp(:, i+idx   ), c0_bgrp(:, i+idx   ), verl1, verl2, emaver, c3, ngw, idx_in )
                 ENDIF
                 IF ( gstart == 2 ) THEN
-                    cm(1,i+idx-1) = CMPLX(real(cm(1,i+idx-1)),0.0d0,kind=dp)
-                    cm(1,i+idx  ) = CMPLX(real(cm(1,i+idx  )),0.0d0,kind=dp)
+                    cm_bgrp(1,i+idx-1) = CMPLX(real(cm_bgrp(1,i+idx-1)),0.0d0,kind=dp)
+                    cm_bgrp(1,i+idx  ) = CMPLX(real(cm_bgrp(1,i+idx  )),0.0d0,kind=dp)
                 END IF
              END IF
              !
@ -208,25 +205,8 @@
              !
           END DO

-           ELSE
-
-              DO idx = 1, incr, 2
-                 IF( i + idx - 1 <= n ) THEN
-                    cm( :, i+idx-1) = 0.0d0
-                    cm( :, i+idx  ) = 0.0d0
-                 END IF
-              ENDDO
-  
-           END IF
-
-           icnt = MOD( icnt + 1 , nbgrp )
-
        end do

-        IF( nbgrp > 1 ) THEN
-           CALL mp_sum( cm, inter_bgrp_comm )
-        END IF
-
        DEALLOCATE( c2 )
        DEALLOCATE( c3 )
        DEALLOCATE( tg_rhos )
@ -237,7 +217,7 @@
     DEALLOCATE( emaver )
 !
   END SUBROUTINE runcp_uspp_x
-!
+
 !
 !=----------------------------------------------------------------------------=!
 !
@ -454,4 +434,3 @@
      DEALLOCATE(c3, c5)

   END SUBROUTINE runcp_uspp_force_pairing_x
-
--- a/CPV/wave.f90
+++ b/CPV/wave.f90
@ -71,7 +71,7 @@
    
    USE kinds,              only : DP
    use mp,                 only : mp_sum
-    use mp_global,          only : intra_bgrp_comm
+    use mp_global,          only : intra_bgrp_comm, nbgrp, inter_bgrp_comm
    use gvect, only : gstart
    use wave_base,          only : wave_speed2
    !
@ -100,6 +100,8 @@
    ekincm = ekincm * emass / ( delt * delt )

    CALL mp_sum( ekincm, intra_bgrp_comm )
+    IF( nbgrp > 1 ) &
+       CALL mp_sum( ekincm, inter_bgrp_comm )
    DEALLOCATE( emainv )

    return
@ -354,7 +356,7 @@


 !=----------------------------------------------------------------------------=!
-   SUBROUTINE wave_rand_init_x( cm, n, noff )
+   SUBROUTINE wave_rand_init_x( cm_bgrp )
 !=----------------------------------------------------------------------------=!

      !  this routine sets the initial wavefunctions at random
@ -368,27 +370,27 @@
      USE gvecw,              ONLY: ngw, ngw_g
      USE io_global,          ONLY: stdout
      USE random_numbers,     ONLY: randy
+      USE electrons_base,     ONLY: nbsp, nbsp_bgrp, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
      
      IMPLICIT NONE

      ! ... declare subroutine arguments 
-      INTEGER,     INTENT(IN)  :: n, noff
-      COMPLEX(DP), INTENT(OUT) :: cm(:,:)
+      COMPLEX(DP), INTENT(OUT) :: cm_bgrp(:,:)

      ! ... declare other variables
-      INTEGER :: ntest, ig, ib
+      INTEGER :: ntest, ig, ib, iss, n1, n2, m1, m2
      REAL(DP) ::  rranf1, rranf2, ampre
      COMPLEX(DP), ALLOCATABLE :: pwt( : )

      ! ... Check array dimensions

-      IF( SIZE( cm, 1 ) < ngw ) THEN 
+      IF( SIZE( cm_bgrp, 1 ) < ngw ) THEN 
        CALL errore(' wave_rand_init ', ' wrong dimensions ', 3)
      END IF

      ! ... Reset them to zero
 
-      cm( :, noff : noff + n - 1 ) = 0.0d0
+      cm_bgrp = 0.0d0

      ! ... initialize the wave functions in such a way that the values
      ! ... of the components are independent on the number of processors
@ -397,25 +399,36 @@
      ALLOCATE( pwt( ngw_g ) )

      ntest = ngw_g / 4
-      IF( ntest < SIZE( cm, 2 ) ) THEN
+      IF( ntest < SIZE( cm_bgrp, 2 ) ) THEN
         ntest = ngw_g
      END IF
      !
      ! ... assign random values to wave functions
      !
-      DO ib = noff, noff + n - 1
+      DO ib = 1, nbsp
        pwt( : ) = 0.0d0
        DO ig = 3, ntest
          rranf1 = 0.5d0 - randy()
          rranf2 = randy()
          pwt( ig ) = ampre * CMPLX(rranf1, rranf2,kind=DP)
        END DO
-        DO ig = 1, ngw
-          cm( ig, ib ) = pwt( ig_l2g( ig ) )
-        END DO
+        !
+        iss = 1
+        IF( ib > nupdwn( 1 ) ) iss = 2
+        n1 = iupdwn_bgrp(iss)
+        n2 = n1 + nupdwn_bgrp(iss) - 1
+        m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+        m2 = m1 + nupdwn_bgrp(iss) - 1
+        !
+        IF( ib >= m1 .AND. ib <= m2 ) THEN
+          DO ig = 1, ngw
+            cm_bgrp( ig, ib - m1 + n1 ) = pwt( ig_l2g( ig ) )
+          END DO
+        END IF
+        !
      END DO
      IF ( gstart == 2 ) THEN
-        cm( 1, noff : noff + n - 1 ) = (0.0d0, 0.0d0)
+        cm_bgrp( 1, : ) = (0.0d0, 0.0d0)
      END IF

      DEALLOCATE( pwt )
@ -423,3 +436,59 @@
      RETURN
    END SUBROUTINE wave_rand_init_x

+
+    SUBROUTINE c_bgrp_expand_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      USE mp,                 ONLY: mp_sum
+      USE electrons_base,     ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
+      USE mp_global,          ONLY: nbgrp, inter_bgrp_comm
+      IMPLICIT NONE
+      COMPLEX(DP) :: c_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2, i
+      IF( nbgrp < 2 ) &
+         RETURN
+      DO iss = nspin, 1, -1
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         DO i = m2, m1, -1
+            c_bgrp(:,i) = c_bgrp(:,i-m1+n1)
+         END DO
+      END DO
+      DO iss = 1, nspin
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         DO i = 1, m1-1
+            c_bgrp(:,i) = 0.0d0
+         END DO
+         DO i = m2+1, nupdwn(iss)
+            c_bgrp(:,i) = 0.0d0
+         END DO
+      END DO
+      CALL mp_sum( c_bgrp, inter_bgrp_comm )
+      RETURN
+    END SUBROUTINE c_bgrp_expand_x
+
+
+    SUBROUTINE c_bgrp_pack_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      USE electrons_base,     ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
+      USE mp_global,          ONLY: nbgrp
+      IMPLICIT NONE
+      COMPLEX(DP) :: c_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2, i
+      IF( nbgrp < 2 ) &
+         RETURN
+      DO iss = 1, nspin
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+         DO i = n1, n2
+            c_bgrp(:,i) = c_bgrp(:,i-n1+m1)
+         END DO
+      END DO
+      RETURN
+    END SUBROUTINE c_bgrp_pack_x
+