Merge branch 'merge_qegpu' into HEAD

2021-01-22 17:20:50 +01:00 · 2021-01-22 17:20:50 +01:00 · db0da8b0d9
parent f4e957a3eb c75b94d359
commit db0da8b0d9
277 changed files with 43022 additions and 2109 deletions
--- a/.ci/cineca.yml
+++ b/.ci/cineca.yml
@ -1,104 +1,19 @@
-# ====================
-# UtilXlib UnitTesting
-# ====================
-
-build:cudampiomp:
-  tags: [galileo]
-  only:
-    - /utilxlib/i
-  script:
-    - module load profile/advanced pgi/17.10 cuda/8.0.61
-    - ./configure --enable-openmp
-    - cd UtilXlib/tests
-    - bash compile_and_run_tests.sh -smcn
-
-build:intelmpiomp:
-  tags: [galileo]
-  only:
-    - /utilxlib/i
-  script:
-    - module load profile/advanced intel intelmpi
-    - ./configure --enable-openmp
-    - cd UtilXlib/tests
-    - bash compile_and_run_tests.sh -sm
-
-build:cudampi:
-  tags: [galileo]
-  only:
-    - /utilxlib/i
-  script:
-    - module load profile/advanced pgi/17.10 cuda/8.0.61
-    - ./configure
-    - cd UtilXlib/tests
-    - bash compile_and_run_tests.sh -smcn
-
-build:intelmpi:
-  tags: [galileo]
-  only:
-    - /utilxlib/i
-  script:
-    - module load profile/advanced intel intelmpi
-    - ./configure
-    - cd UtilXlib/tests
-    - bash compile_and_run_tests.sh -sm
-
 build:pw:
-  image: ubuntu:latest
+  tags: [docker]
+  image: espressofoundation/ubuntu:latest
  script:
-    - apt-get update
-    - apt-get install --yes build-essential gfortran wget python
-    - apt-get install --yes libopenmpi-dev openmpi-bin
-    - apt-get install --yes libblas-dev liblapack-dev fftw3 fftw3-dev pkg-config
    - ./configure
    - make pw
-  #  - export OMP_NUM_THREADS=1
-  #  - cd test-suite
-  #  - mkdir /tmp/save
-  #  - make run-tests-pw-serial
-  #  - rm -r /tmp/save
-
-build:cp:
-  image: ubuntu:latest
-  script:
-    - apt-get update
-    - apt-get install --yes build-essential gfortran wget python
-    - apt-get install --yes libopenmpi-dev openmpi-bin
-    - apt-get install --yes libblas-dev liblapack-dev fftw3 fftw3-dev pkg-config
-    - ./configure
-    - make cp
-  #  - export OMP_NUM_THREADS=1
-  #  - cd test-suite
-  #  - mkdir /tmp/save
-  #  - make run-tests-cp-serial
-  #  - rm -r /tmp/save
-
-

 #### BUILDS ON GALILEO ####

-build:intel:
-  tags: [galileo]
+intel parallel:
+  tags: [intel]
  script:
    - module purge
-    - module load intel/pe-xe-2018--binary intelmpi/2018--binary mkl/2018--binary
-    - module list
+    - module load intel/pe-xe-2018--binary intelmpi/2018--binary mkl/2018--binary python
    - ./configure --enable-openmp
-    - make pw cp
-    - export OMP_NUM_THREADS=2
-    - cd test-suite
-    - mkdir -p /tmp/save
-    - make run-tests-pw-parallel
-    - make run-tests-cp-parallel
-    - rm -r /tmp/save
-
-
-pgi cuda mpi:
-  tags: [k80]
-  script:
-    - module purge
-    - module load profile/global pgi/17.10 mkl/2018--binary cuda/8.0.61
-    - ./configure --enable-openmp --with-cuda-runtime=8.0 --with-cuda-cc=35 --with-cuda=$CUDA_HOME --with-scalapack=no
-    - make -j pw
+    - make -j pw cp
    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
    - echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
    - cd test-suite
@ -107,3 +22,163 @@ pgi cuda mpi:
    - make run-tests-pw-parallel
    - sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
    - make clean && make run-tests-pw-parallel
+
+pgi parallel k80:
+  tags: [k80]
+  script:
+    - module purge
+    - module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
+    - ./configure --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=35 --with-cuda=yes --enable-cuda-env-check=no --with-scalapack=no
+    - make -j pw cp
+    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
+    - echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
+    - cd test-suite
+    - sed -i "s/TESTCODE_NPROCS=4/TESTCODE_NPROCS=$SLURM_NTASKS/" ENVIRONMENT
+    - make clean
+    - make run-tests-pw-parallel
+    - sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
+    - make clean && make run-tests-pw-parallel
+    - cd .. && cp PW/src/pw.x ./pwgpu-mpi-cuda8-cc35-${CI_COMMIT_SHA:0:8}.x
+
+
+pgi parallel v100:
+  tags: [galileo,v100]
+  script:
+    - module purge
+    - module load profile/global hpc-sdk/20.9--binary mkl/2019--binary cuda/10.1 python
+    - ./configure --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=70 --with-cuda=$CUDA_HOME --with-scalapack=no
+    - make -j pw cp
+    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
+    - echo "Using $SLURM_NTASKS procs and $OMP_NUM_THREADS threads"
+    - cd test-suite
+    - sed -i "s/TESTCODE_NPROCS=4/TESTCODE_NPROCS=$SLURM_NTASKS/" ENVIRONMENT
+    - make clean
+    - make run-tests-pw-parallel
+    - sed -i 's/export PARA_POSTFIX=" "/export PARA_POSTFIX=" -pd=.true."/g' run-pw.sh
+    - make clean && make run-tests-pw-parallel
+
+pgi serial k80:
+  tags: [k80]
+  script:
+    - module purge
+    - module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
+    - ./configure --disable-parallel --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=35 --with-cuda=yes --enable-cuda-env-check=no
+    - make -j pw cp
+    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
+    - echo "Using $OMP_NUM_THREADS threads"
+    - cd test-suite
+    - make clean
+    - make run-tests-pw-serial
+    - make run-tests-cp-serial
+
+
+pgi serial v100 :
+  tags: [galileo,v100]
+  script:
+    - module purge
+    - module load profile/global hpc-sdk/20.9--binary mkl/2019--binary python
+    - ./configure --disable-parallel --enable-openmp --with-cuda-runtime=10.1 --with-cuda-cc=70 --with-cuda=yes --enable-cuda-env-check=no
+    - make -j pw cp
+    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
+    - echo "Using $OMP_NUM_THREADS threads"
+    - cd test-suite
+    - make clean
+    - make run-tests-pw-serial
+    - make run-tests-cp-serial
+
+pgi207 power v100:
+  tags: [marconi100]
+  script:
+    - module purge
+    - module load profile/global hpc-sdk/2020--binary cuda/10.1 spectrum_mpi/10.3.1--binary python/3.8.2
+    - pwd
+    - cd ..
+    - ./configure CC=pgcc F77=pgf90 FC=pgf90 F90=pgf90 MPIF90=mpipgifort --enable-openmp --with-cuda=$CUDA_ROOT --with-cuda-runtime=10.1 --with-cuda-cc=70
+    - make -j pw cp
+    - salloc --nodes=1 --ntasks-per-node=4 --ntasks-per-socket=2 --cpus-per-task=32 --gres=gpu:4 --mem=230000MB --time 01:00:00 -A $CINECA_QE_ACCOUNT -p m100_usr_prod
+    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=8; fi
+    - echo "Using $OMP_NUM_THREADS threads"
+    - cd test-suite
+    - make clean
+    - make run-tests-pw-parallel
+    - make run-tests-cp-parallel
+    - exit
+
+#pgi cuda9.2 p100 nollvm:
+#  tags: [p100]
+#  script:
+#    - module purge
+#    - module load profile/global pgi/19.10--binary cuda/10.0
+#    - ./configure --enable-openmp --with-cuda-runtime=10.0 --with-cuda-cc=60 --with-cuda=$CUDA_HOME --with-scalapack=no
+#    - sed -i 's/traditional/traditional -Uvector/' make.inc # problem in PGI compiler mis-understanding comments.
+#    - sed -i 's/cuda10.0/cuda10.0,nollvm/' make.inc # this is needed for some reason yet to be clarified.
+#    - make -j pw
+#    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
+#    - echo "Using $OMP_NUM_THREADS threads"
+#    - cd test-suite
+#    - make clean
+#    - make run-tests-pw-parallel
+#
+#pgi cuda9.2 p100:
+#  tags: [p100]
+#  script:
+#    - module purge
+#    - module load profile/global pgi/19.10--binary cuda/10.0
+#    - ./configure --enable-openmp --with-cuda-runtime=10.0 --with-cuda-cc=60 --with-cuda=$CUDA_HOME --with-scalapack=no
+#    - sed -i 's/traditional/traditional -Uvector/' make.inc # problem in PGI compiler mis-understanding comments.
+#    - make -j pw
+#    - if [ -z ${SLURM_CPUS_PER_TASK} ]; then export OMP_NUM_THREADS=1; else export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK; fi
+#    - echo "Using $OMP_NUM_THREADS threads"
+#    - cd test-suite
+#    - make clean
+#    - make run-tests-pw-parallel
+
+#
+## UtilXlib UnitTesting 
+#build:cudampiomp:
+#  tags: [galileo]
+#  script:
+#    - module load profile/advanced pgi/17.10 cuda/8.0.61
+#    - ./configure --enable-openmp
+#    - cd UtilXlib/tests
+#    - bash compile_and_run_tests.sh -smcn
+#
+#build:intelmpiomp:
+#  tags: [galileo]
+#  script:
+#    - module load profile/advanced intel intelmpi
+#    - ./configure --enable-openmp
+#    - cd UtilXlib/tests
+#    - bash compile_and_run_tests.sh -sm
+#
+#build:cudampi:
+#  tags: [galileo]
+#  script:
+#    - module load profile/advanced pgi/17.10 cuda/8.0.61
+#    - ./configure
+#    - cd UtilXlib/tests
+#    - bash compile_and_run_tests.sh -smcn
+#
+#build:intelmpi:
+#  tags: [galileo]
+#  script:
+#    - module load profile/advanced intel intelmpi
+#    - ./configure
+#    - cd UtilXlib/tests
+#    - bash compile_and_run_tests.sh -sm
+#
+##### BUILDS ON GALILEO ####
+#
+#build:laxlib-unittest:
+#  tags: [galileo]
+#  script:
+#    - module load profile/advanced pgi/17.10 cuda/8.0.61
+#    - ./configure && make pw
+#    - cd LAXlib/tests
+#    - make clean && make
+#    - for file in ./*.x; do mpirun -np 4 ./$file; done
+#    - cd ../../
+#    - ./configure --disable-parallel && make clean && make pw
+#    - cd LAXlib/tests
+#    - make clean && make
+#    - for file in ./*.x; do ./$file; done
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -22,12 +22,12 @@ build:cp:
 #    - make run-tests-cp-serial
 #    - rm -r /tmp/save

-build:ph:
-  tags: [docker]
-  image: espressofoundation/ubuntu:latest
-  script:
-    - ./configure
-    - make ph
+#build:ph:
+#  tags: [docker]
+#  image: espressofoundation/ubuntu:latest
+#  script:
+#    - ./configure
+#    - make ph
 #    - export OMP_NUM_THREADS=1
 #    - cd test-suite
 #    - mkdir /tmp/save
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -591,6 +591,12 @@ add_custom_target(gwl
        qe_gww_util_grap_exe
        qe_gww_util_abcoefftoeps_exe
        qe_gww_util_memorypw4gww_exe
+	qe_gww_bse_bse_main_exe
+        qe_gww_gww_exe
+        qe_gww_gww_fit_exe
+        qe_gww_head_exe
+        qe_gww_simple_bse_exe
+        qe_gww_simple_ip_exe
    COMMENT
        "GW with Lanczos chains")

--- a/CPV/src/cg_sub.f90
+++ b/CPV/src/cg_sub.f90
@ -175,12 +175,16 @@

      !orthonormalize c0

-      call calbec(1,nsp,eigr,c0,bec)
+      call calbec(nbsp, betae,c0,bec)
      CALL gram_bgrp( betae, bec, nkb, c0, ngw )

      !calculates phi for pcdaga

+#if defined (__CUDA)
+      CALL errore('  runcg_uspp ', ' GPU version not yet implemented', 1 )
+#else
      CALL calphi_bgrp( c0, SIZE(c0,1), bec, nkb, betae, phi, nbsp )
+#endif

      !calculates the factors for S and K inversion in US case
      if(nkbus>0) then
@ -205,13 +209,13 @@


        ENERGY_CHECK: if(.not. ene_ok ) then
-          call calbec(1,nsp,eigr,c0,bec)
+          call calbec(nbsp, betae,c0,bec)
          if(.not.tens) then
             call rhoofr(nfi,c0(:,:),irb,eigrb,bec,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
          else

            if(newscheme.or.firstiter) then 
-               call  inner_loop_cold( nfi, tfirst, tlast, eigr,  irb, eigrb, &
+               call  inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
                      rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,c0,bec,dbec,firstiter,vpot)
               firstiter=.false.
            endif
@ -238,7 +242,7 @@
          !
          !     put core charge (if present) in rhoc(r)
          !
-          if (nlcc_any) call set_cc(irb,eigrb,rhoc)
+          if (nlcc_any) call set_cc(rhoc)

          !
          !---ensemble-DFT
@ -301,7 +305,7 @@

        !update d

-        call newd(vpot,irb,eigrb,rhovan,fion)
+        call newd(vpot,rhovan,fion,.true.)


        call prefor(eigr,betae)!ATTENZIONE
@ -341,13 +345,13 @@
        hpsi0=hpsi
        gi = hpsi
        
-        call calbec(1,nsp,eigr,hpsi,becm)
+        call calbec(nbsp, betae,hpsi,becm)
        call xminus1(hpsi,betae,dumm,becm,s_minus1,.false.)
 !        call sminus1(hpsi,becm,betae)

 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 !look if the following two lines are really needed
-        call calbec(1,nsp,eigr,hpsi,becm)
+        call calbec(nbsp, betae,hpsi,becm)
        call pc2(c0,bec,hpsi,becm)
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

@ -357,13 +361,13 @@
        else
           call xminus1_state(gi,betae,ema0bg,becm,k_minus1,.true.,ave_ene)
        endif
-        call calbec(1,nsp,eigr,gi,becm)
+        call calbec(nbsp, betae,gi,becm)
        call pc2(c0,bec,gi,becm)

        
        if(tens) call calcmt( nrlx, f, z0t, fmat0 )

-        call calbec(1,nsp,eigr,hpsi,bec0) 
+        call calbec(nbsp, betae,hpsi,bec0) 

 !  calculates gamma
        gamma=0.d0
@ -496,7 +500,7 @@

        !project hi on conduction sub-space

-        call calbec(1,nsp,eigr,hi,bec0)
+        call calbec(nbsp, betae,hi,bec0)
        call pc2(c0,bec,hi,bec0)
        

@ -565,7 +569,7 @@

        !orthonormalize

-        call calbec(1,nsp,eigr,cm,becm)
+        call calbec(nbsp, betae,cm,becm)
        CALL gram_bgrp( betae, becm, nkb, cm, ngw )
               
        !calculate energy
@ -573,7 +577,7 @@
          call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
        else
          if(newscheme) then 
-              call  inner_loop_cold( nfi, tfirst, tlast, eigr,  irb, eigrb, &
+              call  inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
                        rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot  )  
          endif

@ -588,7 +592,7 @@
        !
        !     put core charge (if present) in rhoc(r)
        !
-        if (nlcc_any) call set_cc(irb,eigrb,rhoc)
+        if (nlcc_any) call set_cc(rhoc)
        !
        vpot = rhor
        !
@ -628,7 +632,7 @@
          cm(1,:)=0.5d0*(cm(1,:)+CONJG(cm(1,:)))
        endif

-        call calbec(1,nsp,eigr,cm,becm)
+        call calbec(nbsp, betae,cm,becm)
        CALL gram_bgrp( betae, becm, nkb, cm, ngw )

        !test on energy: check the energy has really diminished
@ -638,7 +642,7 @@
          call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
        else
          if(newscheme)  then
-              call  inner_loop_cold( nfi, tfirst, tlast, eigr,  irb, eigrb, &
+              call  inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
                      rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot  )
          endif
          !     calculation of the rotated quantities
@ -652,7 +656,7 @@
        !
        !     put core charge (if present) in rhoc(r)
        !
-        if (nlcc_any) call set_cc(irb,eigrb,rhoc)
+        if (nlcc_any) call set_cc(rhoc)
        !
        vpot = rhor
        !
@ -699,7 +703,7 @@
          endif
          c0=c0+spasso*passov*hi
          restartcg=.true.
-          call calbec(1,nsp,eigr,c0,bec)
+          call calbec(nbsp, betae,c0,bec)
          CALL gram_bgrp( betae, bec, nkb, c0, ngw )
          ene_ok=.false.
          !if  ene1 << energy <  ene0; go to  ene1
@ -709,7 +713,7 @@
          endif  
          c0=c0+spasso*passov*hi
          restartcg=.true.!ATTENZIONE
-          call calbec(1,nsp,eigr,c0,bec)
+          call calbec(nbsp, betae,c0,bec)
          CALL gram_bgrp( betae, bec, nkb, c0, ngw )
          !if ene > ene0,en1 do a steepest descent step
          ene_ok=.false.
@ -725,14 +729,14 @@
            cm=c0+spasso*passov*hi
            ! chenge the searching direction
            spasso=spasso*(-1.d0)
-            call calbec(1,nsp,eigr,cm,becm)
+            call calbec(nbsp, betae,cm,becm)
            CALL gram_bgrp( betae, bec, nkb, cm, ngw )
-            call calbec(1,nsp,eigr,cm,becm)
+            call calbec(nbsp, betae,cm,becm)
            if(.not.tens) then
              call rhoofr(nfi,cm(:,:),irb,eigrb,becm,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
            else
              if(newscheme)  then
-                  call  inner_loop_cold( nfi, tfirst, tlast, eigr,  irb, eigrb, &
+                  call  inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
                          rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,cm,becm,dbec,.false., vpot  )
              endif
              !     calculation of the rotated quantities
@ -746,7 +750,7 @@
            !
            !     put core charge (if present) in rhoc(r)
            !
-            if (nlcc_any) call set_cc(irb,eigrb,rhoc)
+            if (nlcc_any) call set_cc(rhoc)
            !
            vpot = rhor
            !
@ -776,10 +780,14 @@
        
        if(tens.and.newscheme) enever=enever-entropy
 
-        if(.not. ene_ok) call calbec (1,nsp,eigr,c0,bec)
+        if(.not. ene_ok) call calbec (nbsp, betae,c0,bec)

+#if defined (__CUDA)
+        CALL errore('  runcg_uspp ', ' GPU version not yet implemented', 1 )
+#else
        !calculates phi for pc_daga
        CALL calphi_bgrp( c0, SIZE(c0,1), bec, nkb, betae, phi, nbsp )
+#endif
  
        !=======================================================================
        !
@ -788,7 +796,7 @@
        !
        !=======================================================================
        if(tens.and. .not.newscheme) then
-            call  inner_loop_cold( nfi, tfirst, tlast, eigr,  irb, eigrb, &
+            call  inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
                    rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac,c0,bec,dbec,firstiter, vpot  )
 !the following sets up the new energy
           enever=etot
@ -809,7 +817,7 @@
      !calculates atomic forces and lambda

       if(tpre) then!if pressure is need the following is written because of caldbec
-          call  calbec(1,nsp,eigr,c0,bec)
+          call  calbec(nbsp, betae,c0,bec)
          if(.not.tens) then
            call caldbec_bgrp( eigr, c0, dbec, idesc )
            call rhoofr(nfi,c0(:,:),irb,eigrb,bec,dbec,rhovan,rhor,drhor,rhog,drhog,rhos,enl,denl,ekin,dekin6)
@ -827,7 +835,7 @@
          !
          !     put core charge (if present) in rhoc(r)
          !
-          if (nlcc_any) call set_cc(irb,eigrb,rhoc)
+          if (nlcc_any) call set_cc(rhoc)

          !
          !---ensemble-DFT
@ -844,12 +852,16 @@

     call calcmt( nrlx, f, z0t, fmat0 )

-      call newd(vpot,irb,eigrb,rhovan,fion)
+      call newd(vpot,rhovan,fion,.true.)
+#if defined (__CUDA)
+      CALL errore('  runcg_uspp ', ' GPU version not yet implemented', 1 )
+#else
      if (.not.tens) then
-        if (tfor .or. tprnfor) call nlfq_bgrp( c0, eigr, bec, becdr, fion ) ! call nlfq(c0,eigr,bec,becdr,fion)
+        if (tfor .or. tprnfor) call nlfq_bgrp( c0, betae, bec, becdr, fion ) 
      else
-        if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, eigr, becdiag, becdrdiag, fion ) ! call nlfq(c0diag,eigr,becdiag,becdrdiag,fion)
+        if (tfor .or. tprnfor) call nlfq_bgrp( c0diag, betae, becdiag, becdrdiag, fion ) 
      endif
+#endif
  
        call prefor(eigr,betae)
        do i=1,nbsp,2
@ -955,9 +967,9 @@
               enddo
            enddo
          
-            call calbec (1,nsp,eigr,c0,bec)
+            call calbec (nbsp,betae,c0,bec)
            CALL gram_bgrp( betae, bec, nkb, c0, ngw )
-            call calbec(1,nsp,eigr,c0,bec)
+            call calbec(nbsp, betae,c0,bec)
          


@ -998,7 +1010,7 @@
            CALL mp_sum( lambda_repl, intra_bgrp_comm )
            CALL distribute_lambda( lambda_repl, lambda( :, :, 1 ), idesc( :, 1 ) )
            cm(:,:)=c0(:,:)
-            call calbec (1,nsp,eigr,cm,becm)
+            call calbec (nbsp, betae,cm,becm)

         endif
        DEALLOCATE( lambda_repl )
@ -1031,7 +1043,7 @@
           !
           DEALLOCATE( lambda_dist )
           !
-           call nlsm2_bgrp( ngw, nkb, eigr, c0, becdr, nbspx, nbsp )
+           call nlsm2_bgrp( ngw, nkb, betae, c0, becdr, nbspx, nbsp )
           !
        endif
        !
--- a/CPV/src/chargedensity.f90
+++ b/CPV/src/chargedensity.f90
@ -6,9 +6,16 @@
 ! or http://www.gnu.org/copyleft/gpl.txt .
 !

+#if defined(__CUDA)
+#define DEVICEATTR ,DEVICE
+#else
+#define DEVICEATTR
+#endif
+
 !-----------------------------------------------------------------------
   SUBROUTINE rhoofr_cp &
-      ( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
+      ( nfi, c_bgrp, c_d, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
+        enl, denl, ekin, dekin, tstress, ndwwf )
 !-----------------------------------------------------------------------
 !
 !  this routine computes:
@ -48,7 +55,7 @@
      USE uspp,               ONLY: nkb
      USE uspp_param,         ONLY: nh, nhm
      USE cell_base,          ONLY: omega
-      USE electrons_base,     ONLY: nspin, nbsp_bgrp, ispin_bgrp, f_bgrp
+      USE electrons_base,     ONLY: nspin, nbsp_bgrp, ispin_bgrp, f_bgrp, f_d
      USE constants,          ONLY: pi, fpi
      USE mp,                 ONLY: mp_sum
      USE io_global,          ONLY: stdout, ionode
@ -60,7 +67,7 @@
      USE fft_interfaces,     ONLY: fwfft, invfft
      USE fft_base,           ONLY: dffts, dfftp
      USE cp_interfaces,      ONLY: checkrho, ennl, calrhovan, dennl
-      USE cp_main_variables,  ONLY: iprint_stdout, idesc
+      USE cp_main_variables,  ONLY: iprint_stdout, idesc, irb, eigrb
      USE wannier_base,       ONLY: iwf
      USE exx_module,         ONLY: rhopr 
      USE input_parameters,   ONLY: tcpbo ! BS
@ -68,6 +75,8 @@
      USE io_files,           ONLY: restart_dir
      USE fft_rho
      USE fft_helper_subroutines, ONLY: c2psi_gamma
+      USE mp,                 ONLY: mp_barrier
+      USE mp_world,           ONLY: mpime, world_comm
      !
      IMPLICIT NONE
      INTEGER nfi
@ -79,11 +88,10 @@
      REAL(DP) rhos(:,:)
      REAL(DP) enl, ekin
      REAL(DP) denl(3,3), dekin(6)
-      COMPLEX(DP) eigrb( :, : )
      COMPLEX(DP) rhog( :, : )
      COMPLEX(DP) drhog( :, :, :, : )
      COMPLEX(DP) c_bgrp( :, : )
-      INTEGER irb( :, : )
+      COMPLEX(DP) DEVICEATTR :: c_d( :, : )
      LOGICAL, OPTIONAL, INTENT(IN) :: tstress
      INTEGER, OPTIONAL, INTENT(IN) :: ndwwf

@ -119,7 +127,11 @@
      !
      !  calculation of kinetic energy ekin
      !
+#if defined (__CUDA)
+      ekin = enkin( c_d, f_d, nbsp_bgrp )
+#else
      ekin = enkin( c_bgrp, f_bgrp, nbsp_bgrp )
+#endif
      !
      IF( nbgrp > 1 ) &
         CALL mp_sum( ekin, inter_bgrp_comm )
@ -217,6 +229,9 @@
               CALL errore( ' rhoofr ', ' c second dimension too small ', SIZE( c_bgrp, 2 ) )
            !
            c_bgrp( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
+#if defined (__CUDA)
+            c_d( :, nbsp_bgrp + 1 ) = ( 0.d0, 0.d0 )
+#endif
            !
         ENDIF
         !
@ -236,7 +251,11 @@
            !
         ELSE 
            !
+#if defined (__CUDA)
+            CALL loop_over_states_gpu()
+#else
            CALL loop_over_states()
+#endif
            !
         END IF
         !
@ -260,7 +279,7 @@
            DEALLOCATE( drhovan )
         END IF
         !
-         CALL rhov( irb, eigrb, rhovan, rhog, rhor )
+         CALL rhov( rhovan, rhog, rhor )

      ENDIF COMPUTE_CHARGE
 !
@ -359,6 +378,10 @@
 #endif
         REAL(DP), ALLOCATABLE :: tmp_rhos(:,:)

+         IF( fftx_ntgrp(dffts) > 1 ) THEN
+            CALL errore('rhoofr','Task group not supported',1)
+         END IF
+
         ALLOCATE( psis( dffts%nnr_tg ) ) 
         !
         CALL tg_get_group_nr3( dffts, tg_nr3 )
@ -369,17 +392,9 @@

         do i = 1, nbsp_bgrp, 2 * fftx_ntgrp(dffts)

-#if defined(__MPI)
-            !
-            CALL c2psi_gamma_tg(dffts, psis, c_bgrp, i, nbsp_bgrp )
-
-            CALL invfft ('tgWave', psis, dffts )
-#else
            CALL c2psi_gamma( dffts, psis, c_bgrp(:,i), c_bgrp(:,i+1) )

            CALL invfft('Wave', psis, dffts )
-
-#endif
            !
            ! Now the first proc of the group holds the first two bands
            ! of the 2*nogrp bands that we are processing at the same time,
@ -446,6 +461,103 @@
         RETURN
      END SUBROUTINE loop_over_states

+#if defined (__CUDA)
+
+      SUBROUTINE loop_over_states_gpu
+         !
+         USE parallel_include
+         USE fft_helper_subroutines
+         USE control_flags, ONLY : many_fft
+         USE cudafor
+         !
+         !        MAIN LOOP OVER THE EIGENSTATES
+         !           - This loop is also parallelized within the task-groups framework
+         !           - Each group works on a number of eigenstates in parallel
+         !
+         IMPLICIT NONE
+         !
+         INTEGER :: from, i, ig, eig_index, eig_offset, ii, tg_nr3, ioff
+         !
+         REAL(DP), DEVICE, ALLOCATABLE    :: rhos_d(:,:)
+         COMPLEX(DP), DEVICE, ALLOCATABLE :: psis(:)
+         COMPLEX(DP), DEVICE, ALLOCATABLE :: ptmp(:,:)
+         INTEGER,     DEVICE, POINTER     :: nl_d(:), nlm_d(:)
+
+         ALLOCATE( psis( dffts%nnr * many_fft ) )  ! dffts%nnr * many_fft
+         ALLOCATE( rhos_d ( SIZE(rhos,1), SIZE(rhos,2) ) )
+         !
+         rhos_d = 0_DP
+         nl_d => dffts%nl_d
+         nlm_d => dffts%nlm_d
+
+         do i = 1, nbsp_bgrp, 2 * many_fft
+
+            psis = 0.0d0
+
+            ioff = 0
+            DO ii = i, i + 2 * many_fft - 1, 2
+              IF( ii < nbsp_bgrp ) THEN
+!$cuf kernel do(1)
+                 do ig = 1, dffts%ngw
+                    psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) ) + ci * conjg( c_d( ig, ii+1 ))
+                    psis( nl_d( ig )  + ioff) = c_d( ig, ii ) + ci * c_d( ig, ii+1 )
+                 end do
+              ELSE IF( ii == nbsp_bgrp ) THEN
+!$cuf kernel do(1)
+                 do ig = 1, dffts%ngw
+                    psis( nlm_d( ig ) + ioff) = CONJG( c_d( ig, ii ) )
+                    psis( nl_d( ig )  + ioff) = c_d( ig, ii )
+                 end do
+              END IF
+              ! CALL c2psi_gamma( dffts, psis, c_bgrp(:,ii), c_bgrp(:,ii+1) )
+              ioff = ioff + dffts%nnr
+              
+            END DO
+
+            CALL invfft('Wave', psis, dffts, many_fft )
+
+            ioff = 0
+            DO ii = i, i + 2 * many_fft - 1, 2
+              IF( ii < nbsp_bgrp ) THEN
+                iss1=ispin_bgrp( ii )
+                sa1 =f_bgrp( ii )/omega
+                iss2=ispin_bgrp( ii + 1 )
+                sa2 =f_bgrp( ii + 1 )/omega
+!$cuf kernel do(1)
+                do ir = 1, dffts%nnr
+                   rhos_d(ir,iss1) = rhos_d(ir,iss1) + sa1*( real(psis(ir + ioff)))**2
+                   rhos_d(ir,iss2) = rhos_d(ir,iss2) + sa2*(aimag(psis(ir + ioff)))**2
+                end do
+              ELSE IF( ii == nbsp_bgrp ) THEN
+                iss1=ispin_bgrp( ii )
+                sa1 =f_bgrp( ii )/omega
+                iss2=iss1
+                sa2=0.0d0
+!$cuf kernel do(1)
+                do ir = 1, dffts%nnr
+                   rhos_d(ir,iss1) = rhos_d(ir,iss1) + sa1*( real(psis(ir + ioff)))**2
+                END DO
+              END IF
+              ioff = ioff + dffts%nnr
+            END DO
+            !
+         END DO
+
+         rhos = rhos_d
+
+         IF( nbgrp > 1 ) THEN
+            CALL mp_sum( rhos, inter_bgrp_comm )
+         END IF
+
+         DEALLOCATE( rhos_d )
+         DEALLOCATE( psis ) 
+         DEALLOCATE( ptmp ) 
+
+         RETURN
+      END SUBROUTINE loop_over_states_gpu
+#endif
+
+
 !-----------------------------------------------------------------------
   END SUBROUTINE rhoofr_cp
 !-----------------------------------------------------------------------
@ -525,50 +637,32 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
      REAL(DP),    INTENT(OUT) :: drhor(dfftp%nnr,nspin,3,3)
      COMPLEX(DP), INTENT(OUT) :: drhog(dfftp%ngm,nspin,3,3)
 ! local
-      INTEGER i, j, isup, isdw, iv, jv, ig, ijv, is, iss, ia, ir, ijs
+      INTEGER i, j, isup, isdw, iv, jv, ig, ijv, is, iss, ia, ir, ijs, itid
      REAL(DP) :: asumt, dsumt
      COMPLEX(DP) fp, fm, ci
-#if defined(__INTEL_COMPILER)
-#if __INTEL_COMPILER  >= 1300
-!dir$ attributes align: 4096 :: v, dqgbt,qv
-#endif
-#endif
      COMPLEX(DP), ALLOCATABLE :: v(:)
      COMPLEX(DP), ALLOCATABLE:: dqgbt(:,:)
      COMPLEX(DP), ALLOCATABLE :: qv(:)
      COMPLEX(DP), ALLOCATABLE :: fg1(:), fg2(:)
-!
-      INTEGER  :: itid, mytid, ntids
 #if defined(_OPENMP)
      INTEGER  :: omp_get_thread_num, omp_get_num_threads
      EXTERNAL :: omp_get_thread_num, omp_get_num_threads
 #endif
 !
-!$omp parallel default(none), private(i,j,iss,ir,ig,mytid,ntids,itid), shared(nspin,dfftp,drhor,drhog,rhor,rhog,ainv) 
-#if defined(_OPENMP)
-      mytid = omp_get_thread_num()  ! take the thread ID
-      ntids = omp_get_num_threads() ! take the number of threads
-#else
-      mytid = 0
-      ntids = 1
-#endif
-      itid  = 0
+!$omp parallel do collapse(3) default(none), private(i,j,iss,ir,ig), shared(nspin,dfftp,drhor,drhog,rhor,rhog,ainv) 
      DO j=1,3
         DO i=1,3
            DO iss=1,nspin
-               IF( MOD( itid,  ntids ) == mytid ) THEN
-                  DO ir=1,dfftp%nnr
-                     drhor(ir,iss,i,j)=-rhor(ir,iss)*ainv(j,i)
-                  END DO
-                  DO ig=1,dfftp%ngm
-                     drhog(ig,iss,i,j)=-rhog(ig,iss)*ainv(j,i)
-                  END DO
-               END IF
-               itid = itid + 1 
+               DO ir=1,dfftp%nnr
+                  drhor(ir,iss,i,j)=-rhor(ir,iss)*ainv(j,i)
+               END DO
+               DO ig=1,dfftp%ngm
+                  drhog(ig,iss,i,j)=-rhog(ig,iss)*ainv(j,i)
+              END DO
            END DO
         END DO
      END DO
-!$omp end parallel
+!$omp end parallel do

      IF ( nkbus <= 0 ) THEN
         GO TO 1000
@ -589,8 +683,8 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)

 !$omp parallel default(none) &
 !$omp          shared(nat, ityp, ngb, nh, eigrb, dfftb, irb, v, &
-!$omp                 ci, i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
-!$omp          private(mytid, ntids, is, ia, iv, jv, ijv, ig, iss, &
+!$omp                 i, j, dqgb, qgb, nhm, rhovan, drhovan, upf ) &
+!$omp          private( is, ia, iv, jv, ijv, ig, iss, &
 !$omp                  qv, fg1, fg2, itid, dqgbt, dsumt, asumt )

               ALLOCATE( qv( dfftb%nnr ) )
@ -598,12 +692,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
               ALLOCATE( fg1( ngb ) )
               ALLOCATE( fg2( ngb ) )

-#if defined(_OPENMP)
-               mytid = omp_get_thread_num()  ! take the thread ID
-               ntids = omp_get_num_threads() ! take the number of threads
               itid  = 0
-#endif
-
               iss=1

               DO ia=1,nat
@ -617,11 +706,11 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
 #endif

 #if defined(_OPENMP)
-                     IF ( mytid /= itid ) THEN
-                        itid = MOD( itid + 1, ntids )
+                     IF ( omp_get_thread_num() /= itid ) THEN
+                        itid = MOD( itid + 1, omp_get_num_threads() )
                        CYCLE
                     ELSE
-                        itid = MOD( itid + 1, ntids )
+                        itid = MOD( itid + 1, omp_get_num_threads() )
                     END IF
 #endif

@ -761,7 +850,7 @@ END SUBROUTINE drhov

 !
 !-----------------------------------------------------------------------
-SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
+SUBROUTINE rhov(rhovan,rhog,rhor)
 !-----------------------------------------------------------------------
 !     Add Vanderbilt contribution to rho(r) and rho(g)
 !
@ -772,7 +861,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
      USE kinds,                    ONLY: dp
      USE ions_base,                ONLY: nat, na, nsp, ityp
      USE io_global,                ONLY: stdout
-      USE mp_global,                ONLY: intra_bgrp_comm
+      USE mp_global,                ONLY: intra_bgrp_comm, inter_bgrp_comm
      USE mp,                       ONLY: mp_sum
      USE uspp_param,               ONLY: nh, nhm, upf
      USE uspp,                     ONLY: deeq, nkbus
@ -786,19 +875,21 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
      USE fft_interfaces,           ONLY: fwfft, invfft
      USE fft_base,                 ONLY: dfftb, dfftp, dfftb
      USE fft_helper_subroutines,   ONLY: fftx_add_threed2oned_gamma
+      USE cp_main_variables,        ONLY: irb, eigrb, iabox, nabox
 !
      IMPLICIT NONE
      !
      REAL(DP),    INTENT(IN) ::  rhovan(nhm*(nhm+1)/2,nat,nspin)
-      INTEGER,     INTENT(in) :: irb(3,nat)
-      COMPLEX(DP), INTENT(in):: eigrb(ngb,nat)
      ! 
      REAL(DP),     INTENT(inout):: rhor(dfftp%nnr,nspin)
      COMPLEX(DP),  INTENT(inout):: rhog(dfftp%ngm,nspin)
 !
-      INTEGER     :: isup, isdw, nfft, ifft, iv, jv, ig, ijv, is, iss, isa, ia, ir, i, j
+      INTEGER, PARAMETER :: isup = 1
+      INTEGER, PARAMETER :: isdw = 2
+      INTEGER     :: iv, jv, ig, ijv, is, iss, ia, ir, i, j, iia
      REAL(DP)    :: sumrho
-      COMPLEX(DP) :: ci, fp, fm, ca
+      COMPLEX(DP) :: fp, fm, ca
+      COMPLEX(DP), PARAMETER :: ci=(0.d0,1.d0)
 #if defined(__INTEL_COMPILER)
 #if __INTEL_COMPILER  >= 1300
 !dir$ attributes align: 4096 :: qgbt, v, qv
@ -809,8 +900,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
      COMPLEX(DP), ALLOCATABLE :: qv(:)
      COMPLEX(DP), ALLOCATABLE :: fg1(:), fg2(:)

+      INTEGER  :: mytid, ntids
 #if defined(_OPENMP)
-      INTEGER  :: itid, mytid, ntids
      INTEGER  :: omp_get_thread_num, omp_get_num_threads
      EXTERNAL :: omp_get_thread_num, omp_get_num_threads
 #endif
@ -822,7 +913,6 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
      END IF

      CALL start_clock( 'rhov' )
-      ci=(0.d0,1.d0)
 !
 !
      ALLOCATE( v( dfftp%nnr ) )
@ -830,13 +920,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
      ! private variable need to be initialized, otherwise
      ! outside the parallel region they have an undetermined value
      !
-#if defined(_OPENMP)
      mytid = 0
      ntids = 1
-      itid  = 0
-#endif
-      iss   = 1
-      isa   = 1
 !
      IF(nspin.EQ.1) THEN
         ! 
@ -845,12 +930,8 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)

 !$omp parallel default(none) &
 !$omp          shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
-!$omp                 stdout, ci, rhor, dfftp, upf, nsp, ityp, nat ) &
-!$omp          private(mytid, ntids, is, ia, nfft, ifft, iv, jv, ijv, sumrho, qgbt, ig, iss, isa, ca, &
-!$omp                  qv, fg1, fg2, itid, ir )
-
-         iss=1
-         isa=1
+!$omp                 stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin, iabox, nabox, inter_bgrp_comm ) &
+!$omp          private(mytid, ntids, is, ia, iia, iv, jv, ijv, sumrho, qgbt, ig, ca, qv, ir )

 !$omp workshare
         v (:) = (0.d0, 0.d0)
@ -859,149 +940,96 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
 #if defined(_OPENMP)
         mytid = omp_get_thread_num()  ! take the thread ID
         ntids = omp_get_num_threads() ! take the number of threads
-         itid  = 0
 #endif

-         ALLOCATE( qgbt( ngb, 2 ) )
+         ALLOCATE( qgbt( ngb, nspin ) )
         ALLOCATE( qv( dfftb%nnr ) )
-         ALLOCATE( fg1( ngb ) )
-         ALLOCATE( fg2( ngb ) )

-
-         DO ia = 1, nat
-
-            is = ityp(ia)
-
-            IF( upf(is)%tvanp ) THEN
-
-               nfft = 1
-
-#if defined(__MPI)
-               IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) ) THEN
-                  CYCLE
-               END IF
-#endif
-#if defined(_OPENMP)
-               IF ( mytid /= itid ) THEN
-                  itid = MOD( itid + 1, ntids )
-                  CYCLE
-               ELSE
-                  itid = MOD( itid + 1, ntids )
-               END IF
-#endif
-
-               DO ifft=1,nfft
-                  qgbt(:,ifft) = (0.d0, 0.d0)
-                  DO iv= 1,nh(is)
-                     DO jv=iv,nh(is)
-                        ijv = (jv-1)*jv/2 + iv
-                        sumrho=rhovan(ijv,ia+ifft-1,iss)
-                        IF(iv.NE.jv) sumrho=2.d0*sumrho
-                        DO ig=1,ngb
-                           qgbt(ig,ifft)=qgbt(ig,ifft) + sumrho*qgb(ig,ijv,is)
-                        END DO
+         DO iia = 1, nabox
+            IF( MOD( iia - 1, ntids ) == mytid ) THEN
+               ia = iabox(iia)
+               is = ityp(ia)
+               qgbt(:,1) = (0.d0, 0.d0)
+               DO iv= 1,nh(is)
+                  DO jv=iv,nh(is)
+                     ijv = (jv-1)*jv/2 + iv
+                     sumrho=rhovan(ijv,ia,1)
+                     IF(iv.NE.jv) sumrho=2.d0*sumrho
+                     DO ig=1,ngb
+                        qgbt(ig,1)=qgbt(ig,1) + sumrho*qgb(ig,ijv,is)
                     END DO
                  END DO
               END DO
               !
               ! add structure factor
               !
-               IF(nfft.EQ.2)THEN
-                  fg1 = eigrb(1:ngb,ia   )*qgbt(1:ngb,1)
-                  fg2 = eigrb(1:ngb,ia+1 )*qgbt(1:ngb,2)
-                  CALL fft_oned2box( qv, fg1, fg2 )
-               ELSE
-                  fg1 = eigrb(1:ngb,ia   )*qgbt(1:ngb,1)
-                  CALL fft_oned2box( qv, fg1 )
-               ENDIF
+               qgbt(1:ngb,1) = eigrb(1:ngb,ia)*qgbt(1:ngb,1)
+               CALL fft_oned2box( qv, qgbt(:,1) )

               CALL invfft( qv, dfftb, ia )
               !
               !  qv = US augmentation charge in real space on box grid
               !       for atomic species is, real(qv)=atom ia, imag(qv)=atom ia+1
- 
-               IF( iverbosity > 1 ) THEN
-                  ca = SUM(qv)
-                  WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ',         &
-     &                 omegab*DBLE(qgbt(1,1))
-                  WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ',         &
-     &                 omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
-                  WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ',         &
-     &                 omegab*DBLE(qgbt(1,2))
-                  WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ',         &
-     &                 omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
-               ENDIF
               !
               !  add qv(r) to v(r), in real space on the dense grid
               !
               CALL  box2grid(irb(:,ia),1,qv,v)
-               IF (nfft.EQ.2) CALL  box2grid(irb(:,ia+1),2,qv,v)
-!
            END IF
         END DO

-         DEALLOCATE( fg1 )
-         DEALLOCATE( fg2 )
         DEALLOCATE(qv)
         DEALLOCATE(qgbt)
         !
         !  rhor(r) = total (smooth + US) charge density in real space
         !
+!$omp barrier
+!$omp master
+         !
+         CALL mp_sum( v, inter_bgrp_comm )
+         !
+!$omp end master
+!$omp barrier
+!$omp do 
+         DO ir=1,dfftp%nnr
+            rhor(ir,1)=rhor(ir,1)+DBLE(v(ir))        
+         END DO
+!$omp end do
 !$omp end parallel

-         iss = 1
-
-         DO ir=1,dfftp%nnr
-            rhor(ir,iss)=rhor(ir,iss)+DBLE(v(ir))        
-         END DO
-
-!
-         IF( iverbosity > 1 ) THEN
-            ca = SUM(v)
-
-            CALL mp_sum( ca, intra_bgrp_comm )
-
-            WRITE( stdout,'(a,2f12.8)')                                  &
-     &           ' rhov: int  n_v(r)  dr = ',omega*ca/(dfftp%nr1*dfftp%nr2*dfftp%nr3)
-         ENDIF
-!
         CALL fwfft('Rho',v, dfftp )
-!
-         IF( iverbosity > 1 ) THEN
-            WRITE( stdout,*) ' rhov: smooth ',omega*rhog(1,iss)
-            WRITE( stdout,*) ' rhov: vander ',omega*v(1)
-            WRITE( stdout,*) ' rhov: all    ',omega*(rhog(1,iss)+v(1))
-         ENDIF
         !
         !  rhog(g) = total (smooth + US) charge density in G-space
         !
-         CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,iss) )
+         CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,1) )

-         IF( iverbosity > 1 ) WRITE( stdout,'(a,2f12.8)')                          &
-     &        ' rhov: n_v(g=0) = ',omega*DBLE(rhog(1,iss))
-!
      ELSE
         !
         !     nspin=2: two fft at a time, one for spin up and one for spin down
         !
-         isup=1
-         isdw=2
+!$omp parallel default(none) &
+!$omp          shared(na, ngb, nh, rhovan, qgb, eigrb, dfftb, iverbosity, omegab, irb, v, &
+!$omp                 stdout, rhor, dfftp, upf, nsp, ityp, nat, nspin, iabox, nabox, inter_bgrp_comm ) &
+!$omp          private(mytid, ntids, is, ia, iia, iv, jv, ijv, sumrho, qgbt, ig, ca, qv, fg1, fg2, ir )

+
+!$omp workshare
         v (:) = (0.d0, 0.d0)
+!$omp end workshare
+
+#if defined(_OPENMP)
+         mytid = omp_get_thread_num()  ! take the thread ID
+         ntids = omp_get_num_threads() ! take the number of threads
+#endif

         ALLOCATE( qgbt( ngb, 2 ) )
         ALLOCATE( qv( dfftb%nnr ) )
         ALLOCATE( fg1( ngb ) )
         ALLOCATE( fg2( ngb ) )

-         isa=1
-         DO ia=1,nat
-            is = ityp(ia)
-#if defined(__MPI)
-               IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) ) CYCLE
-#endif
-               IF( upf(is)%tvanp ) THEN
-
+         DO iia = 1, nabox
+            IF( MOD( iia - 1, ntids ) == mytid ) THEN
+               ia = iabox(iia)
+               is = ityp(ia)
               DO iss=1,2
                  qgbt(:,iss) = (0.d0, 0.d0)
                  DO iv=1,nh(is)
@ -1015,42 +1043,40 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
                     END DO
                  END DO
               END DO
-!     
-! add structure factor
-!
+               !     
+               ! add structure factor
+               !
               fg1 = eigrb(1:ngb,ia)*qgbt(1:ngb,1)
               fg2 = eigrb(1:ngb,ia)*qgbt(1:ngb,2)
               CALL fft_oned2box( qv, fg1, fg2 )
-!
+               !
               CALL invfft( qv,dfftb,ia)
-!
-!  qv is the now the US augmentation charge for atomic species is
-!  and atom ia: real(qv)=spin up, imag(qv)=spin down
-!
-               IF( iverbosity > 1 ) THEN
-                  ca = SUM(qv)
-                  WRITE( stdout,'(a,f12.8)') ' rhov: up   g-space = ',        &
-     &                 omegab*DBLE(qgbt(1,1))
-                  WRITE( stdout,'(a,f12.8)') ' rhov: up r-sp = ',             &
-     &                 omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
-                  WRITE( stdout,'(a,f12.8)') ' rhov: dw g-space = ',          &
-     &                 omegab*DBLE(qgbt(1,2))
-                  WRITE( stdout,'(a,f12.8)') ' rhov: dw r-sp = ',             &
-     &                 omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
-               ENDIF
-!
-!  add qv(r) to v(r), in real space on the dense grid
-!
+               !
+               !  qv is the now the US augmentation charge for atomic species is
+               !  and atom ia: real(qv)=spin up, imag(qv)=spin down
+               !
+               !  add qv(r) to v(r), in real space on the dense grid
+               !
               CALL box2grid(irb(:,ia),qv,v)

-               END IF
-
+            END IF
         END DO
-!
+         DEALLOCATE(qgbt)
+         DEALLOCATE( qv )
+         DEALLOCATE( fg1 )
+         DEALLOCATE( fg2 )
+!$omp barrier
+!$omp master
+         CALL mp_sum( v, inter_bgrp_comm )
+!$omp end master
+!$omp barrier
+!$omp do 
         DO ir=1,dfftp%nnr
            rhor(ir,isup)=rhor(ir,isup)+DBLE(v(ir)) 
            rhor(ir,isdw)=rhor(ir,isdw)+AIMAG(v(ir)) 
         END DO
+!$omp end do 
+!$omp end parallel
 !
         IF( iverbosity > 1 ) THEN
            ca = SUM(v)
@ -1078,10 +1104,6 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
     &        ' rhov: n_v(g=0) up   = ',omega*DBLE (rhog(1,isup)), &
     &        ' rhov: n_v(g=0) down = ',omega*DBLE(rhog(1,isdw))
         END IF
-         DEALLOCATE(qgbt)
-         DEALLOCATE( qv )
-         DEALLOCATE( fg1 )
-         DEALLOCATE( fg2 )
 !
      ENDIF

@ -1092,4 +1114,62 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
 1000  CONTINUE
 !
      RETURN
+
+CONTAINS
+
+      SUBROUTINE print_rhov()
+           IF( nspin == 1 ) THEN
+               IF( iverbosity > 1 ) THEN
+                  ca = SUM(qv)
+                  WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom g-sp = ',         &
+     &                 omegab*DBLE(qgbt(1,1))
+                  WRITE( stdout,'(a,f12.8)') ' rhov: 1-atom r-sp = ',         &
+     &                 omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
+               ENDIF
+           ELSE
+               IF( iverbosity > 1 ) THEN
+                  ca = SUM(qv)
+                  WRITE( stdout,'(a,f12.8)') ' rhov: up   g-space = ',        &
+     &                 omegab*DBLE(qgbt(1,1))
+                  WRITE( stdout,'(a,f12.8)') ' rhov: up r-sp = ',             &
+     &                 omegab*DBLE(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
+                  WRITE( stdout,'(a,f12.8)') ' rhov: dw g-space = ',          &
+     &                 omegab*DBLE(qgbt(1,2))
+                  WRITE( stdout,'(a,f12.8)') ' rhov: dw r-sp = ',             &
+     &                 omegab*AIMAG(ca)/(dfftb%nr1*dfftb%nr2*dfftb%nr3)
+               ENDIF
+           ENDIF
+      END SUBROUTINE
 END SUBROUTINE rhov
+
+SUBROUTINE rhoofr_host &
+      ( nfi, c_bgrp, irb, eigrb, bec_bgrp, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, &
+        enl, denl, ekin, dekin, tstress, ndwwf )
+         USE kinds,         ONLY: DP
+#if defined (__CUDA)
+         USE cudafor
+#endif
+         USE cp_interfaces
+         IMPLICIT NONE
+         INTEGER nfi
+         COMPLEX(DP) c_bgrp( :, : )
+         INTEGER irb( :, : )
+         COMPLEX(DP) eigrb( :, : )
+         REAL(DP) bec_bgrp(:,:)
+         REAL(DP) dbec(:,:,:,:)
+         REAL(DP) rhovan(:, :, : )
+         REAL(DP) rhor(:,:)
+         REAL(DP) drhor(:,:,:,:)
+         COMPLEX(DP) rhog( :, : )
+         COMPLEX(DP) drhog( :, :, :, : )
+         REAL(DP) rhos(:,:)
+         REAL(DP) enl, ekin
+         REAL(DP) denl(3,3), dekin(6)
+         LOGICAL, OPTIONAL, INTENT(IN) :: tstress
+         INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
+         COMPLEX(DP), ALLOCATABLE DEVICEATTR :: c(:,:) 
+         ALLOCATE( c, SOURCE=c_bgrp )
+         CALL rhoofr(nfi, c_bgrp, c, bec_bgrp, dbec, rhovan, rhor, &
+              drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
+         DEALLOCATE( c ) 
+END SUBROUTINE rhoofr_host
--- a/CPV/src/cp_interfaces.f90
+++ b/CPV/src/cp_interfaces.f90
@ -7,6 +7,12 @@
 !
 ! written by Carlo Cavazzoni

+#if defined(__CUDA)
+#define DEVICEATTR ,DEVICE
+#else
+#define DEVICEATTR
+#endif
+
 !=----------------------------------------------------------------------------=!
   MODULE cp_interfaces
 !=----------------------------------------------------------------------------=!
@ -44,7 +50,6 @@
   PUBLIC :: packgam

   PUBLIC :: ortho
-   PUBLIC :: ortho_gamma

   PUBLIC :: nlfh
   PUBLIC :: nlfl_bgrp
@ -96,7 +101,6 @@
   PUBLIC :: dotcsc
   PUBLIC :: nlsm1
   PUBLIC :: nlsm2_bgrp
-   PUBLIC :: calbec_bgrp
   PUBLIC :: ennl
   PUBLIC :: calrhovan
   PUBLIC :: calbec
@ -104,8 +108,8 @@
   PUBLIC :: dennl
   PUBLIC :: nlfq_bgrp
   PUBLIC :: collect_bec
-   PUBLIC :: beta_eigr
   PUBLIC :: nlsm1us
+   PUBLIC :: dbeta_eigr

   ! ------------------------------------ !

@ -126,6 +130,22 @@
         INTEGER,     INTENT(IN)    :: n, nspin
         REAL(DP),    OPTIONAL      :: v1( ldv, * )
      END SUBROUTINE dforce_x
+#if defined (__CUDA)
+      SUBROUTINE dforce_gpu_x( i, bec, vkb, c, df, da, v, ldv, ispin, f, n, nspin )
+         USE kinds,              ONLY: DP
+         IMPLICIT NONE
+         INTEGER,     INTENT(IN)    :: i
+         REAL(DP)                   :: bec(:,:)
+         COMPLEX(DP), DEVICE        :: vkb(:,:)
+         COMPLEX(DP), DEVICE        :: c(:,:)
+         COMPLEX(DP)                :: df(:), da(:)
+         INTEGER,     INTENT(IN)    :: ldv
+         REAL(DP), DEVICE           :: v( :, : )
+         INTEGER                    :: ispin( : )
+         REAL(DP)                   :: f( : )
+         INTEGER,     INTENT(IN)    :: n, nspin
+      END SUBROUTINE dforce_gpu_x
+#endif
   END INTERFACE


@ -225,6 +245,26 @@

   INTERFACE rhoofr
      SUBROUTINE rhoofr_cp &
+         ( nfi, c_bgrp, c_d, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
+         USE kinds,      ONLY: DP         
+         IMPLICIT NONE
+         INTEGER nfi
+         COMPLEX(DP) :: c_bgrp( :, : )
+         COMPLEX(DP) DEVICEATTR :: c_d( :, : )
+         REAL(DP) bec(:,:)
+         REAL(DP) dbec(:,:,:,:)
+         REAL(DP) rhovan(:, :, : )
+         REAL(DP) rhor(:,:)
+         REAL(DP) drhor(:,:,:,:)
+         COMPLEX(DP) rhog( :, : )
+         COMPLEX(DP) drhog( :, :, :, : )
+         REAL(DP) rhos(:,:)
+         REAL(DP) enl, ekin
+         REAL(DP) denl(3,3), dekin(6)
+         LOGICAL, OPTIONAL, INTENT(IN) :: tstress
+         INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
+      END SUBROUTINE rhoofr_cp
+      SUBROUTINE rhoofr_host &
         ( nfi, c_bgrp, irb, eigrb, bec, dbec, rhovan, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin, tstress, ndwwf )
         USE kinds,      ONLY: DP
         IMPLICIT NONE
@ -244,7 +284,7 @@
         REAL(DP) denl(3,3), dekin(6)
         LOGICAL, OPTIONAL, INTENT(IN) :: tstress
         INTEGER, OPTIONAL, INTENT(IN) :: ndwwf
-      END SUBROUTINE rhoofr_cp
+      END SUBROUTINE rhoofr_host
   END INTERFACE

   INTERFACE checkrho
@ -309,7 +349,7 @@

   INTERFACE runcp_uspp
      SUBROUTINE runcp_uspp_x &
-         ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
+         ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
         USE kinds,             ONLY: DP
         IMPLICIT NONE
         integer, intent(in) :: nfi
@ -318,6 +358,7 @@
         real(DP) :: rhos(:,:)
         real(DP) :: bec_bgrp(:,:)
         complex(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
+         complex(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
         logical, optional, intent(in) :: fromscra
         logical, optional, intent(in) :: restart
      END SUBROUTINE
@ -377,36 +418,19 @@

   INTERFACE ortho
      SUBROUTINE ortho_x &
-         ( eigr, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
+         ( betae, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
         USE kinds,          ONLY: DP
         IMPLICIT NONE
         INTEGER,  INTENT(IN) :: idesc( :, : )
-         COMPLEX(DP) :: eigr( :, : )
+         COMPLEX(DP) :: betae( :, : )
         COMPLEX(DP) :: cp_bgrp( :, : ), phi_bgrp( :, : )
         REAL(DP)    :: x0( :, :, : ), diff, ccc
         INTEGER     :: iter
         REAL(DP)    :: bephi(:,:)
         REAL(DP)    :: becp_bgrp(:,:)
-      END SUBROUTINE
-   END INTERFACE
-
-   INTERFACE ortho_gamma
-      SUBROUTINE ortho_gamma_x &
-         ( iopt, cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
-           x0, nx0, idesc, diff, iter, n, nss, istart )
-         USE kinds,          ONLY: DP
-         IMPLICIT NONE
-         INTEGER,  INTENT(IN)  :: iopt
-         INTEGER,  INTENT(IN)  :: ngwx, nkbx, nx0
-         INTEGER,  INTENT(IN)  :: n, nss, istart
-         COMPLEX(DP) :: phi( ngwx, n ), cp( ngwx, n )
-         REAL(DP)    :: bephi( :, : )
-         REAL(DP)    :: becp_dist(:,:)
-         REAL(DP)    :: qbephi( :, : ), qbecp( :, : )
-         REAL(DP)    :: x0( nx0, nx0 )
-         INTEGER,  INTENT(IN) :: idesc( : )
-         INTEGER,  INTENT(OUT) :: iter
-         REAL(DP), INTENT(OUT) :: diff
+#if defined (__CUDA)
+         ATTRIBUTES( DEVICE ) :: becp_bgrp, bephi, cp_bgrp, phi_bgrp, betae
+#endif
      END SUBROUTINE
   END INTERFACE

@ -748,15 +772,14 @@

   INTERFACE move_electrons
      SUBROUTINE move_electrons_x( &
-         nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb, &
+         nfi, tprint, tfirst, tlast, b1, b2, b3, fion, enthal, enb, &
            &  enbi, fccc, ccc, dt2bye, stress,l_cprestart )
         USE kinds,         ONLY: DP
         IMPLICIT NONE
         INTEGER,  INTENT(IN)    :: nfi
-         LOGICAL,  INTENT(IN)    :: tfirst, tlast
+         LOGICAL,  INTENT(IN)    :: tprint, tfirst, tlast
         REAL(DP), INTENT(IN)    :: b1(3), b2(3), b3(3)
         REAL(DP)                :: fion(:,:)
-         COMPLEX(DP)             :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
         REAL(DP), INTENT(IN)    :: dt2bye
         REAL(DP)                :: fccc, ccc
         REAL(DP)                :: enb, enbi
@ -820,6 +843,13 @@
      IMPLICIT NONE
      COMPLEX(DP) :: c_bgrp(:,:)
    END SUBROUTINE c_bgrp_expand_x
+#if defined (__CUDA)
+    SUBROUTINE c_bgrp_expand_gpu_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      IMPLICIT NONE
+      COMPLEX(DP), DEVICE :: c_bgrp(:,:)
+    END SUBROUTINE c_bgrp_expand_gpu_x
+#endif
   END INTERFACE
   INTERFACE c_bgrp_pack
    SUBROUTINE c_bgrp_pack_x( c_bgrp )
@ -827,6 +857,13 @@
      IMPLICIT NONE
      COMPLEX(DP) :: c_bgrp(:,:)
    END SUBROUTINE c_bgrp_pack_x
+#if defined (__CUDA)
+    SUBROUTINE c_bgrp_pack_gpu_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      IMPLICIT NONE
+      COMPLEX(DP), DEVICE :: c_bgrp(:,:)
+    END SUBROUTINE c_bgrp_pack_gpu_x
+#endif
   END INTERFACE

   INTERFACE vofrho
@ -856,6 +893,16 @@
         REAL(DP),    INTENT(IN) :: f( : )
         REAL(DP) :: enkin_x
      END FUNCTION enkin_x
+#if defined (__CUDA)
+      FUNCTION enkin_gpu_x( c, f, n )
+         USE kinds, ONLY: dp
+         IMPLICIT NONE
+         INTEGER,     INTENT(IN) :: n
+         COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
+         REAL(DP),    DEVICE, INTENT(IN) :: f( : )
+         REAL(DP) :: enkin_gpu_x
+      END FUNCTION enkin_gpu_x
+#endif
   END INTERFACE 

   INTERFACE newinit
@ -889,44 +936,45 @@
   END INTERFACE

   INTERFACE dotcsc
-      SUBROUTINE dotcsc_x( eigr, cp, ngw, n )
+      SUBROUTINE dotcsc_x( betae, cp, ngw, n )
         USE kinds, ONLY: dp
         IMPLICIT NONE
         INTEGER,     INTENT(IN) :: ngw, n
-         COMPLEX(DP), INTENT(IN) :: eigr(:,:), cp(:,:)
+         COMPLEX(DP), INTENT(IN) :: cp(:,:)
+         COMPLEX(DP), INTENT(INOUT) :: betae(:,:)
      END SUBROUTINE dotcsc_x
   END INTERFACE

   INTERFACE nlsm1
-      SUBROUTINE nlsm1_x ( n, nspmn, nspmx, eigr, c, becp, pptype_ )
+      SUBROUTINE nlsm1_x ( n, betae, c, becp, pptype_ )
         USE kinds,      ONLY : DP
         IMPLICIT NONE
-         INTEGER,     INTENT(IN)  :: n, nspmn, nspmx
-         COMPLEX(DP), INTENT(IN)  :: eigr( :, : ), c( :, : )
+         INTEGER,     INTENT(IN)  :: n
+         COMPLEX(DP), INTENT(IN)  :: c( :, : )
+         COMPLEX(DP), INTENT(INOUT)  :: betae( :, : )
         REAL(DP),    INTENT(OUT) :: becp( :, : )
         INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
      END SUBROUTINE nlsm1_x 
   END INTERFACE

   INTERFACE nlsm2_bgrp
-      SUBROUTINE  nlsm2_bgrp_x( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
+      SUBROUTINE  nlsm2_bgrp_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
         USE kinds,      ONLY : DP
         IMPLICIT NONE
         INTEGER,     INTENT(IN)  :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
-         COMPLEX(DP), INTENT(IN)  :: eigr( :, : ), c_bgrp( :, : )
+         COMPLEX(DP), INTENT(IN)  :: betae( :, : ), c_bgrp( :, : )
         REAL(DP),    INTENT(OUT) :: becdr_bgrp( :, :, : )
      END SUBROUTINE nlsm2_bgrp_x
-   END INTERFACE
-
-   INTERFACE calbec_bgrp
-      SUBROUTINE calbec_bgrp_x ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
+#if defined (__CUDA)
+      SUBROUTINE  nlsm2_bgrp_gpu_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
         USE kinds,      ONLY : DP
         IMPLICIT NONE
-         INTEGER,     INTENT(IN)  :: nspmn, nspmx
-         COMPLEX(DP), INTENT(IN)  :: eigr( :, : ), c_bgrp( :, : )
-         REAL(DP),    INTENT(OUT) :: bec_bgrp( :, : )
-         INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
-      END SUBROUTINE calbec_bgrp_x 
+         INTEGER,     INTENT(IN)  :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
+         COMPLEX(DP), INTENT(IN), DEVICE :: betae( :, : )
+         COMPLEX(DP), INTENT(IN), DEVICE :: c_bgrp( :, : )
+         REAL(DP),    INTENT(OUT) :: becdr_bgrp( :, :, : )
+      END SUBROUTINE nlsm2_bgrp_gpu_x
+#endif
   END INTERFACE

   INTERFACE ennl
@ -950,12 +998,13 @@
   END INTERFACE

   INTERFACE calbec
-      SUBROUTINE calbec_x( nspmn, nspmx, eigr, c, bec, pptype_ )
+      SUBROUTINE calbec_x( n, betae, c, bec, pptype_ )
         USE kinds,              ONLY: DP
         IMPLICIT NONE
-         INTEGER,     INTENT(IN)  ::  nspmn, nspmx
-         REAL(DP),    INTENT(OUT) ::  bec( :, : )
-         COMPLEX(DP), INTENT(IN)  ::  c( :, : ), eigr( :, : )
+         INTEGER,     INTENT(IN)    :: n
+         REAL(DP),    INTENT(OUT)   :: bec( :, : )
+         COMPLEX(DP), INTENT(IN)    :: c( :, : )
+         COMPLEX(DP), INTENT(INOUT) :: betae( :, : )
         INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
      END SUBROUTINE calbec_x
   END INTERFACE
@ -983,10 +1032,11 @@
   END INTERFACE

   INTERFACE nlfq_bgrp
-      SUBROUTINE nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
+      SUBROUTINE nlfq_bgrp_x( c_bgrp, betae, bec_bgrp, becdr_bgrp, fion )
         USE kinds,              ONLY: DP
         IMPLICIT NONE
-         COMPLEX(DP), INTENT(IN)  ::  c_bgrp( :, : ), eigr( :, : )
+         COMPLEX(DP), INTENT(IN) DEVICEATTR :: c_bgrp( :, : )
+         COMPLEX(DP), INTENT(IN) DEVICEATTR :: betae( :, : )
         REAL(DP),    INTENT(IN)  ::  bec_bgrp( :, : )
         REAL(DP),    INTENT(OUT) ::  becdr_bgrp( :, :, : )
         REAL(DP),    INTENT(OUT) ::  fion( :, : )
@ -1003,28 +1053,25 @@
      END SUBROUTINE collect_bec_x
   END INTERFACE

-   INTERFACE beta_eigr
-      SUBROUTINE beta_eigr_x ( beigr, nspmn, nspmx, eigr, pptype_ )
-         USE kinds,      ONLY : DP
-         IMPLICIT NONE
-         INTEGER,     INTENT(IN)  :: nspmn, nspmx
-         COMPLEX(DP), INTENT(IN)  :: eigr( :, : )
-         COMPLEX(DP), INTENT(OUT) :: beigr( :, : )
-         INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
-      END SUBROUTINE beta_eigr_x
-   END INTERFACE
-
   INTERFACE nlsm1us
-      SUBROUTINE nlsm1us_x ( n, beigr, c, becp )
+      SUBROUTINE nlsm1us_x ( n, betae, c, becp )
         USE kinds,      ONLY : DP
         IMPLICIT NONE
         INTEGER,     INTENT(IN)  :: n
-         COMPLEX(DP), INTENT(IN)  :: beigr( :, : )
-         COMPLEX(DP), INTENT(IN)  :: c( :, : )
-         REAL(DP),    INTENT(OUT) :: becp( :, : )
+         COMPLEX(DP) DEVICEATTR , INTENT(INOUT)  :: betae( :, : )
+         COMPLEX(DP) DEVICEATTR , INTENT(IN)  :: c( :, : )
+         REAL(DP) DEVICEATTR ,    INTENT(OUT) :: becp( :, : )
      END SUBROUTINE nlsm1us_x
   END INTERFACE

+   INTERFACE dbeta_eigr
+      SUBROUTINE dbeta_eigr_x( dbeigr, eigr )
+         USE kinds,      ONLY : DP
+         IMPLICIT NONE
+         COMPLEX(DP), INTENT(IN)  :: eigr( :, : )
+         COMPLEX(DP), INTENT(OUT) :: dbeigr( :, :, :, :)
+      END SUBROUTINE dbeta_eigr_x
+   END INTERFACE

 !=----------------------------------------------------------------------------=!

--- a/CPV/src/cplib.f90
+++ b/CPV/src/cplib.f90
@ -1092,6 +1092,7 @@ subroutine nlinit
      use uspp,            ONLY : aainit, beta, qq_nt, dvan, nhtol, nhtolm, indv,&
                                  dbeta
      use uspp_param,      ONLY : upf, lmaxq, nbetam, lmaxkb, nhm, nh, ish
+      use uspp_gpum,       ONLY : using_qq_nt, using_qq_nt_d, qq_nt_d
      use atom,            ONLY : rgrid
      use qgb_mod,         ONLY : qgb, dqgb
      use smallbox_gvec,   ONLY : ngb
@ -1503,7 +1504,7 @@ end subroutine dylmr2_


 !-----------------------------------------------------------------------
-      SUBROUTINE dotcsc_x( eigr, cp, ngw, n )
+      SUBROUTINE dotcsc_x( betae, cp, ngw, n )
 !-----------------------------------------------------------------------
 !
      USE kinds,              ONLY: DP
@ -1514,13 +1515,14 @@ end subroutine dylmr2_
      USE uspp_param,         ONLY: nh, ish, upf
      USE mp,                 ONLY: mp_sum
      USE mp_global,          ONLY: intra_bgrp_comm, nbgrp, inter_bgrp_comm
-      USE cp_interfaces,      ONLY: nlsm1
+      USE cp_interfaces,      ONLY: calbec
      USE electrons_base,     ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
 !
      IMPLICIT NONE
 !
      INTEGER,     INTENT(IN) :: ngw, n
-      COMPLEX(DP), INTENT(IN) :: eigr(:,:), cp(:,:)
+      COMPLEX(DP), INTENT(IN) :: cp(:,:)
+      COMPLEX(DP), INTENT(INOUT) :: betae(:,:)
 ! local variables
      REAL(DP) rsum, csc(n) ! automatic array
      COMPLEX(DP) temp(ngw) ! automatic array
@ -1536,7 +1538,7 @@ end subroutine dylmr2_
 !     < beta | phi > is real. only the i lowest:
 !

-      CALL nlsm1( nbspx_bgrp, 1, nsp, eigr, cp, becp, 2 )
+      CALL calbec( nbspx_bgrp, betae, cp, becp, 2 )

      nnn = MIN( 12, n )

@ -1640,30 +1642,73 @@ end subroutine dylmr2_
      ! local

      INTEGER  :: ig, i
-      REAL(DP) :: sk(n)  ! automatic array
+      REAL(DP) :: sk, rsum
      !
+      sk = 0.0d0
+!$omp parallel do reduction(+:sk) default(none) &
+!$omp shared(c,g2kin,gstart,ngw,n,f) private(i,ig,rsum)
      DO i=1,n
-         sk(i)=0.0d0
+         rsum = 0.0d0
         DO ig=gstart,ngw
-            sk(i)=sk(i)+DBLE(CONJG(c(ig,i))*c(ig,i))*g2kin(ig)
+            rsum = rsum + DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin(ig)
         END DO
+         sk = sk + f(i) * rsum
      END DO
+!$omp end parallel do

-      CALL mp_sum( sk(1:n), intra_bgrp_comm )
-
-      enkin_x=0.0d0
-      DO i=1,n
-         enkin_x=enkin_x+f(i)*sk(i)
-      END DO
+      CALL mp_sum( sk, intra_bgrp_comm )

      ! ... reciprocal-space vectors are in units of alat/(2 pi) so a
      ! ... multiplicative factor (2 pi/alat)**2 is required
-
-      enkin_x = enkin_x * tpiba2
+      enkin_x = tpiba2 * sk
 !
      RETURN
   END FUNCTION enkin_x

+#if defined (__CUDA)
+!-----------------------------------------------------------------------
+   FUNCTION enkin_gpu_x( c, f, n )
+!-----------------------------------------------------------------------
+      !
+      USE kinds,              ONLY: DP
+      USE constants,          ONLY: pi, fpi
+      USE gvecw,              ONLY: ngw
+      USE gvect,              ONLY: gstart
+      USE gvecw,              ONLY: g2kin_d
+      USE mp,                 ONLY: mp_sum
+      USE mp_global,          ONLY: intra_bgrp_comm
+      USE cell_base,          ONLY: tpiba2
+      USE cudafor
+
+      IMPLICIT NONE
+
+      REAL(DP)                :: enkin_gpu_x
+
+      INTEGER,     INTENT(IN) :: n
+      COMPLEX(DP), DEVICE, INTENT(IN) :: c( :, : )
+      REAL(DP),    DEVICE, INTENT(IN) :: f( : )
+      !
+      ! local
+
+      INTEGER  :: ig, i
+      REAL(DP) :: sk
+      !
+      sk=0.0d0
+!$cuf kernel do(2) <<<*,*>>>
+      DO i=1,n
+         DO ig=gstart,ngw
+            sk = sk + f(i) * DBLE(CONJG(c(ig,i))*c(ig,i)) * g2kin_d(ig)
+         END DO
+      END DO
+
+      CALL mp_sum( sk, intra_bgrp_comm )
+
+      enkin_gpu_x = tpiba2 * sk
+!
+      RETURN
+   END FUNCTION enkin_gpu_x
+#endif
+
 !-------------------------------------------------------------------------
      SUBROUTINE nlfl_bgrp_x( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
 !-----------------------------------------------------------------------
@ -1932,25 +1977,34 @@ end subroutine dylmr2_
      USE gvecw,      ONLY : ngw
      USE uspp,       ONLY : beta, nhtol, indv_ijkb0
      USE uspp_param, ONLY : nh, upf
+      USE gvect,      ONLY : gstart
 !
      IMPLICIT NONE
      COMPLEX(DP), INTENT(IN) :: eigr( :, : )
      COMPLEX(DP), INTENT(OUT) :: betae( :, : )
 !
      INTEGER     :: is, iv, ia, inl, ig, isa
+      COMPLEX(DP), PARAMETER, DIMENSION(4) :: cfact = &  ! (l == 0), (l == 1), (l == 2), (l == 3)
+      [( 1.0_dp , 0.0_dp ), ( 0.0_dp , -1.0_dp ), ( -1.0_dp , 0.0_dp ), ( 0.0_dp , 1.0_dp )]
      COMPLEX(DP) :: ci
 !
      CALL start_clock( 'prefor' )
+!$omp parallel do default(shared) private(ia,is,iv,ci,inl,ig)
      DO ia=1,nat
         is=ityp(ia)
         DO iv=1,nh(is)
-            ci=(0.0d0,-1.0d0)**nhtol(iv,is)
+            ci=cfact( nhtol(iv,is) + 1 )
            inl = indv_ijkb0(ia) + iv
            DO ig=1,ngw
               betae(ig,inl)=ci*beta(ig,iv,is)*eigr(ig,ia)
            END DO
+            !beigr(1,inl)=betae(1,inl)
+            !DO ig=gstart,ngw
+            !   beigr(ig,inl)=2.0d0 * betae(ig,inl)
+            !END DO
         END DO
      END DO
+!$omp end parallel do
      CALL stop_clock( 'prefor' )
 !
      RETURN
--- a/CPV/src/cpr.f90
+++ b/CPV/src/cpr.f90
@ -24,6 +24,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE core,                     ONLY : rhoc
  USE uspp_param,               ONLY : nhm, nh, ish
  USE uspp,                     ONLY : nkb, vkb, becsum, deeq, okvan, nlcc_any
+  USE uspp_gpum,                ONLY : vkb_d
  USE energies,                 ONLY : eht, epseu, exc, etot, eself, enl, &
                                       ekin, atot, entropy, egrand, enthal, &
                                       ekincm, print_energies
@ -80,7 +81,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
                                       electrons_nosevel, electrons_noseupd
  USE pres_ai_mod,              ONLY : P_ext, P_in, P_fin, pvar, volclu, &
                                       surfclu, Surf_t, abivol, abisur
-  USE wavefunctions,     ONLY : c0_bgrp, cm_bgrp, phi_bgrp
+  USE wavefunctions,            ONLY : c0_bgrp, cm_bgrp, cm_d, phi, c0_d
  USE wannier_module,           ONLY : allocate_wannier
  USE cp_interfaces,            ONLY : printout_new, move_electrons, newinit
  USE cell_nose,                ONLY : xnhh0, xnhhm, xnhhp, vnhh, temph, &
@ -96,14 +97,14 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
                                       ema0bg, sfac, eigr, iprint_stdout,  &
                                       irb, taub, eigrb, rhog, rhos, &
                                       rhor, bephi, becp_bgrp, nfi, idesc, &
-                                       drhor, drhog, bec_bgrp, dbec
+                                       drhor, drhog, bec_bgrp, dbec, bec_d, iabox, nabox
  USE autopilot,                ONLY : event_step, event_index, &
                                       max_event_step, restart_p
  USE cell_base,                ONLY : s_to_r, r_to_s
  USE wannier_subroutines,      ONLY : wannier_startup, wf_closing_options, &
                                       ef_enthalpy
  USE cp_interfaces,            ONLY : writefile, eigs, strucf, phfacs
-  USE cp_interfaces,            ONLY : ortho, elec_fakekine, calbec_bgrp, calbec, caldbec_bgrp
+  USE cp_interfaces,            ONLY : ortho, elec_fakekine, calbec, caldbec_bgrp
  USE constraints_module,       ONLY : check_constraint, remove_constr_force
  USE cp_autopilot,             ONLY : pilot
  USE ions_nose,                ONLY : ions_nose_allocate, ions_nose_shiftvar
@ -117,6 +118,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
  USE london_module,            ONLY : energy_london, force_london, stres_london
  USE input_parameters,         ONLY : tcpbo
  USE xc_lib,                   ONLY : xclib_dft_is, start_exx, exx_is_active
+  USE device_memcpy_m,            ONLY : dev_memcpy
  !
  IMPLICIT NONE
  !
@ -266,7 +268,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     ! 
     IF ( (okvan .or. nlcc_any ) .AND. (tfor .OR. thdyn .OR. tfirst) ) THEN
        !
-        CALL initbox( tau0, alat, at, ainv, taub, irb )
+        CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
        !
        CALL phbox( taub, iverbosity, eigrb )
        !
@ -288,13 +290,15 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        !
     END IF
     !
-     ! ... why this call ??? from Paolo Umari
+     IF( force_pairing ) THEN
+          c0_bgrp(:,iupdwn(2):nbsp)       =     c0_bgrp(:,1:nupdwn(2))
+          cm_bgrp(:,iupdwn(2):nbsp)       =     cm_bgrp(:,1:nupdwn(2))
+         !phi(:,iupdwn(2):nbsp)       =    phi(:,1:nupdwn(2))
+          CALL dev_memcpy(phi(:,iupdwn(2):), phi, [1, ubound(phi)], 1, [1, nbsp])
+      lambda(:,:, 2) = lambda(:,:, 1)
+     ENDIF
     !
-     IF ( tefield .or. tefield2 ) THEN
-        !
-        CALL calbec( 1, nsp, eigr, c0_bgrp, bec_bgrp ) ! ATTENZIONE  
-        !
-     END IF
+     CALL dev_memcpy( c0_d, c0_bgrp )
     !
     ! Autopilot (Dynamic Rules) Implimentation    
     !
@ -322,13 +326,6 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
     !
     !=======================================================================
     !
-     IF( force_pairing ) THEN
-          c0_bgrp(:,iupdwn(2):nbsp)       =     c0_bgrp(:,1:nupdwn(2))
-          cm_bgrp(:,iupdwn(2):nbsp)       =     cm_bgrp(:,1:nupdwn(2))
-         phi_bgrp(:,iupdwn(2):nbsp)       =    phi_bgrp(:,1:nupdwn(2))
-      lambda(:,:, 2) = lambda(:,:, 1)
-     ENDIF
-     !
     ! ... fake electronic kinetic energy
     !
     IF ( .NOT. tcg ) THEN
@ -339,9 +336,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        !
     END IF
     !
-     CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
-                          fion, c0_bgrp, cm_bgrp, phi_bgrp, &
-                          enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
+     CALL move_electrons( nfi, tprint, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3), &
+                          fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress, .false. )
     !
     IF (lda_plus_u) fion = fion + forceh
     !
@ -537,6 +533,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
        ! ... prefor calculates vkb
        !
        CALL prefor( eigr, vkb )
+        CALL dev_memcpy( vkb_d, vkb )
        !
     END IF
     !
@ -553,13 +550,17 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
         !
         IF ( tortho ) THEN
           !
-           CALL ortho( eigr, cm_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
+#if defined (__CUDA)
+           CALL ortho( vkb_d, cm_d, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
+#else
+           CALL ortho( vkb, cm_bgrp, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
+#endif
           !
         ELSE
           !
           CALL gram_bgrp( vkb, bec_bgrp, nkb, cm_bgrp, ngw )
           !
-           IF ( iverbosity > 2 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
+           IF ( iverbosity > 2 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp_bgrp )
           !
         END IF
         !
@ -568,23 +569,31 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
         IF ( iverbosity > 1 ) CALL laxlib_print_matrix( lambda, idesc, nbsp, 9, nudx, 1.D0, ionode, stdout )
         !
         IF ( tortho ) THEN
-           CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
+#if defined (__CUDA)
+            CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_d, cm_d, idesc )
+            CALL dev_memcpy( bec_bgrp, bec_d )
+            CALL dev_memcpy( cm_bgrp, cm_d )
+#else
+            CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_bgrp, cm_bgrp, idesc )
+#endif
         END IF
         !
         IF( force_pairing ) THEN
-           c0_bgrp(:,iupdwn(2):nbsp)       =     c0_bgrp(:,1:nupdwn(2))
-           cm_bgrp(:,iupdwn(2):nbsp)       =     cm_bgrp(:,1:nupdwn(2))
-           phi_bgrp(:,iupdwn(2):nbsp)       =    phi_bgrp(:,1:nupdwn(2))
+           c0_bgrp(:,iupdwn(2):nbsp)   =     c0_bgrp(:,1:nupdwn(2))
+           cm_bgrp(:,iupdwn(2):nbsp)   =     cm_bgrp(:,1:nupdwn(2))
+           !phi(:,iupdwn(2):nbsp)       =    phi(:,1:nupdwn(2))
+           CALL dev_memcpy(phi(:,iupdwn(2):), phi, [1, ubound(phi)], 1, [1, nbsp])
           lambda(:,:, 2) = lambda(:,:, 1)
         ENDIF
         !
-         CALL calbec_bgrp( 1, nsp, eigr, cm_bgrp, bec_bgrp, 1 )
+         ! the following compute only on NC pseudo components
+         CALL calbec( nbsp_bgrp, vkb, cm_bgrp, bec_bgrp, 1 ) 
         !
         IF ( tpre ) THEN
           CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
         END IF
         !
-         IF ( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp_bgrp )
+         IF ( iverbosity > 1 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp_bgrp )
         !
       END IF
       !
@ -801,7 +810,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
           ! ... restart with CP
           !
           IF ( okvan .or. nlcc_any ) THEN
-              CALL initbox( tau0, alat, at, ainv, taub, irb )
+              CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
              CALL phbox( taub, iverbosity, eigrb ) 
           END IF
           CALL r_to_s( tau0, taus, nat, ainv )
@ -816,9 +825,8 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
           !
           lambdam = lambda
           !
-           CALL move_electrons( nfi, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
-                                fion, c0_bgrp, cm_bgrp, phi_bgrp, enthal, enb,&
-                                enbi, fccc, ccc, dt2bye, stress,.true. )
+           CALL move_electrons( nfi, tprint, tfirst, tlast, bg(:,1), bg(:,2), bg(:,3),&
+                                fion, enthal, enb, enbi, fccc, ccc, dt2bye, stress,.true. )
           !
        END IF
        !
@ -1094,6 +1102,7 @@ SUBROUTINE terminate_run()
  CALL print_clock( 'nlfq' )
  CALL print_clock( 'nlsm1' )
  CALL print_clock( 'nlsm2' )
+  CALL print_clock( 'nlsm1us' )
  CALL print_clock( 'fft' )
  CALL print_clock( 'ffts' )
  CALL print_clock( 'fftw' )
@ -1114,6 +1123,7 @@ SUBROUTINE terminate_run()
  CALL print_clock( 'new_ns' )
  CALL print_clock( 'strucf' )
  CALL print_clock( 'calbec' )
+  CALL print_clock( 'exch_corr' )
 !==============================================================
  IF (ts_vdw) THEN
    WRITE( stdout, '(/5x,"Called by tsvdw:")' )
--- a/CPV/src/dealloc.f90
+++ b/CPV/src/dealloc.f90
@ -15,11 +15,13 @@ SUBROUTINE deallocate_modules_var()
  !
  USE core,                 ONLY : deallocate_core
  USE uspp,                 ONLY : deallocate_uspp
+  USE uspp_gpum,            ONLY : deallocate_uspp_gpu
  USE electrons_base,       ONLY : deallocate_elct
  USE efield_module,        ONLY : deallocate_efield
  USE ensemble_dft,         ONLY : deallocate_ensemble_dft
  USE cg_module,            ONLY : deallocate_cg
  USE gvect,                ONLY : deallocate_gvect
+  USE gvect_gpum,           ONLY : deallocate_gvect_gpu
  USE gvecw,                ONLY : deallocate_gvecw
  USE smallbox_gvec,        ONLY : deallocate_smallbox_gvec
  USE local_pseudo,         ONLY : deallocate_local_pseudo
@ -59,8 +61,10 @@ SUBROUTINE deallocate_modules_var()
  CALL deallocate_cg( )
  CALL deallocate_core()
  CALL deallocate_uspp()
+  CALL deallocate_uspp_gpu()
  CALL deallocate_gvect(.TRUE.) ! Value .true. is hard coded in init.f90:195,
                                !  here it prevents double free of gg variable.
+  CALL deallocate_gvect_gpu()
  CALL deallocate_gvecw()
  CALL deallocate_smallbox_gvec( )
  CALL deallocate_local_pseudo()
--- a/CPV/src/exch_corr.f90
+++ b/CPV/src/exch_corr.f90
@ -63,6 +63,8 @@
      complex(DP), ALLOCATABLE :: self_rhog( :,: )
      LOGICAL  :: ttsic
      real(DP) :: detmp(3,3)
+
+      CALL start_clock( 'exch_corr' )
      !
      !     filling of gradr with the gradient of rho using fft's
      !
@ -246,6 +248,8 @@

      IF( ALLOCATED( gradr ) ) DEALLOCATE( gradr )

+      CALL stop_clock( 'exch_corr' )
+
 5555  format(1x,f12.5,1x,f12.5,1x,f12.5/                                &
     &       1x,f12.5,1x,f12.5,1x,f12.5/                                &
     &       1x,f12.5,1x,f12.5,1x,f12.5//)
@ -511,29 +515,29 @@ subroutine exch_corr_cp(nnr,nspin,grhor,rhor,etxc)
        ! h contains D(rho*Exc)/D(|grad rho|) * (grad rho) / |grad rho|
        !
 !$omp parallel default(none), shared(nnr,grhor,h), private(ipol,k)
+!$omp do collapse(2)
        do ipol = 1, 3
-!$omp do
           do k = 1, nnr
              grhor (ipol, k, 1) = h (k, 1, 1) * grhor (ipol, k, 1)
           enddo
-!$omp end do
        end do
+!$omp end do
 !$omp end parallel
        !
        !
     else
        !
 !$omp parallel default(none), shared(nnr,grhor,h), private(ipol,k,grup,grdw)
+!$omp do collapse(2)
        do ipol = 1, 3
-!$omp do
           do k = 1, nnr
             grup = grhor (ipol, k, 1)
             grdw = grhor (ipol, k, 2)
             grhor (ipol, k, 1) = h (k, 1, 1) * grup + h (k, 1, 2) * grdw
             grhor (ipol, k, 2) = h (k, 2, 2) * grdw + h (k, 2, 1) * grup
           enddo
-!$omp end do
        enddo
+!$omp end do
 !$omp end parallel
        !
     end if
--- a/CPV/src/forces.f90
+++ b/CPV/src/forces.f90
@ -80,21 +80,14 @@
 !=======================================================================

      nogrp_ = fftx_ntgrp(dffts)
+      IF( nogrp_ > 1 ) THEN
+         CALL errore('dforce','Task group not supported',1)
+      END IF
      ALLOCATE( psi( dffts%nnr_tg ) )
      !
-#if defined(__MPI)
-
-      CALL c2psi_gamma_tg( dffts, psi, c, i, n )
-
-      CALL invfft('tgWave', psi, dffts)
-
-#else
-
      CALL c2psi_gamma( dffts, psi, c(:,i), c(:,i+1) )
      !
      CALL invfft( 'Wave', psi, dffts )
-
-#endif
      !
      ! the following avoids a potential out-of-bounds error
      !
@ -213,11 +206,7 @@
         !
      END IF
      !
-#if defined(__MPI)
-      CALL fwfft( 'tgWave', psi, dffts )
-#else
      CALL fwfft( 'Wave', psi, dffts )
-#endif
      !
      !   note : the factor 0.5 appears 
      !       in the kinetic energy because it is defined as 0.5*g**2
@ -343,3 +332,231 @@
 !
      RETURN
   END SUBROUTINE dforce_x
+
+#if defined (__CUDA)
+!-------------------------------------------------------------------------
+      SUBROUTINE dforce_gpu_x ( i, bec, vkb, c, df, da, v, ldv, ispin, f, n, nspin )
+!-----------------------------------------------------------------------
+!computes: the generalized force df=cmplx(dfr,dfi,kind=DP) acting on the i-th
+!          electron state at the gamma point of the brillouin zone
+!          represented by the vector c=cmplx(cr,ci,kind=DP)
+!
+!     d_n(g) = f_n { 0.5 g^2 c_n(g) + [vc_n](g) +
+!              sum_i,ij d^q_i,ij (-i)**l beta_i,i(g) 
+!                                 e^-ig.r_i < beta_i,j | c_n >}
+!
+      USE parallel_include
+      USE kinds,                  ONLY: dp
+      USE control_flags,          ONLY: iprint
+      USE uspp,                   ONLY: nhsa=>nkb, dvan, deeq, indv_ijkb0
+      USE uspp_param,             ONLY: nhm, nh
+      USE constants,              ONLY: pi, fpi
+      USE ions_base,              ONLY: nsp, na, nat, ityp
+      USE gvecw,                  ONLY: ngw, g2kin_d
+      USE cell_base,              ONLY: tpiba2
+      USE ensemble_dft,           ONLY: tens
+      USE xc_lib,                 ONLY: xclib_dft_is, exx_is_active
+      USE fft_base,               ONLY: dffts
+      USE fft_interfaces,         ONLY: fwfft, invfft
+      USE mp_global,              ONLY: me_bgrp
+      USE control_flags,          ONLY: many_fft
+      USE fft_helper_subroutines
+      USE device_memcpy_m,          ONLY : dev_memcpy
+      USE cudafor
+!
+      IMPLICIT NONE
+!
+      INTEGER,     INTENT(IN)    :: i
+      REAL(DP)                   :: bec(:,:)
+      COMPLEX(DP), DEVICE        :: vkb(:,:)
+      COMPLEX(DP), DEVICE        :: c(:,:)
+      COMPLEX(DP)                :: df(:), da(:)
+      INTEGER,     INTENT(IN)    :: ldv
+      REAL(DP), DEVICE           :: v( :, : )
+      INTEGER                    :: ispin( : )
+      REAL(DP)                   :: f( : )
+      INTEGER,     INTENT(IN)    :: n, nspin
+      !
+      ! local variables
+      !
+      INTEGER     :: iv, jv, ia, is, iss1, iss2, ir, ig, inl, jnl
+      INTEGER     :: igno, igrp, ierr, ii
+      INTEGER     :: idx, ioff
+      REAL(DP)    :: fi, fip, dd, dv
+      COMPLEX(DP) :: fp, fm
+      complex(DP), parameter :: ci=(0.0d0,1.0d0)
+
+      REAL(DP),    ALLOCATABLE :: af( :, : ), aa( :, : )
+      REAL(DP),    ALLOCATABLE, DEVICE :: af_d( :, : ), aa_d( :, : )
+      COMPLEX(DP), ALLOCATABLE, DEVICE :: psi(:)
+      COMPLEX(DP), ALLOCATABLE, DEVICE :: df_d(:)
+      COMPLEX(DP), ALLOCATABLE, DEVICE :: da_d(:)
+      INTEGER,     DEVICE, POINTER     :: nl_d(:), nlm_d(:)
+      !
+      CALL start_clock( 'dforce' ) 
+      !
+      IF(xclib_dft_is('hybrid').AND.exx_is_active()) THEN
+         CALL errore(' dforce ', ' dft_is_hybrid and exx_is_active NOT implemented ', 1 )
+      END IF
+
+      ALLOCATE( psi( dffts%nnr * many_fft ) )
+      ALLOCATE( df_d( SIZE( df ) ) )
+      ALLOCATE( da_d( SIZE( da ) ) )
+      !
+      psi = 0.0d0
+      nl_d => dffts%nl_d
+      nlm_d => dffts%nlm_d
+
+      ioff = 0
+      DO ii = i, i + 2 * many_fft - 1, 2
+         IF( ii < n ) THEN
+!$cuf kernel do(1)
+            do ig = 1, dffts%ngw
+               psi( nlm_d( ig ) + ioff) = CONJG( c( ig, ii ) ) + ci * conjg( c( ig, ii+1))
+               psi( nl_d( ig )  + ioff) = c( ig, ii ) + ci * c( ig, ii+1)
+            end do
+         ELSE IF( ii == n ) THEN
+!$cuf kernel do(1)
+            do ig = 1, dffts%ngw
+               psi( nlm_d( ig ) + ioff) = CONJG( c( ig, ii ) )
+               psi( nl_d( ig )  + ioff) = c( ig, ii )
+            end do
+         END IF
+         ioff = ioff + dffts%nnr
+      END DO
+      !
+      CALL invfft( 'Wave', psi, dffts, many_fft )
+      !
+      ioff = 0
+      DO ii = i, i + 2 * many_fft - 1, 2
+         IF( ii < n ) THEN
+            iss1=ispin( ii )
+            iss2=ispin( ii + 1 )
+!$cuf kernel do(1)
+            DO ir=1,dffts%nnr
+               psi(ir+ioff)=CMPLX( v(ir,iss1)* DBLE(psi(ir+ioff)), &
+                                   v(ir,iss2)*AIMAG(psi(ir+ioff)) ,kind=DP)
+            END DO
+         ELSE IF( ii == n ) THEN
+            iss1=ispin( ii )
+            iss2=iss1
+!$cuf kernel do(1)
+            DO ir=1,dffts%nnr
+               psi(ir+ioff)=CMPLX( v(ir,iss1)* DBLE(psi(ir+ioff)), &
+                                   v(ir,iss2)*AIMAG(psi(ir+ioff)) ,kind=DP)
+            END DO
+         END IF
+         ioff = ioff + dffts%nnr
+      END DO
+
+      CALL fwfft( 'Wave', psi, dffts, many_fft )
+
+      igno = 0
+      ioff = 0
+      DO idx = 1, 2 * many_fft, 2
+
+         IF( idx + i - 1 <= n ) THEN
+            if (tens) then
+               fi = -0.5d0
+               fip = -0.5d0
+            else
+               fi = -0.5d0*f(i+idx-1)
+               fip = -0.5d0*f(i+idx)
+            endif
+            CALL fftx_psi2c_gamma_gpu( dffts, psi( 1+ioff : ioff+dffts%nnr ), df_d(1+igno:igno+ngw), da_d(1+igno:igno+ngw))
+!$cuf kernel do(1)
+            DO ig=1,ngw
+               df_d(ig+igno)= fi*(tpiba2*g2kin_d(ig)* c(ig,idx+i-1)+df_d(ig+igno))
+               da_d(ig+igno)=fip*(tpiba2*g2kin_d(ig)* c(ig,idx+i  )+da_d(ig+igno))
+            END DO
+         END IF
+
+         igno = igno + ngw
+         ioff = ioff + dffts%nnr
+
+      ENDDO
+
+      !
+
+      IF( nhsa > 0 ) THEN
+         !
+         !     aa_i,i,n = sum_j d_i,ij <beta_i,j|c_n>
+         ! 
+         ALLOCATE( af( nhsa, many_fft ), aa( nhsa, many_fft ) )
+         ALLOCATE( af_d( nhsa, many_fft ), aa_d( nhsa, many_fft ) )
+         !
+!$omp parallel do default(none), &
+!$omp shared(many_fft,i,n,tens,f,nat,ityp,nh,dvan,indv_ijkb0,deeq,af,aa,bec,ispin), &
+!$omp private(idx,igrp,fi,fip,ia,is,iv,jv,inl,jnl,dv,dd,iss1,iss2)
+         DO idx = 1, 2*many_fft , 2
+
+            igrp = idx/2+1
+            af(:,igrp) = 0.0d0
+            aa(:,igrp) = 0.0d0
+
+            IF( idx + i - 1 <= n ) THEN
+
+               IF( i + idx - 1 /= n ) THEN
+                  fi = f(i+idx-1)
+                  fip= f(i+idx)
+                  iss1=ispin( i+idx-1 )
+                  iss2=ispin( i+idx   )
+               ELSE
+                  fi = f(i+idx-1)
+                  iss1=ispin( i+idx-1 )
+               END IF
+               IF (tens) THEN
+                  fi = 1.0d0
+                  fip= 1.0d0
+               ENDIF
+               !
+               DO ia = 1, nat
+                  is = ityp(ia)
+                  DO iv = 1, nh(is)
+                     DO jv = 1, nh(is)
+                        dv = dvan(iv,jv,is)
+                        inl = indv_ijkb0(ia) + iv
+                        jnl = indv_ijkb0(ia) + jv
+                        IF( i + idx - 1 /= n ) THEN
+                           dd = deeq(iv,jv,ia,iss1) + dv
+                           af(inl,igrp) = af(inl,igrp) - fi  * dd * bec(jnl,i+idx-1)
+                           dd = deeq(iv,jv,ia,iss2) + dv
+                           aa(inl,igrp) = aa(inl,igrp) - fip * dd * bec(jnl,i+idx)
+                        ELSE
+                           dd = deeq(iv,jv,ia,iss1) + dv
+                           af(inl,igrp) = af(inl,igrp) - fi * dd * bec(jnl,i+idx-1)
+                        END IF
+                     END DO
+                  END DO
+               END DO
+            END IF
+         END DO
+!$omp end parallel do
+
+         CALL dev_memcpy( af_d, af )
+         CALL dev_memcpy( aa_d, aa )
+
+         IF( ngw > 0 ) THEN
+           CALL MYDGEMM ( 'N', 'N', 2*ngw, many_fft , nhsa, 1.0d0, vkb, 2*ngw, af_d, nhsa, 1.0d0, df_d, 2*ngw)
+           CALL MYDGEMM ( 'N', 'N', 2*ngw, many_fft , nhsa, 1.0d0, vkb, 2*ngw, aa_d, nhsa, 1.0d0, da_d, 2*ngw)
+         END IF
+         !
+         DEALLOCATE( aa, af )
+         DEALLOCATE( aa_d, af_d )
+         !
+      ENDIF
+
+      CALL dev_memcpy( df, df_d )
+      CALL dev_memcpy( da, da_d )
+!
+      DEALLOCATE( df_d )
+      DEALLOCATE( da_d )
+      DEALLOCATE( psi )
+      NULLIFY(nl_d) 
+      NULLIFY(nlm_d)
+!
+      CALL stop_clock( 'dforce' ) 
+!
+      RETURN
+   END SUBROUTINE dforce_gpu_x
+#endif
--- a/CPV/src/fromscra.f90
+++ b/CPV/src/fromscra.f90
@ -30,6 +30,7 @@ SUBROUTINE from_scratch( )
    USE energies,             ONLY : dft_energy_type, debug_energies
    USE dener,                ONLY : denl, denl6, dekin6, detot
    USE uspp,                 ONLY : vkb, becsum, deeq, nkb, okvan, nlcc_any
+    USE uspp_gpum,            ONLY : vkb_d
    USE io_global,            ONLY : stdout, ionode
    USE core,                 ONLY : rhoc
    USE gvecw,                ONLY : ngw
@ -43,21 +44,24 @@ SUBROUTINE from_scratch( )
    USE cp_interfaces,        ONLY : runcp_uspp, runcp_uspp_force_pairing, &
                                     strucf, phfacs, nlfh, vofrho, nlfl_bgrp, prefor
    USE cp_interfaces,        ONLY : rhoofr, ortho, wave_rand_init, elec_fakekine
-    USE cp_interfaces,        ONLY : compute_stress, dotcsc, calbec_bgrp, caldbec_bgrp
+    USE cp_interfaces,        ONLY : compute_stress, dotcsc, calbec, caldbec_bgrp
    USE cp_interfaces,        ONLY : nlfq_bgrp
    USE printout_base,        ONLY : printout_pos
    USE orthogonalize_base,   ONLY : updatc, calphi_bgrp
    USE wave_base,            ONLY : wave_steepest
-    USE wavefunctions, ONLY : c0_bgrp, cm_bgrp, phi_bgrp
+    USE wavefunctions,        ONLY : c0_bgrp, cm_bgrp, c0_d, phi, cm_d
    USE fft_base,             ONLY : dfftp, dffts
    USE time_step,            ONLY : delt
-    USE cp_main_variables,    ONLY : idesc, bephi, becp_bgrp, nfi, &
-                                     sfac, eigr, taub, irb, eigrb, bec_bgrp, &
+    USE cp_main_variables,    ONLY : idesc, bephi, becp_bgrp, nfi, iabox, nabox, &
+                                     sfac, eigr, taub, irb, eigrb, bec_bgrp, bec_d, &
                                     lambda, lambdam, lambdap, ema0bg, rhog, rhor, rhos, &
                                     vpot, ht0, edft, becdr_bgrp, dbec, drhor, drhog
    USE mp_global,            ONLY : inter_bgrp_comm, nbgrp, me_bgrp
-    USE mp,                   ONLY : mp_sum
+    USE mp_world,             ONLY : mpime, world_comm
+    USE mp,                   ONLY : mp_sum, mp_barrier
    USE matrix_inversion
+    USE device_memcpy_m,        ONLY : dev_memcpy
+
    !
    IMPLICIT NONE
    !
@ -115,7 +119,7 @@ SUBROUTINE from_scratch( )
    CALL strucf( sfac, eigts1, eigts2, eigts3, mill, dffts%ngm )
    !     
    IF ( okvan .OR. nlcc_any ) THEN
-       CALL initbox ( tau0, alat, at, ainv, taub, irb )
+       CALL initbox ( tau0, alat, at, ainv, taub, irb, iabox, nabox )
       CALL phbox( taub, iverbosity, eigrb )
    END IF
    !
@ -133,6 +137,7 @@ SUBROUTINE from_scratch( )
    ! ... prefor calculates vkb (used by gram)
    !
    CALL prefor( eigr, vkb )
+    CALL dev_memcpy( vkb_d, vkb )
    !
    nspin_wfc = nspin
    IF( force_pairing ) nspin_wfc = 1
@ -141,7 +146,9 @@ SUBROUTINE from_scratch( )

    IF( force_pairing ) cm_bgrp(:,iupdwn(2):iupdwn(2)+nupdwn(2)-1) = cm_bgrp(:,1:nupdwn(2))
    !
-    if( iverbosity > 1 ) CALL dotcsc( eigr, cm_bgrp, ngw, nbsp )
+    if( iverbosity > 1 ) CALL dotcsc( vkb, cm_bgrp, ngw, nbsp )
+    !
+    CALL dev_memcpy( cm_d, cm_bgrp )
    !
    ! ... initialize bands
    !
@ -188,11 +195,11 @@ SUBROUTINE from_scratch( )
    !
    IF( .NOT. tcg ) THEN
       !
-       CALL calbec_bgrp ( 1, nsp, eigr, cm_bgrp, bec_bgrp )
+       CALL calbec ( nbsp_bgrp, vkb, cm_bgrp, bec_bgrp, 0 )
       !
       if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )
       !
-       CALL rhoofr( nfi, cm_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
+       CALL rhoofr( nfi, cm_bgrp, cm_d, bec_bgrp, dbec, becsum, rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
       !
       edft%enl  = enl
       edft%ekin = ekin
@ -201,7 +208,7 @@ SUBROUTINE from_scratch( )
    !
    !     put core charge (if present) in rhoc(r)
    !
-    if ( nlcc_any ) CALL set_cc( irb, eigrb, rhoc )
+    if ( nlcc_any ) CALL set_cc( rhoc )
    !
    IF( .NOT. tcg ) THEN
   
@ -229,7 +236,7 @@ SUBROUTINE from_scratch( )
      if( iverbosity > 1 ) &
             CALL printout_pos( stdout, fion, nat, ityp, head = ' fion ' )

-      CALL newd( vpot, irb, eigrb, becsum, fion )
+      CALL newd( vpot, becsum, fion, .true. )
      !
      IF( force_pairing ) THEN
         !
@ -240,26 +247,42 @@ SUBROUTINE from_scratch( )
         !
      ELSE
         !
-         CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, c0_bgrp, fromscra = .TRUE. )
+         CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, cm_bgrp, cm_d, c0_bgrp, c0_d, fromscra = .TRUE. )
         !
      ENDIF
      !
+      CALL dev_memcpy( c0_d, c0_bgrp )  ! c0 contains the updated wave functions
+      !
      !     nlfq needs deeq bec
      !
      IF( ttforce ) THEN
-         CALL nlfq_bgrp( cm_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
+#if defined (__CUDA)
+         CALL nlfq_bgrp( cm_d, vkb_d, bec_bgrp, becdr_bgrp, fion )
+#else
+         CALL nlfq_bgrp( cm_bgrp, vkb, bec_bgrp, becdr_bgrp, fion )
+#endif
      END IF
      !
      !     calphi calculates phi
      !     the electron mass rises with g**2
      !
-      CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
+#if defined (__CUDA)
+      CALL calphi_bgrp( cm_d, ngw, bec_bgrp, nkb, vkb_d, phi, nbspx_bgrp, ema0bg )
+#else
+      CALL calphi_bgrp( cm_bgrp, ngw, bec_bgrp, nkb, vkb, phi, nbspx_bgrp, ema0bg )
+#endif
+      !
+      IF( force_pairing ) THEN
+         !   phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) =    phi( :, 1:nupdwn(2))
+         CALL dev_memcpy(phi(:,iupdwn(2):), phi(:,1:),  [1, ngw], 1 , [1, nupdwn(2)], 1)
+      END IF
      !
-      IF( force_pairing ) &
-         &   phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) =    phi_bgrp( :, 1:nupdwn(2))
-
      if( tortho ) then
-         CALL ortho( eigr, c0_bgrp, phi_bgrp, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
+#if defined (__CUDA)
+         CALL ortho( vkb_d, c0_d, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
+#else
+         CALL ortho( vkb, c0_bgrp, phi, lambda, idesc, bigr, iter, ccc, bephi, becp_bgrp )
+#endif
      else
         CALL gram_bgrp( vkb, bec_bgrp, nkb, c0_bgrp, ngw )
      endif
@ -275,23 +298,30 @@ SUBROUTINE from_scratch( )
      if ( tstress ) CALL nlfh( stress, bec_bgrp, dbec, lambda, idesc )
      !
      IF ( tortho ) THEN
-         CALL updatc( ccc, lambda, phi_bgrp, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
+#if defined (__CUDA)
+         CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_d, c0_d, idesc )
+         CALL dev_memcpy( c0_bgrp, c0_d )
+         CALL dev_memcpy( bec_bgrp, bec_d )
+#else
+         CALL updatc( ccc, lambda, phi, bephi, becp_bgrp, bec_bgrp, c0_bgrp, idesc )
+#endif
      END IF
      !
      IF( force_pairing ) THEN
         !
         c0_bgrp ( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = c0_bgrp( :, 1:nupdwn(2))
-         phi_bgrp( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi_bgrp( :, 1:nupdwn(2))
+         CALL dev_memcpy(phi(:,iupdwn(2):), phi(:,1:),  [1, ngw], 1 , [1, nupdwn(2)], 1)
+         !phi( :, iupdwn(2):(iupdwn(2)+nupdwn(2)-1) ) = phi( :, 1:nupdwn(2))
         lambda(:,:,2) = lambda(:,:,1)
         !
      ENDIF
      !
-      !
-      CALL calbec_bgrp ( 1, nsp, eigr, c0_bgrp, bec_bgrp, 1 )
+      ! the following compute only on NC pseudo components
+      CALL calbec ( nbsp_bgrp, vkb, c0_bgrp, bec_bgrp, 1 )
      !
      if ( tstress ) CALL caldbec_bgrp( eigr, cm_bgrp, dbec, idesc )

-      if ( iverbosity > 1 ) CALL dotcsc( eigr, c0_bgrp, ngw, nbsp_bgrp )
+      if ( iverbosity > 1 ) CALL dotcsc( vkb, c0_bgrp, ngw, nbsp_bgrp )
      !
      xnhp0 = 0.0d0
      xnhpm = 0.0d0
@ -325,3 +355,10 @@ SUBROUTINE from_scratch( )
    RETURN
    !
 END SUBROUTINE from_scratch
+
+subroutine hangup
+    USE mp_world,             ONLY : mpime, world_comm
+    USE mp,                   ONLY : mp_sum, mp_barrier
+    call mp_barrier(world_comm)
+    CALL stop_cp_run()
+end subroutine
--- a/CPV/src/gram.f90
+++ b/CPV/src/gram.f90
@ -16,33 +16,47 @@ SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx )
      USE gvecw,          ONLY : ngw
      USE electrons_base, ONLY : nbspx_bgrp, ibgrp_g2l, nupdwn, iupdwn, nbspx, iupdwn_bgrp, nspin
      USE kinds,          ONLY : DP
-      USE mp_global,      ONLY : inter_bgrp_comm
      USE mp,             ONLY : mp_sum
+      USE gvect,          ONLY : gstart
+      USE mp_global,      ONLY : intra_bgrp_comm, inter_bgrp_comm, me_bgrp, nproc_bgrp
+      USE mp_world,       ONLY : mpime
 !
      IMPLICIT NONE
 !
      INTEGER, INTENT(IN) :: nkbx, ngwx
      REAL(DP)      :: bec_bgrp( nkbx, nbspx_bgrp )
-      COMPLEX(DP)   :: cp_bgrp( ngwx, nbspx_bgrp ), betae( ngwx, nkbx )
+      COMPLEX(DP)   :: cp_bgrp( ngwx, nbspx_bgrp )
+      COMPLEX(DP), INTENT(IN) :: betae( ngwx, nkbx )
 !
      REAL(DP) :: anorm
      REAL(DP), ALLOCATABLE :: csc( : )
      COMPLEX(DP), ALLOCATABLE :: ctmp( : )
+      REAL(DP), ALLOCATABLE :: temp(:) 
+      COMPLEX(DP), ALLOCATABLE :: cp_tmp(:) 
+      REAL(DP), ALLOCATABLE :: bec_tmp(:) 
+      REAL(DP), ALLOCATABLE :: csc2( : )
      INTEGER :: i,k,j, ig, ibgrp_k, ibgrp_i, nbgrp_im1, iss
      REAL(DP), PARAMETER :: one  =  1.d0
      REAL(DP), PARAMETER :: mone = -1.d0
+      REAL(DP) :: g0
 !
      CALL start_clock( 'gram' )

+      g0 = 0.0d0
+      IF (gstart == 2) g0 = 1.0d0
+
      ALLOCATE( csc( nbspx ) )
      ALLOCATE( ctmp( ngwx ) )
+      ALLOCATE( cp_tmp( ngwx ) )
+      ALLOCATE( bec_tmp( nkbx ) )
+      ALLOCATE( csc2( SIZE( csc ) ) )
 !
      DO iss = 1, nspin
      DO i = iupdwn(iss), iupdwn(iss) + nupdwn(iss) - 1 
         !
         ibgrp_i = ibgrp_g2l( i )
         !
-         CALL gracsc_bgrp( bec_bgrp, betae, cp_bgrp, i, csc, iss, nbgrp_im1 )
+         CALL gracsc_bgrp( i, csc, iss, nbgrp_im1 )
         !
         ! calculate orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
         !
@ -60,14 +74,19 @@ SUBROUTINE gram_bgrp( betae, bec_bgrp, nkbx, cp_bgrp, ngwx )
         IF( ibgrp_i > 0 ) THEN
            cp_bgrp( :, ibgrp_i ) = ctmp
            anorm = cscnorm( bec_bgrp, cp_bgrp, ibgrp_i, nbspx_bgrp )
-            CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
-            CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
+            cp_bgrp(:,ibgrp_i) = cp_bgrp(:,ibgrp_i) / anorm
+            bec_bgrp(:,ibgrp_i) = bec_bgrp(:,ibgrp_i) / anorm
+            !CALL dscal( 2*ngw, 1.0d0/anorm, cp_bgrp(1,ibgrp_i), 1 )
+            !CALL dscal( nkbx, 1.0d0/anorm, bec_bgrp(1,ibgrp_i), 1 )
         END IF
      END DO
      END DO
 !
      DEALLOCATE( ctmp )
      DEALLOCATE( csc )
+      DEALLOCATE( csc2 )
+      DEALLOCATE( bec_tmp )
+      DEALLOCATE( cp_tmp )

      CALL stop_clock( 'gram' )
 !
@ -84,7 +103,6 @@ CONTAINS
 !
      USE ions_base,          ONLY: nat, ityp
      USE gvecw,              ONLY: ngw
-      USE gvect,              ONLY: gstart
      USE uspp_param,         ONLY: nh, upf
      USE uspp,               ONLY: qq_nt, indv_ijkb0
      USE mp,                 ONLY: mp_sum
@ -97,47 +115,39 @@ CONTAINS
      REAL(DP),    INTENT(IN) :: bec( :, : )
      COMPLEX(DP), INTENT(IN) :: cp( :, : )
      !
-      REAL(DP) :: cscnorm
+      REAL(DP) :: cscnorm, ddot
      !
-      INTEGER ig, is, iv, jv, ia, inl, jnl
+      INTEGER  :: is, iv, jv, ia, indv
      REAL(DP) rsum
-      REAL(DP), ALLOCATABLE:: temp(:)
 !
-      ALLOCATE(temp(ngw))
+      rsum = 2.d0 * ddot(2*ngw,cp(1,i),1,cp(1,i),1) - g0 * REAL(CONJG(cp(1,i))*cp(1,i), DP)
 !
-      DO ig=1,ngw
-         temp(ig)=DBLE(CONJG(cp(ig,i))*cp(ig,i))
+      DO ia=1,nat
+         IF ( MOD( ia, nproc_bgrp ) == me_bgrp ) THEN
+            is = ityp(ia)
+            IF( upf(is)%tvanp ) THEN
+               indv = indv_ijkb0(ia)
+               DO iv=1,nh(is)
+                  DO jv=1,nh(is)
+                     IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
+                        rsum = rsum + qq_nt(iv,jv,is)*bec(indv+iv,i)*bec(indv+jv,i)
+                     ENDIF
+                  END DO
+               END DO
+            END IF
+         END IF
      END DO
-      rsum=2.d0*SUM(temp)
-      IF (gstart == 2) rsum=rsum-temp(1)

      CALL mp_sum( rsum, intra_bgrp_comm )
-!
-      DO ia=1,nat
-         is = ityp(ia)
-         IF( upf(is)%tvanp ) THEN
-            DO iv=1,nh(is)
-               inl=indv_ijkb0(ia) + iv
-               DO jv=1,nh(is)
-                  jnl=indv_ijkb0(ia) + jv
-                  IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN 
-                     rsum = rsum + qq_nt(iv,jv,is)*bec(inl,i)*bec(jnl,i)
-                  ENDIF
-               END DO
-            END DO
-         END IF
-      END DO
 !
      cscnorm=SQRT(rsum)
-
-      DEALLOCATE(temp)
 !
      RETURN
      END FUNCTION cscnorm
 !
 !
 !-------------------------------------------------------------------------
-      SUBROUTINE gracsc_bgrp( bec_bgrp, betae, cp_bgrp, i, csc, iss, nk )
+      SUBROUTINE gracsc_bgrp( i, csc, iss, nk )
 !-----------------------------------------------------------------------
 !     requires in input the updated bec(k) for k<i
 !     on output: bec(i) is recalculated
@ -148,7 +158,6 @@ CONTAINS
      USE electrons_base, ONLY: ispin, ispin_bgrp, nbspx_bgrp, ibgrp_g2l, iupdwn, nupdwn, nbspx
      USE gvecw,          ONLY: ngw
      USE mp,             ONLY: mp_sum
-      USE mp_global,      ONLY: intra_bgrp_comm, inter_bgrp_comm, me_bgrp, nproc_bgrp
      USE kinds,          ONLY: DP
      USE gvect, ONLY: gstart
 !
@ -156,97 +165,67 @@ CONTAINS
 !
      INTEGER, INTENT(IN) :: i, iss
      INTEGER, INTENT(OUT) :: nk
-      COMPLEX(DP) :: betae( :, : )
-      REAL(DP)    :: bec_bgrp( :, : )
-      COMPLEX(DP) :: cp_bgrp( :, : )
      REAL(DP)    :: csc( : )
-      INTEGER     :: k, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
-      REAL(DP)    :: rsum
-      REAL(DP), ALLOCATABLE :: temp(:) 
-      COMPLEX(DP), ALLOCATABLE :: cp_tmp(:) 
-      REAL(DP), ALLOCATABLE :: bec_tmp(:) 
-      REAL(DP), ALLOCATABLE :: csc2( : )
-#if defined(_OPENMP)
-      INTEGER :: mytid, ntids, omp_get_thread_num, omp_get_num_threads
-#endif
+      INTEGER     :: k, kmax_bgrp, kmax,ig, is, iv, jv, ia, inl, jnl, ibgrp_k, ibgrp_i
+      REAL(DP)    :: rsum, ddot
+      INTEGER     :: omp_get_thread_num, omp_get_num_threads
      !
      !     calculate csc(k)=<cp(i)|cp(k)>,  k<i
      !
      kmax = i - 1
      !
-      ALLOCATE( cp_tmp( ngwx ) )
-      ALLOCATE( bec_tmp( nkbx ) )
-      ALLOCATE( csc2( SIZE( csc ) ) )
-
-
-      cp_tmp = 0.0d0
      csc    = 0.0d0

      ibgrp_i = ibgrp_g2l( i )
-      IF( ibgrp_i > 0 ) cp_tmp = cp_bgrp( :, ibgrp_i )
+      IF( ibgrp_i > 0 ) THEN
+         cp_tmp = cp_bgrp( :, ibgrp_i )
+      ELSE
+         cp_tmp = 0.0d0
+      END IF

      CALL mp_sum( cp_tmp, inter_bgrp_comm )

-!$omp parallel default(none), &
-!$omp          shared(iupdwn,kmax,ispin,ibgrp_g2l,ngw,cp_bgrp,cp_tmp,csc,betae,bec_bgrp,i,iss,gstart), &
-!$omp          shared(upf,nat,ityp,indv_ijkb0,nh), &
-!$omp          private( temp, k, ig, inl, ibgrp_k, ibgrp_i, is, ia )
-      ALLOCATE( temp( ngw ) )
-!$omp do
+      kmax_bgrp = 0
+      nk = 0
      DO k = iupdwn( iss ), kmax
-         IF ( ispin(i) .EQ. ispin(k) ) THEN
-            ibgrp_k = ibgrp_g2l( k )
-            IF( ibgrp_k > 0 ) THEN
-               DO ig = 1, ngw
-                  temp(ig) = DBLE( cp_bgrp(ig,ibgrp_k) * CONJG(cp_tmp(ig)) )
-               END DO
-               csc(k) = 2.0d0 * SUM(temp)
-               IF (gstart == 2) csc(k) = csc(k) - temp(1)
-            END IF
-         ENDIF
+         IF( ibgrp_g2l( k ) > 0 ) THEN
+            kmax_bgrp = ibgrp_g2l( k )
+            nk = nk + 1
+         END IF
      END DO
-!$omp end do
-      !
-      !
-      !     calculate bec(i)=<cp(i)|beta>
-      !
-      ibgrp_i = ibgrp_g2l( i )
-      !
+      kmax_bgrp = kmax_bgrp - iupdwn_bgrp(iss) + 1
+
+      IF( kmax_bgrp > 0 .AND. ngw > 0 ) &
+        CALL dgemv( 'T', 2*ngw, kmax_bgrp, 1.0d0, cp_bgrp(1,iupdwn_bgrp(iss)), 2*ngwx, cp_tmp, 1, 0.0d0, csc2, 1 )
+
+      nk = 0
+      DO k = iupdwn( iss ), kmax
+         ibgrp_k = ibgrp_g2l( k )
+         IF( ibgrp_k > 0 ) THEN
+            nk = nk + 1
+            csc(k) = 2.0d0 * csc2(nk) - g0 * DBLE( cp_bgrp(1,ibgrp_k) * CONJG(cp_tmp(1)) )
+         END IF
+      END DO
+
      IF(  ibgrp_i > 0 ) THEN
-!$omp do
         DO ia = 1, nat
            is = ityp(ia)
            IF( upf(is)%tvanp ) THEN
               DO iv=1,nh(is)
                  inl=indv_ijkb0(ia)+iv
-                  DO ig=1,ngw
-                     temp(ig) = DBLE( cp_bgrp(ig,ibgrp_i) * CONJG(betae(ig,inl)) )
-                  END DO
-                  bec_bgrp(inl,ibgrp_i)=2.d0*SUM(temp)
-                  IF (gstart == 2) bec_bgrp(inl,ibgrp_i)= bec_bgrp(inl,ibgrp_i)-temp(1)
+                  bec_tmp(inl) = 2.d0 * DDOT( 2*ngw, cp_bgrp(1,ibgrp_i), 1, betae(1,inl), 1) &
+                                 - g0 * DBLE(cp_bgrp(1,ibgrp_i) * CONJG(betae(1,inl)))
               END DO
+            ELSE
+               inl= indv_ijkb0(ia)
+               bec_tmp( inl + 1: inl + nh(is) ) = 0.0d0
            END IF
         END DO
-!$omp end do
+         CALL mp_sum( bec_tmp, intra_bgrp_comm )  ! parallel sum over G vectors within a band group
+         bec_bgrp( : , ibgrp_i ) = bec_tmp( : )
+      ELSE
+         bec_tmp = 0.0d0
      END IF
-      DEALLOCATE( temp )
-!$omp end parallel
-
-      CALL mp_sum( csc, intra_bgrp_comm )
-      CALL mp_sum( csc, inter_bgrp_comm )
-
-      IF(  ibgrp_i > 0 ) THEN
-         DO ia = 1, nat
-            is = ityp(ia)
-            IF( upf(is)%tvanp ) THEN
-               inl=indv_ijkb0(ia)
-               CALL mp_sum( bec_bgrp( inl + 1: inl + nh(is), ibgrp_i ), intra_bgrp_comm )
-            END IF
-         END DO
-      END IF
-
-      bec_tmp = 0.0d0
-      IF( ibgrp_i > 0 ) bec_tmp = bec_bgrp(:,ibgrp_i )

      CALL mp_sum( bec_tmp, inter_bgrp_comm )
 !
@ -254,62 +233,46 @@ CONTAINS
 !
      csc2    = 0.0d0

-!$omp parallel default(none), &
-!$omp shared(iupdwn,iss,kmax,nproc_bgrp,me_bgrp,ispin,i,ibgrp_g2l,nh), &
+!$omp parallel if( (kmax - iupdwn( iss )) > omp_get_num_threads() ) default(none), &
+!$omp shared(iupdwn,iss,kmax,nproc_bgrp,me_bgrp,nbspx,i,ibgrp_g2l,nh), &
 !$omp shared(indv_ijkb0,qq_nt,na,bec_tmp,bec_bgrp,csc2,nat,ityp,upf), &
-!$omp private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k, ntids, mytid )
-#if defined(_OPENMP)
-      mytid = omp_get_thread_num()  ! take the thread ID
-      ntids = omp_get_num_threads() ! take the number of threads
-#endif
-
-      DO k=iupdwn(iss), kmax
-         IF ( MOD( k, nproc_bgrp ) /= me_bgrp ) CYCLE
-#if defined(_OPENMP)
-         ! distribute bands round robin to threads
-         IF( MOD( k / nproc_bgrp, ntids ) /= mytid ) CYCLE
-#endif
-         IF (ispin(i).EQ.ispin(k)) THEN
-            rsum=0.d0
-            ibgrp_k = ibgrp_g2l( k )
-            IF( ibgrp_k > 0 ) THEN
-               DO ia = 1, nat
+!$omp private( k, is, iv, jv, ia, inl, jnl, rsum, ibgrp_k )
+!$omp do
+      DO k = iupdwn( iss ), kmax
+         rsum=0.d0
+         ibgrp_k = ibgrp_g2l( k )
+         IF( ibgrp_k > 0 ) THEN
+            DO ia = 1, nat
+               IF ( MOD( ia-1, nproc_bgrp ) == me_bgrp ) THEN
                  is=ityp(ia)
                  IF( upf(is)%tvanp ) THEN
+                     inl = indv_ijkb0(ia)
                     DO iv=1,nh(is)
-                        inl=indv_ijkb0(ia)+iv
                        DO jv=1,nh(is)
-                           jnl=indv_ijkb0(ia)+jv
                           IF(ABS(qq_nt(iv,jv,is)).GT.1.e-5) THEN
-                              rsum = rsum + qq_nt(iv,jv,is)*bec_tmp(inl)*bec_bgrp(jnl,ibgrp_k)
+                              rsum = rsum + qq_nt(iv,jv,is)*bec_tmp(inl+iv)*bec_bgrp(inl+jv,ibgrp_k)
                           ENDIF
                        END DO
                     END DO
                  END IF
-               END DO
-            END IF
-            csc2(k)=csc2(k)+rsum
+               END IF
+            END DO
         ENDIF
+         csc2(k)=csc2(k)+rsum
      END DO
+!$omp end do
 !$omp end parallel
 !
 !     orthogonalized cp(i) : |cp(i)>=|cp(i)>-\sum_k<i csc(k)|cp(k)>
 !
 !     corresponing bec:  bec(i)=<cp(i)|beta>-csc(k)<cp(k)|beta>
 !
+      CALL mp_sum( csc, intra_bgrp_comm )
      CALL mp_sum( csc2, intra_bgrp_comm )
+      CALL mp_sum( csc, inter_bgrp_comm )
      CALL mp_sum( csc2, inter_bgrp_comm )
      csc = csc + csc2

-      bec_tmp = 0.0d0
-      DO k = iupdwn(iss), kmax
-         ibgrp_k = ibgrp_g2l( k )
-         IF( ibgrp_k > 0 ) THEN
-            DO inl=1,nkbx
-               bec_tmp(inl)=bec_tmp(inl)-csc(k)*bec_bgrp(inl,ibgrp_k)
-            END DO
-         END IF
-      END DO
      nk = 0
      DO k = iupdwn(iss), kmax
         ibgrp_k = ibgrp_g2l( k )
@ -318,12 +281,15 @@ CONTAINS
            csc( nk ) = csc( k )
         END IF
      END DO
+
+      IF( nk > 0 .AND. ngw > 0 ) THEN
+        CALL dgemv( 'N', nkbx, nk, -1.0d0, bec_bgrp(1,iupdwn_bgrp(iss)), nkbx, csc, 1, 0.0d0, bec_tmp, 1 )
+      ELSE
+        bec_tmp = 0.0d0
+      END IF
+
      CALL mp_sum( bec_tmp, inter_bgrp_comm )
      IF( ibgrp_i > 0 ) bec_bgrp(:,ibgrp_i ) = bec_bgrp(:,ibgrp_i ) + bec_tmp
-
-      DEALLOCATE( csc2 )
-      DEALLOCATE( bec_tmp )
-      DEALLOCATE( cp_tmp )
 !
      RETURN
      END SUBROUTINE gracsc_bgrp
--- a/CPV/src/init.f90
+++ b/CPV/src/init.f90
@ -31,6 +31,7 @@
      USE gvect,                ONLY : mill_g, eigts1,eigts2,eigts3, g, gg, &
                                       ecutrho, gcutm, gvect_init, mill, &
                                       ig_l2g, gstart, ngm, ngm_g, gshells
+      USE gvect_gpum,           ONLY : using_g, using_g_d 
      use gvecs,                only : gcutms, gvecs_init, ngms
      use gvecw,                only : gkcut, gvecw_init, g2kin_init
      USE smallbox_subs,        ONLY : ggenb
@ -195,6 +196,11 @@
        !
      END IF

+      CALL using_g(2)
+#if defined (__CUDA)
+      CALL using_g_d(0)
+#endif
+
      CALL gshells (.TRUE.)
      !
      ! ... allocate and generate (modified) kinetic energy
@ -401,6 +407,7 @@
                                        cell_base_reinit
      USE gvecw,                 ONLY : g2kin_init
      USE gvect,                 ONLY : g, gg, ngm, mill
+      USE gvect_gpum,            ONLY : using_g, using_g_d
      USE fft_base,              ONLY : dfftp, dfftb
      USE small_box,             ONLY : small_box_set
      USE smallbox_subs,         ONLY : gcalb
@ -425,10 +432,15 @@
      !
      !  re-calculate G-vectors and kinetic energy
      !
+      CALL using_g(2)
+      !
      do ig = 1, dfftp%ngm
         g(:,ig)= mill(1,ig)*bg(:,1) + mill(2,ig)*bg(:,2) + mill(3,ig)*bg(:,3)
         gg(ig)=g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2
      enddo
+#if defined (__CUDA)
+      CALL using_g_d(0)
+#endif
      !
      call g2kin_init ( gg, tpiba2 )
      !
--- a/CPV/src/init_run.f90
+++ b/CPV/src/init_run.f90
@ -27,8 +27,9 @@ SUBROUTINE init_run()
  USE fft_base,                 ONLY : dfftp, dffts
  USE electrons_base,           ONLY : nspin, nbsp, nbspx, nupdwn, f
  USE uspp,                     ONLY : nkb, vkb, deeq, becsum,nkbus
+  USE uspp_gpum,                ONLY : vkb_d
  USE core,                     ONLY : rhoc
-  USE wavefunctions,            ONLY : c0_bgrp, cm_bgrp, phi_bgrp
+  USE wavefunctions,            ONLY : c0_bgrp, cm_bgrp, allocate_cp_wavefunctions
  USE ensemble_dft,             ONLY : tens, z0t
  USE cg_module,                ONLY : tcg
  USE electrons_base,           ONLY : nudx
@ -80,6 +81,9 @@ SUBROUTINE init_run()
  USE cell_base,                ONLY : ref_tpiba2, init_tpiba2
  USE tsvdw_module,             ONLY : tsvdw_initialize
  USE exx_module,               ONLY : exx_initialize
+#if defined (__CUDA)
+  USE cudafor
+#endif
  !
  IMPLICIT NONE
  !
@ -171,10 +175,7 @@ SUBROUTINE init_run()
  !
  !  initialize wave functions descriptors and allocate wf
  !
-  IF(lwfpbe0nscf) ALLOCATE(cv0( ngw, vnbsp ) )   ! Lingzhu Kong
-  ALLOCATE( c0_bgrp( ngw, nbspx ) )
-  ALLOCATE( cm_bgrp( ngw, nbspx ) )
-  ALLOCATE( phi_bgrp( ngw, nbspx ) )
+  CALL allocate_cp_wavefunctions( ngw, nbspx, vnbsp, lwfpbe0nscf )
  !
  IF ( iverbosity > 1 ) THEN
     !
@ -203,6 +204,9 @@ SUBROUTINE init_run()
  ALLOCATE( deeq( nhm, nhm, nat, nspin ) )
  !
  ALLOCATE( vkb( ngw, nkb ) )
+#if defined(_CUDA)
+  ALLOCATE( vkb_d( ngw, nkb ) )
+#endif
  !
  IF ( xclib_dft_is('meta') .AND. tens ) &
     CALL errore( ' init_run ', 'ensemble_dft not implemented for metaGGA', 1 )
@ -264,11 +268,6 @@ SUBROUTINE init_run()
  !
  hnew = h
  !
-  IF(lwfpbe0nscf) cv0=( 0.D0, 0.D0 )    ! Lingzhu Kong
-  cm_bgrp  = ( 0.D0, 0.D0 )
-  c0_bgrp  = ( 0.D0, 0.D0 )
-  phi_bgrp = ( 0.D0, 0.D0 )
-  !
  IF ( tens ) then
     CALL id_matrix_init( idesc, nspin )
     CALL h_matrix_init( idesc, nspin )
--- a/CPV/src/inner_loop_cold.f90
+++ b/CPV/src/inner_loop_cold.f90
@ -7,7 +7,7 @@
 !
 !
 !====================================================================
-   SUBROUTINE inner_loop_cold( nfi, tfirst, tlast, eigr,  irb, eigrb, &
+   SUBROUTINE inner_loop_cold( nfi, tfirst, tlast, eigr, irb, eigrb, &
                          rhor, rhog, rhos, rhoc, ei1, ei2, ei3, &
                          sfac, c0, bec, dbec, firstiter, vpot )
 !====================================================================
@ -109,7 +109,7 @@

        ! calculates the overlaps bec between the wavefunctions c0
        ! and the beta functions
-        CALL calbec( 1, nsp, eigr, c0, bec )
+        CALL calbec( n, betae, c0, bec )
 
        ! rotates the wavefunctions c0 and the overlaps bec
        ! (the occupation matrix f_ij becomes diagonal f_i)      
@ -119,7 +119,7 @@
        ! calculates the electronic charge density
        CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
                     rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
-        IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
+        IF(nlcc_any) CALL set_cc( rhoc )
  
        ! calculates the SCF potential, the total energy
        ! and the ionic forces
@ -141,7 +141,7 @@
      ! the augmentation charges and the 
      ! corresponding contribution to the ionic force
       
-         CALL newd( vpot, irb, eigrb, rhovan, fion2 )
+         CALL newd( vpot, rhovan, fion2, .true. )

         ! operates the Hamiltonian on the wavefunction c0
         h0c0( :, : )= 0.D0
@ -260,7 +260,7 @@
        ! calculates the electronic charge density
         CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
                     rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
-         IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
+         IF(nlcc_any) CALL set_cc( rhoc )
  
        ! calculates the SCF potential, the total energy
        ! and the ionic forces
@ -409,7 +409,7 @@
      ! calculates the electronic charge density
      CALL rhoofr( nfi, c0diag, irb, eigrb, becdiag, dbec, rhovan, &
                   rhor, drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
-      IF(nlcc_any) CALL set_cc( irb, eigrb, rhoc )
+      IF(nlcc_any) CALL set_cc( rhoc )
  
      ! calculates the SCF potential, the total energy
      ! and the ionic forces
--- a/CPV/src/ldaU.f90
+++ b/CPV/src/ldaU.f90
@ -119,10 +119,11 @@
      USE step_penalty,       ONLY: penalty_e, penalty_f
      USE mp_pools,           ONLY: intra_pool_comm, me_pool, nproc_pool
      USE mp_bands,           only: nbgrp
-      USE cp_interfaces,      only: nlsm1, nlsm2_bgrp
+      USE cp_interfaces,      only: calbec, nlsm2_bgrp
 !
      implicit none
-      complex(DP), intent(in) :: c(ngw,nx), eigr(ngw,nat), betae(ngw,nkb)
+      complex(DP), intent(in) :: c(ngw,nx), eigr(ngw,nat)
+      complex(DP), intent(inout) :: betae(ngw,nkb)
      complex(DP), intent(out) :: hpsi(ngw,nx)
      real(DP), INTENT(OUT) :: forceh(3,nat)
 !
@ -243,10 +244,10 @@
        allocate(dns(ldmx,ldmx,nspin,nat))
        allocate (spsi(ngw,n))
 !
-        call nlsm1 ( n, 1, nsp, eigr, c, bp )
+        call calbec ( n, betae, c, bp )
        call s_wfc ( n, bp, betae, c, spsi )
-        call nlsm2_bgrp( ngw, nkb, eigr, c, dbp, nx, n )
-        call nlsm2_bgrp( ngw, nkb, eigr, wfcU, wdb, nwfcU, nwfcU )
+        call nlsm2_bgrp( ngw, nkb, betae, c, dbp, nx, n )
+        call nlsm2_bgrp( ngw, nkb, betae, wfcU, wdb, nwfcU, nwfcU )
        !
        ! poor-man parallelization over bands
        ! - if nproc_pool=1   : nb_s=1, nb_e=n, mykey=0
@ -620,12 +621,13 @@
      USE gvect,              ONLY: gstart
      USE ions_base,          ONLY: nsp, nat
      USE uspp,               ONLY: nkb
-      USE cp_interfaces,      only: nlsm1
+      USE cp_interfaces,      only: calbec
 !
      IMPLICIT NONE
      INTEGER,     INTENT(IN) :: nx, n, nwfcU, offset(nat), &
                                 Hubbard_l(nsp)
-      COMPLEX(DP), INTENT(IN) :: c( ngw, nx ), eigr(ngw,nat), betae(ngw,nkb)
+      COMPLEX(DP), INTENT(IN) :: c( ngw, nx ), eigr(ngw,nat)
+      COMPLEX(DP), INTENT(INOUT) :: betae(ngw,nkb)
 !
      COMPLEX(DP), INTENT(OUT):: wfcU(ngw, nwfcU),    &
     &                           swfc(ngw, nwfcU)
@ -641,7 +643,7 @@
      !
      ! calculate bec = <beta|wfc>
      !
-      CALL nlsm1( nwfcU, 1, nsp, eigr, wfcU, becwfc )
+      CALL calbec( nwfcU, betae, wfcU, becwfc )
      !
      ! calculate swfc = S|wfc>
      !
--- a/CPV/src/mainvar.f90
+++ b/CPV/src/mainvar.f90
@ -42,6 +42,8 @@ MODULE cp_main_variables
  REAL(DP), ALLOCATABLE :: taub(:,:)
  COMPLEX(DP), ALLOCATABLE :: eigrb(:,:)
  INTEGER,     ALLOCATABLE :: irb(:,:)
+  INTEGER,     ALLOCATABLE :: iabox(:)
+  INTEGER :: nabox
  ! 
  ! ... nonlocal projectors:
  ! ...    bec   = scalar product of projectors and wave functions
@ -53,8 +55,12 @@ MODULE cp_main_variables
  REAL(DP), ALLOCATABLE :: bephi(:,:)      ! distributed (orhto group)
  REAL(DP), ALLOCATABLE :: becp_bgrp(:,:)  ! distributed becp (band group)
  REAL(DP), ALLOCATABLE :: bec_bgrp(:,:)  ! distributed bec (band group)
+  REAL(DP), ALLOCATABLE :: bec_d(:,:)  ! distributed bec (band group)
  REAL(DP), ALLOCATABLE :: becdr_bgrp(:,:,:)  ! distributed becdr (band group)
  REAL(DP), ALLOCATABLE :: dbec(:,:,:,:)    ! derivative of bec distributed(ortho group) 
+#if defined (__CUDA)
+  ATTRIBUTES( DEVICE ) :: becp_bgrp, bephi, bec_d
+#endif
  !
  ! ... mass preconditioning
  !
@ -148,6 +154,10 @@ MODULE cp_main_variables
      ALLOCATE( irb( 3, nat ), STAT=ierr )
      IF( ierr /= 0 ) &
         CALL errore( ' allocate_mainvar ', ' unable to allocate irb ', ierr )
+      ALLOCATE( iabox( nat ), STAT=ierr )
+      IF( ierr /= 0 ) &
+         CALL errore( ' allocate_mainvar ', ' unable to allocate iabox ', ierr )
+      nabox = 0
      !
      IF ( xclib_dft_is('meta') ) THEN
         !
@ -249,6 +259,11 @@ MODULE cp_main_variables
      ALLOCATE( bec_bgrp( nhsa, nbspx_bgrp ), STAT=ierr )
      IF( ierr /= 0 ) &
         CALL errore( ' allocate_mainvar ', ' unable to allocate bec_bgrp ', ierr )
+#if defined (__CUDA)
+      ALLOCATE( bec_d( nhsa, nbspx_bgrp ), STAT=ierr )
+      IF( ierr /= 0 ) &
+         CALL errore( ' allocate_mainvar ', ' unable to allocate bec_d ', ierr )
+#endif
      ALLOCATE( bephi( nhsa, nspin*nrcx ), STAT=ierr )
      IF( ierr /= 0 ) &
         CALL errore( ' allocate_mainvar ', ' unable to allocate becphi ', ierr )
@ -281,12 +296,14 @@ MODULE cp_main_variables
      IF( ALLOCATED( sfac ) )    DEALLOCATE( sfac )
      IF( ALLOCATED( eigrb ) )   DEALLOCATE( eigrb )
      IF( ALLOCATED( irb ) )     DEALLOCATE( irb )
+      IF( ALLOCATED( iabox ) )     DEALLOCATE( iabox )
      IF( ALLOCATED( rhor ) )    DEALLOCATE( rhor )
      IF( ALLOCATED( rhos ) )    DEALLOCATE( rhos )
      IF( ALLOCATED( rhog ) )    DEALLOCATE( rhog )
      IF( ALLOCATED( drhog ) )   DEALLOCATE( drhog )
      IF( ALLOCATED( drhor ) )   DEALLOCATE( drhor )
      IF( ALLOCATED( bec_bgrp ) )     DEALLOCATE( bec_bgrp )
+      IF( ALLOCATED( bec_d ) )     DEALLOCATE( bec_d )
      IF( ALLOCATED( becdr_bgrp ) )   DEALLOCATE( becdr_bgrp )
      IF( ALLOCATED( bephi ) )   DEALLOCATE( bephi )
      IF( ALLOCATED( becp_bgrp ) )    DEALLOCATE( becp_bgrp )
--- a/CPV/src/move_electrons.f90
+++ b/CPV/src/move_electrons.f90
@ -7,8 +7,8 @@
 !
 !
 !----------------------------------------------------------------------------
-SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
-            cm_bgrp, phi_bgrp, enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
+SUBROUTINE move_electrons_x( nfi, tprint, tfirst, tlast, b1, b2, b3, fion, &
+            enthal, enb, enbi, fccc, ccc, dt2bye, stress, l_cprestart )
  !----------------------------------------------------------------------------
  !
  ! ... this routine updates the electronic degrees of freedom
@ -17,10 +17,11 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
  USE control_flags,        ONLY : lwf, tfor, tprnfor, thdyn
  USE cg_module,            ONLY : tcg
  USE cp_main_variables,    ONLY : eigr, irb, eigrb, rhog, rhos, rhor, drhor, &
-                                   drhog, sfac, ema0bg, bec_bgrp, becdr_bgrp, &
+                                   drhog, sfac, ema0bg, bec_bgrp, becdr_bgrp,  &
                                   taub, lambda, lambdam, lambdap, vpot, dbec, idesc
  USE cell_base,            ONLY : omega, ibrav, h, press
  USE uspp,                 ONLY : becsum, vkb, nkb, nlcc_any
+  USE uspp_gpum,            ONLY : vkb_d
  USE energies,             ONLY : ekin, enl, entropy, etot
  USE electrons_base,       ONLY : nbsp, nspin, f, nudx, nupdwn, nbspx_bgrp, nbsp_bgrp
  USE core,                 ONLY : rhoc
@ -43,16 +44,16 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
  USE electrons_module,     ONLY : distribute_c, collect_c, distribute_b
  USE gvect,                ONLY : eigts1, eigts2, eigts3 
  USE control_flags,        ONLY : lwfpbe0nscf  ! exx_wf related
-  USE wavefunctions, ONLY : cv0 ! Lingzhu Kong
+  USE wavefunctions,        ONLY : cv0, c0_bgrp, cm_bgrp, phi, c0_d, cm_d
  USE xc_lib,               ONLY : xclib_dft_is, exx_is_active
+  USE device_memcpy_m,        ONLY : dev_memcpy
  !
  IMPLICIT NONE
  !
  INTEGER,  INTENT(IN)    :: nfi
-  LOGICAL,  INTENT(IN)    :: tfirst, tlast
+  LOGICAL,  INTENT(IN)    :: tprint, tfirst, tlast
  REAL(DP), INTENT(IN)    :: b1(3), b2(3), b3(3)
  REAL(DP)                :: fion(:,:)
-  COMPLEX(DP)             :: c0_bgrp(:,:), cm_bgrp(:,:), phi_bgrp(:,:)
  REAL(DP), INTENT(IN)    :: dt2bye
  REAL(DP)                :: fccc, ccc
  REAL(DP)                :: enb, enbi
@ -67,10 +68,14 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
  CALL start_clock('move_electrons')
  electron_dynamic: IF ( tcg ) THEN
     !
+#if defined (__CUDA)
+     CALL errore(' move_electrons ', ' GPU version of runcg not yet implemented ', 1 )
+#else
     CALL runcg_uspp( nfi, tfirst, tlast, eigr, bec_bgrp, irb, eigrb, &
                      rhor, rhog, rhos, rhoc, eigts1, eigts2, eigts3, sfac, &
                      fion, ema0bg, becdr_bgrp, lambdap, lambda, SIZE(lambda,1), vpot, c0_bgrp, &
-                      cm_bgrp, phi_bgrp, dbec, l_cprestart  )
+                      cm_bgrp, phi, dbec, l_cprestart  )
+#endif
     !
     CALL compute_stress( stress, detot, h, omega )
     !
@ -80,7 +85,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
          CALL get_wannier_center( tfirst, cm_bgrp, bec_bgrp, eigr, &
                                   eigrb, taub, irb, ibrav, b1, b2, b3 )
     !
-     CALL rhoofr( nfi, c0_bgrp, irb, eigrb, bec_bgrp, dbec, becsum, rhor, &
+     CALL rhoofr( nfi, c0_bgrp, c0_d, bec_bgrp, dbec, becsum, rhor, &
                  drhor, rhog, drhog, rhos, enl, denl, ekin, dekin6 )
     !
 !=================================================================
@ -105,7 +110,7 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
 !=================================================================
     ! ... put core charge (if present) in rhoc(r)
     !
-     IF ( nlcc_any ) CALL set_cc( irb, eigrb, rhoc )
+     IF ( nlcc_any ) CALL set_cc( rhoc )
     !
     IF ( lwf ) THEN
        !
@ -154,9 +159,10 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
     !
     !=======================================================================
     !
-     CALL newd( vpot, irb, eigrb, becsum, fion )
+     CALL newd( vpot, becsum, fion, tprint )
     !
     CALL prefor( eigr, vkb )
+     CALL dev_memcpy( vkb_d, vkb )
     !
     IF( force_pairing ) THEN
        !
@ -165,23 +171,29 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
        !
     ELSE
        !
-        CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp )
+        CALL runcp_uspp( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d )
        !
     ENDIF
     !
+     CALL dev_memcpy( cm_d, cm_bgrp )  ! cm contains the updated wavefunctions
+     !
     !----------------------------------------------------------------------
     !                 contribution to fion due to lambda
     !----------------------------------------------------------------------
     !
     ! ... nlfq needs deeq bec
     !
-     IF ( tfor .OR. tprnfor ) THEN
-        CALL nlfq_bgrp( c0_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
+     IF ( tfor .OR. ( tprnfor .AND. tprint ) ) THEN
+#if defined (__CUDA)
+        CALL nlfq_bgrp( c0_d, vkb_d, bec_bgrp, becdr_bgrp, fion )
+#else
+        CALL nlfq_bgrp( c0_bgrp, vkb, bec_bgrp, becdr_bgrp, fion )
+#endif
     END IF
     !
-     IF ( (tfor.or.tprnfor) .AND. tefield ) &
+     IF ( (tfor.or.(tprnfor.AND.tprint)) .AND. tefield ) &
        CALL bforceion( fion, .TRUE. , ipolp, qmat, bec_bgrp, becdr_bgrp, gqq, evalue )
-     IF ( (tfor.or.tprnfor) .AND. tefield2 ) &
+     IF ( (tfor.or.(tprnfor.AND.tprint)) .AND. tefield2 ) &
        CALL bforceion( fion, .TRUE. , ipolp2, qmat2, bec_bgrp, becdr_bgrp, gqq2, evalue2 )
     !
     IF( force_pairing ) THEN
@ -199,13 +211,17 @@ SUBROUTINE move_electrons_x( nfi, tfirst, tlast, b1, b2, b3, fion, c0_bgrp, &
     ! ... calphi calculates phi
     ! ... the electron mass rises with g**2
     !
-     CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi_bgrp, nbspx_bgrp, ema0bg )
+#if defined (__CUDA)
+     CALL calphi_bgrp( c0_d, ngw, bec_bgrp, nkb, vkb_d, phi, nbspx_bgrp, ema0bg )
+#else
+     CALL calphi_bgrp( c0_bgrp, ngw, bec_bgrp, nkb, vkb, phi, nbspx_bgrp, ema0bg )
+#endif
     !
     ! ... begin try and error loop (only one step!)
     !
     ! ... nlfl and nlfh need: lambda (guessed) becdr
     !
-     IF ( tfor .OR. tprnfor ) THEN
+     IF ( tfor .OR. (tprnfor .AND. tprint) ) THEN
        CALL nlfl_bgrp( bec_bgrp, becdr_bgrp, lambda, idesc, fion )
     END IF
     !
--- a/CPV/src/newd.f90
+++ b/CPV/src/newd.f90
@ -6,7 +6,7 @@
 ! or http://www.gnu.org/copyleft/gpl.txt .
 !
 !-------------------------------------------------------------------------
-   SUBROUTINE newd(vr,irb,eigrb,rhovan,fion)
+   SUBROUTINE newd( vr, rhovan, fion, tprint )
 !-----------------------------------------------------------------------
 !
 !     this routine calculates array deeq:
@ -32,28 +32,28 @@
                                  my_bgrp_id, nbgrp 
      USE fft_interfaces,   ONLY: invfft
      USE fft_base,         ONLY: dfftb, dfftp
+      USE cp_main_variables,ONLY: irb, eigrb, iabox, nabox
 !
      IMPLICIT NONE
 ! input
-      INTEGER irb(3,nat)
-      REAL(DP) rhovan(nhm*(nhm+1)/2,nat,nspin)
-      COMPLEX(DP) eigrb(ngb,nat)
+      REAL(DP), INTENT(IN) ::  rhovan(nhm*(nhm+1)/2,nat,nspin)
      REAL(DP)  vr(dfftp%nnr,nspin)
+      LOGICAL, INTENT(IN) :: tprint
 ! output
      REAL(DP)  fion(3,nat)
 ! local
-      INTEGER isup,isdw,iss, iv,ijv,jv, ik, nfft, ia, is, ig
-      REAL(DP)  fac, fac1, fac2, res
-      COMPLEX(DP) ci, facg1, facg2
+      INTEGER :: isup,isdw,iss, iv,ijv,jv, ik, nfft, ia, iia, is, ig
+      REAL(DP)  :: fac, fac1, fac2, res
+      COMPLEX(DP), PARAMETER :: ci = (0.d0,1.d0)
+      COMPLEX(DP) :: facg1, facg2
      COMPLEX(DP), ALLOCATABLE :: qv(:), fg1(:), fg2(:)
      REAL(DP), ALLOCATABLE :: fvan(:,:)
      INTEGER :: na_bgrp, ia_bgrp
-
+      INTEGER :: mytid, ntids
 #if defined(_OPENMP)
-      INTEGER :: itid, mytid, ntids, omp_get_thread_num, omp_get_num_threads
+      INTEGER :: omp_get_thread_num, omp_get_num_threads
      EXTERNAL :: omp_get_thread_num, omp_get_num_threads
 #endif
-
 !
      IF ( dfftb%nr1==0 .OR. dfftb%nr2==0 .OR. dfftb%nr3==0 ) THEN
         RETURN
@ -62,263 +62,151 @@
      CALL start_clock( 'newd' )

      ALLOCATE( fvan( 3, nat ) )
-      ci=(0.d0,1.d0)
-      fac=omegab/DBLE(dfftb%nr1*dfftb%nr2*dfftb%nr3)
-      deeq (:,:,:,:) = 0.d0
-      fvan (:,:) = 0.d0

+      fac=omegab/DBLE(dfftb%nr1*dfftb%nr2*dfftb%nr3)

 !$omp parallel default(none) &
-!$omp          shared(ngb, nh, qgb, eigrb, dfftb, irb, vr, ci, deeq, &
-!$omp                 fac, nspin, my_bgrp_id, nbgrp, ityp, upf, nat ) &
-!$omp          private(mytid, ntids, is, ia, nfft, iv, jv, ijv, ig, qv, fg1, fg2, itid, res, iss )
+!$omp          shared(ngb, nh, qgb, eigrb, dfftb, irb, vr, deeq, tfor, thdyn, tprnfor, tprint, nabox, &
+!$omp                 fac, nspin, my_bgrp_id, nbgrp, ityp, upf, nat, fvan, tpibab, gxb, rhovan, iabox ) &
+!$omp          private(mytid, ntids, is, ia, iia, nfft, iv, jv, ijv, ig, qv, fg1, fg2, res, &
+!$omp                 iss, isup, isdw, fac2, facg1, fac1 )

 #if defined(_OPENMP)
      mytid = omp_get_thread_num()  ! take the thread ID
      ntids = omp_get_num_threads() ! take the number of threads
-      itid  = 0
+#else
+      mytid = 0
+      ntids = 1
 #endif

+!$omp workshare
+      deeq (:,:,:,:) = 0.d0
+      fvan (:,:) = 0.d0
+!$omp end workshare

      ALLOCATE( qv( dfftb%nnr ) )
      ALLOCATE( fg1( ngb ) )
      ALLOCATE( fg2( ngb ) )
-!
-! calculation of deeq_i,lm = \int V_eff(r) q_i,lm(r) dr
-!
-      DO ia = 1, nat
-
-         is = ityp(ia)
-
-         IF( .NOT. upf(is)%tvanp ) &
-            CYCLE
-
-#if defined(__MPI)
-         IF(  ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
-            CYCLE
+      !
+      ! calculation of deeq_i,lm = \int V_eff(r) q_i,lm(r) dr
+      !
+      DO iia = 1, nabox
+         IF( MOD( iia - 1, ntids ) == mytid ) THEN
+            ia = iabox( iia )
+            is = ityp(ia)
+            nfft = 1
+            DO iv=1,nh(is)
+               DO jv=iv,nh(is)
+                  ijv = (jv-1)*jv/2 + iv
+                  fg1 = eigrb(1:ngb,ia  )*qgb(1:ngb,ijv,is)
+                  CALL fft_oned2box( qv, fg1 )
+                  CALL invfft( qv, dfftb, ia )
+                  DO iss=1,nspin
+                     res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
+                     deeq(iv,jv,ia,iss) = fac * res  
+                     IF (iv.NE.jv)                                      &
+        &                 deeq(jv,iv,ia,iss)=deeq(iv,jv,ia,iss)
+                  END DO
+               END DO
+            END DO
         END IF
-#endif
-
-         nfft = 1
-
-#if defined(_OPENMP)
-         IF ( mytid /= itid ) THEN
-            itid = MOD( itid + 1, ntids )
-            CYCLE
-         ELSE
-            itid = MOD( itid + 1, ntids )
-         END IF
-#endif
-
-         DO iv=1,nh(is)
-             DO jv=iv,nh(is)
-                ijv = (jv-1)*jv/2 + iv
-                IF (nfft.EQ.2) THEN
-                   fg1 = eigrb(1:ngb,ia  )*qgb(1:ngb,ijv,is)
-                   fg2 = eigrb(1:ngb,ia+1)*qgb(1:ngb,ijv,is)
-                   CALL fft_oned2box( qv, fg1, fg2 )
-                ELSE
-                   fg1 = eigrb(1:ngb,ia  )*qgb(1:ngb,ijv,is)
-                   CALL fft_oned2box( qv, fg1 )
-                END IF
-!
-                CALL invfft( qv, dfftb, ia )
-!
-                DO iss=1,nspin
-                   res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
-                   deeq(iv,jv,ia,iss) = fac * res  
-                   IF (iv.NE.jv)                                      &
-     &                  deeq(jv,iv,ia,iss)=deeq(iv,jv,ia,iss)
-                   IF (nfft.EQ.2) THEN
-                      res = boxdotgrid(irb(:,ia+1),2,qv,vr(:,iss))
-                      deeq(iv,jv,ia+1,iss) = fac * res 
-                      IF (iv.NE.jv)                                   &
-     &                     deeq(jv,iv,ia+1,iss)=deeq(iv,jv,ia+1,iss)
-                   END IF
-                END DO
-             END DO
-         END DO
      END DO

+      IF ( tfor .OR. thdyn .OR. (tprnfor.AND.tprint) ) THEN
+         !
+         ! calculation of fion_i = \int V_eff(r) \sum_lm rho_lm (dq_i,lm(r)/dR_i) dr
+         !
+         IF( nspin == 1 ) THEN
+            !     
+            !  case nspin=1: two ffts at the same time, on two atoms (if possible)
+            !     
+            iss=1
+            nfft=1
+
+            DO iia = 1, nabox
+               IF( MOD( iia - 1, ntids ) == mytid ) THEN
+                  ia = iabox( iia )
+                  is = ityp(ia)
+                  DO ik=1,3
+                     qv(:) = (0.d0, 0.d0)
+                     DO iv=1,nh(is)
+                        DO jv=iv,nh(is)
+                           ijv = (jv-1)*jv/2 + iv
+                           IF(iv.NE.jv) THEN
+                              fac1=2.d0*fac*tpibab*rhovan(ijv,ia,iss)
+                           ELSE
+                              fac1=     fac*tpibab*rhovan(ijv,ia,iss)
+                           ENDIF
+                           DO ig=1,ngb
+                              facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is)*fac1
+                              fg1(ig) = eigrb(ig,ia  )*facg1
+                           END DO
+                           CALL fft_add_oned2box( qv, fg1 )
+                        END DO
+                     END DO
+                     CALL invfft( qv, dfftb, ia)
+                     res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
+                     fvan(ik,ia) = res
+                  END DO
+               END IF
+            END DO
+
+         ELSE
+            !     
+            !  case nspin=2: up and down spin fft's combined into a single fft
+            !     
+            isup=1
+            isdw=2
+
+            DO iia = 1, nabox
+               IF( MOD( iia - 1, ntids ) == mytid ) THEN
+                  ia = iabox( iia )
+                  is = ityp(ia)
+                  DO ik=1,3
+                      qv(:) = (0.d0, 0.d0)
+                      DO iv=1,nh(is)
+                         DO jv=iv,nh(is)
+                            ijv = (jv-1)*jv/2 + iv
+                            IF(iv.NE.jv) THEN
+                               fac1=2.d0*fac*tpibab*rhovan(ijv,ia,isup)
+                               fac2=2.d0*fac*tpibab*rhovan(ijv,ia,isdw)
+                            ELSE
+                               fac1=     fac*tpibab*rhovan(ijv,ia,isup)
+                               fac2=     fac*tpibab*rhovan(ijv,ia,isdw)
+                            END IF
+                            DO ig=1,ngb
+                               fg1(ig) = fac1 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
+           &                             qgb(ig,ijv,is) * eigrb(ig,ia)
+                               fg2(ig) = fac2 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
+           &                             qgb(ig,ijv,is) * eigrb(ig,ia)
+                            END DO
+                            CALL fft_add_oned2box( qv, fg1, fg2 )
+                         END DO
+                      END DO
+                      CALL invfft( qv, dfftb, ia)
+                      fvan(ik,ia) =                                      &
+           &                  boxdotgrid(irb(:,ia),isup,qv,vr(:,isup)) + &
+           &                  boxdotgrid(irb(:,ia),isdw,qv,vr(:,isdw))
+                  END DO
+               END IF
+            END DO
+         END IF
+      END IF
+
      DEALLOCATE( qv )
      DEALLOCATE( fg1 )
      DEALLOCATE( fg2 )

 !$omp end parallel

-      CALL mp_sum( deeq, intra_bgrp_comm )
-      CALL mp_sum( deeq, inter_bgrp_comm )
-
-      IF (.NOT.( tfor .OR. thdyn .OR. tprnfor ) ) go to 10
-!
-! calculation of fion_i = \int V_eff(r) \sum_lm rho_lm (dq_i,lm(r)/dR_i) dr
-!
-
-      IF( nspin == 1 ) THEN
-
-         !     =================================================================
-         !     case nspin=1: two ffts at the same time, on two atoms (if possible)
-         !     -----------------------------------------------------------------
-
-!$omp parallel default(none) &
-!$omp          shared(nat, ngb, nh, qgb, eigrb, dfftb, irb, vr, ci, deeq, &
-!$omp                 fac, nspin, rhovan, tpibab, gxb, fvan, my_bgrp_id, nbgrp, ityp, upf ) &
-!$omp          private(mytid, ntids, is, ia, ik, nfft, iv, jv, ijv, ig, qv, itid, res, iss, &
-!$omp                  fac1, fac2, facg1, facg2, fg1, fg2 )
-
-
-         ALLOCATE( qv( dfftb%nnr ) )
-         ALLOCATE( fg1( ngb ) )
-         ALLOCATE( fg2( ngb ) )
-
-         iss=1
-
-#if defined(_OPENMP)
-         mytid = omp_get_thread_num()  ! take the thread ID
-         ntids = omp_get_num_threads() ! take the number of threads
-         itid  = 0
-#endif
-
-         DO ia = 1, nat
-
-            is = ityp(ia)
-
-            IF( .NOT. upf(is)%tvanp ) &
-               CYCLE
-
-#if defined(__MPI)
-            IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
-               CYCLE
-            END IF
-#endif
-            nfft=1
-
-#if defined(_OPENMP)
-            IF ( mytid /= itid ) THEN
-               itid = MOD( itid + 1, ntids )
-               CYCLE
-            ELSE
-               itid = MOD( itid + 1, ntids )
-            END IF
-#endif
-            DO ik=1,3
-               qv(:) = (0.d0, 0.d0)
-               DO iv=1,nh(is)
-                  DO jv=iv,nh(is)
-                     ijv = (jv-1)*jv/2 + iv
-                     IF(iv.NE.jv) THEN
-                        fac1=2.d0*fac*tpibab*rhovan(ijv,ia,iss)
-                        IF (nfft.EQ.2) fac2=2.d0*fac*tpibab*         &
-     &                                        rhovan(ijv,ia+1,iss)
-                     ELSE
-                        fac1=     fac*tpibab*rhovan(ijv,ia,iss)
-                        IF (nfft.EQ.2) fac2=     fac*tpibab*        &
-     &                                        rhovan(ijv,ia+1,iss)
-                     ENDIF
-                     IF (nfft.EQ.2) THEN
-                        DO ig=1,ngb
-                           facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is) * fac1
-                           facg2 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is) * fac2
-                           fg1(ig) = eigrb(ig,ia  )*facg1
-                           fg2(ig) = eigrb(ig,ia+1)*facg2
-                        END DO
-                        CALL fft_add_oned2box( qv, fg1, fg2 )
-                     ELSE
-                        DO ig=1,ngb
-                           facg1 = CMPLX(0.d0,-gxb(ik,ig),kind=DP) * qgb(ig,ijv,is)*fac1
-                           fg1(ig) = eigrb(ig,ia  )*facg1
-                        END DO
-                        CALL fft_add_oned2box( qv, fg1 )
-                     END IF
-                  END DO
-               END DO
-!
-               CALL invfft( qv, dfftb, ia)
-!
-               res = boxdotgrid(irb(:,ia),1,qv,vr(:,iss))
-               fvan(ik,ia) = res
-!
-               IF (nfft.EQ.2) THEN
-                  res = boxdotgrid(irb(:,ia+1),2,qv,vr(:,iss))
-                  fvan(ik,ia+1) = res
-               END IF
-            END DO
-         END DO
-
-         DEALLOCATE( qv )
-         DEALLOCATE( fg1 )
-         DEALLOCATE( fg2 )
-
-!$omp end parallel
-
-      ELSE
-
-         !     =================================================================
-         !     case nspin=2: up and down spin fft's combined into a single fft
-         !     -----------------------------------------------------------------
-
-         ALLOCATE( qv( dfftb%nnr ) )
-         ALLOCATE( fg1( ngb ) )
-         ALLOCATE( fg2( ngb ) )
-         isup=1
-         isdw=2
-
-         DO ia = 1, nat
-
-            is = ityp(ia)
-
-            IF( .NOT. upf(is)%tvanp ) &
-               CYCLE
-
-#if defined(__MPI)
-            IF ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
-               CYCLE
-            END IF
-#endif
-
-            DO ik=1,3
-                qv(:) = (0.d0, 0.d0)
-!
-                DO iv=1,nh(is)
-                   DO jv=iv,nh(is)
-                      ijv = (jv-1)*jv/2 + iv
-                      IF(iv.NE.jv) THEN
-                         fac1=2.d0*fac*tpibab*rhovan(ijv,ia,isup)
-                         fac2=2.d0*fac*tpibab*rhovan(ijv,ia,isdw)
-                      ELSE
-                         fac1=     fac*tpibab*rhovan(ijv,ia,isup)
-                         fac2=     fac*tpibab*rhovan(ijv,ia,isdw)
-                      END IF
-                      DO ig=1,ngb
-                         fg1(ig) = fac1 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
-     &                             qgb(ig,ijv,is) * eigrb(ig,ia)
-                         fg2(ig) = fac2 * CMPLX(0.d0,-gxb(ik,ig),kind=DP) * &
-     &                             qgb(ig,ijv,is) * eigrb(ig,ia)
-                      END DO
-                      CALL fft_add_oned2box( qv, fg1, fg2 )
-                   END DO
-                END DO
-!
-                CALL invfft( qv, dfftb, ia)
-!
-                fvan(ik,ia) =                                      &
-     &                  boxdotgrid(irb(:,ia),isup,qv,vr(:,isup)) + &
-     &                  boxdotgrid(irb(:,ia),isdw,qv,vr(:,isdw))
-            END DO
-         END DO
-
-         DEALLOCATE( qv )
-         DEALLOCATE( fg1 )
-         DEALLOCATE( fg2 )
-
+      IF ( tfor .OR. thdyn .OR. (tprnfor.AND.tprint) ) THEN
+         CALL mp_sum( fvan, intra_bgrp_comm )
+         CALL mp_sum( fvan, inter_bgrp_comm )
+         fion(:,:) = fion(:,:) - fvan(:,:)
      END IF

-      CALL mp_sum( fvan, intra_bgrp_comm )
-      CALL mp_sum( fvan, inter_bgrp_comm )
-
-      fion(:,:) = fion(:,:) - fvan(:,:)
-
-  10  CONTINUE
+      CALL mp_sum( deeq, intra_bgrp_comm )
+      CALL mp_sum( deeq, inter_bgrp_comm )
 !
      CALL stop_clock( 'newd' )
 !
--- a/CPV/src/nl_base.f90
+++ b/CPV/src/nl_base.f90
@ -5,108 +5,15 @@
 ! in the root directory of the present distribution,
 ! or http://www.gnu.org/copyleft/gpl.txt .
 !
-!
-!-----------------------------------------------------------------------
-   SUBROUTINE beta_eigr_x ( beigr, nspmn, nspmx, eigr, pptype_ )
-!-----------------------------------------------------------------------
-
-      !     computes: the array becp
-      !     beigr(ig,iv)=
-      !         = [(-i)**l beta(g,iv,is) e^(-ig.r_ia)]^* 
-      !
-      !     routine makes use of c*(g)=c(-g)  (g> see routine ggen)
-      !     input : beta(ig,l,is), eigr, c
-      !     output: becp as parameter
-      !
-      USE kinds,      ONLY : DP
-      USE ions_base,  only : nat, nsp, ityp
-      USE gvecw,      only : ngw
-      USE uspp,       only : nkb, nhtol, beta, indv_ijkb0
-      USE uspp_param, only : nh, upf, nhm
-      !
-      USE gvect, ONLY : gstart
-!
-      implicit none
-
-      integer,     intent(in)  :: nspmn, nspmx
-      complex(DP), intent(in)  :: eigr( :, : )
-      complex(DP), intent(out) :: beigr( :, : )
-      INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
-      ! pptype_: pseudo type to process: 0 = all, 1 = norm-cons, 2 = ultra-soft
-      !
-      integer   :: ig, is, iv, ia, l, inl
-      complex(DP) :: cfact
-      integer :: pptype
-      !
-      call start_clock( 'beta_eigr' )
-
-      IF( PRESENT( pptype_ ) ) THEN
-         pptype = pptype_
-      ELSE
-         pptype = 0
-      END IF
-
-      !allocate( wrk2( ngw, nkb ) ) 
-
-      beigr = 0.0d0
-
-!$omp parallel default(none), &
-!$omp shared(nat,ngw,nh,nhtol,beigr,beta,eigr,ityp,pptype,nspmn,nspmx,upf,gstart,indv_ijkb0), &
-!$omp private(is,ia,iv,inl,l,cfact,ig)
-!$omp do
-      DO ia = 1, nat
-         is = ityp(ia)
-         !
-         IF( pptype == 2 .AND. .NOT. upf(is)%tvanp ) CYCLE
-         IF( pptype == 1 .AND. upf(is)%tvanp ) CYCLE
-         IF( is >= nspmn .AND. is <= nspmx ) THEN
-              !
-              inl = indv_ijkb0(ia)
-
-              do iv = 1, nh( is )
-                !
-                l = nhtol( iv, is )
-                !
-                if (l == 0) then
-                  cfact =   cmplx( 1.0_dp , 0.0_dp )
-                else if (l == 1) then
-                  cfact = - cmplx( 0.0_dp , 1.0_dp )
-                else if (l == 2) then
-                  cfact = - cmplx( 0.0_dp , 1.0_dp )
-                  cfact = cfact * cfact
-                else if (l == 3) then
-                  cfact = - cmplx( 0.0_dp , 1.0_dp )
-                  cfact = cfact * cfact * cfact
-                endif
-                !
-                !  q = 0   component (with weight 1.0)
-                !
-                if (gstart == 2) then
-                  beigr( 1, iv + inl ) = cfact * beta(1,iv,is) * eigr(1,ia)
-                end if
-                !
-                !   q > 0   components (with weight 2.0)
-                !
-                do ig = gstart, ngw
-                  beigr( ig, iv + inl ) = 2.0d0 * cfact * beta(ig,iv,is) * eigr(ig,ia)
-                end do
-                !
-              end do
-              !
-         END IF
-      END DO
-!$omp end do
-!$omp end parallel
-
-      call stop_clock( 'beta_eigr' )
-
-      RETURN
-   END SUBROUTINE beta_eigr_x
-!-----------------------------------------------------------------------
+#if defined(__CUDA)
+#define DEVICEATTR ,DEVICE
+#else
+#define DEVICEATTR
+#endif
 !
 !
 !-----------------------------------------------------------------------
-   subroutine nlsm1us_x ( n, beigr, c, becp )
+   subroutine nlsm1us_x ( n, betae, c, becp )
 !-----------------------------------------------------------------------

      !     computes: the array becp
@ -124,17 +31,36 @@
      USE mp_global,  ONLY : nproc_bgrp, intra_bgrp_comm
      USE gvecw,      only : ngw
      USE uspp,       only : nkb
+      USE gvect,      ONLY : gstart
+#if defined(__CUDA)
+      USE cudafor
+      USE cublas
+#endif
 !
      implicit none

      integer,     intent(in)  :: n
-      complex(DP), intent(in)  :: beigr( :, : ), c( :, : )
-      real(DP),    intent(out) :: becp( :, : )
+      complex(DP) DEVICEATTR , intent(in)  :: c( :, : )
+      complex(DP) DEVICEATTR , intent(inout)  :: betae( :, : )
+      real(DP)    DEVICEATTR , intent(out) :: becp( :, : )
+      INTEGER :: i
      !
      call start_clock( 'nlsm1us' )

      IF( ngw > 0 .AND. nkb > 0 ) THEN
-         CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 1.0d0, beigr, 2*ngw, c, 2*ngw, 0.0d0, becp, nkb )
+         IF( gstart > 1 ) THEN
+!$cuf kernel do(1) <<<*,*>>>
+            DO i = 1, nkb
+               betae( 1, i ) = 0.5d0 * betae( 1, i )
+            END DO
+         END IF
+         CALL MYDGEMM( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becp, nkb )
+         IF( gstart > 1 ) THEN
+!$cuf kernel do(1) <<<*,*>>>
+            DO i = 1, nkb
+               betae( 1, i ) = 2.0d0 * betae( 1, i )
+            END DO
+         END IF
      END IF

      IF( nproc_bgrp > 1 ) THEN
@ -149,7 +75,7 @@
 !
 !
 !-----------------------------------------------------------------------
-   subroutine nlsm1_x ( n, nspmn, nspmx, eigr, c, becp, pptype_ )
+   subroutine nlsm1_x ( n, betae, c, becp, pptype_ )
 !-----------------------------------------------------------------------

      !     computes: the array becp
@ -170,21 +96,20 @@
      USE uspp,       only : nkb, nhtol, beta, indv_ijkb0
      USE uspp_param, only : nh, upf, nhm
      USE gvect,      ONLY : gstart
-      USE cp_interfaces, only : beta_eigr
 !
      implicit none

-      integer,     intent(in)  :: n, nspmn, nspmx
-      complex(DP), intent(in)  :: eigr( :, : ), c( :, : )
-      real(DP), intent(out) :: becp( :, : )
+      integer,     intent(in)  :: n
+      complex(DP), intent(in)  :: c( :, : )
+      complex(DP), intent(inout)  :: betae( :, : )
+      real(DP),    intent(out) :: becp( :, : )
      INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
      ! pptype_: pseudo type to process: 0 = all, 1 = norm-cons, 2 = ultra-soft
      !
-      integer   :: ig, is, iv, ia, l, inl
-      real(DP), allocatable :: becps( :, : )
-      complex(DP), allocatable :: wrk2( :, : )
-      complex(DP) :: cfact
+      integer :: ig, is, iv, ia, l, inl
      integer :: pptype
+      real(DP), allocatable :: becps( :, : )
+      LOGICAL :: nothing_to_do
      !
      call start_clock( 'nlsm1' )

@ -194,21 +119,40 @@
         pptype = 0
      END IF

-      allocate( wrk2( ngw, nkb ) ) 
-      allocate( becps( SIZE(becp,1), SIZE(becp,2) ) ) 
- 
-      CALL beta_eigr ( wrk2, nspmn, nspmx, eigr, pptype_ )
-
-      IF( ngw > 0 .AND. nkb > 0 ) THEN
-         CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 1.0d0, wrk2, 2*ngw, c, 2*ngw, 0.0d0, becps, nkb )
+      IF( pptype == 1 ) THEN
+         nothing_to_do = .TRUE.
+         do is = 1, nsp
+            IF( .NOT. upf(is)%tvanp ) THEN
+               nothing_to_do = .FALSE.
+            END IF
+         END DO
+         IF( nothing_to_do ) GO TO 100
      END IF

-      DEALLOCATE( wrk2 )
+      IF( pptype == 0 ) THEN
+         IF( ngw > 0 .AND. nkb > 0 ) THEN
+            IF( gstart > 1 ) betae( 1, : ) = 0.5d0 * betae( 1, : )
+            CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becp, SIZE(becp,1) )
+            IF( gstart > 1 ) betae( 1, : ) = 2.0d0 * betae( 1, : )
+         END IF
+         IF( nproc_bgrp > 1 ) THEN
+            CALL mp_sum( becp, intra_bgrp_comm )
+         END IF
+         GO TO 100
+      END IF
+      
+      allocate( becps( SIZE(becp,1), SIZE(becp,2) ) ) 
+ 
+      IF( ngw > 0 .AND. nkb > 0 ) THEN
+         IF( gstart > 1 ) betae( 1, : ) = 0.5d0 * betae( 1, : )
+         CALL dgemm( 'T', 'N', nkb, n, 2*ngw, 2.0d0, betae, 2*ngw, c, 2*ngw, 0.0d0, becps, nkb )
+         IF( gstart > 1 ) betae( 1, : ) = 2.0d0 * betae( 1, : )
+      END IF

      IF( nproc_bgrp > 1 ) THEN
        CALL mp_sum( becps, intra_bgrp_comm )
      END IF
-      do is = nspmn, nspmx
+      do is = 1, nsp
        IF( pptype == 2 .AND. .NOT. upf(is)%tvanp ) CYCLE
        IF( pptype == 1 .AND. upf(is)%tvanp ) CYCLE
          DO ia = 1, nat
@ -218,21 +162,21 @@
                becp(inl+iv,:) = becps( inl+iv, : )
              end do
            END IF
-          end do
+         end do
      end do
              !
-      
      DEALLOCATE( becps )

+100   CONTINUE
+
      call stop_clock( 'nlsm1' )

      return
   end subroutine nlsm1_x
 !-----------------------------------------------------------------------

-
 !-------------------------------------------------------------------------
-   subroutine nlsm2_bgrp_x( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
+   subroutine nlsm2_bgrp_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
 !-----------------------------------------------------------------------

      !     computes: the array becdr
@ -245,9 +189,6 @@
      !
 
      USE kinds,      ONLY : DP
-      use ions_base,  only : nsp, ityp, nat
-      use uspp,       only : nhtol, beta, indv_ijkb0
-      use uspp_param, only : nh, upf
      use cell_base,  only : tpiba
      use mp,         only : mp_sum
      use mp_global,  only : nproc_bgrp, intra_bgrp_comm
@ -256,69 +197,40 @@
      implicit none
    
      integer,     intent(in)  :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
-      complex(DP), intent(in)  :: eigr(:,:), c_bgrp(:,:)
+      complex(DP), intent(in)  :: betae(:,:), c_bgrp(:,:)
      real(DP),    intent(out) :: becdr_bgrp(:,:,:)
      !
      complex(DP), allocatable :: wrk2(:,:)
      !
-      integer  :: ig, is, iv, ia, k, l, inl
-      complex(DP) :: cfact
+      integer  :: ig, iv, k, info
+      complex(DP) :: cfact1, cfact2
 !
      call start_clock( 'nlsm2' )

-      allocate( wrk2( ngw, nkb ) )
+      cfact2 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba * 2.0d0
+      cfact1 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba 
+
+      allocate( wrk2, MOLD = betae, STAT = info )
+      IF( info /= 0 ) &
+         CALL errore( ' nlsm2 ', ' allocating wrk2', ABS( info ) )
 !
      DO k = 1, 3
-         !
-         DO ia = 1, nat
-
-            is = ityp(ia) 
-
-            inl = indv_ijkb0(ia)
-
-            do iv=1,nh(is)
-               !
-               !     order of states:  s_1  p_x1  p_z1  p_y1  s_2  p_x2  p_z2  p_y2
-               !
-               l=nhtol(iv,is)
-
-               ! compute (-i)^(l+1)
-               !
-               if (l == 0) then
-                 cfact = - cmplx( 0.0_dp , 1.0_dp )
-               else if (l == 1) then
-                 cfact = - cmplx( 0.0_dp , 1.0_dp )
-                 cfact = cfact * cfact
-               else if (l == 2) then
-                 cfact = - cmplx( 0.0_dp , 1.0_dp )
-                 cfact = cfact * cfact * cfact
-               else if (l == 3) then
-                 cfact =   cmplx( 1.0_dp , 0.0_dp )
-               endif
-
-               cfact = cfact * tpiba
-
-               !    q = 0   component (with weight 1.0)
-               if (gstart == 2) then
-                  wrk2(1,iv+inl) = cfact * g(k,1) * beta(1,iv,is) * eigr(1,ia)
-               end if
-               !    q > 0   components (with weight 2.0)
-               do ig=gstart,ngw
-                  wrk2(ig,iv+inl) = cfact * 2.0d0 * g(k,ig) * beta(ig,iv,is) * eigr(ig,ia)
-               end do
-               !
-            end do
-
+         do iv=1,nkb
+            wrk2(1,iv) = cfact1 * g(k,1) * betae(1,iv)
         end do
-
+!$omp parallel do default(shared) private(iv,ig) collapse(2)
+         do iv=1,nkb
+            do ig=gstart,ngw
+               wrk2(ig,iv) = cfact2 * g(k,ig) * betae(ig,iv)
+            end do
+         end do
+!$omp end parallel do
         IF( ngw > 0 .AND. nkb > 0 ) THEN
-            CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
+            CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1), 2*ngw, &
                 c_bgrp, 2*ngw, 0.0d0, becdr_bgrp( 1, 1, k ), nkb )
         END IF
-
      end do

-
      deallocate( wrk2 )

      IF( nproc_bgrp > 1 ) THEN
@ -331,7 +243,86 @@
   end subroutine nlsm2_bgrp_x
 !-----------------------------------------------------------------------

+#if defined (__CUDA)
+!-------------------------------------------------------------------------
+   subroutine nlsm2_bgrp_gpu_x( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
+!-----------------------------------------------------------------------

+      !     computes: the array becdr
+      !     becdr(ia,n,iv,is,k)
+      !      =2.0 sum_g> g_k beta(g,iv,is) re[ (i)**(l+1) e^(ig.r_ia) c(g,n)]
+      !
+      !     routine makes use of  c*(g)=c(-g)  (g> see routine ggen)
+      !     input : eigr, c
+      !     output: becdr
+      !
+ 
+      USE kinds,      ONLY : DP
+      use cell_base,  only : tpiba
+      use mp,         only : mp_sum
+      use mp_global,  only : nproc_bgrp, intra_bgrp_comm
+      use gvect,      only : gstart
+      use gvect_gpum, only : g_d
+      USE device_memcpy_m, ONLY : dev_memcpy
+      USE cudafor
+      USE cublas
+!
+      implicit none
+    
+      integer,     intent(in)  :: ngw, nkb, nbspx_bgrp, nbsp_bgrp
+      complex(DP), intent(in), DEVICE :: c_bgrp(:,:)
+      complex(DP), intent(in), DEVICE :: betae(:,:)
+      real(DP),    intent(out) :: becdr_bgrp(:,:,:)
+      !
+      complex(DP), allocatable, DEVICE :: wrk2(:,:)
+      real(DP), allocatable, DEVICE :: becdr_d(:,:)
+      !
+      integer  :: ig, iv, k, info
+      complex(DP) :: cfact1, cfact2
+!
+      call start_clock( 'nlsm2' )
+
+      ALLOCATE( wrk2, MOLD=betae, STAT = info )
+      IF( info /= 0 ) &
+         CALL errore( ' nlsm2 ', ' allocating wrk2', ABS( info ) )
+      ALLOCATE( becdr_d( SIZE( becdr_bgrp, 1 ), SIZE( becdr_bgrp, 2 ) ), STAT=info ) 
+      IF( info /= 0 ) &
+         CALL errore( ' nlsm2 ', ' allocating becdr_d ', ABS( info ) )
+
+      cfact2 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba * 2.0d0
+      cfact1 = - cmplx( 0.0_dp , 1.0_dp ) * tpiba 
+
+      DO k = 1, 3
+!$cuf kernel do(1) <<<*,*>>>
+         do iv=1,nkb
+            wrk2(1,iv) = cfact1 * g_d(k,1) * betae(1,iv)
+         end do
+!$cuf kernel do(2) <<<*,*>>>
+         do iv=1,nkb
+            do ig=gstart,ngw
+               wrk2(ig,iv) = cfact2 * g_d(k,ig) * betae(ig,iv)
+            end do
+         end do
+         IF( ngw > 0 .AND. nkb > 0 ) THEN
+            CALL MYDGEMM( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1), 2*ngw, &
+                 c_bgrp, 2*ngw, 0.0d0, becdr_d, nkb )
+            CALL dev_memcpy( becdr_bgrp(:,:,k), becdr_d )
+         END IF
+      end do
+
+      DEALLOCATE( becdr_d )
+      deallocate( wrk2 )
+
+      IF( nproc_bgrp > 1 ) THEN
+         CALL mp_sum( becdr_bgrp, intra_bgrp_comm )
+      END IF
+
+      call stop_clock( 'nlsm2' )
+!
+      return
+   end subroutine nlsm2_bgrp_gpu_x
+!-----------------------------------------------------------------------
+#endif

 !-----------------------------------------------------------------------
   SUBROUTINE ennl_x( ennl_val, rhovan, bec_bgrp )
@ -356,35 +347,40 @@
      ! local
      !
      real(DP) :: sumt, sums(2), ennl_t
-      integer  :: is, iv, jv, ijv, inl, jnl, ia, iss, i
+      integer  :: is, iv, jv, ijv, inl, jnl, ia, iss, i, indv
+      INTEGER  :: omp_get_num_threads
      !
      ennl_t = 0.d0  
      !
-      do is = 1, nsp
+!$omp parallel num_threads(min(4,omp_get_num_threads())) default(none) &
+!$omp shared(nat,ityp,indv_ijkb0,nh,nbsp_bgrp,ispin_bgrp,f_bgrp,bec_bgrp,rhovan,dvan,nspin,ennl_t) &
+!$omp private(ia,is,indv,iv,inl,jv,ijv,jnl,sums,iss,i,sumt)
+!$omp do reduction(+:ennl_t)
+      do ia = 1, nat
+         is   = ityp(ia)
+         indv = indv_ijkb0(ia)
         do iv = 1, nh(is)
+            inl = indv + iv
            do jv = iv, nh(is)
               ijv = (jv-1)*jv/2 + iv
-               do ia = 1, nat
-                  IF( ityp(ia) == is ) THEN
-                    inl = indv_ijkb0(ia) + iv
-                    jnl = indv_ijkb0(ia) + jv
-                    sums = 0.d0
-                    do i = 1, nbsp_bgrp
-                       iss = ispin_bgrp(i)
-                       sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
-                    end do
-                    sumt = 0.d0
-                    do iss = 1, nspin
-                       rhovan( ijv, ia, iss ) = sums( iss )
-                       sumt = sumt + sums( iss )
-                    end do
-                    if( iv .ne. jv ) sumt = 2.d0 * sumt
-                    ennl_t = ennl_t + sumt * dvan( jv, iv, is)
-                  END IF
+               jnl = indv + jv
+               sums = 0.d0
+               do i = 1, nbsp_bgrp
+                  iss = ispin_bgrp(i)
+                  sums(iss) = sums(iss) + f_bgrp(i) * bec_bgrp(inl,i) * bec_bgrp(jnl,i)
               end do
+               sumt = 0.d0
+               do iss = 1, nspin
+                  rhovan( ijv, ia, iss ) = sums( iss )
+                  sumt = sumt + sums( iss )
+               end do
+               if( iv .ne. jv ) sumt = 2.d0 * sumt
+               ennl_t = ennl_t + sumt * dvan( jv, iv, is)
            end do
         end do
      end do
+!$omp end do
+!$omp end parallel
      !
      ennl_val = ennl_t
      !
@ -435,10 +431,8 @@
   end subroutine calrhovan_x
 !-----------------------------------------------------------------------

-
-
 !-----------------------------------------------------------------------
-   subroutine calbec_x ( nspmn, nspmx, eigr, c, bec, pptype_ )
+   subroutine calbec_x ( n, betae, c, bec, pptype_ )
 !-----------------------------------------------------------------------

      !     this routine calculates array bec
@ -450,59 +444,79 @@
      !
      
      USE kinds,          ONLY : DP
-      use electrons_base, only : nbsp
      use cp_interfaces,  only : nlsm1
 !
      implicit none
      !
-      integer,     intent(in)  :: nspmn, nspmx
-      real(DP),    intent(out) :: bec( :, : )
-      complex(DP), intent(in)  :: c( :, : ), eigr( :, : )
+      INTEGER,     INTENT(IN)    :: n
+      real(DP),    intent(out)   :: bec( :, : )
+      complex(DP), intent(in)    :: c( :, : )
+      complex(DP), intent(inout) :: betae( :, : )
      INTEGER,     INTENT(IN), OPTIONAL  :: pptype_

-      ! local variables
-!
      call start_clock( 'calbec' )
-      !
-      call nlsm1( nbsp, nspmn, nspmx, eigr, c, bec, pptype_ )
-!
+      call nlsm1( n, betae, c, bec, pptype_ )
      call stop_clock( 'calbec' )
 !
      return
   end subroutine calbec_x
 !-----------------------------------------------------------------------

-!-----------------------------------------------------------------------
-   subroutine calbec_bgrp_x ( nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
-!-----------------------------------------------------------------------

-      !     this routine calculates array bec
-      !
-      !        < psi_n | beta_i,i > = c_n(0) beta_i,i(0) +
-      !                 2 sum_g> re(c_n*(g) (-i)**l beta_i,i(g) e^-ig.r_i)
-      !
-      !     routine makes use of c(-g)=c*(g)  and  beta(-g)=beta*(g)
-      !
+!-----------------------------------------------------------------------
+SUBROUTINE dbeta_eigr_x( dbeigr, eigr )
+  !-----------------------------------------------------------------------
+  !
+  USE kinds,      ONLY : DP
+  use ions_base,  only : nat, ityp
+  use uspp,       only : nhtol, nkb, dbeta, indv_ijkb0
+  use uspp_param, only : nh, nhm
+  use gvect,      only : gstart
+  use gvecw,      only : ngw
+  !
+  implicit none
+  !
+  include 'laxlib.fh'
+  !
+  complex(DP), intent(out) :: dbeigr( :, :, :, : )
+  complex(DP), intent(in)  :: eigr(:,:)
+  !
+  integer   :: ig, is, iv, ia, l, inl, i, j
+  complex(DP) :: cfact(4)
+  !
+  !if (l == 0) then
+  cfact(1) =   cmplx( 1.0_dp , 0.0_dp )
+  !else if (l == 1) then
+  cfact(2) = - cmplx( 0.0_dp , 1.0_dp )
+  !else if (l == 2) then
+  cfact(3) = - cmplx( 0.0_dp , 1.0_dp )
+  cfact(3) = cfact(3) * cfact(3)
+  !else if (l == 3) then
+  cfact(4) = - cmplx( 0.0_dp , 1.0_dp )
+  cfact(4) = cfact(4) * cfact(4) * cfact(4)
+  !endif

-      USE kinds,          ONLY : DP
-      use electrons_base, only : nbsp_bgrp
-      use cp_interfaces,  only : nlsm1
-!
-      implicit none
-      !
-      integer,     intent(in)  :: nspmn, nspmx
-      real(DP),    intent(out) :: bec_bgrp( :, : )
-      complex(DP), intent(in)  :: c_bgrp( :, : ), eigr( :, : )
-      INTEGER,     INTENT(IN), OPTIONAL  :: pptype_
-!
-      call start_clock( 'calbec' )
-      !
-      call nlsm1( nbsp_bgrp, nspmn, nspmx, eigr, c_bgrp, bec_bgrp, pptype_ )
-      !
-      call stop_clock( 'calbec' )
-!
-      return
-   end subroutine calbec_bgrp_x
+  do j=1,3
+     do i=1,3
+        do ia = 1, nat
+           is = ityp(ia) 
+           inl = indv_ijkb0(ia)
+           do iv=1,nh(is)
+              l=nhtol(iv,is)
+              !     q = 0   component (with weight 1.0)
+              dbeigr(1,iv+inl,i,j)= cfact(l+1)*dbeta(1,iv,is,i,j)*eigr(1,ia)
+              !     q > 0   components (with weight 2.0)
+              do ig = gstart, ngw
+                 dbeigr(ig,iv+inl,i,j) = 2.0d0*cfact(l+1)*dbeta(ig,iv,is,i,j)*eigr(ig,ia)
+              end do
+           end do
+        end do
+     end do
+  end do
+  !
+  return
+end subroutine dbeta_eigr_x
+!-----------------------------------------------------------------------


 !-----------------------------------------------------------------------
@ -526,8 +540,9 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
  use uspp_param, only : nh, nhm
  use gvect,      only : gstart
  use gvecw,      only : ngw
-  use electrons_base,     only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
-                                 ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
+  use electrons_base, only : nspin, iupdwn, nupdwn, nbspx_bgrp, iupdwn_bgrp, nupdwn_bgrp, &
+                             ibgrp_g2l, i2gupdwn_bgrp, nbspx, nbsp_bgrp
+  use cp_interfaces,  only : dbeta_eigr
  !
  implicit none
  !
@ -538,7 +553,7 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
  real(DP),    intent(out) :: dbec( :, :, :, : )
  integer, intent(in) :: idesc( :, : )
  !
-  complex(DP), allocatable :: wrk2(:,:)
+  complex(DP), allocatable :: wrk2(:,:,:,:)
  real(DP),    allocatable :: dwrk_bgrp(:,:)
  !
  integer   :: ig, is, iv, ia, l, inl, i, j, ii, iw, iss, nr, ir, istart, nss
@ -549,47 +564,23 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
  !
  dbec = 0.0d0
  !
+  allocate( wrk2( ngw, nkb, 3, 3 ) )
+  allocate( dwrk_bgrp( nkb, nbspx_bgrp ) )
+  !
+  CALL dbeta_eigr( wrk2, eigr )
+  !
  do j=1,3
     do i=1,3
+        IF( ngw > 0 .AND. nkb > 0 ) THEN
+           CALL dgemm( 'T', 'N', nkb, nbsp_bgrp, 2*ngw, 1.0d0, wrk2(1,1,i,j), 2*ngw, &
+                             c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(1,1), nkb )
+        END IF
+        if( nproc_bgrp > 1 ) then
+           call mp_sum( dwrk_bgrp, intra_bgrp_comm )
+        end if
        do ia = 1, nat
           is = ityp(ia) 
-           allocate( wrk2( ngw, nh(is) ) )
-           allocate( dwrk_bgrp( nh(is), nbspx_bgrp ) )
-
-           do iv=1,nh(is)
-              l=nhtol(iv,is)
-              if (l == 0) then
-                 cfact =   cmplx( 1.0_dp , 0.0_dp )
-              else if (l == 1) then
-                 cfact = - cmplx( 0.0_dp , 1.0_dp )
-              else if (l == 2) then
-                 cfact = - cmplx( 0.0_dp , 1.0_dp )
-                 cfact = cfact * cfact
-              else if (l == 3) then
-                 cfact = - cmplx( 0.0_dp , 1.0_dp )
-                 cfact = cfact * cfact * cfact
-              else
-                 CALL errore(' caldbec  ', ' l not implemented ', ABS( l ) )
-              endif
-              !
-              !     q = 0   component (with weight 1.0)
-              if (gstart == 2) then
-                 wrk2(1,iv)= cfact*dbeta(1,iv,is,i,j)*eigr(1,ia)
-              end if
-              !     q > 0   components (with weight 2.0)
-              do ig = gstart, ngw
-                 wrk2(ig,iv) = 2.0d0*cfact*dbeta(ig,iv,is,i,j)*eigr(ig,ia)
-              end do
-           end do
-           IF( ngw > 0 .AND. nh(is) > 0 ) THEN
-              CALL dgemm( 'T', 'N', nh(is), nbsp_bgrp, 2*ngw, 1.0d0, wrk2, 2*ngw, &
-                             c_bgrp, 2*ngw, 0.0d0, dwrk_bgrp(1,1), nh(is) )
-           END IF
-
-           if( nproc_bgrp > 1 ) then
-              call mp_sum( dwrk_bgrp, intra_bgrp_comm )
-           end if
-
+           inl = indv_ijkb0(ia)
           do iss=1,nspin
              IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
                 nr = idesc( LAX_DESC_NR, iss )
@ -600,19 +591,18 @@ SUBROUTINE caldbec_bgrp_x( eigr, c_bgrp, dbec, idesc )
                    ibgrp_i = ibgrp_g2l( ii + ir - 1 + istart - 1 )
                    IF( ibgrp_i > 0 ) THEN
                       do iw = 1, nh(is)
-                          dbec( indv_ijkb0(ia) + iw, ii + (iss-1)*nrcx, i, j ) = dwrk_bgrp( iw, ibgrp_i )
+                          dbec( inl + iw, ii + (iss-1)*nrcx, i, j ) = dwrk_bgrp( inl + iw, ibgrp_i )
                       end do
                    END IF
                 end do
              END IF
           end do
-
-           deallocate( wrk2 )
-           deallocate( dwrk_bgrp )
        end do
     end do
  end do

+  deallocate( wrk2 )
+  deallocate( dwrk_bgrp )
  if( nbgrp > 1 ) then
     CALL mp_sum( dbec, inter_bgrp_comm )
  end if
@ -659,6 +649,11 @@ subroutine dennl_x( bec_bgrp, dbec, drhovan, denl, idesc )
  denl=0.d0
  drhovan=0.0d0

+!$omp parallel default(none) &
+!$omp shared(nat,ityp,indv_ijkb0,nh,nbsp_bgrp,ispin_bgrp,f_bgrp,bec_bgrp,drhovan,dvan,nspin,denl) &
+!$omp shared(idesc,iupdwn,nupdwn,ibgrp_g2l,nrcx,dbec) &
+!$omp private(ia,is,iv,inl,jv,ijv,jnl,dsums,iss,i,dsum,ii,ir,k,j,nr,istart,nss,ibgrp)
+!$omp do reduction(+:denl)
  do ia=1,nat
     is = ityp(ia) 
     do iv=1,nh(is)
@ -701,6 +696,8 @@ subroutine dennl_x( bec_bgrp, dbec, drhovan, denl, idesc )
        end do
     end do
  end do
+!$omp end do
+!$omp end parallel

  CALL mp_sum( denl,    intra_bgrp_comm )
  CALL mp_sum( drhovan, intra_bgrp_comm )
@ -720,7 +717,7 @@ end subroutine dennl_x


 !-----------------------------------------------------------------------
-subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
+subroutine nlfq_bgrp_x( c_bgrp, betae, bec_bgrp, becdr_bgrp, fion )
  !-----------------------------------------------------------------------
  !
  !     contribution to fion due to nonlocal part
@ -739,12 +736,16 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
  !
  implicit none
  !
-  COMPLEX(DP), INTENT(IN)  ::  c_bgrp( :, : ), eigr( :, : )
+  COMPLEX(DP), INTENT(IN) :: c_bgrp( :, : )
+  COMPLEX(DP), INTENT(IN) ::  betae( :, : )
+#if defined (__CUDA)
+  ATTRIBUTES( DEVICE ) :: c_bgrp, betae
+#endif
  REAL(DP),    INTENT(IN)  ::  bec_bgrp( :, : )
  REAL(DP),    INTENT(OUT)  ::  becdr_bgrp( :, :, : )
  REAL(DP),    INTENT(OUT) ::  fion( :, : )
  !
-  integer  :: k, is, ia, inl, iv, jv, i
+  integer  :: k, is, ia, inl, jnl, iv, jv, i
  real(DP) :: temp
  real(DP) :: sum_tmpdr
  !
@ -758,7 +759,7 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
  !
  !     nlsm2 fills becdr
  !
-  call nlsm2_bgrp( ngw, nkb, eigr, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
+  call nlsm2_bgrp( ngw, nkb, betae, c_bgrp, becdr_bgrp, nbspx_bgrp, nbsp_bgrp )
  !
  allocate ( fion_loc( 3, nat ) )
  !
@ -767,17 +768,17 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
 !$omp parallel default(none), &
 !$omp shared(becdr_bgrp,bec_bgrp,fion_loc,f_bgrp,deeq,dvan,nbsp_bgrp,indv_ijkb0,nh, &
 !$omp        nat,nhm,nbspx_bgrp,ispin_bgrp,nproc_bgrp,me_bgrp,ityp), &
-!$omp private(tmpbec,tmpdr,is,ia,iv,jv,k,inl,temp,i,mytid,ntids,sum_tmpdr)
+!$omp private(tmpbec,tmpdr,is,ia,iv,jv,k,inl,jnl,temp,i,mytid,ntids,sum_tmpdr)

 #if defined(_OPENMP)
  mytid = omp_get_thread_num()  ! take the thread ID
  ntids = omp_get_num_threads() ! take the number of threads
 #endif

-  allocate ( tmpbec( nhm, nbspx_bgrp ), tmpdr( nhm, nbspx_bgrp ) )
+  allocate ( tmpbec( nbspx_bgrp, nhm ), tmpdr( nbspx_bgrp, nhm ) )

  DO k = 1, 3
-     DO ia=1,nat
+     DO ia = 1, nat
        is = ityp(ia)

        ! better if we distribute to MPI tasks too!
@ -790,29 +791,27 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
        IF( MOD( ( ia + (k-1)*nat ) / nproc_bgrp, ntids ) /= mytid ) CYCLE
 #endif  
        tmpbec = 0.d0
-        tmpdr  = 0.d0
-
-        do iv=1,nh(is)
-           do jv=1,nh(is)
-              inl = indv_ijkb0(ia) + jv
+        do jv=1,nh(is)
+           jnl = indv_ijkb0(ia) + jv
+           do iv=1,nh(is)
              do i = 1, nbsp_bgrp
                 temp = dvan(iv,jv,is) + deeq(jv,iv,ia,ispin_bgrp( i ) )
-                 tmpbec(iv,i) = tmpbec(iv,i) + temp * bec_bgrp(inl,i)
+                 tmpbec(i,iv) = tmpbec(i,iv) + temp * bec_bgrp(jnl,i)
              end do
           end do
        end do

-        do iv=1,nh(is)
+        do iv = 1, nh(is)
           inl = indv_ijkb0(ia) + iv
           do i = 1, nbsp_bgrp
-              tmpdr(iv,i) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
+              tmpdr(i,iv) = f_bgrp( i ) * becdr_bgrp( inl, i, k )
           end do
        end do

        sum_tmpdr = 0.0d0
-        do i = 1, nbsp_bgrp
-           do iv = 1, nh(is)
-              sum_tmpdr = sum_tmpdr + tmpdr(iv,i)*tmpbec(iv,i)
+        do iv = 1, nh(is)
+           do i = 1, nbsp_bgrp
+              sum_tmpdr = sum_tmpdr + tmpdr(i,iv)*tmpbec(i,iv)
           end do
        end do

@ -823,7 +822,6 @@ subroutine nlfq_bgrp_x( c_bgrp, eigr, bec_bgrp, becdr_bgrp, fion )
  deallocate ( tmpbec, tmpdr )

 !$omp end parallel
-
  !
  CALL mp_sum( fion_loc, intra_bgrp_comm )
  IF( nbgrp > 1 ) THEN
--- a/CPV/src/nlcc.f90
+++ b/CPV/src/nlcc.f90
@ -305,7 +305,7 @@

 !
 !-----------------------------------------------------------------------
-      subroutine set_cc( irb, eigrb, rhoc )
+      subroutine set_cc( rhoc )
 !-----------------------------------------------------------------------
 !
 !     Calculate core charge contribution in real space, rhoc(r)
@ -320,91 +320,76 @@
      use core,              only: rhocb
      use fft_interfaces,    only: invfft
      use fft_base,          only: dfftb, dfftp
+      USE cp_main_variables, ONLY: irb, eigrb
+      USE mp_global,         ONLY: nproc_bgrp, me_bgrp, inter_bgrp_comm, my_bgrp_id, nbgrp
+      USE mp,                ONLY: mp_sum

      implicit none
-! input
-      integer, intent(in)        :: irb(3,nat)
-      complex(dp), intent(in):: eigrb(ngb,nat)
 ! output
      real(dp), intent(out)  :: rhoc(dfftp%nnr)
 ! local
-      integer nfft, ig, is, ia, isa
-      complex(dp) ci
+      integer :: ig, is, ia, isa, iia
+      INTEGER :: nabox, iabox( nat )
+      complex(dp), PARAMETER :: ci = (0.d0,1.d0)
      complex(dp), allocatable :: wrk1(:)
      complex(dp), allocatable :: qv(:), fg1(:), fg2(:)

+      INTEGER :: mytid, ntids
 #if defined(_OPENMP)
-      INTEGER :: itid, mytid, ntids, omp_get_thread_num, omp_get_num_threads
+      INTEGER :: omp_get_thread_num, omp_get_num_threads
      EXTERNAL :: omp_get_thread_num, omp_get_num_threads
 #endif
 !
      call start_clock( 'set_cc' )
-      ci=(0.d0,1.d0)

      allocate( wrk1 ( dfftp%nnr ) )
-      wrk1 (:) = (0.d0, 0.d0)
+
+      nabox = 0
+      DO ia = 1, nat
+         IF( .NOT. upf(ityp(ia))%nlcc ) CYCLE
+         IF( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2 ( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) CYCLE 
+         nabox = nabox + 1
+         iabox( nabox ) = ia
+      END DO
 !
 !$omp parallel default(none) &      
-!$omp          shared(nsp, na, ngb, eigrb, dfftb, irb, ci, rhocb, &
-!$omp                 nat, upf, wrk1, ityp ) &
-!$omp          private(mytid, ntids, is, ia, nfft, ig, isa, qv, fg1, fg2, itid )
+!$omp          shared( nsp, na, ngb, eigrb, dfftb, irb, rhocb, &
+!$omp                 nat, wrk1, ityp, nabox, iabox ) &
+!$omp          private( mytid, ntids, is, ia, iia, ig, qv, fg1 )

      allocate( qv ( dfftb%nnr ) )
      allocate( fg1 ( ngb ) )
-      allocate( fg2 ( ngb ) )
-!
-      isa = 0

+!$omp workshare
+      wrk1 = (0.d0, 0.d0)
+!$omp end workshare
+!
 #if defined(_OPENMP)
      mytid = omp_get_thread_num()  ! take the thread ID
      ntids = omp_get_num_threads() ! take the number of threads
-      itid  = 0
+#else
+      mytid = 0
+      ntids = 1
 #endif

-      do ia = 1, nat
-         !
-         is = ityp(ia)
-
-         if (.not.upf(is)%nlcc) then
-            cycle
-         end if
-         !
-#if defined(__MPI)
-         nfft=1
-         if ( ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2 ( ia ) <= 0 ) ) cycle
-#endif
-
-#if defined(_OPENMP)
-         IF ( mytid /= itid ) THEN
-            itid = MOD( itid + 1, ntids )
-            CYCLE
-         ELSE
-            itid = MOD( itid + 1, ntids )
-         END IF
-#endif
-
-         if(nfft.eq.2)then
-            fg1 = eigrb(1:ngb,ia  )*rhocb(1:ngb,is)
-            fg2 = eigrb(1:ngb,ia+1)*rhocb(1:ngb,is)
-            CALL fft_oned2box( qv, fg1, fg2 )
-         else
+      do iia = 1, nabox
+         IF( MOD( iia - 1, ntids ) == mytid ) THEN
+            ia = iabox(iia)
+            is = ityp(ia)
            fg1 = eigrb(1:ngb,ia  )*rhocb(1:ngb,is)
            CALL fft_oned2box( qv, fg1 )
-         endif
-!
-         call invfft( qv, dfftb, ia )
-!
-         call box2grid(irb(:,ia),1,qv,wrk1)
-         if (nfft.eq.2) call box2grid(irb(:,ia+1),2,qv,wrk1)
-!
+            call invfft( qv, dfftb, ia )
+            call box2grid(irb(:,ia),1,qv,wrk1)
+         END IF
      end do
 !
-      deallocate( qv  )
      deallocate( fg1  )
-      deallocate( fg2  )
+      deallocate( qv  )

 !$omp end parallel

+      CALL mp_sum( wrk1, inter_bgrp_comm ) 
+
      call dcopy( dfftp%nnr, wrk1, 2, rhoc, 1 )

      deallocate( wrk1 )
--- a/CPV/src/ortho.f90
+++ b/CPV/src/ortho.f90
@ -6,17 +6,208 @@
 ! or http://www.gnu.org/copyleft/gpl.txt .
 !

+#if defined(__CUDA)
+#define DGEMMDRV cublasDgemm
+#define DEVICEATTR ,DEVICE
+#else
+#define DGEMMDRV dgemm
+#define DEVICEATTR
+#endif
+
+MODULE ortho_module
+   !
+#if defined(__CUDA)
+   USE cudafor
+#endif
+   USE kinds,              ONLY: DP
+   IMPLICIT NONE
+   SAVE
+
+   REAL(DP), ALLOCATABLE DEVICEATTR :: s(:,:), sig(:,:), tau(:,:), stmp(:,:)
+   REAL(DP), ALLOCATABLE DEVICEATTR :: wrk(:,:), rhoa(:,:), rhos(:,:), rhod(:)
+   REAL(DP), ALLOCATABLE DEVICEATTR :: xloc(:,:)
+
+CONTAINS
+
+   SUBROUTINE allocate_local_ortho_memory( nss, nx0 )
+      INTEGER, INTENT(IN) :: nss, nx0
+      INTEGER :: info
+      IF( ALLOCATED( rhos ) ) THEN
+         IF( nx0 == SIZE( rhos, 1 ) ) THEN
+            RETURN
+         ELSE
+            DEALLOCATE( rhos, rhoa, s, sig, tau, rhod )
+            IF(ALLOCATED(wrk)) DEALLOCATE(wrk) 
+            IF(ALLOCATED(stmp)) DEALLOCATE(stmp) 
+         END IF 
+      END IF
+      ALLOCATE( rhos( nx0, nx0 ), STAT = info )
+      IF( info /= 0 ) &
+         CALL errore( ' ortho_gamma ', ' allocating rhos ', ABS( info ) )
+      rhos = 0
+      ALLOCATE( rhoa( nx0, nx0 ), STAT = info )   !   antisymmetric part of rho
+      IF( info /= 0 ) &
+         CALL errore( ' ortho_gamma ', ' allocating rhoa ', ABS( info ) )
+      rhoa = 0
+      ALLOCATE( s( nx0, nx0 ), STAT = info ) 
+      IF( info /= 0 ) &
+         CALL errore( ' ortho_gamma ', ' allocating s ', ABS( info ) )
+      s = 0
+      ALLOCATE( sig( nx0, nx0 ), STAT = info ) 
+      IF( info /= 0 ) &
+         CALL errore( ' ortho_gamma ', ' allocating sig ', ABS( info ) )
+      sig = 0
+      ALLOCATE( tau( nx0, nx0 ), STAT = info ) 
+      IF( info /= 0 ) &
+         CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
+      tau = 0
+      ALLOCATE( rhod( nss ), STAT = info )
+      IF( info /= 0 ) &
+         CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
+      rhod = 0
+#if defined(__CUDA)
+      ALLOCATE( wrk( nss, nss ), STAT = info )
+      IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating wrk ', 1 )
+      ALLOCATE( stmp( nss, nss ), STAT = info )
+      IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating stmp ', 1 )
+#endif
+   END SUBROUTINE
+
+   SUBROUTINE deallocate_local_ortho_memory()
+     IF(ALLOCATED(s)) DEALLOCATE(s) 
+     IF(ALLOCATED(sig)) DEALLOCATE(sig) 
+     IF(ALLOCATED(tau)) DEALLOCATE(tau) 
+     IF(ALLOCATED(stmp)) DEALLOCATE(stmp) 
+     IF(ALLOCATED(wrk)) DEALLOCATE(wrk) 
+     IF(ALLOCATED(rhoa)) DEALLOCATE(rhoa)
+     IF(ALLOCATED(rhos)) DEALLOCATE(rhos) 
+     IF(ALLOCATED(rhod)) DEALLOCATE(rhod) 
+     IF(ALLOCATED(xloc)) DEALLOCATE(xloc) 
+   END SUBROUTINE deallocate_local_ortho_memory
+
+   SUBROUTINE x0_to_xloc( x0, nx0, ccc_, idesc )
+      REAL(DP), INTENT(IN) :: x0(:,:)
+      REAL(DP), INTENT(IN) :: ccc_
+      INTEGER, INTENT(IN) :: nx0
+      INTEGER,  INTENT(IN)  :: idesc(:)
+      include 'laxlib.fh'
+      INTEGER :: i, j, info
+      REAL(DP) DEVICEATTR :: ccc
+      IF( ALLOCATED(xloc) ) THEN
+        IF( SIZE(x0,1) /= SIZE(xloc,1) .OR. SIZE(x0,2) /= SIZE(xloc,2) ) THEN
+           DEALLOCATE(xloc)
+        END IF
+      END IF
+      IF( .NOT. ALLOCATED(xloc) ) THEN
+         ALLOCATE( xloc, MOLD=x0, STAT = info )
+         IF( info /= 0 ) &
+            CALL errore( ' x0_to_xloc ', ' allocating xloc ', ABS( info ) )
+      END IF
+      IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
+         RETURN
+      ENDIF
+      xloc = x0
+      ccc = ccc_
+!$cuf kernel do(2) <<<*,*>>>
+      DO j = 1, SIZE(xloc,2)
+         DO i = 1, SIZE(xloc,1)
+            xloc(i,j) = xloc(i,j) * ccc
+         END DO
+      END DO
+   END SUBROUTINE x0_to_xloc
+   
+   SUBROUTINE xloc_to_x0( x0, nx0, ccc_, idesc )
+      REAL(DP), INTENT(OUT) :: x0(:,:)
+      INTEGER, INTENT(IN) :: nx0
+      REAL(DP), INTENT(IN) :: ccc_
+      INTEGER,  INTENT(IN)  :: idesc(:)
+      include 'laxlib.fh'
+      INTEGER :: i, j
+      REAL(DP) DEVICEATTR :: byccc
+      IF( idesc(LAX_DESC_ACTIVE_NODE) < 0 ) THEN
+         RETURN
+      ENDIF
+      IF( .NOT. ALLOCATED(xloc) ) THEN
+         CALL errore( ' xloc_to_x0 ', ' xloc not allocated ', 1 )
+      END IF
+      byccc = 1.0d0 / ccc_
+!$cuf kernel do(2) <<<*,*>>>
+      DO j = 1, SIZE(xloc,2)
+         DO i = 1, SIZE(xloc,1)
+            xloc(i,j) = xloc(i,j) * byccc
+         END DO
+      END DO
+      x0 = xloc
+   END SUBROUTINE xloc_to_x0
+
+   SUBROUTINE distribute_matrix( a, b, ir, nr, ic, nc, comm )
+      USE mp, ONLY: mp_bcast
+      REAL(DP) DEVICEATTR :: a(:,:), b(:,:)
+      INTEGER, INTENT(IN) :: ir, nr, ic, nc, comm
+      INTEGER :: i, j, info
+      CALL mp_bcast( a, 0, comm )
+!$cuf kernel do(2) <<<*,*>>>
+      DO j = 1, nc
+         DO i = 1, nr
+            b( i, j ) = a( i + ir - 1, j + ic - 1 )
+         END DO
+      END DO
+      RETURN
+   END SUBROUTINE
+
+   SUBROUTINE collect_matrix( a, b, ir, nr, ic, nc, comm )
+      USE mp, ONLY: mp_sum
+      REAL(DP) DEVICEATTR :: a(:,:), b(:,:)
+      INTEGER, INTENT(IN) :: ir, nr, ic, nc, comm
+      INTEGER :: i, j, info
+      a = 0.0d0
+!$cuf kernel do(2) <<<*,*>>>
+      DO j = 1, nc
+         DO i = 1, nr
+            a( ir + i - 1, ic + j - 1 ) = b( i, j )
+         END DO
+      END DO
+      CALL mp_sum( a, comm )
+      RETURN
+   END SUBROUTINE
+
+   SUBROUTINE consistency_check( a, idesc )
+      REAL(DP) DEVICEATTR, INTENT(IN) :: a(:,:)
+      INTEGER,  INTENT(IN)  :: idesc(:)
+      INTEGER :: i, j
+      include 'laxlib.fh'
+      !
+      ! on some machines (IBM RS/6000 for instance) the following test allows
+      ! to distinguish between Numbers and Sodium Nitride (NaN, Not a Number).
+      ! If a matrix of Not-Numbers is passed to rs, the most likely outcome is
+      ! that the program goes on forever doing nothing and writing nothing.
+      !
+#if ! defined(__CUDA)
+      IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
+         DO j = 1, idesc(LAX_DESC_NC)
+            DO i = 1, idesc(LAX_DESC_NR)
+               IF (a(i,j) /= a(i,j)) &
+                  CALL errore(' ortho ',' ortho went bananas ',1)
+            END DO
+         END DO
+      END IF
+#endif
+      RETURN
+   END SUBROUTINE

 !=----------------------------------------------------------------------------=!
-   SUBROUTINE ortho_gamma_x( iopt, cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
-                           x0, nx0, idesc, diff, iter, n, nss, istart )
+   SUBROUTINE ortho_gamma( cp, ngwx, phi, becp_dist, qbecp, nkbx, bephi, qbephi, &
+                           nx0, idesc, diff, iter, n, nss, istart )
 !=----------------------------------------------------------------------------=!
      !
      USE kinds,              ONLY: DP
      USE orthogonalize_base, ONLY: rhoset, sigset, tauset, ortho_iterate,   &
-                                    ortho_alt_iterate, use_parallel_diag
+                                    use_parallel_diag
+      USE control_flags,      ONLY: diagonalize_on_host
      USE mp_global,          ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, my_bgrp_id, inter_bgrp_comm, nbgrp
      USE mp,                 ONLY: mp_sum, mp_bcast
+      USE mp_world,           ONLY: mpime
+      USE device_memcpy_m

      IMPLICIT  NONE

@ -24,23 +215,21 @@

      ! ... Arguments

-      INTEGER,  INTENT(IN)  :: iopt
      INTEGER,  INTENT(IN)  :: ngwx, nkbx, nx0
      INTEGER,  INTENT(IN)  :: n, nss, istart
-      COMPLEX(DP) :: phi( ngwx, n ), cp( ngwx, n )
-      REAL(DP)    :: bephi( :, : )
-      REAL(DP)    :: becp_dist( :, : )
-      REAL(DP)    :: qbephi( :, : ), qbecp( :, : )
-      REAL(DP)    :: x0( nx0, nx0 )
+      COMPLEX(DP) DEVICEATTR :: phi( :, : ), cp( :, : )
+      REAL(DP)    DEVICEATTR :: bephi( :, : )
+      REAL(DP)    DEVICEATTR :: becp_dist( :, : )
+      REAL(DP)    DEVICEATTR :: qbephi( :, : ), qbecp( :, : )
      INTEGER,  INTENT(IN)  :: idesc(:)
      INTEGER,  INTENT(OUT) :: iter
      REAL(DP), INTENT(OUT) :: diff

      ! ... Locals

-      REAL(DP),   ALLOCATABLE :: s(:,:), sig(:,:), tau(:,:), rhot(:,:)
-      REAL(DP),   ALLOCATABLE :: wrk(:,:), rhoa(:,:), rhos(:,:), rhod(:)
      INTEGER  :: i, j, info, nr, nc, ir, ic
+      INTEGER, SAVE :: icnt = 1
+      REAL(DP), ALLOCATABLE :: rhos_h(:,:), s_h(:,:), rhod_h(:)
      !
      ! ...   Subroutine body
      !
@ -66,98 +255,13 @@
         !
      END IF
      !
-      ALLOCATE( rhos( nx0, nx0 ), STAT = info )
-      IF( info /= 0 ) &
-         CALL errore( ' ortho_gamma ', ' allocating rhos ', ABS( info ) )
-      ALLOCATE( rhoa( nx0, nx0 ), STAT = info )   !   antisymmetric part of rho
-      IF( info /= 0 ) &
-         CALL errore( ' ortho_gamma ', ' allocating rhoa ', ABS( info ) )
-      ALLOCATE( s( nx0, nx0 ), STAT = info ) 
-      IF( info /= 0 ) &
-         CALL errore( ' ortho_gamma ', ' allocating s ', ABS( info ) )
-      ALLOCATE( sig( nx0, nx0 ), STAT = info ) 
-      IF( info /= 0 ) &
-         CALL errore( ' ortho_gamma ', ' allocating sig ', ABS( info ) )
-      ALLOCATE( tau( nx0, nx0 ), STAT = info ) 
-      IF( info /= 0 ) &
-         CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( info ) )
-      !
-      ALLOCATE( rhod( nss ), STAT = info )
-      IF( info /= 0 ) &
-         CALL errore( ' ortho_gamma ', ' allocating tau ', ABS( rhod ) )
+      CALL allocate_local_ortho_memory(nss, nx0)
      !
      !     rho = <s'c0|s|cp>
      !
      CALL start_clock( 'rhoset' )
-      !
-      CALL rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, istart, rhos, nx0, idesc )
-      !
-      IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
-         !
-         ALLOCATE( rhot( nx0, nx0 ), STAT = info )   !   transpose of rho
-         IF( info /= 0 ) &
-            CALL errore( ' ortho_gamma ', ' allocating rhot ', ABS( rhod ) )
-         !
-         !    distributed array rhos contains "rho", 
-         !    now transpose rhos and store the result in distributed array rhot
-         !
-         CALL sqr_tr_cannon( nss, rhos, nx0, rhot, nx0, idesc )
-         !
-         !  Compute the symmetric part of rho
-         !
-         DO j = 1, nc
-            DO i = 1, nr
-               rhos( i, j ) = 0.5d0 * ( rhos( i, j ) + rhot( i, j ) )
-            END DO
-         END DO
-         !
-         !    distributed array rhos now contains symmetric part of "rho", 
-         !
-         CALL consistency_check( rhos )
-         !
-         !  Antisymmetric part of rho, alredy distributed across ortho procs.
-         !
-         DO j = 1, nc
-            DO i = 1, nr
-               rhoa( i, j ) = rhos( i, j ) - rhot( i, j )
-            END DO
-         END DO
-         !
-         DEALLOCATE( rhot )
-         !
-      END IF
-
+      CALL rhoset( cp, ngwx, phi, bephi, nkbx, qbecp, n, nss, istart, rhos, rhoa, nx0, idesc )
      CALL stop_clock( 'rhoset' )
-
-      CALL start_clock( 'rsg' )
-      !
-      ! ...   Diagonalize symmetric part of rho (rhos)
-      ! ...   "s" is the matrix of eigenvectors, "rhod" is the array of eigenvalues
-      !
-      IF( use_parallel_diag ) THEN
-         !
-         CALL laxlib_diagonalize( nss, rhos, rhod, s, idesc )
-         !
-      ELSE
-         !
-         IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
-            !
-            ALLOCATE( wrk( nss, nss ), STAT = info )
-            IF( info /= 0 ) CALL errore( ' ortho_gamma ', ' allocating wrk ', 1 )
-            !
-            CALL collect_matrix( wrk, rhos )
-            !
-            CALL laxlib_diagonalize( nss, wrk, rhod )
-            !
-            CALL distribute_matrix( wrk, s )
-            !
-            DEALLOCATE( wrk )
-            !
-         END IF
-         !
-      END IF
-      !
-      CALL stop_clock( 'rsg' )
      !
      !     sig = 1-<cp|s|cp>
      !
@ -171,6 +275,74 @@
      CALL tauset( phi, ngwx, bephi, nkbx, qbephi, n, nss, istart, tau, nx0, idesc )
      CALL stop_clock( 'tauset' )
      !
+      CALL consistency_check(rhos,idesc) 
+
+      CALL start_clock( 'rsg' )
+      !
+      ! ...   Diagonalize symmetric part of rho (rhos)
+      ! ...   "s" is the matrix of eigenvectors, "rhod" is the array of eigenvalues
+      !
+      IF( use_parallel_diag ) THEN
+         !
+#if defined(__CUDA)
+         IF( idesc(LAX_DESC_NR) == idesc(LAX_DESC_NC) .AND. idesc(LAX_DESC_NR) == idesc(LAX_DESC_N) ) THEN
+            IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
+               CALL laxlib_diagonalize( nss, rhos, rhod, s, info )
+            END IF
+         ELSE IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
+            IF( diagonalize_on_host ) THEN  !  tune here
+               ALLOCATE( rhos_h, SOURCE = rhos )
+               ALLOCATE( rhod_h, MOLD = rhod )
+               ALLOCATE( s_h, MOLD = s )
+               CALL laxlib_diagonalize( nss, rhos_h, rhod_h, s_h, idesc )
+               CALL dev_memcpy( s, s_h )
+               CALL dev_memcpy( rhod, rhod_h )
+               DEALLOCATE( rhos_h, rhod_h, s_h )
+            ELSE
+               CALL collect_matrix( wrk, rhos, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
+               IF( idesc(LAX_DESC_IC) == 1 .AND. idesc(LAX_DESC_IR) == 1 ) THEN
+                  CALL laxlib_diagonalize( nss, wrk, rhod, stmp, info )
+               END IF 
+               CALL distribute_matrix( stmp, s, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
+               CALL mp_bcast( rhod, 0, idesc(LAX_DESC_COMM) )
+            END IF
+         END IF
+#else
+         CALL laxlib_diagonalize( nss, rhos, rhod, s, idesc )
+#endif
+         !
+      ELSE
+         !
+         IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
+            !
+            IF( idesc(LAX_DESC_NR) == idesc(LAX_DESC_NC) .AND. idesc(LAX_DESC_NR) == idesc(LAX_DESC_N) ) THEN
+               !
+               !  rhos and s matrixes, are replicated, no need of collect them
+#if defined(__CUDA)
+               CALL laxlib_diagonalize( nss, rhos, rhod, s, info )
+#else
+               s = rhos
+               CALL laxlib_diagonalize( nss, s, rhod )
+#endif
+            ELSE
+               !
+               CALL collect_matrix( wrk, rhos, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
+               !
+#if defined(__CUDA)
+               CALL laxlib_diagonalize( nss, wrk, rhod, s, info )
+#else
+               CALL laxlib_diagonalize( nss, wrk, rhod )
+#endif
+               !
+               CALL distribute_matrix( wrk, s, ir, nr, ic, nc, idesc(LAX_DESC_COMM) )
+               !
+            END IF
+            !
+         END IF
+         !
+      END IF
+      !
+      CALL stop_clock( 'rsg' )
      CALL start_clock( 'ortho_iter' )
      !
      IF( my_bgrp_id == 0 ) THEN
@ -180,15 +352,7 @@
         !  group are enough. Moreover replicating the computation across groups could leads
         ! to small numerical differences and weird numerical effects.
         !
-         IF( iopt == 0 ) THEN
-            !
-            CALL ortho_iterate( iter, diff, s, nx0, rhod, x0, nx0, sig, rhoa, rhos, tau, nss, idesc)
-            !
-         ELSE
-            !
-            CALL ortho_alt_iterate( iter, diff, s, nx0, rhod, x0, nx0, sig, rhoa, tau, nss, idesc)
-            !
-         END IF
+         CALL ortho_iterate( iter, diff, s, nx0, rhod, xloc, nx0, sig, rhoa, rhos, tau, nss, idesc)
         !
      END IF
      !
@ -197,74 +361,137 @@
         !  All groups must have the same lambda matrix, in order to avoid weird
         !  numerical side effects.
         !
-         CALL mp_bcast( x0, 0, inter_bgrp_comm )
+         CALL mp_bcast( xloc, 0, inter_bgrp_comm )
         CALL mp_bcast( iter, 0, inter_bgrp_comm )
         CALL mp_bcast( diff, 0, inter_bgrp_comm )
         !
      END IF
      !
+      CALL consistency_check( xloc,idesc )
+
      CALL stop_clock( 'ortho_iter' )
-      !
-      DEALLOCATE( rhoa, rhos, rhod, s, sig, tau )
-      !
-      IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 )  CALL consistency_check( x0 )

      RETURN
+   END SUBROUTINE ortho_gamma

-   CONTAINS
+   !

-      SUBROUTINE distribute_matrix( a, b )
-         REAL(DP) :: a(:,:), b(:,:)
-         INTEGER :: i, j
-         IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
-            DO j = 1, nc
-               DO i = 1, nr
-                  b( i, j ) = a( i + ir - 1, j + ic - 1 )
-               END DO
-            END DO
-         END IF
-         RETURN
-      END SUBROUTINE
+   SUBROUTINE compute_qs_times_betas( bephi, bec_row, qbephi, qbecp, idesc )
+      USE uspp,           ONLY: nkb, qq_nt, indv_ijkb0, nkbus
+      USE uspp_gpum,      ONLY: qq_nt_d, using_qq_nt
+      USE uspp_param,     ONLY: nh, upf
+      USE electrons_base, ONLY: nspin, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, nbsp, nupdwn, iupdwn
+      USE ions_base,      ONLY: na, nat, nsp, ityp
+#if defined (__CUDA)
+      USE cublas
+#endif
+      !
+      IMPLICIT NONE
+      !
+      include 'laxlib.fh'
+      !
+      REAL(DP), INTENT(OUT) DEVICEATTR :: qbephi(:,:,:), qbecp(:,:,:)
+      REAL(DP), INTENT(IN)  DEVICEATTR :: bephi(:,:)
+      REAL(DP), INTENT(IN)  DEVICEATTR :: bec_row(:,:)
+      INTEGER, INTENT(IN) :: idesc(:,:)

-      SUBROUTINE collect_matrix( a, b )
-         REAL(DP) :: a(:,:), b(:,:)
-         INTEGER :: i, j
-         a = 0.0d0
-         IF( idesc(LAX_DESC_ACTIVE_NODE) > 0 ) THEN
-            DO j = 1, nc
-               DO i = 1, nr
-                  a( ir + i - 1, ic + j - 1 ) = b( i, j )
-               END DO
-            END DO
-         END IF
-         CALL mp_sum( a, idesc(LAX_DESC_COMM) )
-         RETURN
-      END SUBROUTINE
+      REAL(DP), ALLOCATABLE DEVICEATTR :: bec_col(:,:), bephi_col(:,:)

-      SUBROUTINE consistency_check( a )
-         REAL(DP) :: a(:,:)
-         INTEGER :: i, j
+      INTEGER :: nkbx, info, nrcx, iss, is, jv, iv, ia
+      INTEGER :: i, j, inl, jnl, indv, nc, nhs
+
+      nkbx = nkb
+      nrcx = idesc( LAX_DESC_NRCX, 1 )
+      IF( nspin > 1 ) nrcx = MAX( nrcx, idesc( LAX_DESC_NRCX, 2 ) )
+
+      qbephi = 0.d0
+      qbecp  = 0.d0
+ 
+      CALL using_qq_nt(0)
+      !
+      IF( nkbus > 0 ) THEN
         !
-         ! on some machines (IBM RS/6000 for instance) the following test allows
-         ! to distinguish between Numbers and Sodium Nitride (NaN, Not a Number).
-         ! If a matrix of Not-Numbers is passed to rs, the most likely outcome is
-         ! that the program goes on forever doing nothing and writing nothing.
+         ALLOCATE( bec_col ( nkbx, nrcx*nspin ), STAT = info )
+         IF( info /= 0 ) &
+            CALL errore( ' compute_qs_times_betas ', ' allocating bec_col ', ABS( info ) )
+         ALLOCATE( bephi_col ( nkbx, nrcx*nspin ), STAT = info )
+         IF( info /= 0 ) &
+            CALL errore( ' compute_qs_times_betas ', ' allocating bephi_col ', ABS( info ) )
         !
-         DO j = 1, nc
-            DO i = 1, nr
-               IF (a(i,j) /= a(i,j)) CALL errore(' ortho ',' ortho went bananas ',1)
-            END DO
+         CALL redist_row2col( nupdwn(1), bephi, bephi_col, nkbx, nrcx, idesc(:,1) )
+         CALL redist_row2col( nupdwn(1), bec_row, bec_col, nkbx, nrcx, idesc(:,1) )
+         IF( nspin == 2 ) THEN
+            CALL redist_row2col( nupdwn(2), bephi(:,nrcx+1:), bephi_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
+            CALL redist_row2col( nupdwn(2), bec_row(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
+         END IF
+         !
+         DO iss = 1, nspin
+            IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
+               nc = idesc( LAX_DESC_NC, iss )
+               DO ia = 1, nat
+                  is = ityp(ia)
+                  IF( upf(is)%tvanp ) THEN
+                     indv = indv_ijkb0(ia)
+                     nhs  = nh(is)
+#if defined (__CUDA)
+                     CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt_d(1,1,is), SIZE(qq_nt_d,1), &
+                                   bephi_col(indv+1,(iss-1)*nrcx+1), SIZE(bephi_col,1), 0.0d0, qbephi(indv+1,1,iss), SIZE(qbephi,1))
+                     CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt_d(1,1,is), SIZE(qq_nt_d,1), &
+                                   bec_col(indv+1,(iss-1)*nrcx+1), SIZE(bec_col,1), 0.0d0, qbecp(indv+1,1,iss), SIZE(qbecp,1))
+#else
+                     CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt(1,1,is), SIZE(qq_nt,1), &
+                                   bephi_col(indv+1,(iss-1)*nrcx+1), SIZE(bephi_col,1), 0.0d0, qbephi(indv+1,1,iss), SIZE(qbephi,1))
+                     CALL DGEMMDRV('N', 'N', nhs, nc, nhs, 1.0d0, qq_nt(1,1,is), SIZE(qq_nt,1), &
+                                   bec_col(indv+1,(iss-1)*nrcx+1), SIZE(bec_col,1), 0.0d0, qbecp(indv+1,1,iss), SIZE(qbecp,1))
+#endif
+!!$cuf kernel do (2)
+!                     DO iv=1,nhs
+!                        DO i = 1, nc
+!                           DO jv = 1, nhs
+!                              qbephi(indv+iv,i,iss) = qbephi(indv+iv,i,iss) + &
+!                                                         qq_nt_d(iv,jv,is) * bephi_col(indv+jv,i+(iss-1)*nrcx)
+!                              qbecp(indv+iv,i,iss) = qbecp(indv+iv,i,iss) + &
+!                                                        qq_nt_d(iv,jv,is) * bec_col(indv+jv,i+(iss-1)*nrcx)
+!                           END DO
+!                        END DO
+!                     END DO
+                  END IF
+               END DO
+            ENDIF
         END DO
-         RETURN
-      END SUBROUTINE
-
-   END SUBROUTINE ortho_gamma_x
+         DEALLOCATE( bec_col )
+         DEALLOCATE( bephi_col )
+      END IF
+   END SUBROUTINE compute_qs_times_betas

+   SUBROUTINE keep_only_us(wrk)
+      USE uspp,           ONLY: indv_ijkb0
+      USE uspp_param,     ONLY: nh, upf
+      USE ions_base,      ONLY: na, nat, nsp, ityp
+#if defined (__CUDA)
+      USE cublas
+#endif
+      IMPLICIT NONE
+      COMPLEX(DP) DEVICEATTR, INTENT(OUT) :: wrk(:,:)
+      INTEGER :: ia, is, inl, nhs, iv
+      DO ia = 1, nat
+         is  = ityp(ia)
+         inl = indv_ijkb0(ia)
+         nhs = nh(is)
+         IF( .NOT. upf(is)%tvanp ) THEN
+!$cuf kernel do (1)
+            DO iv = 1, nhs
+               wrk(:,iv+inl) = 0.0d0
+            END DO
+         END IF
+      END DO
+   END SUBROUTINE

+END MODULE ortho_module


 !=----------------------------------------------------------------------------=!
-   SUBROUTINE ortho_x( eigr, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
+   SUBROUTINE ortho_x( betae, cp_bgrp, phi_bgrp, x0, idesc, diff, iter, ccc, bephi, becp_bgrp )
 !=----------------------------------------------------------------------------=!
      !
      !     input = cp (non-orthonormal), beta
@ -279,72 +506,63 @@
      !     for vanderbilt pseudo pot - kl & ap
      !
      USE kinds,          ONLY: DP
-      USE ions_base,      ONLY: na, nat, nsp, ityp
-      USE uspp,           ONLY: nkb, qq_nt, indv_ijkb0, nkbus
-      USE uspp_param,     ONLY: nh, upf
-      USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, i2gupdwn_bgrp, nbsp, nspin, nupdwn, iupdwn
+      USE uspp,           ONLY: nkb, nkbus
+      USE ions_base,      ONLY: nsp
+      USE electrons_base, ONLY: f, nbsp_bgrp, iupdwn_bgrp, nupdwn_bgrp, nbsp, nspin, nupdwn, iupdwn
      USE gvecw,          ONLY: ngw
      USE control_flags,  ONLY: iprint, iverbosity, ortho_max
      USE control_flags,  ONLY: force_pairing
      USE io_global,      ONLY: stdout, ionode
-      USE cp_interfaces,  ONLY: ortho_gamma, c_bgrp_expand, c_bgrp_pack, nlsm1, collect_bec, beta_eigr, nlsm1us
-      USE mp_global,          ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm ! DEBUG
+      USE cp_interfaces,  ONLY: c_bgrp_expand, c_bgrp_pack, nlsm1, collect_bec, nlsm1us
+      USE mp_global,          ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, inter_bgrp_comm! DEBUG
+      USE mp_world,           ONLY: mpime
      USE orthogonalize_base, ONLY: bec_bgrp2ortho
      USE mp,                 ONLY : mp_sum
+      USE ortho_module
+      USE device_memcpy_m
      !
      IMPLICIT NONE
      !
      include 'laxlib.fh'
      !
      INTEGER, INTENT(IN) :: idesc(:,:)
-      COMPLEX(DP) :: eigr(:,:)
-      COMPLEX(DP) :: cp_bgrp(:,:), phi_bgrp(:,:)
+      COMPLEX(DP) DEVICEATTR :: betae(:,:)
+      COMPLEX(DP) DEVICEATTR :: cp_bgrp(:,:), phi_bgrp(:,:)
      REAL(DP)    :: x0(:,:,:), diff, ccc
      INTEGER     :: iter
-      REAL(DP)    :: bephi(:,:)
-      REAL(DP)    :: becp_bgrp(:,:)
+      REAL(DP)    DEVICEATTR :: bephi(:,:)
+      REAL(DP)    DEVICEATTR :: becp_bgrp(:,:)
      !
-      REAL(DP), ALLOCATABLE :: xloc(:,:), becp_dist(:,:)
-      REAL(DP), ALLOCATABLE :: qbephi(:,:,:), qbecp(:,:,:), bec_col(:,:)
-      COMPLEX(DP), ALLOCATABLE :: beigr(:,:)
+      REAL(DP),    ALLOCATABLE DEVICEATTR :: bec_row(:,:)
+      REAL(DP),    ALLOCATABLE DEVICEATTR :: qbephi(:,:,:), qbecp(:,:,:)
+      COMPLEX(DP), ALLOCATABLE DEVICEATTR :: wrk2(:,:)

-      INTEGER :: nkbx
-      INTEGER :: info, i, j, iss, iv, jv, ia, is, inl, jnl
-      INTEGER :: n1, n2, m1, m2
-      INTEGER :: nspin_sub, nx0, ngwx, nrcx
-      REAL(DP) :: qqf, dum
+      INTEGER :: nkbx, info, iss, nspin_sub, nx0, ngwx, nrcx
+      !
+      CALL start_clock( 'ortho' )
      !
      nkbx = nkb
      ngwx = SIZE( cp_bgrp, 1 )
-      !
-      nx0 = SIZE( x0, 1 )
+      nx0  = SIZE( x0, 1 )
+      nrcx = MAXVAL( idesc( LAX_DESC_NRCX, : ) )
      !
      !     calculation of becp and bephi
      !
-      CALL start_clock( 'ortho' )
-
-      nrcx = MAXVAL( idesc( LAX_DESC_NRCX, : ) )
-
-      ALLOCATE( becp_dist( nkbx, nrcx*nspin ), STAT = info )
+      ALLOCATE( bec_row( nkbx, nrcx*nspin ), STAT = info )
      IF( info /= 0 ) &
-         CALL errore( ' ortho ', ' allocating becp_dist ', ABS( info ) )
+         CALL errore( ' ortho ', ' allocating bec_row ', ABS( info ) )

      IF( nkbus > 0 ) THEN
         !
-         ALLOCATE( beigr(ngw,nkb))
-         becp_bgrp = 0.0d0
-         !
-         CALL beta_eigr ( beigr, 1, nsp, eigr, 2 )
-         CALL nlsm1us ( nbsp_bgrp, beigr, phi_bgrp, becp_bgrp )
-         !CALL nlsm1 ( nbsp_bgrp, 1, nsp, eigr, phi_bgrp, becp_bgrp, 2 )
+         ALLOCATE( wrk2, MOLD = betae  )
+         CALL dev_memcpy( wrk2, betae )
+         CALL keep_only_us( wrk2 ) 
+         CALL nlsm1us ( nbsp_bgrp, wrk2, phi_bgrp, becp_bgrp )
         CALL bec_bgrp2ortho( becp_bgrp, bephi, nrcx, idesc )
         !
-         becp_bgrp = 0.0d0
-         !
-         CALL nlsm1us ( nbsp_bgrp, beigr, cp_bgrp, becp_bgrp )
-         !CALL nlsm1 ( nbsp_bgrp, 1, nsp, eigr, cp_bgrp, becp_bgrp, 2 )
-         CALL bec_bgrp2ortho( becp_bgrp, becp_dist, nrcx, idesc )
-         DEALLOCATE( beigr )
+         CALL nlsm1us ( nbsp_bgrp, wrk2, cp_bgrp, becp_bgrp )
+         CALL bec_bgrp2ortho( becp_bgrp, bec_row, nrcx, idesc )
+         DEALLOCATE( wrk2 )
         !
      END IF
      !
@ -353,95 +571,27 @@
      ALLOCATE( qbephi( nkbx, nx0, nspin ), STAT = info )
      IF( info /= 0 ) &
         CALL errore( ' ortho ', ' allocating qbephi ', ABS( info ) )
-      !
-      IF( nkbus > 0 ) THEN
-         ALLOCATE( bec_col ( nkbx, nrcx*nspin ), STAT = info )
-         IF( info /= 0 ) &
-            CALL errore( ' ortho ', ' allocating bec_col ', ABS( info ) )
-         CALL redist_row2col( nupdwn(1), bephi, bec_col, nkbx, nrcx, idesc(:,1) )
-         IF( nspin == 2 ) THEN
-            CALL redist_row2col( nupdwn(2), bephi(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
-         END IF
-      END IF
-      !
-      qbephi = 0.d0
-      !
-      DO iss = 1, nspin
-         IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
-            DO ia = 1, nat
-               is = ityp(ia)
-               IF( upf(is)%tvanp ) THEN
-                  DO iv=1,nh(is)
-                     inl = indv_ijkb0(ia) + iv 
-                     DO jv=1,nh(is)
-                        jnl = indv_ijkb0(ia) + jv
-                        qqf = qq_nt(iv,jv,is)
-                        IF( ABS( qqf ) > 1.D-5 ) THEN
-                           DO i = 1, idesc( LAX_DESC_NC, iss )
-                              qbephi(inl,i,iss) = qbephi(inl,i,iss) + qqf * bec_col(jnl,i+(iss-1)*nrcx)
-                           END DO
-                        END IF
-                     END DO
-                  END DO
-               END IF
-            END DO
-         ENDIF
-      END DO
-      !
      ALLOCATE( qbecp ( nkbx, nx0, nspin ), STAT = info )
      IF( info /= 0 ) &
         CALL errore( ' ortho ', ' allocating qbecp ', ABS( info ) )
-
-      qbecp  = 0.d0
-
-      IF( nkbus > 0 ) THEN
-         CALL redist_row2col( nupdwn(1), becp_dist, bec_col, nkbx, nrcx, idesc(:,1) )
-         IF( nspin == 2 ) THEN
-            CALL redist_row2col( nupdwn(2), becp_dist(:,nrcx+1:), bec_col(:,nrcx+1:), nkbx, nrcx, idesc(:,2) )
-         END IF
-         DO iss = 1, nspin
-            IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) THEN
-               DO ia = 1, nat
-                  is = ityp(ia) 
-                  IF( upf(is)%tvanp ) THEN
-                     DO iv=1,nh(is)
-                        inl = indv_ijkb0(ia) + iv
-                        DO jv=1,nh(is)
-                           jnl = indv_ijkb0(ia) + jv
-                           qqf = qq_nt(iv,jv,is)
-                           IF( ABS( qqf ) > 1.D-5 ) THEN
-                              DO i = 1, idesc( LAX_DESC_NC, iss )
-                                 qbecp(inl,i,iss) = qbecp(inl,i,iss) + qqf * bec_col(jnl,i+(iss-1)*nrcx)
-                              END DO
-                           ENDIF
-                        END DO
-                     END DO
-                  END IF
-               END DO
-            END IF
-         END DO
-         DEALLOCATE( bec_col )
-      END IF
+      !
+      CALL compute_qs_times_betas( bephi, bec_row, qbephi, qbecp, idesc )
      !
      ! Expand cp and phi to contain all electronic band
      !
      CALL c_bgrp_expand( cp_bgrp )
      CALL c_bgrp_expand( phi_bgrp )
      !
-      ALLOCATE( xloc( nx0, nx0 ), STAT = info )
-      IF( info /= 0 ) &
-         CALL errore( ' ortho ', ' allocating xloc ', ABS( info ) )
-      !
      nspin_sub = nspin 
      if( force_pairing ) nspin_sub = 1
      !
      DO iss = 1, nspin_sub

-         IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) xloc = x0(:,:,iss) * ccc
+         CALL x0_to_xloc( x0(:,:,iss), nx0, ccc, idesc(:,iss) )

-         CALL ortho_gamma( 0, cp_bgrp, ngwx, phi_bgrp, becp_dist(:,(iss-1)*nrcx+1:iss*nrcx), qbecp(:,:,iss), nkbx, &
+         CALL ortho_gamma( cp_bgrp, ngwx, phi_bgrp, bec_row(:,(iss-1)*nrcx+1:iss*nrcx), qbecp(:,:,iss), nkbx, &
                           bephi(:,((iss-1)*nrcx+1):iss*nrcx), &
-                           qbephi(:,:,iss), xloc, nx0, idesc(:,iss), diff, iter, nbsp, nupdwn(iss), iupdwn(iss) )
+                           qbephi(:,:,iss), nx0, idesc(:,iss), diff, iter, nbsp, nupdwn(iss), iupdwn(iss) )

         IF( iter > ortho_max ) THEN
            WRITE( stdout, 100 ) diff, iter
@ -452,16 +602,17 @@
            WRITE( stdout, 100 ) diff, iter
         ENDIF
         !     
-         IF( idesc( LAX_DESC_ACTIVE_NODE, iss ) > 0 ) x0( :, :, iss ) = xloc / ccc
+         CALL xloc_to_x0( x0(:,:,iss), nx0, ccc, idesc(:,iss) )
         !
      END DO

-      IF( force_pairing ) cp_bgrp(:, iupdwn(2):iupdwn(2)+nupdwn(2)-1 ) = cp_bgrp(:,1:nupdwn(2))
+      IF( force_pairing ) THEN
+         CALL dev_memcpy(cp_bgrp(:,iupdwn(2):), cp_bgrp(:,1:),  [1, ngw], 1 , [1, nupdwn(2)], 1) 
+      END IF
      !
-      DEALLOCATE( xloc )
      DEALLOCATE( qbecp )
      DEALLOCATE( qbephi )
-      DEALLOCATE( becp_dist )
+      DEALLOCATE( bec_row )
      !
      ! pack cp so that it contains only the bands in the band subgroup
      !
--- a/CPV/src/ortho_base.f90
+++ b/CPV/src/ortho_base.f90
--- a/CPV/src/potentials.f90
+++ b/CPV/src/potentials.f90
@ -351,7 +351,11 @@
      ALLOCATE( ftmp( 3, SIZE( fion, 2 ) ) )
      
      ftmp = 0.0d0
-
+!$omp parallel do reduction(+:ftmp) default(none) &
+!$omp shared( gstart, dffts, sfac, rhops, screen_coul, rhoeg, nsp, gg, tpiba2, mill, g, &
+!$omp          nat, ityp, vps, ei1, ei2, ei3, tscreen ) &
+!$omp private(ig, rp, is, rhet, rhog, fpibg, ig1, ig2, ig3, gxc, gyc, gzc, ia, cnvg, cvn, tx, &
+!$omp          ty, tz, teigr )
      DO ig = gstart, dffts%ngm

        RP   = (0.D0,0.D0)
@ -389,6 +393,8 @@

      END DO
      !
+!$omp end parallel do
+      !
      fion = fion + DBLE(ftmp) * 2.D0 * omega * tpiba

      DEALLOCATE( ftmp )
@ -444,6 +450,8 @@
      INTEGER, DIMENSION(6), PARAMETER :: ALPHA = (/ 1,2,3,2,3,3 /)
      INTEGER, DIMENSION(6), PARAMETER :: BETA  = (/ 1,1,1,2,2,3 /)

+      INTEGER :: omp_get_num_threads
+
 ! ... SUBROUTINE BODY 

      ALLOCATE( rc( nsp, nsp ) )
@ -474,9 +482,13 @@
      IA_S   = gind_block( 1, nat, nproc_bgrp, me_bgrp )
      IA_E   = IA_S + NA_LOC - 1

+!$omp parallel do reduction(+:esr,desr) num_threads(min(max(1,na_loc),omp_get_num_threads())) default(none) &
+!$omp private(ia,ib,k,j,zv2_kj,rckj_m1,fact_pre,xlm,ylm,zlm,tzero,xlm0,ylm0,zlm0,ix,iy,iz,sxlm,tshift, &
+!$omp         rxlm,erre2,rlm,arg,esrtzero,addesr,addpre,repand,i,fxx ) &
+!$omp shared(ia_s,ia_e,nat,ityp,zv2,rc,taus,iesr,hmat,fionloc,tstress,na_loc)
      DO ia = ia_s, ia_e
-        k = ityp(ia)
        DO ib = ia, nat
+          k = ityp(ia)
          j = ityp(ib)

          zv2_kj   = zv2(k,j)
@ -536,15 +548,14 @@
          END DO        ! IX
        END DO
      END DO
+!$omp end parallel do

 !
 !     each processor add its own contribution to the array FION
 !
-      DO ia = 1, nat
-        FION(1,ia) = FION(1,ia)+FIONLOC(1,ia)
-        FION(2,ia) = FION(2,ia)+FIONLOC(2,ia)
-        FION(3,ia) = FION(3,ia)+FIONLOC(3,ia)
-      END DO
+      !  FION = FION+FIONLOC
+      !
+      CALL daxpy( 3*nat, 1.0d0, fionloc, 1, fion, 1 )

      CALL mp_sum(esr, intra_bgrp_comm)
     
--- a/CPV/src/pseudopot_sub.f90
+++ b/CPV/src/pseudopot_sub.f90
@ -615,6 +615,7 @@
      USE atom,       ONLY: rgrid
      USE uspp,       ONLY: indv
      use uspp,       only: qq_nt, beta
+      use uspp_gpum,  ONLY: using_qq_nt, using_qq_nt_d, qq_nt_d
      USE betax,      only: refg, qradx, mmx, dqradx
      use smallbox_gvec,      only: ngb
      use control_flags, only: iprint, iverbosity
@ -624,6 +625,9 @@
      use smallbox_gvec,      only: gb, gxb
      use small_box,  only: omegab, tpibab
      USE cp_interfaces, ONLY: fill_qrl
+#if defined (__CUDA)
+      USE cudafor
+#endif
      !
      IMPLICIT NONE
      !
@ -796,6 +800,22 @@
         end do

      end do
+
+      CALL using_qq_nt(2)
+#if defined (__CUDA)
+      CALL using_qq_nt_d(0)
+!$cuf kernel do (3)
+      DO is = 1, SIZE(qq_nt_d,3)
+         DO jv=1,SIZE(qq_nt_d,2)
+            DO iv=1,SIZE(qq_nt_d,1)
+               IF( ABS( qq_nt_d(iv,jv,is) ) <= 1.D-5 ) THEN
+                  qq_nt_d(iv,jv,is) = 0.0d0
+               END IF
+            END DO
+         END DO
+      END DO
+#endif
+!
 !
      if (tpre) then
 !     ---------------------------------------------------------------
@ -1039,6 +1059,7 @@
      use gvecw, only: ngw
      use cell_base, only: ainv
      use uspp, only: qq_nt, nhtolm, beta
+      use uspp_gpum,  ONLY: using_qq_nt, using_qq_nt_d, qq_nt_d
      use constants, only: pi, fpi
      use ions_base, only: nsp
      use uspp_param, only: upf, lmaxq, lmaxkb, nbetam, nh
@ -1046,6 +1067,9 @@
      use smallbox_gvec, only: gb, gxb, ngb
      use small_box,  only: omegab, tpibab
      USE betax, ONLY: qradx, dqradx, refg, mmx
+#if defined (__CUDA)
+      USE cudafor
+#endif
 !
      implicit none

@ -1131,6 +1155,21 @@
         end do

      end do
+
+      CALL using_qq_nt(2)
+#if defined (__CUDA)
+      CALL using_qq_nt_d(0)
+!$cuf kernel do (3)
+      DO is = 1, SIZE(qq_nt_d,3)
+         DO jv=1,SIZE(qq_nt_d,2)
+            DO iv=1,SIZE(qq_nt_d,1)
+               IF( ABS( qq_nt_d(iv,jv,is) ) <= 1.D-5 ) THEN
+                  qq_nt_d(iv,jv,is) = 0.0d0
+               END IF
+            END DO
+         END DO
+      END DO
+#endif
 !
      if (tpre) then
 !     ---------------------------------------------------------------
--- a/CPV/src/restart_sub.f90
+++ b/CPV/src/restart_sub.f90
@ -28,17 +28,19 @@ SUBROUTINE from_restart( )
   USE gvect,    ONLY : mill, eigts1, eigts2, eigts3 
   USE printout_base,         ONLY : printout_pos
   USE gvecw,                 ONLY : ngw
-   USE cp_interfaces,         ONLY : phfacs, strucf, prefor, calbec_bgrp, caldbec_bgrp
+   USE cp_interfaces,         ONLY : phfacs, strucf, prefor, calbec, caldbec_bgrp
   USE energies,              ONLY : eself, dft_energy_type
   USE wave_base,             ONLY : rande_base
   USE efield_module,         ONLY : efield_berry_setup,  tefield, &
                                     efield_berry_setup2, tefield2
   USE uspp,                  ONLY : okvan, vkb, nkb, nlcc_any
+   USE uspp_gpum,             ONLY : vkb_d
   USE cp_main_variables,     ONLY : ht0, htm, lambdap, lambda, lambdam, eigr, &
-                                     sfac, taub, irb, eigrb, edft, bec_bgrp, dbec, idesc
+                                     sfac, taub, irb, eigrb, edft, bec_bgrp, dbec, idesc, iabox, nabox
   USE time_step,             ONLY : delt
   USE fft_base,              ONLY : dfftp, dffts
   USE matrix_inversion
+   USE device_memcpy_m,         ONLY : dev_memcpy
   !
   IMPLICIT NONE

@ -153,7 +155,7 @@ SUBROUTINE from_restart( )
   ! ... to starting cell (from ndr or again standard input)
   !
   IF ( okvan .or. nlcc_any ) THEN
-      CALL initbox( tau0, alat, at, ainv, taub, irb )
+      CALL initbox( tau0, alat, at, ainv, taub, irb, iabox, nabox )
      CALL phbox( taub, iverbosity, eigrb )
   END IF
   !
@ -162,6 +164,7 @@ SUBROUTINE from_restart( )
   CALL strucf( sfac, eigts1, eigts2, eigts3, mill, dffts%ngm )
   !
   CALL prefor( eigr, vkb )
+   CALL dev_memcpy( vkb_d, vkb )
   !
   CALL formf( .TRUE. , eself )
   !
@ -187,7 +190,7 @@ SUBROUTINE from_restart( )
      !
   END IF
   !
-   CALL calbec_bgrp( 1, nsp, eigr, c0_bgrp, bec_bgrp )
+   CALL calbec( nbsp_bgrp, vkb, c0_bgrp, bec_bgrp, 0 )
   !
   IF ( tpre     ) CALL caldbec_bgrp( eigr, c0_bgrp, dbec, idesc )
   !
--- a/CPV/src/runcp.f90
+++ b/CPV/src/runcp.f90
@ -10,9 +10,15 @@

 !=----------------------------------------------------------------------------------=!

+#if defined(__CUDA)
+#define DEVICEATTR ,DEVICE
+#else
+#define DEVICEATTR
+#endif
+

   SUBROUTINE runcp_uspp_x &
-      ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, cm_bgrp, fromscra, restart )
+      ( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
      !
      !  This subroutine performs a Car-Parrinello or Steepest-Descent step
      !  on the electronic variables, computing forces on electrons
@ -31,7 +37,7 @@
      USE mp,                  ONLY : mp_sum
      USE fft_base,            ONLY : dffts
      use wave_base,           only : wave_steepest, wave_verlet
-      use control_flags,       only : lwf, tsde
+      use control_flags,       only : lwf, tsde, many_fft
      use uspp,                only : deeq, vkb
      use gvect,  only : gstart
      use electrons_base,      only : nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, nupdwn_bgrp, iupdwn_bgrp
@ -41,6 +47,10 @@
      USE cp_interfaces,       ONLY : dforce
      USE ldaU_cp,             ONLY : lda_plus_u, vupsi
      USE fft_helper_subroutines
+#if defined (__CUDA)
+      USE uspp_gpum,           ONLY : vkb_d
+      USE cudafor
+#endif
      !
      IMPLICIT NONE
      !
@ -50,6 +60,7 @@
      REAL(DP) :: rhos(:,:)
      REAL(DP) :: bec_bgrp(:,:)
      COMPLEX(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
+      COMPLEX(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
      LOGICAL, OPTIONAL, INTENT(IN) :: fromscra
      LOGICAL, OPTIONAL, INTENT(IN) :: restart
      !
@ -64,11 +75,22 @@
     real(DP),    allocatable :: emaver(:)
     complex(DP), allocatable :: c2(:), c3(:), c2tmp(:), c3tmp(:)
     REAL(DP),    ALLOCATABLE :: tg_rhos(:,:), ftmp(:)
+#if defined (__CUDA)
+     REAL(DP),    ALLOCATABLE, DEVICE :: rhos_d(:,:)
+#endif
     INTEGER,     ALLOCATABLE :: itmp(:)
     integer :: i, nsiz, incr, idx, idx_in, ierr
     integer :: iwfc, nwfc, is, ii, tg_rhos_siz, c2_siz
     integer :: iflag
     logical :: ttsde
+     INTEGER :: omp_get_num_threads
+
+#if defined (__CUDA)
+     IF( dffts%has_task_groups ) THEN
+        CALL errore(' runcp_uspp ', ' task groups not implemented on GPU ',1)
+     END IF
+     ALLOCATE( rhos_d, SOURCE = rhos )
+#endif

     iflag = 0
     !
@ -85,6 +107,9 @@
     ELSE
        tg_rhos_siz = 1
        c2_siz      = ngw 
+#if defined (__CUDA)
+        c2_siz      = c2_siz * many_fft 
+#endif
     END IF

     !
@ -117,12 +142,13 @@
                             emadt2, emaver, verl1, verl2 )
     ELSE
        allocate( c2( c2_siz ), c3( c2_siz ) )
-        allocate( tg_rhos( tg_rhos_siz, nspin ) )

        c2      = 0D0
        c3      = 0D0

        IF( dffts%has_task_groups ) THEN
+
+           ALLOCATE( tg_rhos( tg_rhos_siz, nspin ) )
           !
           !  The potential in rhos is distributed across all processors
           !  We need to redistribute it so that it is completely contained in the
@ -136,7 +162,11 @@

        ELSE

+#if defined (__CUDA)
+           incr = 2 * many_fft
+#else
           incr = 2
+#endif

        END IF

@ -230,8 +260,14 @@

           ELSE

+#if defined (__CUDA)
+              CALL dforce( i, bec_bgrp, vkb_d, c0_d, c2, c3, rhos_d, &
+                           SIZE(rhos_d,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )
+#else
              CALL dforce( i, bec_bgrp, vkb, c0_bgrp, c2, c3, rhos, &
                           SIZE(rhos,1), ispin_bgrp, f_bgrp, nbsp_bgrp, nspin )
+#endif
+
              IF ( lda_plus_u ) THEN
                 c2(:) = c2(:) - vupsi(:,i)
                 c3(:) = c3(:) - vupsi(:,i+1)
@ -256,8 +292,10 @@
              ENDDO
           END IF

-           idx_in = 1
+!$omp parallel num_threads(min(incr,omp_get_num_threads())) default(shared) private(idx_in, idx)
+!$omp do
           DO idx = 1, incr, 2
+              idx_in = idx/2+1
              IF( i + idx - 1 <= nbsp_bgrp ) THEN
                 IF (tsde) THEN
                    CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
@ -271,21 +309,26 @@
                    cm_bgrp(1,i+idx  ) = CMPLX(real(cm_bgrp(1,i+idx  )),0.0d0,kind=dp)
                 END IF
              END IF
-              !
-              idx_in = idx_in + 1
-              !
           END DO
+!$omp end do
+!$omp end parallel

-        end do
+        END DO

        DEALLOCATE( c2 )
        DEALLOCATE( c3 )
-        DEALLOCATE( tg_rhos )
+
+        IF( dffts%has_task_groups ) THEN
+           DEALLOCATE( tg_rhos )
+        END IF

     END IF

     DEALLOCATE( emadt2 )
     DEALLOCATE( emaver )
+#if defined (__CUDA)
+     DEALLOCATE( rhos_d )
+#endif
 !
   END SUBROUTINE runcp_uspp_x

--- a/CPV/src/smallbox_lib.f90
+++ b/CPV/src/smallbox_lib.f90
@ -6,17 +6,18 @@
 ! or http://www.gnu.org/copyleft/gpl.txt .
 !
 !-----------------------------------------------------------------------
-      SUBROUTINE initbox ( tau0, alat, at, ainv, taub, irb )
+      SUBROUTINE initbox ( tau0, alat, at, ainv, taub, irb, iabox, nabox )
 !-----------------------------------------------------------------------
 !
 !     sets the indexes irb and positions taub for the small boxes 
 !     around atoms
 !
      USE kinds,                    ONLY: DP
-      USE ions_base,                ONLY: nat
+      USE ions_base,                ONLY: nat, ityp
+      USE uspp_param,               ONLY: upf
      USE control_flags,            ONLY: iverbosity
      USE io_global,                ONLY: stdout
-      USE mp_global,                ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm
+      USE mp_global,                ONLY: nproc_bgrp, me_bgrp, intra_bgrp_comm, my_bgrp_id, nbgrp
      USE fft_base,                 ONLY: dfftb, dfftp, fft_type_descriptor
      USE fft_smallbox_type,        ONLY: fft_box_set

@ -26,9 +27,10 @@
 ! output
      INTEGER,  INTENT(out) :: irb(3,nat)
      REAL(DP), INTENT(out) :: taub(3,nat)
+      INTEGER,  INTENT(out) :: iabox(nat), nabox
 ! local
      REAL(DP) :: x(3), xmod
-      INTEGER  :: nr(3), nrb(3), xint, ia, i
+      INTEGER  :: nr(3), nrb(3), xint, ia, i, is
 !
      IF ( dfftb%nr1 < 1) CALL errore ('initbox', 'incorrect value for box grid dimensions', 1)
      IF ( dfftb%nr2 < 1) CALL errore ('initbox', 'incorrect value for box grid dimensions', 2)
@ -96,6 +98,19 @@

      CALL fft_box_set( dfftb, nat, irb, dfftp )

+      ! build the local list of atom 
+      nabox = 0
+      DO ia = 1, nat
+         IF( .NOT. upf(ityp(ia))%tvanp )  CYCLE
+#if defined(__MPI)
+         IF(  ( dfftb%np3( ia ) <= 0 ) .OR. ( dfftb%np2( ia ) <= 0 ) .OR. ( my_bgrp_id /= MOD( ia, nbgrp ) ) ) THEN
+            CYCLE
+         END IF
+#endif
+         nabox = nabox + 1
+         iabox( nabox ) = ia
+      END DO
+
      IF( iverbosity > 1 ) THEN
           DO ia=1,nat
              WRITE( stdout,2000) ia, (irb(i,ia),i=1,3)
--- a/CPV/src/vofrho.f90
+++ b/CPV/src/vofrho.f90
@ -79,7 +79,7 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
      !
      INTEGER iss, isup, isdw, ig, ir, i, j, k, ij, is, ia, inlc
      REAL(DP) :: vtxc, vave, ebac, wz, eh, ehpre, enlc
-      COMPLEX(DP)  fp, fm, ci, drhop, zpseu, zh
+      COMPLEX(DP)  fp, fm, drhop, zpseu, zh
      COMPLEX(DP), ALLOCATABLE :: rhotmp(:), vtemp(:)
      COMPLEX(DP), ALLOCATABLE :: drhot(:,:)
      REAL(DP), ALLOCATABLE    :: gagb(:,:), rhosave(:,:), newrhosave(:,:), rhocsave(:) 
@ -104,6 +104,7 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
      ! ...  dalbe(:) = delta( alpha(:), beta(:) )
      REAL(DP),  DIMENSION(6), PARAMETER :: dalbe = &
         (/ 1.0_DP, 0.0_DP, 0.0_DP, 1.0_DP, 0.0_DP, 1.0_DP /)
+      COMPLEX(DP), PARAMETER :: ci = ( 0.0d0, 1.0d0 )


      CALL start_clock( 'vofrho' )
@ -130,8 +131,6 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
        !
      END IF
      !
-      ci = ( 0.0d0, 1.0d0 )
-      !
      !     wz = factor for g.neq.0 because of c*(g)=c(-g)
      !
      wz = 2.0d0
@ -442,12 +441,12 @@ SUBROUTINE vofrho_x( nfi, rhor, drhor, rhog, drhog, rhos, rhoc, tfirst, &
      END IF
       
      IF( nspin == 1 ) THEN
-         rhog( 1:dfftp%ngm, 1 ) = rhog( 1:dfftp%ngm, 1 ) + vtemp(1:dfftp%ngm) 
+         CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,1), 1)
      ELSE
         isup=1
         isdw=2
-         rhog( 1:dfftp%ngm, isup ) = rhog( 1:dfftp%ngm, isup ) + vtemp(1:dfftp%ngm) 
-         rhog( 1:dfftp%ngm, isdw ) = rhog( 1:dfftp%ngm, isdw ) + vtemp(1:dfftp%ngm) 
+         CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,isup), 1)
+         CALL zaxpy(dfftp%ngm, (1.0d0,0.0d0) , vtemp, 1, rhog(1,isdw), 1)
         IF( ttsic ) THEN
            rhog( 1:dfftp%ngm, isup ) = rhog( 1:dfftp%ngm, isup ) - self_vloc(1:dfftp%ngm) 
            rhog( 1:dfftp%ngm, isdw ) = rhog( 1:dfftp%ngm, isdw ) - self_vloc(1:dfftp%ngm) 
--- a/CPV/src/wave.f90
+++ b/CPV/src/wave.f90
@ -464,6 +464,13 @@
      !
      ! ... assign random values to wave functions
      !
+      ! 2.519 = 4^(2/3), equivalent to keep only (ngw_g/4) values
+      fac = 2.519d0
+      IF( ngw_g/4 < nbsp ) fac = 1.0d0
+      IF( ngw_g   < nbsp ) THEN
+        CALL errore(' wave_rand_init ', ' too few plane waves, linear dependent electronic states! ', 1)
+      END IF
+
      DO ib = 1, nbsp

        IF( local ) THEN
@ -487,7 +494,7 @@
        IF( ibgrp > 0 ) THEN
          DO ig = 1, ngw
            IF( local ) THEN
-               IF( gg(ig) < ggx / 2.519d0 ) THEN  ! 2.519 = 4^(2/3), equivalent to keep only (ngw_g/4) values
+               IF( gg(ig) < ggx / fac ) THEN  
                  rranf1 = rnd( 1, mill(1,ig) ) * rnd( 2, mill(2,ig) ) * rnd( 3, mill(3,ig) )
                  rranf2 = 0.0d0
                  cm_bgrp( ig, ibgrp ) =  ampre * CMPLX( rranf1, rranf2 ,kind=DP) / ( 1.0d0 + gg(ig) )
@ -599,3 +606,65 @@
      RETURN
    END SUBROUTINE c_bgrp_pack_x

+#if defined (__CUDA)
+    SUBROUTINE c_bgrp_expand_gpu_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      USE mp,                 ONLY: mp_sum
+      USE electrons_base,     ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
+      USE mp_global,          ONLY: nbgrp, inter_bgrp_comm
+      USE cudafor
+      IMPLICIT NONE
+      COMPLEX(DP), DEVICE :: c_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2, i
+      IF( nbgrp < 2 ) &
+         RETURN
+      DO iss = nspin, 1, -1
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+!$cuf kernel do(1) <<<*,*>>>
+         DO i = m2, m1, -1
+            c_bgrp(:,i) = c_bgrp(:,i-m1+n1)
+         END DO
+      END DO
+      DO iss = 1, nspin
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+!$cuf kernel do(1) <<<*,*>>>
+         DO i = iupdwn(iss), m1-1
+            c_bgrp(:,i) = 0.0d0
+         END DO
+!$cuf kernel do(1) <<<*,*>>>
+         DO i = m2+1, iupdwn(iss) + nupdwn(iss) - 1
+            c_bgrp(:,i) = 0.0d0
+         END DO
+      END DO
+      CALL mp_sum( c_bgrp, inter_bgrp_comm )
+      RETURN
+    END SUBROUTINE c_bgrp_expand_gpu_x
+
+
+    SUBROUTINE c_bgrp_pack_gpu_x( c_bgrp )
+      USE kinds,              ONLY: DP
+      USE electrons_base,     ONLY: nspin, i2gupdwn_bgrp, nupdwn, iupdwn_bgrp, iupdwn, nupdwn_bgrp
+      USE mp_global,          ONLY: nbgrp
+      USE cudafor
+      IMPLICIT NONE
+      COMPLEX(DP), DEVICE :: c_bgrp(:,:)
+      INTEGER :: iss, n1, n2, m1, m2, i
+      IF( nbgrp < 2 ) &
+         RETURN
+      DO iss = 1, nspin
+         n1 = iupdwn_bgrp(iss)
+         n2 = n1 + nupdwn_bgrp(iss) - 1
+         m1 = iupdwn(iss)+i2gupdwn_bgrp(iss) - 1
+         m2 = m1 + nupdwn_bgrp(iss) - 1
+!$cuf kernel do(1) <<<*,*>>>
+         DO i = n1, n2
+            c_bgrp(:,i) = c_bgrp(:,i-n1+m1)
+         END DO
+      END DO
+      RETURN
+    END SUBROUTINE c_bgrp_pack_gpu_x
+#endif
--- a/FFTXlib/fft_helper_subroutines.f90
+++ b/FFTXlib/fft_helper_subroutines.f90
@ -616,6 +616,9 @@ CONTAINS
     WRITE( stdout,*) '  Number of x-y planes for each processors: '
     WRITE( stdout, fmt = '( 5("  |",I4,",",I4) )' ) ( ( desc%nr2p(j), &
             desc%nr3p(i), i = 1, desc%nproc3 ), j=1,desc%nproc2 )
+
+     IF ( .not. desc%use_pencil_decomposition ) WRITE( stdout,*) '  Using Slab Decomposition'
+     IF (       desc%use_pencil_decomposition ) WRITE( stdout,*) '  Using Pencil Decomposition'
 1000  FORMAT(3X, &
         'Global Dimensions   Local  Dimensions   Processor Grid',/,3X, &
         '.X.   .Y.   .Z.     .X.   .Y.   .Z.     .X.   .Y.   .Z.',/, &
--- a/FFTXlib/tests/Makefile
+++ b/FFTXlib/tests/Makefile
@ -0,0 +1,30 @@
+# Makefile for FFTXlib testing
+
+include ../../make.inc
+
+MODFLAGS= $(MOD_FLAG).. $(MOD_FLAG).
+
+SRCS = test_fft_scalar_gpu.f90 \
+				test_fft_scatter_mod_gpu.f90 \
+				test_fwinv_gpu.f90
+
+EXECS = $(SRCS:.f90=.x)
+
+all: common $(EXECS)
+
+common: tester.o utils.o
+
+%.x: %.o
+	$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(QELIBS)
+
+clean :
+	- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
+
+# .PHONY forces execution of a rule irrespective of the presence of an
+# updated file with the same name of the rule. In this way, the script 
+# that generates version.f90 always runs, updating the version if you 
+# execute "svn update". The update_version script takes care of not
+# changing the file if the svn version did not change
+
+.PHONY: all clean
+
--- a/FFTXlib/tests/test_fft_scalar_gpu.f90
+++ b/FFTXlib/tests/test_fft_scalar_gpu.f90
@ -0,0 +1,273 @@
+#if defined(__CUDA)
+program test_fft_scalar_gpu
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    !
+    CALL test%init()
+    !
+    test%tolerance64 = 1.d-13
+    !
+    CALL save_random_seed("test_fft_scalar_gpu", 0)
+    !
+    CALL test_cft_2xy_gpu(test)
+    !
+    CALL test_cft_1z_gpu(test)
+    !
+    CALL test_cfft3d_gpu(test)
+    !
+    CALL test_cfft3ds_gpu(test)
+    !
+    CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE fill_random(c, c_d, n)
+    USE cudafor
+    USE fft_param, ONLY : DP
+    implicit none
+    complex(DP), device :: c_d(:)
+    complex(DP)         :: c(:)
+    integer, intent(in) :: n
+    !
+    real(DP), ALLOCATABLE :: rnd_aux(:)
+    !
+    ALLOCATE (rnd_aux(2*n))
+    CALL RANDOM_NUMBER(rnd_aux)
+    c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
+    c_d = c
+    DEALLOCATE(rnd_aux)
+  END SUBROUTINE fill_random
+  !
+  SUBROUTINE test_cft_1z_gpu(test)
+    USE fft_scalar, ONLY : cft_1z, cft_2xy, cfft3d, cfft3ds
+    USE fft_scalar, ONLY : cft_1z_gpu, cft_2xy_gpu, cfft3d_gpu, cfft3ds_gpu
+    USE fft_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    !
+    ! stream
+    integer(kind = cuda_stream_kind) :: stream = 0
+    ! size
+    integer, parameter :: nsl=100, nz=56, ldz=56
+    !
+    ! array for random values
+    real(DP) :: rnd_aux(2 * nsl * ldz)
+    !
+    ! variables on device
+    complex(DP), device :: c_d(nsl*ldz)
+    complex(DP), device :: cout_d(nsl*ldz)
+    ! variables on host
+    complex(DP) :: c(nsl*ldz)
+    complex(DP) :: cout(nsl*ldz)
+    !
+    CALL fill_random(c, c_d, nsl*ldz)
+    !
+    ! Check forward direction
+    CALL cft_1z(c, nsl, nz, ldz, 1, cout)
+    CALL cft_1z_gpu(c_d, nsl, nz, ldz, 1, cout_d, stream)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    c = (0.d0, 0.d0)
+    c = cout_d
+    CALL test%assert_close( c, cout )
+    !
+    !
+    CALL fill_random(c, c_d, nsl*ldz)
+    !
+    ! Check backward direction
+    CALL cft_1z(c, nsl, nz, ldz, -1, cout)
+    CALL cft_1z_gpu(c_d, nsl, nz, ldz, -1, cout_d, stream)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    c = (0.d0, 0.d0)
+    c = cout_d
+    CALL test%assert_close( c, cout )
+    !
+    ! ==  Same as above, for inplace call ==
+    !
+    CALL fill_random(c, c_d, nsl*ldz)
+    !
+    ! Check forward direction
+    CALL cft_1z(c, nsl, nz, ldz, 1, cout)
+    CALL cft_1z_gpu(c_d, nsl, nz, ldz, 1, cout_d, stream, in_place=.true.)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    c = (0.d0, 0.d0)
+    c = c_d
+    CALL test%assert_close( c, cout )
+    !
+    !
+    CALL fill_random(c, c_d, nsl*ldz)
+    !
+    ! Check backward direction
+    CALL cft_1z(c, nsl, nz, ldz, -1, cout)
+    CALL cft_1z_gpu(c_d, nsl, nz, ldz, -1, cout_d, stream, in_place=.true.)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    c = (0.d0, 0.d0)
+    c = c_d
+    CALL test%assert_close( c, cout )
+    !
+  END SUBROUTINE test_cft_1z_gpu
+  !
+  SUBROUTINE test_cft_2xy_gpu(test)
+    USE fft_scalar, ONLY : cft_2xy
+    USE fft_scalar, ONLY : cft_2xy_gpu
+    USE fft_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    !
+    ! stream
+    integer(kind = cuda_stream_kind) :: stream = 0
+    ! size
+    integer, parameter :: nx = 10, ny = 10, nzl = 5, ldx=10, ldy=10
+    !
+    ! array for random values
+    real(DP) :: rnd_aux(2 * nzl * ldx * ldy)
+    ! variables on device
+    complex(DP), device :: c_d(nzl * ldx * ldy)
+    complex(DP), device :: tmp_d(nzl * ldx * ldy)
+    ! variables on host
+    complex(DP) :: c(nzl * ldx * ldy)
+    complex(DP) :: tmp(nzl * ldx * ldy)
+    !
+    CALL fill_random(c, c_d, nzl * ldx * ldy)
+    !
+    CALL cft_2xy_gpu(c_d, tmp_d, nzl, nx, ny, ldx, ldy, 1, stream)
+    CALL cft_2xy(c, nzl, nx, ny, ldx, ldy, 1)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    tmp = c_d
+    CALL test%assert_close( c, tmp )
+    !
+    CALL fill_random(c, c_d, nzl * ldx * ldy)
+    !
+    CALL cft_2xy_gpu(c_d, tmp_d, nzl, nx, ny, ldx, ldy, -1, stream)
+    CALL cft_2xy(c, nzl, nx, ny, ldx, ldy, -1)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    tmp = c_d
+    CALL test%assert_close( c, tmp )
+    !
+  END SUBROUTINE test_cft_2xy_gpu
+  !
+  SUBROUTINE test_cfft3d_gpu(test)
+    USE fft_scalar, ONLY : cfft3d
+    USE fft_scalar, ONLY : cfft3d_gpu
+    USE fft_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    !
+    ! stream
+    integer(kind = cuda_stream_kind) :: stream = 0
+    ! size
+    integer, parameter :: nx = 10, ny = 10, nz = 10
+    integer, parameter :: ldx= 10, ldy= 10, ldz= 10
+#if ! defined(__DFTI)
+    integer, parameter :: howmany=1
+#else
+    integer, parameter :: howmany=12
+#endif
+    !
+    ! array for random values
+    real(DP) :: rnd_aux(2 * howmany * ldx * ldy * ldz)
+    ! variables on device
+    complex(DP), device :: c_d(howmany * ldx * ldy * ldz)
+    complex(DP), device :: tmp_d(howmany * ldx * ldy * ldz)
+    ! variables on host
+    complex(DP) :: c(howmany * ldx * ldy * ldz)
+    complex(DP) :: tmp(howmany * ldx * ldy * ldz)
+    !
+    integer :: i, rs, re
+    !
+#if ! defined(__DFTI)
+    print *, 'The current CPU scalar driver does not support howmany. Reverting to howmany 1'
+#endif
+    
+    CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
+    !
+    CALL cfft3d( c, nx, ny, nz, ldx, ldy, ldz, howmany, 1 )
+    CALL cfft3d_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, 1, stream )
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    tmp = c_d
+    DO i=0, howmany - 1
+      rs = i * ldx * ldy * ldz + 1
+      re = rs + nx * ny  * nz  - 1
+      CALL test%assert_close( c(rs:re), tmp(rs:re) )
+    END DO
+    !
+    CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
+    !
+    CALL cfft3d( c, nx, ny, nz, ldx, ldy, ldz, howmany, -1 )
+    CALL cfft3d_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, -1, stream )
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    tmp = c_d
+    DO i=0, howmany - 1
+      rs = i * ldx * ldy * ldz + 1
+      re = rs + nx * ny  * nz  - 1
+      CALL test%assert_close( c(rs:re), tmp(rs:re) )
+    END DO
+    !
+  END SUBROUTINE test_cfft3d_gpu
+  !
+  SUBROUTINE test_cfft3ds_gpu(test)
+    USE fft_scalar, ONLY : cfft3ds
+    USE fft_scalar, ONLY : cfft3ds_gpu
+    USE fft_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    !
+    ! stream
+    integer(kind = cuda_stream_kind) :: stream = 0
+    ! size
+    integer, parameter :: nx = 10, ny = 10, nz = 10
+    integer, parameter :: ldx= 10, ldy= 10, ldz= 10, howmany=1
+    !
+    ! array for random values
+    real(DP) :: rnd_aux(2 * howmany * ldx * ldy * ldz)
+    ! variables on device
+    complex(DP), device :: c_d(howmany * ldx * ldy * ldz)
+    complex(DP), device :: tmp_d(howmany * ldx * ldy * ldz)
+    ! variables on host
+    complex(DP) :: c(howmany * ldx * ldy * ldz)
+    complex(DP) :: tmp(howmany * ldx * ldy * ldz)
+    integer     :: do_fft_y(ldx), do_fft_z(ldx*ldy)
+    !
+    CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
+    do_fft_y = 1; do_fft_z = 1
+    !
+    CALL cfft3ds( c, nx, ny, nz, ldx, ldy, ldz, howmany, 1, do_fft_z, do_fft_y)
+    CALL cfft3ds_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, 1, do_fft_z, do_fft_y, stream)
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    tmp = c_d
+    CALL test%assert_close( c, tmp )
+    !
+    CALL fill_random(c, c_d, howmany * ldx * ldy * ldz)
+    !
+    CALL cfft3ds( c, nx, ny, nz, ldx, ldy, ldz, howmany, -1, do_fft_z, do_fft_y)
+    CALL cfft3ds_gpu( c_d, nx, ny, nz, ldx, ldy, ldz, howmany, -1, do_fft_z, do_fft_y, stream )
+    !
+    ! Use c as auxiliary variable hosting GPU results
+    tmp = c_d
+    CALL test%assert_close( c, tmp )
+    !
+  END SUBROUTINE test_cfft3ds_gpu
+  
+end program test_fft_scalar_gpu
+#else
+program test_fft_scalar_gpu
+end program test_fft_scalar_gpu
+#endif
--- a/FFTXlib/tests/test_fft_scatter_mod_gpu.f90
+++ b/FFTXlib/tests/test_fft_scatter_mod_gpu.f90
@ -0,0 +1,406 @@
+#if defined(__CUDA)
+program test_fft_scatter_mod_gpu
+#if defined(__MPI)
+    USE mpi
+#endif
+    USE tester
+    IMPLICIT NONE
+    ! MPI type
+    type mpi_t
+    integer :: me, n, root, comm
+    end type mpi_t
+    TYPE(mpi_t) :: mp
+    !
+    TYPE(tester_t) :: test
+    !
+    INTEGER :: ierr, level, i
+    !
+#if defined(__MPI)
+#if defined(_OPENMP)
+  CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
+#else
+  CALL MPI_Init(ierr)
+#endif
+#endif
+    !
+    CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
+    !
+    CALL test%init()
+    !
+    test%tolerance64 = 1.d-14
+    !
+    CALL save_random_seed("test_fft_scatter_mod_gpu", mp%me)
+    !
+    DO i = 1, mp%n
+      IF (MOD(mp%n,i) == 0 ) THEN
+        ! gamma case
+        CALL test_fft_scatter_xy_gpu_1(mp, test, .true., i)
+        ! k case
+        CALL test_fft_scatter_xy_gpu_1(mp, test, .false., i)
+        !
+        ! gamma case
+        CALL test_fft_scatter_yz_gpu_1(mp, test, .true., i)
+        ! k case
+        CALL test_fft_scatter_yz_gpu_1(mp, test, .false., i)
+      END IF
+    END DO
+    CALL test_fft_scatter_many_yz_gpu_1(mp, test, .true., 1)
+    CALL test_fft_scatter_many_yz_gpu_1(mp, test, .false., 1)
+
+    !
+    CALL collect_results(test)
+    !
+    IF (mp%me == mp%root) CALL test%print()
+    !
+#if defined(__MPI)
+    CALL MPI_Finalize(ierr)
+#endif
+  CONTAINS
+  !
+  SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
+    implicit none
+    integer, intent(out) :: mpme, npes, mproot, comm
+    integer :: ierr
+    mpme=0; npes=1; mproot=0; comm=0
+#if defined(__MPI)
+    CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
+    CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
+    comm = MPI_COMM_WORLD
+#endif
+  END SUBROUTINE mpi_data_init
+  SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
+    USE fft_types,       ONLY : fft_type_descriptor, fft_type_init
+    USE stick_base,      ONLY : sticks_map
+    USE fft_param, ONLY : DP
+    implicit none
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    CHARACTER(LEN=*), INTENT(IN) :: flavor
+    LOGICAL :: gamma_only
+    LOGICAL :: parallel
+    INTEGER :: comm, nyfft
+    REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
+    !
+    REAL(DP) :: at(3,3), bg(3,3)
+    !
+    at = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(at))
+    bg = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(bg))
+    bg = 2.d0*pi
+    !
+    CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, nyfft=nyfft)
+    !
+  END SUBROUTINE fft_desc_init
+  
+  SUBROUTINE fft_desc_finalize(dfft, smap)
+    USE fft_types,       ONLY : fft_type_descriptor, fft_type_deallocate
+    USE stick_base,      ONLY : sticks_map, sticks_map_deallocate
+    implicit none
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    !
+    CALL fft_type_deallocate(dfft)
+    CALL sticks_map_deallocate( smap )
+  END SUBROUTINE fft_desc_finalize
+  !
+  SUBROUTINE fill_random(c, c_d, n)
+    USE cudafor
+    USE fft_param, ONLY : DP
+    implicit none
+    complex(DP), device :: c_d(:)
+    complex(DP)         :: c(:)
+    integer, intent(in) :: n
+    !
+    real(DP), ALLOCATABLE :: rnd_aux(:)
+    !
+    ALLOCATE (rnd_aux(2*n))
+    CALL RANDOM_NUMBER(rnd_aux)
+    c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
+    c_d = c
+    DEALLOCATE(rnd_aux)
+  END SUBROUTINE fill_random
+  !
+  SUBROUTINE test_fft_scatter_xy_gpu_1(mp, test, gamma_only, ny)
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE scatter_mod,     ONLY : fft_scatter_xy
+    USE scatter_mod_gpu, ONLY : fft_scatter_xy_gpu
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:)
+    integer(kind = cuda_stream_kind) :: stream = 0
+    integer :: fft_sign = 2
+    integer :: vsiz, nr1p_, compare_len, me2
+    !
+    parallel = mp%n .gt. 1
+    
+    CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
+    me2    = dfft%mype2 + 1
+    vsiz = dfft%nnr
+    compare_len = dfft%nr1x * dfft%my_nr2p * dfft%my_nr3p
+    if (ny > 1) then
+       ! When using task groups, wave FFTs are not distributed along Y
+       fft_sign = 3
+       vsiz = dfft%nnr_tg
+       compare_len = dfft%nr1x * dfft%nr2x * dfft%my_nr3p
+    end if
+    !
+    ! Allocate variables
+    ALLOCATE(scatter_in(vsiz), scatter_out(vsiz), aux(vsiz))
+    ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
+    !
+    ! Test 1
+    CALL fill_random(scatter_in, scatter_in_d, vsiz)
+    !
+    CALL fft_scatter_xy( dfft, scatter_in, scatter_out, vsiz, fft_sign )
+    CALL fft_scatter_xy_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign, stream )
+    aux(1:compare_len) = scatter_out_d(1:compare_len)
+    !
+    ! Check
+    CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
+    !
+    ! Test 2
+    CALL fill_random(scatter_in, scatter_in_d, vsiz)
+    !
+    CALL fft_scatter_xy( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
+    CALL fft_scatter_xy_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign, stream )
+    !
+    compare_len = dfft%nr2x * dfft%nr1w(me2) * dfft%my_nr3p
+    IF (ny > 1) compare_len = dfft%nr2x * dfft%nr1w_tg * dfft%my_nr3p
+    !
+    aux(1:compare_len) = scatter_out_d(1:compare_len)
+    ! Check
+    CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
+    !
+  END SUBROUTINE test_fft_scatter_xy_gpu_1
+  !
+  SUBROUTINE test_fft_scatter_yz_gpu_1(mp, test, gamma_only, ny)
+    !
+    ! This test checks wave fft scatter, with parallel = .true. if 
+    !  called with more than 1 MPI.
+    !
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE scatter_mod,     ONLY : fft_scatter_yz
+    USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_out(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:)
+    integer(kind = cuda_stream_kind) :: stream = 0
+    integer :: fft_sign = 2
+    integer :: vsiz, compare_len, my_nr1p_
+    !
+    parallel = mp%n .gt. 1
+    CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
+    vsiz     = dfft%nnr
+    my_nr1p_ = count(dfft%ir1w > 0)
+    if (ny > 1) then
+       fft_sign = 3
+       vsiz = dfft%nnr_tg
+       my_nr1p_ = count(dfft%ir1w_tg > 0)
+    end if
+    !
+    ! Allocate variables
+    ALLOCATE(scatter_in(vsiz), scatter_out(vsiz), aux(vsiz))
+    ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz))
+    !
+    ! Test 1
+    CALL fill_random(scatter_in, scatter_in_d, vsiz)
+    !
+    CALL fft_scatter_yz( dfft, scatter_in, scatter_out, vsiz, fft_sign )
+    CALL fft_scatter_yz_gpu( dfft, scatter_in_d, scatter_out_d, vsiz, fft_sign )
+    ! Set the number of elements that should be strictly equivalent in the
+    ! two implementations.
+    compare_len = dfft%my_nr3p*my_nr1p_*dfft%nr2x
+    aux(1:compare_len) = scatter_out_d(1:compare_len)
+    ! Check
+    CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
+    !
+    ! Test 2
+    CALL fill_random(scatter_in, scatter_in_d, vsiz)
+    !
+    CALL fft_scatter_yz( dfft, scatter_out, scatter_in, vsiz, -1*fft_sign )
+    CALL fft_scatter_yz_gpu( dfft, scatter_out_d, scatter_in_d, vsiz, -1*fft_sign )
+    !
+    compare_len = dfft%nsw(mp%me+1)*dfft%nr3x
+    aux(1:compare_len) = scatter_out_d(1:compare_len)
+    ! Check
+    CALL test%assert_close( scatter_out(1:compare_len), aux(1:compare_len) )
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(scatter_in, scatter_out, aux, scatter_in_d, scatter_out_d)
+    !
+  END SUBROUTINE test_fft_scatter_yz_gpu_1
+  !
+  SUBROUTINE test_fft_scatter_many_yz_gpu_1(mp, test, gamma_only, ny)
+    !
+    ! This test checks wave fft scatter, with parallel = .true. if 
+    !  called with more than 1 MPI.
+    !
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE scatter_mod,     ONLY : fft_scatter_yz
+    USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    INTEGER, PARAMETER :: howmany = 4
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: scatter_in(:), scatter_in_cpy(:), scatter_out(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: scatter_in_d(:), scatter_out_d(:), aux_d(:)
+    !                 convenient variables for slices
+    integer :: i, l, start_in, end_in, start_out, end_out, nstick_zx, n3, n3x, vsiz
+    integer :: start_sl, end_sl
+    !integer(kind = cuda_stream_kind) :: streams(5)
+    !
+    parallel = mp%n .gt. 1
+    IF (ny > 1) print *, 'scatter_many does not support task grouping'
+    CALL fft_desc_init(dfft, smap, "wave", gamma_only, parallel, mp%comm, nyfft=ny)
+    !
+    ! Allocate variables
+    vsiz = dfft%nnr*howmany
+    ALLOCATE(scatter_in(vsiz), scatter_in_cpy(vsiz), scatter_out(vsiz), aux(vsiz))
+    ALLOCATE(scatter_in_d(vsiz), scatter_out_d(vsiz), aux_d(vsiz))
+    !
+    ! How FFT allocates bunches in this case:
+    nstick_zx = MAXVAL(dfft%nsw)
+    n3 = dfft%nr3
+    n3x = dfft%nr3x
+    
+    ! Test 1
+    CALL fill_random(scatter_in, scatter_in_d, vsiz)
+    scatter_in_cpy = scatter_in
+    !
+    !print *, 'dfft%nnr, nr3, nr2, nr1 : ', dfft%nnr, dfft%nr3, dfft%nr2, dfft%nr1 
+    !  Test 1.1, compare scatter of slices 
+    DO i=0,howmany-1
+       start_in = i*dfft%nnr + 1
+       end_in = (i+1)*dfft%nnr
+       start_out = start_in
+       !
+       end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
+       !
+       CALL fft_scatter_yz( dfft, scatter_in(start_in:end_in), scatter_out(start_out:end_out), dfft%nnr, 2 )
+       CALL fft_scatter_yz_gpu( dfft, scatter_in_d(start_in:end_in), scatter_out_d(start_in:end_out), dfft%nnr, 2 )
+       aux(start_out:end_out) = scatter_out_d(start_out:end_out)
+       ! Check
+       CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
+    END DO
+    !
+    scatter_in = scatter_in_cpy
+    ! Store data as expected in input
+    DO i=0,howmany-1
+       start_in = i*dfft%nnr + 1
+       end_in   = (i+1)*dfft%nnr
+       start_out= i*nstick_zx*n3x+1
+       end_out  = (i+1)*nstick_zx*n3
+       scatter_in_d( start_out : end_out ) = scatter_in(start_in:start_in+nstick_zx*n3)
+    END DO
+    CALL fft_scatter_many_yz_gpu ( dfft, scatter_in_d, scatter_out_d, vsiz, 2, howmany )
+    
+    DO i=0,howmany-1
+       start_out = i*dfft%nnr + 1
+       end_out = (start_out-1) + dfft%my_nr3p*dfft%nr1w(dfft%mype2 +1)*dfft%nr2x
+       !
+       aux(start_out:end_out) = scatter_out_d(start_out:end_out)
+       !
+       CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
+    END DO
+    !
+    !
+    ! Test 2
+    CALL fill_random(scatter_in, scatter_in_d, vsiz)
+    scatter_in_cpy = scatter_in
+    !
+    DO i=0,howmany-1
+       ! Input data for fft_scatter_yz call
+       start_in = i*dfft%nnr + 1
+       end_in = (i+1)*dfft%nnr
+       !
+       ! Where to store output data
+       start_out = start_in
+       end_out = end_in
+       CALL fft_scatter_yz( dfft, scatter_out(start_out:end_out), scatter_in(start_in:end_in),  dfft%nnr, -2 )
+       CALL fft_scatter_yz_gpu( dfft, scatter_out_d(start_out:end_out), scatter_in_d(start_in:end_in), dfft%nnr, -2 )
+       !
+       aux(start_out:end_out) = scatter_out_d(start_out:end_out)
+       !
+       ! Check only relevant part
+       end_out = start_out + dfft%nsw(mp%me+1)*n3
+       CALL test%assert_close( scatter_out(start_out:end_out), aux(start_out:end_out) )
+    END DO
+    !
+    !
+    ! Now repeat the test, but doing all the FFTs in a single shot
+    scatter_in_d(1:vsiz) = scatter_in_cpy(1:vsiz)
+    !
+    CALL fft_scatter_many_yz_gpu ( dfft, scatter_out_d, scatter_in_d, vsiz, -2, howmany )
+    
+    DO i=0,howmany-1
+       ! Extract data from GPU. Data are spaced by nstick_zx*n3x
+       start_out = 1 + i*nstick_zx*n3x
+       end_out   = (i+1)*nstick_zx*n3x
+       aux(start_out:end_out) = scatter_out_d(start_out:end_out)
+       !
+       start_in = i*dfft%nnr + 1
+       end_in   = i*dfft%nnr + dfft%nsw(mp%me+1)*n3x    !nstick_zx*nx3!
+       !
+       ! Extract only data tofft_scatter_yz compare the two methods. Results from the
+       ! previous call to fft_scatter_yz are separated by nnr, while the
+       ! new results are separated by nstick_zx*n3x. We read start_out
+       ! and add from there dfft%nsw(mp%me+1)*n3x to be compared (don't forget the -1!)
+       start_sl = start_out
+       end_sl   = (start_out - 1) + dfft%nsw(mp%me+1)*n3x
+       CALL test%assert_close( aux(start_sl:end_sl), scatter_out(start_in:end_in) )
+    END DO
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(scatter_in, scatter_in_cpy, scatter_out, aux, scatter_in_d, scatter_out_d)
+    !
+  END SUBROUTINE test_fft_scatter_many_yz_gpu_1
+  
+end program test_fft_scatter_mod_gpu
+! dummy subroutines
+subroutine stop_clock( label )
+character(len=*) :: label
+end subroutine stop_clock
+subroutine start_clock( label )
+character(len=*) :: label
+end subroutine start_clock
+!
+#else
+program test_fft_scatter_mod_gpu
+end program test_fft_scatter_mod_gpu
+#endif
--- a/FFTXlib/tests/test_fwinv_gpu.f90
+++ b/FFTXlib/tests/test_fwinv_gpu.f90
@ -0,0 +1,424 @@
+#if defined(__CUDA)
+program test_fwinv_gpu
+#if defined(__MPI)
+    USE mpi
+#endif
+    USE tester
+    IMPLICIT NONE
+    ! MPI type
+    type mpi_t
+    integer :: me, n, root, comm
+    end type mpi_t
+    TYPE(mpi_t) :: mp
+    !
+    TYPE(tester_t) :: test
+    !
+    INTEGER :: ierr, level, i
+    !
+#if defined(__MPI)
+#if defined(_OPENMP)
+  CALL MPI_Init_thread(MPI_THREAD_FUNNELED,level, ierr)
+#else
+  CALL MPI_Init(ierr)
+#endif
+#endif
+    !
+    CALL mpi_data_init(mp%me, mp%n, mp%root, mp%comm)
+    !
+    CALL test%init()
+    !
+    test%tolerance64 = 1.d-12
+    !
+    CALL save_random_seed("test_fwinv_gpu", mp%me)
+    !
+    DO i = 1, mp%n
+      IF (MOD(mp%n,i) == 0 ) THEN
+        CALL test_fwfft_gpu_1(mp, test, .true., i)
+        CALL test_fwfft_gpu_1(mp, test, .false., i)
+        !
+        CALL test_invfft_gpu_1(mp, test, .true., i)
+        CALL test_invfft_gpu_1(mp, test, .false., i)
+      END IF
+    END DO
+    CALL test_fwfft_many_gpu_1(mp, test, .true., 1)
+    CALL test_fwfft_many_gpu_1(mp, test, .false., 1)
+    !
+    CALL test_invfft_many_gpu_1(mp, test, .true., 1)
+    CALL test_invfft_many_gpu_1(mp, test, .false., 1)
+    !
+    CALL collect_results(test)
+    !
+    IF (mp%me == mp%root) CALL test%print()
+    !
+#if defined(__MPI)
+    CALL MPI_Finalize(ierr)
+#endif
+  CONTAINS
+  !
+  SUBROUTINE mpi_data_init(mpme, npes, mproot, comm)
+    implicit none
+    integer, intent(out) :: mpme, npes, mproot, comm
+    integer :: ierr
+    mpme=0; npes=1; mproot=0; comm=0
+#if defined(__MPI)
+    CALL mpi_comm_rank(MPI_COMM_WORLD, mpme, ierr)
+    CALL mpi_comm_size(MPI_COMM_WORLD, npes, ierr)
+    comm = MPI_COMM_WORLD
+#endif
+  END SUBROUTINE mpi_data_init
+  !
+  SUBROUTINE calc_bg(at, bg)
+    USE fft_param, ONLY : DP
+    implicit none
+    REAL(DP), PARAMETER :: pi=4.D0*DATAN(1.D0)
+    REAL(DP), INTENT(IN) :: at(3,3)
+    REAL(DP), INTENT(OUT) :: bg(3,3)
+    REAL(DP) :: ucvol
+    !
+    ucvol=at(1,1)*(at(2,2)*at(3,3)-at(3,2)*at(2,3))+&
+     & at(2,1)*(at(3,2)*at(1,2)-at(1,2)*at(3,3))+&
+     at(3,1)*(at(1,2)*at(2,3)-at(2,2)*at(1,3))
+     
+    !calculate reciprocal-space lattice vectors
+    bg(1,1)=2.0*pi*(at(2,2)*at(3,3)-at(3,2)*at(2,3))/ucvol
+    bg(2,1)=2.0*pi*(at(3,2)*at(1,3)-at(1,2)*at(3,3))/ucvol
+    bg(3,1)=2.0*pi*(at(1,2)*at(2,3)-at(2,2)*at(1,3))/ucvol
+    bg(1,2)=2.0*pi*(at(2,3)*at(3,1)-at(3,3)*at(2,1))/ucvol
+    bg(2,2)=2.0*pi*(at(3,3)*at(1,1)-at(1,3)*at(3,1))/ucvol
+    bg(3,2)=2.0*pi*(at(1,3)*at(2,1)-at(2,3)*at(1,1))/ucvol
+    bg(1,3)=2.0*pi*(at(2,1)*at(3,2)-at(3,1)*at(2,2))/ucvol
+    bg(2,3)=2.0*pi*(at(3,1)*at(1,2)-at(1,1)*at(3,2))/ucvol
+    bg(3,3)=2.0*pi*(at(1,1)*at(2,2)-at(2,1)*at(1,2))/ucvol
+  END SUBROUTINE calc_bg
+  !
+  SUBROUTINE fft_desc_init(dfft, smap, flavor, gamma_only, parallel, comm, nyfft)
+    USE stick_base
+    USE fft_types, ONLY : fft_type_descriptor, fft_type_init
+    USE fft_param, ONLY : DP
+    implicit none
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    CHARACTER(LEN=*), INTENT(IN) :: flavor
+    LOGICAL :: gamma_only
+    LOGICAL :: parallel
+    INTEGER :: comm, nyfft
+    !
+    REAL(DP) :: at(3,3), bg(3,3)
+    !
+    at = RESHAPE((/10.d0, 0.d0, 0.d0, 0.d0, 10.d0, 0.d0, 0.d0, 0.d0, 10.d0/), shape(at))
+    CALL calc_bg(at, bg)
+    !
+    CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, nyfft=nyfft)
+    !
+  END SUBROUTINE fft_desc_init
+  
+  SUBROUTINE fft_desc_finalize(dfft, smap)
+    USE fft_types,       ONLY : fft_type_descriptor, fft_type_deallocate
+    USE stick_base,      ONLY : sticks_map, sticks_map_deallocate
+    implicit none
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    !
+    CALL fft_type_deallocate(dfft)
+    CALL sticks_map_deallocate( smap )
+  END SUBROUTINE fft_desc_finalize
+  !
+  SUBROUTINE fill_random(c, c_d, n)
+    USE cudafor
+    USE fft_param, ONLY : DP
+    implicit none
+    complex(DP), device :: c_d(:)
+    complex(DP)         :: c(:)
+    integer, intent(in) :: n
+    !
+    real(DP), ALLOCATABLE :: rnd_aux(:)
+    !
+    ALLOCATE (rnd_aux(2*n))
+    CALL RANDOM_NUMBER(rnd_aux)
+    c = CMPLX(rnd_aux(1:n), rnd_aux(n:2*n))
+    c_d = c
+    DEALLOCATE(rnd_aux)
+  END SUBROUTINE fill_random
+  !
+  SUBROUTINE test_fwfft_gpu_1(mp, test, gamma_only, ny)
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE fft_interfaces,  ONLY : fwfft
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
+    INTEGER :: i
+    !
+    parallel = mp%n .gt. 1
+    CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
+    dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
+    !
+    ! Test 1
+    !
+    IF ( ny .gt. 1 ) THEN
+      ! Allocate variables
+      ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
+      ALLOCATE(data_in_d(dfft%nnr_tg))
+      CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
+      !
+      CALL fwfft( 'tgWave' , data_in, dfft, 1 )
+      CALL fwfft( 'tgWave' , data_in_d, dfft, 1 )
+    ELSE
+      ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
+      ALLOCATE(data_in_d(dfft%nnr))
+      CALL fill_random(data_in, data_in_d, dfft%nnr)
+      !
+      CALL fwfft( 'Wave' , data_in, dfft, 1 )
+      CALL fwfft( 'Wave' , data_in_d, dfft, 1 )    
+    ENDIF
+    aux = data_in_d
+    ! Check
+    CALL test%assert_close( data_in(1:dfft%ngw), aux(1:dfft%ngw) )
+    !
+    ! Test 2
+    !
+    DEALLOCATE(data_in, data_in_d, aux)
+    ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
+    ALLOCATE(data_in_d(dfft%nnr))
+    CALL fill_random(data_in, data_in_d, dfft%nnr)
+    !
+    CALL fwfft( 'Rho' , data_in, dfft, 1 )
+    CALL fwfft( 'Rho' , data_in_d, dfft, 1 )
+    aux = data_in_d
+    ! Check
+    CALL test%assert_close( data_in(1:dfft%ngm), aux(1:dfft%ngm) )
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(data_in, data_in_d, aux)
+    !
+  END SUBROUTINE test_fwfft_gpu_1
+  !
+  SUBROUTINE test_invfft_gpu_1(mp, test, gamma_only, ny)
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE fft_interfaces,  ONLY : invfft
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
+    !
+    parallel = mp%n .gt. 1
+    CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
+    dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
+    !
+    ! Test 1
+    !
+    IF ( ny .gt. 1 ) THEN
+      ! Allocate variables
+      ALLOCATE(data_in(dfft%nnr_tg), aux(dfft%nnr_tg))
+      ALLOCATE(data_in_d(dfft%nnr_tg))
+      CALL fill_random(data_in, data_in_d, dfft%nnr_tg)
+      !
+      CALL invfft( 'tgWave' , data_in, dfft, 1 )
+      CALL invfft( 'tgWave' , data_in_d, dfft, 1 )
+    ELSE
+      !
+      ! Allocate variables
+      ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
+      ALLOCATE(data_in_d(dfft%nnr))
+      CALL fill_random(data_in, data_in_d, dfft%nnr)
+      !
+      CALL invfft( 'Wave' , data_in, dfft, 1 )
+      CALL invfft( 'Wave' , data_in_d, dfft, 1 )    
+    ENDIF
+    aux = data_in_d
+    ! Check
+    CALL test%assert_close( data_in, aux )
+    !
+    ! Test 2
+    !
+    DEALLOCATE(data_in, data_in_d, aux)
+    ALLOCATE(data_in(dfft%nnr), aux(dfft%nnr))
+    ALLOCATE(data_in_d(dfft%nnr))
+    CALL fill_random(data_in, data_in_d, dfft%nnr)
+    !
+    CALL invfft( 'Rho' , data_in, dfft, 1 )
+    CALL invfft( 'Rho' , data_in_d, dfft, 1 )
+    aux = data_in_d
+    ! Check
+    CALL test%assert_close( data_in, aux )
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(data_in, data_in_d, aux)
+    !
+  END SUBROUTINE test_invfft_gpu_1
+  !
+  SUBROUTINE test_fwfft_many_gpu_1(mp, test, gamma_only, ny)
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE fft_interfaces,  ONLY : fwfft
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
+    integer, parameter :: howmany=4
+    INTEGER :: i, start
+    !
+    parallel = mp%n .gt. 1
+    CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
+    dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
+    !
+    ! Test 1
+    !
+    IF ( ny .gt. 1 ) THEN
+      ! Not (yet?) possible
+      RETURN
+    ELSE
+      ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr*howmany))
+      ALLOCATE(data_in_d(dfft%nnr*howmany))
+      CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
+      !
+      CALL fwfft( 'Wave' , data_in_d, dfft, howmany=howmany)
+      !
+      DO i=0,howmany-1
+        start = i*dfft%nnr
+        CALL fwfft( 'Wave' , data_in(1+start:), dfft, 1 )
+        aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
+        ! Check
+        CALL test%assert_close( data_in(start+1:start+dfft%ngw), aux(start+1:start+dfft%ngw) )
+      END DO
+      !
+    ENDIF
+    !
+    ! Test 2
+    !
+    DEALLOCATE(data_in, data_in_d, aux)
+    ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
+    ALLOCATE(data_in_d(dfft%nnr*howmany))
+    !
+    CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
+    !
+    CALL fwfft( 'Rho' , data_in_d, dfft,  howmany)
+    DO i=0,howmany-1
+      start = i*dfft%nnr
+      CALL fwfft( 'Rho' , data_in(1+start:), dfft, 1 )
+      aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
+      ! Check
+      CALL test%assert_close( data_in(start+1:start+dfft%ngm), aux(1:dfft%ngm) )
+    END DO
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(data_in, data_in_d, aux)
+    !
+  END SUBROUTINE test_fwfft_many_gpu_1
+  !
+  SUBROUTINE test_invfft_many_gpu_1(mp, test, gamma_only, ny)
+    USE cudafor
+    USE fft_param,       ONLY : DP
+    USE fft_types,       ONLY : fft_type_descriptor
+    USE stick_base,      ONLY : sticks_map
+    USE fft_interfaces,  ONLY : invfft
+    implicit none
+    TYPE(mpi_t) :: mp
+    TYPE(tester_t) :: test
+    !
+    TYPE(fft_type_descriptor) :: dfft
+    TYPE(sticks_map) :: smap
+    LOGICAL, INTENT(IN) :: gamma_only
+    INTEGER, INTENT(IN) :: ny
+    !
+    LOGICAL :: parallel
+    COMPLEX(DP), ALLOCATABLE :: data_in(:), aux(:)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: data_in_d(:)
+    INTEGER(kind = cuda_stream_kind) :: strm = 0
+    integer, parameter :: howmany=4
+    integer :: start, i
+    !
+    parallel = mp%n .gt. 1
+    CALL fft_desc_init(dfft, smap, 'wave', gamma_only, parallel, mp%comm, nyfft=ny)
+    dfft%rho_clock_label='bla' ; dfft%wave_clock_label='bla'
+    !
+    ! Test 1
+    !
+    IF ( ny .gt. 1 ) THEN
+      ! Not (yet?) possible
+      RETURN
+    ELSE
+      !
+      ! Allocate variables
+      ALLOCATE(data_in(howmany*dfft%nnr), aux(howmany*dfft%nnr))
+      ALLOCATE(data_in_d(howmany*dfft%nnr))
+      CALL fill_random(data_in, data_in_d, howmany*dfft%nnr)
+      !
+      !CALL invfft( 'Wave' , data_in, dfft, 1 )
+      CALL invfft( 'Wave' , data_in_d, dfft, howmany=howmany, stream=strm )
+      DO i=0,howmany-1
+        start = i*dfft%nnr
+        CALL invfft( 'Wave' , data_in(1+start:), dfft, 1 )
+        aux(start+1:start+dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
+        ! Check
+        CALL test%assert_close( data_in(start+1:start+dfft%nnr), aux(start+1:start+dfft%nnr) )
+      END DO
+    ENDIF
+    !
+    ! Test 2
+    !
+    DEALLOCATE(data_in, data_in_d, aux)
+    ALLOCATE(data_in(dfft%nnr*howmany), aux(dfft%nnr))
+    ALLOCATE(data_in_d(dfft%nnr*howmany))
+    CALL fill_random(data_in, data_in_d, dfft%nnr*howmany)
+    !
+    CALL invfft( 'Rho' , data_in_d, dfft, howmany )
+    !
+    DO i=0,howmany-1
+      start = i*dfft%nnr
+      CALL invfft( 'Rho' , data_in(1+start:), dfft, 1 )
+      aux(1:dfft%nnr) = data_in_d(start+1:start+dfft%nnr)
+      ! Check
+      CALL test%assert_close( data_in(start+1:start+dfft%nnr), aux(1:dfft%nnr) )
+    END DO
+    !
+    CALL fft_desc_finalize(dfft, smap)
+    DEALLOCATE(data_in, data_in_d, aux)
+    !
+  END SUBROUTINE test_invfft_many_gpu_1
+  
+end program test_fwinv_gpu
+! dummy subroutines
+subroutine stop_clock( label )
+character(len=*) :: label
+end subroutine stop_clock
+subroutine start_clock( label )
+character(len=*) :: label
+end subroutine start_clock
+!
+#else
+program test_fwinv_gpu
+end program test_fwinv_gpu
+#endif
--- a/FFTXlib/tests/tester.f90
+++ b/FFTXlib/tests/tester.f90
@ -0,0 +1,862 @@
+! This file is part of fortran_tester
+! Copyright 2015 Pierre de Buyl and Peter Colberg
+!           2016 Pierre de Buyl and Stefano Szaghi
+!           2018 Pierre de Buyl and Pietro Bonfa
+! License: BSD
+
+!> Routines to test Fortran programs
+!!
+!! fortran_tester is a pure-Fortran module. It provides a datatype to hold test results and
+!! routines to test for equality, closeness, and positivity of variables. The routines are
+!! overloaded and the resulting interface consists of a small number of names.
+
+module tester
+  use, intrinsic :: iso_fortran_env, only : int8, int16, int32, int64, real32, real64
+
+  implicit none
+  private
+  public :: tester_t
+
+  !> The main **tester** class.
+  type :: tester_t
+     integer(int32) :: n_errors=0_int32                         !< Number of errors.
+     integer(int32) :: n_tests=0_int32                          !< Number of tests.
+     real(real32)   :: tolerance32=2._real32*epsilon(1._real32) !< Real tolerance, 32 bits.
+     real(real64)   :: tolerance64=2._real64*epsilon(1._real64) !< Real tolerance, 64 bits.
+   contains
+     procedure :: init                           !< Initialize the tester.
+     procedure :: print                          !< Print tests results.
+     generic, public :: assert_equal =>     &
+                        assert_equal_i8,    &
+                        assert_equal_i16,   &
+                        assert_equal_i32,   &
+                        assert_equal_i64,   &
+                        assert_equal_r32,   &
+                        assert_equal_r64,   &
+                        assert_equal_c32,   &
+                        assert_equal_c64,   &
+                        assert_equal_l,     &
+                        assert_equal_i8_1,  &
+                        assert_equal_i16_1, &
+                        assert_equal_i32_1, &
+                        assert_equal_i64_1, &
+                        assert_equal_r32_1, &
+                        assert_equal_r64_1, &
+                        assert_equal_c32_1, &
+                        assert_equal_c64_1, &
+                        assert_equal_l_1         !< Check if two values (integer, real, complex or logical) are equal.
+     procedure, private :: assert_equal_i8       !< Check if two integers (8  bits) are equal.
+     procedure, private :: assert_equal_i16      !< Check if two integers (16 bits) are equal.
+     procedure, private :: assert_equal_i32      !< Check if two integers (32 bits) are equal.
+     procedure, private :: assert_equal_i64      !< Check if two integers (64 bits) are equal.
+     procedure, private :: assert_equal_r32      !< Check if two reals (32 bits) are equal.
+     procedure, private :: assert_equal_r64      !< Check if two reals (64 bits) are equal.
+     procedure, private :: assert_equal_c32      !< Check if two complex numbers (32 bits) are equal.
+     procedure, private :: assert_equal_c64      !< Check if two complex numbers (64 bits) are equal.
+     procedure, private :: assert_equal_l        !< Check if two logicals are equal.
+     procedure, private :: assert_equal_i8_1     !< Check if two integer (8  bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_i16_1    !< Check if two integer (16 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_i32_1    !< Check if two integer (32 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_i64_1    !< Check if two integer (64 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_r32_1    !< Check if two real (32 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_r64_1    !< Check if two real (64 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_c32_1    !< Check if two complex (32 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_c64_1    !< Check if two complex (64 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_l_1      !< Check if two logical arrays (rank 1) are equal.
+     generic, public :: assert_positive =>     &
+                        assert_positive_i8,    &
+                        assert_positive_i16,   &
+                        assert_positive_i32,   &
+                        assert_positive_i64,   &
+                        assert_positive_r32,   &
+                        assert_positive_r64,   &
+                        assert_positive_i8_1,  &
+                        assert_positive_i16_1, &
+                        assert_positive_i32_1, &
+                        assert_positive_i64_1, &
+                        assert_positive_r32_1, &
+                        assert_positive_r64_1    !< Check if a number (integer or real) is positive.
+     procedure, private :: assert_positive_i8    !< Check if a integer (8  bits) is positive.
+     procedure, private :: assert_positive_i16   !< Check if a integer (16 bits) is positive.
+     procedure, private :: assert_positive_i32   !< Check if a integer (32 bits) is positive.
+     procedure, private :: assert_positive_i64   !< Check if a integer (64 bits) is positive.
+     procedure, private :: assert_positive_r32   !< Check if a real (32 bits) is positive.
+     procedure, private :: assert_positive_r64   !< Check if a real (64 bits) is positive.
+     procedure, private :: assert_positive_i8_1  !< Check if a integer (8  bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_i16_1 !< Check if a integer (16 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_i32_1 !< Check if a integer (32 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_i64_1 !< Check if a integer (64 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_r32_1 !< Check if a real (32 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_r64_1 !< Check if a real (64 bits) array (rank 1) is positive.
+     generic, public :: assert_close =>     &
+                        assert_close_r32,   &
+                        assert_close_r64,   &
+                        assert_close_c32,   &
+                        assert_close_c64,   &
+                        assert_close_r32_1, &
+                        assert_close_r64_1, &
+                        assert_close_c32_1, &
+                        assert_close_c64_1       !< Check if two values (real or complex) are close with respect a tolerance.
+     procedure, private :: assert_close_r32      !< Check if two reals (32 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_r64      !< Check if two reals (64 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_c32      !< Check if two complex numbers (32 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_c64      !< Check if two complex numbers (64 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_r32_1    !< Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
+     procedure, private :: assert_close_r64_1    !< Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
+     procedure, private :: assert_close_c32_1    !< Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
+     procedure, private :: assert_close_c64_1    !< Check if two complex (64 bits) arrays (rank 1) are close with respect a tolerance.
+  end type tester_t
+
+contains
+
+  !> Initialize the tester.
+  subroutine init(this, tolerance32, tolerance64)
+    class(tester_t), intent(out)          :: this        !< The tester.
+    real(real32),    intent(in), optional :: tolerance32 !< Real tolerance, 32 bits.
+    real(real64),    intent(in), optional :: tolerance64 !< Real tolerance, 64 bits.
+
+    this% n_errors = 0
+    this% n_tests = 0
+
+    if (present(tolerance64)) then
+       this% tolerance64 = tolerance64
+    else
+       this% tolerance64 = 2._real64*epsilon(1._real64)
+    end if
+
+    if (present(tolerance32)) then
+       this% tolerance32 = tolerance32
+    else
+       this% tolerance32 = 2._real32*epsilon(1._real32)
+    end if
+
+  end subroutine init
+
+  !> Print tests results.
+  subroutine print(this, errorstop)
+    class(tester_t), intent(in)           :: this      !< The tester.
+    logical,         intent(in), optional :: errorstop !< Flag to activate error stop if one test fails.
+
+    logical :: do_errorstop
+    if (present(errorstop)) then
+       do_errorstop = errorstop
+    else
+       do_errorstop = .true.
+    end if
+
+    write(*,*) 'fortran_tester:', this% n_errors, ' error(s) for', this% n_tests, 'test(s)'
+
+    if (this% n_errors == 0) then
+       write(*,*) 'fortran_tester: all tests succeeded'
+    else
+       write(*,*) 'fortran_tester: tests failed'
+       if (do_errorstop) then
+          stop 1
+       end if
+    end if
+
+  end subroutine print
+
+  !> Check if two integers (8 bits) are equal.
+  subroutine assert_equal_i8(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int8),   intent(in)           :: i1   !< Value to compare.
+    integer(int8),   intent(in)           :: i2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i8
+
+  !> Check if two integers (16 bits) are equal.
+  subroutine assert_equal_i16(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this   !< The tester.
+    integer(int16),  intent(in)           :: i1     !< Value to compare.
+    integer(int16),  intent(in)           :: i2     !< Value to compare.
+    logical,         intent(in), optional :: fail   !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i16
+
+  !> Check if two integers (32 bits) are equal.
+  subroutine assert_equal_i32(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int32),  intent(in)           :: i1   !< Value to compare.
+    integer(int32),  intent(in)           :: i2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i32
+
+  !> Check if two integers (64 bits) are equal.
+  subroutine assert_equal_i64(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int64),  intent(in)           :: i1   !< Value to compare.
+    integer(int64),  intent(in)           :: i2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i64
+
+  !> Check if two reals (32 bits) are equal.
+  subroutine assert_equal_r32(this, r1, r2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real32),    intent(in)           :: r1   !< Value to compare.
+    real(real32),    intent(in)           :: r2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r1 .ne. r2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_r32
+
+  !> Check if two reals (64 bits) are equal.
+  subroutine assert_equal_r64(this, r1, r2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real64),    intent(in)           :: r1   !< Value to compare.
+    real(real64),    intent(in)           :: r2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r1 .ne. r2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_r64
+
+  !> Check if two complex numbers (32 bits) are equal.
+  subroutine assert_equal_c32(this, c1, c2, fail)
+    class(tester_t), intent(inout)           :: this !< The tester.
+    complex(real32),    intent(in)           :: c1   !< Value to compare.
+    complex(real32),    intent(in)           :: c2   !< Value to compare.
+    logical,            intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (c1 .ne. c2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_c32
+
+  !> Check if two complex numbers (64 bits) are equal.
+  subroutine assert_equal_c64(this, c1, c2, fail)
+    class(tester_t), intent(inout)           :: this !< The tester.
+    complex(real64),    intent(in)           :: c1   !< Value to compare.
+    complex(real64),    intent(in)           :: c2   !< Value to compare.
+    logical,            intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (c1 .ne. c2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_c64
+
+  !> Check if two logicals are equal.
+ subroutine assert_equal_l(this, l1, l2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    logical,         intent(in)           :: l1   !< Value to compare.
+    logical,         intent(in)           :: l2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (l1 .neqv. l2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_l
+
+  !> Check if two integer (8 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i8_1(this, i1, i2, fail)
+    class(tester_t),             intent(inout)        :: this !< The tester.
+    integer(int8), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int8), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                     intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i8_1
+
+  !> Check if two integer (16 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i16_1(this, i1, i2, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int16), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int16), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i16_1
+
+  !> Check if two integer (32 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i32_1(this, i1, i2, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int32), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int32), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i32_1
+
+  !> Check if two integer (64 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i64_1(this, i1, i2, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int64), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int64), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i64_1
+
+  !> Check if two real (32 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_r32_1(this, r1, r2, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real32), dimension(:), intent(in)           :: r1   !< Value to compare.
+    real(real32), dimension(:), intent(in)           :: r2   !< Value to compare.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_r32_1
+
+  !> Check if two real (64 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_r64_1(this, r1, r2, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real64), dimension(:), intent(in)           :: r1   !< Value to compare.
+    real(real64), dimension(:), intent(in)           :: r2   !< Value to compare.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_r64_1
+  
+  !> Check if two complex (32 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_c32_1(this, c1, c2, fail)
+    class(tester_t),               intent(inout)        :: this !< The tester.
+    complex(real32), dimension(:), intent(in)           :: c1   !< Value to compare.
+    complex(real32), dimension(:), intent(in)           :: c2   !< Value to compare.
+    logical,                       intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_c32_1
+
+  !> Check if two complex (64 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_c64_1(this, c1, c2, fail)
+    class(tester_t),               intent(inout)        :: this !< The tester.
+    complex(real64), dimension(:), intent(in)           :: c1   !< Value to compare.
+    complex(real64), dimension(:), intent(in)           :: c2   !< Value to compare.
+    logical,                       intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_c64_1
+
+  !> Check if two logical arrays (rank 1) are equal.
+  subroutine assert_equal_l_1(this, l1, l2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    logical,         intent(in), dimension(:) :: l1   !< Value to compare.
+    logical,         intent(in), dimension(:) :: l2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    integer :: k
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(l1) .ne. size(l2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       do k = 1, size(l1)
+          if (l1(k) .neqv. l2(k)) then
+             if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+                this% n_errors = this% n_errors + 1
+             end if
+             exit
+          end if
+       end do
+    end if
+
+  end subroutine assert_equal_l_1
+
+  !> Check if a integer (32 bits) is positive.
+  subroutine assert_positive_i8(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int8),   intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i8
+
+  !> Check if a integer (16 bits) is positive.
+  subroutine assert_positive_i16(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int16),  intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i16
+
+  !> Check if a integer (32 bits) is positive.
+  subroutine assert_positive_i32(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int32),  intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i32
+
+  !> Check if a integer (32 bits) is positive.
+  subroutine assert_positive_i64(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int64),  intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i64
+
+  !> Check if a real (32 bits) is positive.
+  subroutine assert_positive_r32(this, r, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real32),    intent(in)           :: r    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r32
+
+  !> Check if a real (64 bits) is positive.
+  subroutine assert_positive_r64(this, r, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real64),    intent(in)           :: r    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r64
+
+  !> Check if a integer (8 bits) array (rank 1) is positive.
+  subroutine assert_positive_i8_1(this, i, fail)
+    class(tester_t),             intent(inout)        :: this !< The tester.
+    integer(int8), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                     intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i8_1
+
+  !> Check if a integer (16 bits) array (rank 1) is positive.
+  subroutine assert_positive_i16_1(this, i, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int16), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i16_1
+
+  !> Check if a integer (32 bits) array (rank 1) is positive.
+  subroutine assert_positive_i32_1(this, i, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int32), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i32_1
+
+  !> Check if a integer (64 bits) array (rank 1) is positive.
+  subroutine assert_positive_i64_1(this, i, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int64), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i64_1
+
+  !> Check if a real (32 bits) array (rank 1) is positive.
+  subroutine assert_positive_r32_1(this, r, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real32), dimension(:), intent(in)           :: r    !< Value to check.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(r) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r32_1
+
+  !> Check if a real (64 bits) array (rank 1) is positive.
+  subroutine assert_positive_r64_1(this, r, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real64), dimension(:), intent(in)           :: r    !< Value to check.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(r) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r64_1
+
+  !> Check if two reals (32 bits) are close with respect a tolerance.
+  subroutine assert_close_r32(this, r1, r2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real32),    intent(in)           :: r1   !< Value to compare.
+    real(real32),    intent(in)           :: r2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(r1-r2) > this% tolerance32 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_r32
+
+  !> Check if two reals (64 bits) are close with respect a tolerance.
+  subroutine assert_close_r64(this, r1, r2, fail)
+    class(tester_t),  intent(inout)        :: this !< The tester.
+    real(real64),     intent(in)           :: r1   !< Value to compare.
+    real(real64),     intent(in)           :: r2   !< Value to compare.
+    logical,          intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(r1-r2) > this% tolerance64 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_r64
+
+  !> Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_r32_1(this, r1, r2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    real(real32),    intent(in), dimension(:) :: r1   !< Value to compare.
+    real(real32),    intent(in), dimension(:) :: r2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_r32_1
+
+  !> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_r64_1(this, r1, r2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    real(real64),    intent(in), dimension(:) :: r1   !< Value to compare.
+    real(real64),    intent(in), dimension(:) :: r2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_r64_1
+  
+  !> Check if two complex numbers (32 bits) are close with respect a tolerance.
+  subroutine assert_close_c32(this, c1, c2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    complex(real32), intent(in)           :: c1   !< Value to compare.
+    complex(real32), intent(in)           :: c2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(c1-c2) > this% tolerance32 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_c32
+
+  !> Check if two complex numbers (64 bits) are close with respect a tolerance.
+  subroutine assert_close_c64(this, r1, c2, fail)
+    class(tester_t),  intent(inout)        :: this !< The tester.
+    complex(real64),  intent(in)           :: r1   !< Value to compare.
+    complex(real64),  intent(in)           :: c2   !< Value to compare.
+    logical,          intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(r1-c2) > this% tolerance64 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_c64
+
+  !> Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_c32_1(this, c1, c2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    complex(real32), intent(in), dimension(:) :: c1   !< Value to compare.
+    complex(real32), intent(in), dimension(:) :: c2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > this% tolerance32 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_c32_1
+
+  !> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_c64_1(this, c1, c2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    complex(real64), intent(in), dimension(:) :: c1   !< Value to compare.
+    complex(real64), intent(in), dimension(:) :: c2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > this% tolerance64 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_c64_1
+
+end module tester
--- a/FFTXlib/tests/utils.f90
+++ b/FFTXlib/tests/utils.f90
@ -0,0 +1,94 @@
+SUBROUTINE collect_results(test)
+#if defined(__MPI)
+    USE mpi
+#endif
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: itottests, itoterr, ierr, me
+    !
+#if defined(__MPI)
+    !
+    CALL MPI_REDUCE(test%n_errors, itoterr, 1, MPI_INTEGER, MPI_SUM, &
+                        0, MPI_COMM_WORLD, ierr)
+    ! Fail in case MPI fails...
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+    CALL MPI_REDUCE(test%n_tests, itottests, 1, MPI_INTEGER, MPI_SUM, &
+                        0, MPI_COMM_WORLD, ierr)
+    ! Fail in case MPI fails...
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+    test%n_tests  = itottests
+    test%n_errors = itoterr
+    !
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+    CALL MPI_Comm_rank(MPI_COMM_WORLD, me, ierr);
+    !
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+#endif
+END SUBROUTINE collect_results
+
+SUBROUTINE save_random_seed(test_name, mpime)
+    IMPLICIT NONE
+    CHARACTER(len=*), INTENT(IN) :: test_name
+    INTEGER, INTENT(IN) :: mpime
+    !
+    INTEGER, PARAMETER :: in_unit=20, out_unit=21
+    CHARACTER(len=80) :: fname
+    INTEGER :: n, istat
+    INTEGER, ALLOCATABLE :: seed(:)
+    !
+    CALL random_seed(size = n)
+    ALLOCATE(seed(n))
+
+   ! First try if the OS provides a random number generator
+    OPEN(UNIT=in_unit, file="/dev/urandom", access="stream", &
+        form="unformatted", action="read", status="old", iostat=istat)
+    
+    IF (istat == 0) THEN
+        READ(in_unit) seed
+        CLOSE(in_unit)
+    ELSE
+        ! Fallback to stupid algorithm. Actually we do not really need
+        !  high-quality random numbers
+        CALL random_seed(get=seed)
+        seed = seed + mpime
+    END IF
+    !
+    CALL random_seed(put=seed)
+    !
+    WRITE(fname, '("rnd_seed_",A,I4.4)') TRIM(test_name), mpime
+    fname = TRIM(fname)
+    !
+    OPEN (UNIT=out_unit,FILE=fname,ACTION="write",STATUS="replace")
+    !
+    WRITE (out_unit,*) n
+    WRITE (out_unit,*) seed
+    CLOSE (out_unit)
+    DEALLOCATE(seed)
+    !
+END SUBROUTINE save_random_seed
+
+
+SUBROUTINE no_test
+#if defined(__MPI)
+    USE mpi
+#endif
+    USE tester
+    IMPLICIT NONE
+    !TYPE(tester_t) :: test
+    INTEGER :: ierr
+    !    
+#if defined(__MPI)
+    CALL MPI_Init(ierr)
+#endif
+    !CALL test%init()
+    !CALL print_results(test)
+#if defined(__MPI)
+    CALL mpi_finalize(ierr)
+#endif
+END SUBROUTINE no_test
--- a/KS_Solvers/CG/Makefile
+++ b/KS_Solvers/CG/Makefile
@ -9,6 +9,8 @@ CG = \
 ccgdiagg.o \
 rcgdiagg.o 

+-include make.gpu
+
 all : libcg.a


--- a/KS_Solvers/CG/ccgdiagg_gpu.f90
+++ b/KS_Solvers/CG/ccgdiagg_gpu.f90
@ -0,0 +1,526 @@
+!
+! Copyright (C) 2001-2007 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!
+#define ZERO ( 0.D0, 0.D0 )
+#define ONE  ( 1.D0, 0.D0 )
+
+FUNCTION KSDdot( n, A, incx, B, incy) result( res )
+  !
+  USE util_param,     ONLY : DP
+#if defined(__CUDA)
+  USE cudafor
+  USE cublas
+#endif
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: incx,incy,n
+  !
+#if defined(__CUDA)
+  REAL(DP), DEVICE, INTENT(IN) :: A(n), B(n)
+#else
+  REAL(DP),         INTENT(IN) :: A(n), B(n)
+  REAL(DP),         EXTERNAL   :: ddot
+#endif
+  !
+  REAL(DP) :: res
+  !
+#if defined(__CUDA)
+  res = cublasDdot( n, A, incx, B, incy )
+#else
+  res = ddot( n, A, incx, B, incy )
+#endif
+  !
+  RETURN
+  !
+END FUNCTION KSDdot
+
+! define __VERBOSE to print a message after each eigenvalue is computed
+!
+!----------------------------------------------------------------------------
+SUBROUTINE ccgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, precondition_d, &
+     npwx, npw, nbnd, npol, psi_d, e_d, btype, &
+     ethr, maxter, reorder, notconv, avg_iter )
+  !----------------------------------------------------------------------------
+  !
+  ! ... "poor man" iterative diagonalization of a complex hermitian matrix
+  ! ... through preconditioned conjugate gradient algorithm
+  ! ... Band-by-band algorithm with minimal use of memory
+  ! ... Calls hs_1psi and s_1psi to calculate H|psi> + S|psi> and S|psi>
+  ! ... Works for generalized eigenvalue problem (US pseudopotentials) as well
+  !
+#if defined(__CUDA)
+  USE cudafor
+  USE cublas
+#endif
+  USE util_param,     ONLY : DP
+  USE mp_bands_util,  ONLY : intra_bgrp_comm, inter_bgrp_comm
+  USE mp,             ONLY : mp_sum
+#if defined(__VERBOSE)
+  USE util_param,     ONLY : stdout
+#endif
+  USE device_memcpy_m,  ONLY : dev_memset, dev_memcpy
+  !
+  IMPLICIT NONE
+  !
+  ! ... Mathematical constants
+  ! 
+  REAL(DP), PARAMETER :: pi     = 3.14159265358979323846_DP
+  !
+  ! ... I/O variables
+  !
+  INTEGER,     INTENT(IN)    :: npwx, npw, nbnd, npol, maxter
+  INTEGER,     INTENT(IN)    :: btype(nbnd)
+  REAL(DP),    INTENT(IN)    :: precondition_d(npwx*npol), ethr
+  COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nbnd)
+  REAL(DP),    INTENT(INOUT) :: e_d(nbnd)
+  INTEGER,     INTENT(OUT)   :: notconv
+  REAL(DP),    INTENT(OUT)   :: avg_iter
+#if defined(__CUDA)
+  attributes(DEVICE) :: precondition_d, psi_d, e_d
+#endif
+  !
+  ! ... local variables
+  !
+  INTEGER                  :: i, j, k, m, m_start, m_end, iter, moved
+  COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:)
+  COMPLEX(DP), ALLOCATABLE :: scg_d(:), ppsi_d(:), g0_d(:), lagrange_d(:)
+#if defined(__CUDA)
+  attributes(DEVICE) :: hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d, lagrange_d
+#endif
+  COMPLEX(DP), ALLOCATABLE :: lagrange(:)
+  REAL(DP)                 :: gamma, ddot_temp, es_1, es(2)
+  REAL(DP), ALLOCATABLE    :: e(:)
+  REAL(DP)                 :: a0, b0, gg0, gg, gg1, cg0, e0, psi_norm, sint, cost
+  REAL(DP)                 :: theta, cos2t, sin2t
+  LOGICAL                  :: reorder
+  INTEGER                  :: kdim, kdmx, kdim2, ierr, istat
+  REAL(DP)                 :: empty_ethr, ethr_m
+  !
+  ! ... external functions
+  !
+  REAL (DP), EXTERNAL :: ksDdot
+  EXTERNAL  hs_1psi_gpu, s_1psi_gpu
+  ! hs_1psi( npwx, npw, psi, hpsi, spsi )
+  ! s_1psi( npwx, npw, psi, spsi )
+  !
+  CALL start_clock( 'ccgdiagg' )
+  !
+  empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
+  !
+  IF ( npol == 1 ) THEN
+     !
+     kdim = npw
+     kdmx = npwx
+     !
+  ELSE
+     !
+     kdim = npwx * npol
+     kdmx = npwx * npol
+     !
+  END IF
+  !
+  kdim2 = 2 * kdim
+  !
+  ALLOCATE(  hpsi_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate hpsi_d ', ABS(ierr) )
+  ALLOCATE(  spsi_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate spsi_d ', ABS(ierr) )
+  ALLOCATE(  g_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate g_d ', ABS(ierr) )
+  ALLOCATE(  cg_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate cg_d ', ABS(ierr) )
+  ALLOCATE(  scg_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate scg_d ', ABS(ierr) )
+  ALLOCATE(  ppsi_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate ppsi_d ', ABS(ierr) )
+  ALLOCATE(  g0_d(kdmx), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate g0_d ', ABS(ierr) )
+  !
+  ALLOCATE(  lagrange_d(nbnd), STAT=ierr )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate lagrange_d ', ABS(ierr) )
+  !
+  ALLOCATE(  lagrange(nbnd), STAT=ierr  )
+  IF( ierr /= 0 ) &
+       CALL errore( ' ccgdiagg ',' cannot allocate lagrange ', ABS(ierr) )
+  ALLOCATE(  e(nbnd) )
+  !
+  avg_iter = 0.D0
+  notconv  = 0
+  moved    = 0
+  !
+  ! ... every eigenfunction is calculated separately
+  !
+  DO m = 1, nbnd
+     !
+     IF ( btype(m) == 1 ) THEN
+        !
+        ethr_m = ethr
+        !
+     ELSE
+        !
+        ethr_m = empty_ethr
+        !
+     END IF
+     !
+     CALL dev_memset( spsi_d     , ZERO )
+     CALL dev_memset( scg_d      , ZERO )
+     CALL dev_memset( hpsi_d     , ZERO )
+     CALL dev_memset( g_d        , ZERO )
+     CALL dev_memset( cg_d       , ZERO )
+     CALL dev_memset( g0_d       , ZERO )
+     CALL dev_memset( ppsi_d     , ZERO )
+     CALL dev_memset( lagrange_d , ZERO )
+     !
+     ! ... calculate S|psi>
+     !
+     CALL s_1psi_gpu( npwx, npw, psi_d(1,m), spsi_d )
+     !
+     ! ... orthogonalize starting eigenfunction to those already calculated
+     !
+     call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
+     lagrange = ZERO
+     if(m_start.le.m_end) &
+          CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
+                      kdmx, spsi_d, 1, ZERO, lagrange_d(m_start), 1 )
+     if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
+
+     CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
+     !
+     CALL mp_sum( lagrange( 1:m ), intra_bgrp_comm )
+     !
+     psi_norm = DBLE( lagrange(m) )
+     lagrange_d(1:m) = lagrange(1:m)
+     !
+     DO j = 1, m - 1
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, kdmx
+           !
+           psi_d(i,m)  = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
+           !
+        END DO
+        !
+        psi_norm = psi_norm - ( DBLE( lagrange(j) )**2 + AIMAG( lagrange(j) )**2 )
+        !
+     END DO
+     !
+     psi_norm = SQRT( psi_norm )
+     !
+     !$cuf kernel do(1) <<<*,*>>>
+     DO i = 1, kdmx
+        psi_d(i,m) = psi_d(i,m) / psi_norm
+     END DO
+     !
+     ! ... calculate starting gradient (|hpsi> = H|psi>) ...
+     !
+     CALL hs_1psi_gpu( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
+     !
+     ! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
+     !
+     ! ... NB:  ddot(2*npw,a,1,b,1) = REAL( zdotc(npw,a,1,b,1) )
+     !
+     e(m) = ksDdot( kdim2, psi_d(1,m), 1, hpsi_d, 1 )
+     !
+     CALL mp_sum( e(m), intra_bgrp_comm )
+     !
+     ! ... start iteration for this band
+     !
+     iterate: DO iter = 1, maxter
+        !
+        ! ... calculate  P (PHP)|y>
+        ! ... ( P = preconditioning matrix, assumed diagonal )
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, kdmx
+           g_d(i)    = hpsi_d(i) / precondition_d(i)
+           ppsi_d(i) = spsi_d(i) / precondition_d(i)
+        END DO
+        !
+        ! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
+        !
+        es(1) = ksDdot( kdim2, spsi_d(1), 1, g_d(1), 1 )
+        es(2) = ksDdot( kdim2, spsi_d(1), 1, ppsi_d(1), 1 )
+        !
+        CALL mp_sum( es , intra_bgrp_comm )
+        !
+        es(1) = es(1) / es(2)
+        es_1 = es(1)
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, kdmx
+           g_d(i) = g_d(i) - es_1 * ppsi_d(i)
+        END DO
+        !
+        ! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y>  ensures that 
+        ! ... <g| S P^2|y> = 0
+        ! ... orthogonalize to lowest eigenfunctions (already calculated)
+        !
+        ! ... scg is used as workspace
+        !
+        CALL s_1psi_gpu( npwx, npw, g_d(1), scg_d(1) )
+        !
+        lagrange(1:m-1) = ZERO
+        call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
+        if(m_start.le.m_end) &
+        CALL ZGEMV( 'C', kdim, m_end-m_start+1, ONE, psi_d(1,m_start), &
+                    kdmx, scg_d, 1, ZERO, lagrange_d(m_start), 1 )
+        if(m_start.le.m_end) lagrange(m_start:m_end) = lagrange_d(m_start:m_end)
+        CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
+        !
+        CALL mp_sum( lagrange( 1:m-1 ), intra_bgrp_comm )
+        !
+        lagrange_d(1:m) = lagrange(1:m)
+        !
+        DO j = 1, ( m - 1 )
+           !
+           !$cuf kernel do(1) <<<*,*>>>
+           DO i = 1, kdmx
+              g_d(i)   = g_d(i)   - lagrange_d(j) * psi_d(i,j)
+              scg_d(i) = scg_d(i) - lagrange_d(j) * psi_d(i,j)
+           END DO
+           !
+        END DO
+        !
+        IF ( iter /= 1 ) THEN
+           !
+           ! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
+           !
+           gg1 = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
+           !
+           CALL mp_sum( gg1, intra_bgrp_comm )
+           !
+        END IF
+        !
+        ! ... gg is <g(n+1)|S|g(n+1)>
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, kdmx
+           g0_d(i) = scg_d(i) * precondition_d(i)
+        END DO
+        !
+        gg = ksDdot( kdim2, g_d(1), 1, g0_d(1), 1 )
+        !
+        CALL mp_sum( gg, intra_bgrp_comm )
+        !
+        IF ( iter == 1 ) THEN
+           !
+           ! ... starting iteration, the conjugate gradient |cg> = |g>
+           !
+           gg0 = gg
+           !
+           !$cuf kernel do(1) <<<*,*>>>
+           DO i = 1, kdmx
+              cg_d(i) = g_d(i)
+           END DO
+           !
+        ELSE
+           !
+           ! ... |cg(n+1)> = |g(n+1)> + gamma(n) * |cg(n)>
+           !
+           ! ... Polak-Ribiere formula :
+           !
+           gamma = ( gg - gg1 ) / gg0
+           gg0   = gg
+           !
+           ! 
+           !
+           ! See comment below
+           !!DO i = 1, kdmx
+           !!   cg_d(i) = g_d(i) + cg_d(i) * gamma
+           !!END DO
+           !
+           ! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)> 
+           ! ... is not 0. In fact :
+           ! ... <y(n+1)| S P^2 |cg(n)> = sin(theta)*<cg(n)|S|cg(n)>
+           !
+           psi_norm = gamma * cg0 * sint
+           !
+           !$cuf kernel do(1) <<<*,*>>>
+           DO i = 1, kdmx
+              !          v== this breaks the logic, done here for performance
+              cg_d(i) = (g_d(i) + cg_d(i) * gamma) - psi_norm * psi_d(i,m)
+           END DO
+           !
+        END IF
+        !
+        ! ... |cg> contains now the conjugate gradient
+        !
+        ! ... |scg> is S|cg>
+        !
+        CALL hs_1psi_gpu( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
+        !
+        cg0 = ksDdot( kdim2, cg_d(1), 1, scg_d(1), 1 )
+        !
+        CALL mp_sum(  cg0 , intra_bgrp_comm )
+        !
+        cg0 = SQRT( cg0 )
+        !
+        ! ... |ppsi> contains now HP|cg>
+        ! ... minimize <y(t)|PHP|y(t)> , where :
+        ! ...                         |y(t)> = cos(t)|y> + sin(t)/cg0 |cg>
+        ! ... Note that  <y|P^2S|y> = 1, <y|P^2S|cg> = 0 ,
+        ! ...           <cg|P^2S|cg> = cg0^2
+        ! ... so that the result is correctly normalized :
+        ! ...                           <y(t)|P^2S|y(t)> = 1
+        !
+        a0 = 2.D0 *  ksDdot( kdim2, psi_d(1,m), 1, ppsi_d(1), 1 ) / cg0
+        !
+        CALL mp_sum(  a0 , intra_bgrp_comm )
+        !
+        b0 = ksDdot( kdim2, cg_d(1), 1, ppsi_d(1), 1 ) / cg0**2
+        !
+        CALL mp_sum(  b0 , intra_bgrp_comm )
+        !
+        e0 = e(m)
+        !
+        theta = 0.5D0 * ATAN( a0 / ( e0 - b0 ) )
+        !
+        cost = COS( theta )
+        sint = SIN( theta )
+        !
+        cos2t = cost*cost - sint*sint
+        sin2t = 2.D0*cost*sint
+        !
+        es(1) = 0.5D0 * (   ( e0 - b0 ) * cos2t + a0 * sin2t + e0 + b0 )
+        es(2) = 0.5D0 * ( - ( e0 - b0 ) * cos2t - a0 * sin2t + e0 + b0 )
+        !
+        ! ... there are two possible solutions, choose the minimum
+        !
+        IF ( es(2) < es(1) ) THEN
+           !
+           theta = theta + 0.5D0 * pi
+           !
+           cost = COS( theta )
+           sint = SIN( theta )
+           !
+        END IF
+        !
+        ! ... new estimate of the eigenvalue
+        !
+        e(m) = MIN( es(1), es(2) )
+        ! ... upgrade |psi>
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, kdmx
+           psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
+        END DO
+        !
+        ! ... here one could test convergence on the energy
+        !
+        IF ( ABS( e(m) - e0 ) < ethr_m ) EXIT iterate
+        !
+        ! ... upgrade H|psi> and S|psi>
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, kdmx
+           spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
+           !
+           hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
+        END DO
+        !
+     END DO iterate
+     !
+#if defined(__VERBOSE)
+     IF ( iter >= maxter ) THEN
+        WRITE(stdout,'("e(",i4,") = ",f12.6," eV  (not converged after ",i3,&
+             & " iterations)")') m, e(m)*13.6058, iter
+     ELSE
+        WRITE(stdout,'("e(",i4,") = ",f12.6," eV  (",i3," iterations)")') &
+             m, e(m)*13.6058, iter
+     END IF
+     FLUSH (stdout)
+#endif
+     IF ( iter >= maxter ) notconv = notconv + 1
+     !
+     avg_iter = avg_iter + iter + 1
+     ! ... reorder eigenvalues if they are not in the right order
+     ! ... ( this CAN and WILL happen in not-so-special cases )
+     !
+     !
+     IF ( m > 1 .AND. reorder ) THEN
+        !
+        IF ( e(m) - e(m-1) < - 2.D0 * ethr_m ) THEN
+           ! ... if the last calculated eigenvalue is not the largest...
+           !
+           DO i = m - 2, 1, - 1
+              !
+              IF ( e(m) - e(i) > 2.D0 * ethr_m ) EXIT
+              !
+           END DO
+           !
+           i = i + 1
+           !
+           moved = moved + 1
+           !
+           ! ... last calculated eigenvalue should be in the 
+           ! ... i-th position: reorder
+           !
+           e0 = e(m)
+           !
+           !$cuf kernel do(1) <<<*,*>>>
+           DO k = 1, kdmx
+              ppsi_d(k) = psi_d(k,m)
+           END DO
+           !
+           DO j = m, i + 1, - 1
+              !
+              e(j) = e(j-1)
+              !
+              !$cuf kernel do(1) <<<*,*>>>
+              DO k = 1, kdmx
+                 psi_d(k,j) = psi_d(k,j-1)
+              END DO
+              !
+           END DO
+           !
+           e(i) = e0
+           !
+           !$cuf kernel do(1) <<<*,*>>>
+           DO k = 1, kdmx
+              psi_d(k,i) = ppsi_d(k)
+           END DO
+           !
+           ! ... this procedure should be good if only a few inversions occur,
+           ! ... extremely inefficient if eigenvectors are often in bad order
+           ! ... ( but this should not happen )
+           !
+        END IF
+        !
+     END IF
+     !
+  END DO
+  !
+  avg_iter = avg_iter / DBLE( nbnd )
+  !
+  ! STORING e in e_d since eigenvalues are always on the host
+  CALL dev_memcpy(e_d, e)
+  !
+  DEALLOCATE( lagrange )
+  DEALLOCATE( e )
+  DEALLOCATE( lagrange_d )
+  DEALLOCATE( ppsi_d )
+  DEALLOCATE( g0_d )
+  DEALLOCATE( cg_d )
+  DEALLOCATE( g_d )
+  DEALLOCATE( hpsi_d )
+  DEALLOCATE( scg_d )
+  DEALLOCATE( spsi_d )
+  !
+  CALL stop_clock( 'ccgdiagg' )
+  !
+  RETURN
+  !
+END SUBROUTINE ccgdiagg_gpu
--- a/KS_Solvers/CG/make.gpu
+++ b/KS_Solvers/CG/make.gpu
@ -0,0 +1,6 @@
+# Makefile for CG GPU
+
+CG += \
+rcgdiagg_gpu.o \
+ccgdiagg_gpu.o
+
--- a/KS_Solvers/CG/rcgdiagg_gpu.f90
+++ b/KS_Solvers/CG/rcgdiagg_gpu.f90
@ -0,0 +1,543 @@
+!
+! Copyright (C) 2002-2006 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+
+
+SUBROUTINE cgcudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+#if defined(__CUDA)
+    use cudafor
+    use cublas
+#endif
+    IMPLICIT NONE
+    DOUBLE PRECISION :: ALPHA,BETA
+    INTEGER :: INCX,INCY,LDA,M,N
+    CHARACTER :: TRANS
+    DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
+#if defined(__CUDA)
+    attributes(device) :: A, X, Y
+#endif
+    !
+    call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+    !
+END SUBROUTINE cgcudaDGEMV
+
+! define __VERBOSE to print a message after each eigenvalue is computed
+!----------------------------------------------------------------------------
+SUBROUTINE rcgdiagg_gpu( hs_1psi_gpu, s_1psi_gpu, precondition_d, &
+                     npwx, npw, nbnd, psi_d, e_d, btype, &
+                     ethr, maxter, reorder, notconv, avg_iter )
+  !----------------------------------------------------------------------------
+  !
+  ! ... "poor man" iterative diagonalization of a complex hermitian matrix
+  ! ... through preconditioned conjugate gradient algorithm
+  ! ... Band-by-band algorithm with minimal use of memory
+  ! ... Calls hs_1psi and s_1psi to calculate H|psi> + S|psi> and S|psi>
+  ! ... Works for generalized eigenvalue problem (US pseudopotentials) as well
+  !
+#if defined(__CUDA)
+  USE cudafor
+  USE cublas
+#endif
+  USE util_param,     ONLY : DP
+  USE mp_bands_util,  ONLY : intra_bgrp_comm, inter_bgrp_comm, gstart
+  USE mp,             ONLY : mp_sum
+#if defined(__VERBOSE)
+  USE cg_param,     ONLY : stdout
+#endif
+  !
+  IMPLICIT NONE
+  !
+  ! ... Mathematical constants
+  ! 
+  REAL(DP), PARAMETER :: pi     = 3.14159265358979323846_DP
+  !
+  ! ... I/O variables
+  !
+  INTEGER,     INTENT(IN)    :: npwx, npw, nbnd, maxter
+  INTEGER,     INTENT(IN)    :: btype(nbnd)
+  REAL(DP),    INTENT(IN)    :: precondition_d(npw), ethr
+  COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nbnd)
+  REAL(DP),    INTENT(INOUT) :: e_d(nbnd)
+  INTEGER,     INTENT(OUT)   :: notconv
+  REAL(DP),    INTENT(OUT)   :: avg_iter
+#if defined(__CUDA)
+  attributes(DEVICE) :: precondition_d, psi_d, e_d
+#endif
+  !
+  ! ... local variables
+  !
+  INTEGER                  :: i, j, l, m, m_start, m_end, iter, moved
+  REAL(DP),    ALLOCATABLE :: lagrange_d(:)
+  REAL(DP),    ALLOCATABLE :: lagrange(:), e(:)
+  COMPLEX(DP), ALLOCATABLE :: hpsi_d(:), spsi_d(:), g_d(:), cg_d(:), &
+                              scg_d(:), ppsi_d(:), g0_d(:), psi_aux(:)
+  COMPLEX(DP)              :: psi1, hpsi1, spsi1, ppsi1, scg1, cg1, g1, g01
+  REAL(DP)                 :: psi_norm, a0, b0, gg0, gamma, gg, gg1, &
+                              cg0, e0, es(2), aux
+  REAL(DP)                 :: es1
+  REAL(DP)                 :: theta, cost, sint, cos2t, sin2t
+  LOGICAL                  :: reorder
+  INTEGER                  :: npw2, npwx2
+  REAL(DP)                 :: empty_ethr, ethr_m
+#if defined(__CUDA)
+  attributes(DEVICE) :: lagrange_d, hpsi_d, spsi_d, g_d, cg_d, scg_d, ppsi_d, g0_d
+#endif
+  !
+  ! ... external functions
+  !
+  REAL(DP), EXTERNAL :: ksDdot
+  EXTERNAL  hs_1psi_gpu,    s_1psi_gpu 
+  ! hs_1psi( npwx, npw, psi, hpsi, spsi )
+  ! s_1psi( npwx, npw, psi, spsi )
+  !
+  !
+  CALL start_clock( 'rcgdiagg' )
+  !
+  IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
+  !
+  empty_ethr = MAX( ( ethr * 5.D0 ), 1.D-5 )
+  !
+  npw2 = 2 * npw
+  npwx2 = 2 * npwx
+  !
+  ALLOCATE( spsi_d( npwx ) )
+  ALLOCATE( scg_d(  npwx ) )
+  ALLOCATE( hpsi_d( npwx ) )
+  ALLOCATE( g_d(    npwx ) )
+  ALLOCATE( cg_d(   npwx ) )
+  ALLOCATE( g0_d(   npwx ) )
+  ALLOCATE( ppsi_d( npwx ) )
+  !
+  ALLOCATE( lagrange_d( nbnd ) )
+  ALLOCATE( lagrange  ( nbnd ) )
+  ALLOCATE( e         ( nbnd ) )
+  ALLOCATE( psi_aux   ( nbnd ) )
+  !
+  ! Sync eigenvalues that will remain on the Host
+  e(1:nbnd) = e_d(1:nbnd)
+  !print *, 'init ', e(1:nbnd)
+  !
+  avg_iter = 0.D0
+  notconv  = 0
+  moved    = 0
+  !
+  ! ... every eigenfunction is calculated separately
+  !
+  DO m = 1, nbnd
+
+     IF ( btype(m) == 1 ) THEN
+        !
+        ethr_m = ethr
+        !
+     ELSE
+        !
+        ethr_m = empty_ethr
+        !
+     END IF
+     !
+     ! ... calculate S|psi>
+     !
+     CALL s_1psi_gpu( npwx, npw, psi_d(1,m), spsi_d )
+     !
+     ! ... orthogonalize starting eigenfunction to those already calculated
+     !
+     call divide(inter_bgrp_comm,m,m_start,m_end); !write(*,*) m,m_start,m_end
+     lagrange = 0.d0
+     if(m_start.le.m_end) &
+     CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi_d(1,m_start), npwx2, spsi_d, 1, 0.D0, lagrange_d(m_start), 1 )
+     if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
+     !print *, 'lagrange ', lagrange(1:m)
+
+     CALL mp_sum( lagrange( 1:m ), inter_bgrp_comm )
+     IF ( gstart == 2 ) THEN
+        psi_aux(1:m) = psi_d(1,1:m)
+        spsi1 = spsi_d(1)
+        lagrange(1:m) = lagrange(1:m) - psi_aux(1:m) * spsi1
+     END IF
+     !
+     CALL mp_sum( lagrange( 1:m ), intra_bgrp_comm )
+     !
+     psi_norm = lagrange(m)
+     lagrange_d(1:m) = lagrange(1:m)
+     !
+     DO j = 1, m - 1
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, npwx
+           psi_d(i,m)  = psi_d(i,m) - lagrange_d(j) * psi_d(i,j)
+        END DO
+        !
+        !print *, 'psi_norm ', j, psi_norm
+        psi_norm = psi_norm - lagrange(j)**2
+        !
+     END DO
+     !
+     psi_norm = SQRT( psi_norm )
+     !print *, 'psi_norm 178', psi_norm
+     !
+     !$cuf kernel do(1) <<<*,*>>>
+     DO i = 1, npwx
+        psi_d(i,m) = psi_d(i,m) / psi_norm
+        ! ... set Im[ psi(G=0) ] -  needed for numerical stability
+        IF (i == 1) THEN
+          IF ( gstart == 2 ) psi_d(1,m) = CMPLX( DBLE(psi_d(1,m)), 0.D0 ,kind=DP)
+        END IF
+     END DO
+     !
+     ! ... calculate starting gradient (|hpsi> = H|psi>) ...
+     !
+     CALL hs_1psi_gpu( npwx, npw, psi_d(1,m), hpsi_d, spsi_d )
+     !
+     ! ... and starting eigenvalue (e = <y|PHP|y> = <psi|H|psi>)
+     !
+     ! ... NB:  ddot(2*npw,a,1,b,1) = DBLE( zdotc(npw,a,1,b,1) )
+     !
+     e(m) = 2.D0 * ksDdot( npw2, psi_d(1,m), 1, hpsi_d, 1 )
+     !print *, 'e(m)', e(m)
+     IF ( gstart == 2 ) THEN
+        psi1  = psi_d(1,m)
+        hpsi1 = hpsi_d(1)
+        e(m) = e(m) - psi1 * hpsi1
+     END IF
+     !
+     CALL mp_sum( e(m), intra_bgrp_comm )
+     !print *, 'before iterate', psi1, hpsi1, spsi1, e(1:nbnd)
+     !
+     ! ... start iteration for this band
+     !
+     iterate: DO iter = 1, maxter
+        !
+        ! ... calculate  P (PHP)|y>
+        ! ... ( P = preconditioning matrix, assumed diagonal )
+        !
+        !$cuf kernel do(1) <<<*,*>>>
+        DO i = 1, npw
+           g_d(i)  = hpsi_d(i) / precondition_d(i)
+           ppsi_d(i) = spsi_d(i) / precondition_d(i)
+        END DO
+        !
+        ! ... ppsi is now S P(P^2)|y> = S P^2|psi>)
+        !
+        es(1) = 2.D0 * ksDdot( npw2, spsi_d(1), 1, g_d(1), 1 )
+        es(2) = 2.D0 * ksDdot( npw2, spsi_d(1), 1, ppsi_d(1), 1 )
+        !
+        IF ( gstart == 2 ) THEN
+           !
+           g1 = g_d(1); ppsi1 = ppsi_d(1); spsi1 = spsi_d(1)
+           !
+           es(1) = es(1) - spsi1 * g1
+           es(2) = es(2) - spsi1 * ppsi1
+           !
+        END IF
+        !
+        CALL mp_sum( es , intra_bgrp_comm )
+        !
+        es(1) = es(1) / es(2)
+        !
+        es1 = es(1)
+        !$cuf kernel do
+        DO i=1, npwx
+           g_d(i) = g_d(i) - es1 * ppsi_d(i)
+        END DO
+        !
+        ! ... e1 = <y| S P^2 PHP|y> / <y| S S P^2|y>  ensures that  
+        ! ... <g| S P^2|y> = 0
+        !
+        ! ... orthogonalize to lowest eigenfunctions (already calculated)
+        !
+        ! ... scg is used as workspace
+        !
+        CALL s_1psi_gpu( npwx, npw, g_d(1), scg_d(1) )
+        !
+        lagrange(1:m-1) = 0.d0
+        call divide(inter_bgrp_comm,m-1,m_start,m_end); !write(*,*) m-1,m_start,m_end
+        if(m_start.le.m_end) &
+        CALL cgcudaDGEMV( 'T', npw2, m_end-m_start+1, 2.D0, psi_d(1,m_start), npw2, scg_d, 1, 0.D0, lagrange_d(m_start), 1 )
+        if(m_start.le.m_end) lagrange( m_start:m_end ) = lagrange_d( m_start:m_end )
+        CALL mp_sum( lagrange( 1:m-1 ), inter_bgrp_comm )
+        IF ( gstart == 2 ) THEN
+           psi_aux(1:m-1) = psi_d(1,1:m-1)
+           scg1 = scg_d(1)
+           lagrange(1:m-1) = lagrange(1:m-1) - psi_aux(1:m-1) * scg1
+        END IF
+        !
+        CALL mp_sum( lagrange( 1:m-1 ), intra_bgrp_comm )
+        !
+        DO j = 1, ( m - 1 )
+           !
+           aux = lagrange(j)
+           !$cuf kernel do(1)
+           DO i = 1, npwx
+              g_d(i)   = g_d(i)   - aux * psi_d(i,j)
+              scg_d(i) = scg_d(i) - aux * psi_d(i,j)
+           END DO
+           !
+        END DO
+        !
+        IF ( iter /= 1 ) THEN
+           !
+           ! ... gg1 is <g(n+1)|S|g(n)> (used in Polak-Ribiere formula)
+           !
+           gg1 = 2.D0 * ksDdot( npw2, g_d(1), 1, g0_d(1), 1 )
+           IF ( gstart == 2 ) THEN 
+              g1 = g_d(1) ; g01 = g0_d(1)
+              gg1 = gg1 - g1 * g01
+           END IF
+           !
+           CALL mp_sum(  gg1 , intra_bgrp_comm )
+           !
+        END IF
+        !
+        ! ... gg is <g(n+1)|S|g(n+1)>
+        !
+        !$cuf kernel do
+        do i=1, npwx
+           g0_d(i) = scg_d(i)
+        end do
+        !
+        !$cuf kernel do
+        do i=1, npw
+          g0_d(i) = g0_d(i) * precondition_d(i)
+        end do
+        !
+        gg = 2.D0 * ksDdot( npw2, g_d(1), 1, g0_d(1), 1 )
+        IF ( gstart == 2 ) THEN 
+           g1 = g_d(1) ; g01 = g0_d(1)
+           gg = gg - g1*g01
+        END IF
+        !
+        CALL mp_sum(  gg , intra_bgrp_comm )
+        !
+        IF ( iter == 1 ) THEN
+           !
+           ! ... starting iteration, the conjugate gradient |cg> = |g>
+           !
+           gg0 = gg
+           !
+           !$cuf kernel do
+           DO i=1, npwx
+              cg_d(i) = g_d(i)
+              ! ... |cg> contains now the conjugate gradient
+              ! ... set Im[ cg(G=0) ] -  needed for numerical stability
+              IF ( gstart == 2 .and. i == 1 ) cg_d(1) = CMPLX( DBLE(cg_d(1)), 0.D0 ,kind=DP)
+           END DO
+           !
+        ELSE
+           !
+           ! ... |cg(n+1)> = |g(n+1)> + gamma(n) * |cg(n)>
+           !
+           ! ... Polak-Ribiere formula :
+           !
+           gamma = ( gg - gg1 ) / gg0
+           gg0   = gg
+           !
+           !$cuf kernel do
+           do i=1, npwx
+              cg_d(i) = g_d(i) + cg_d(i) * gamma
+           end do
+           !
+           ! ... The following is needed because <y(n+1)| S P^2 |cg(n+1)> 
+           ! ... is not 0. In fact :
+           ! ... <y(n+1)| S P^2 |cg(n)> = sin(theta)*<cg(n)|S|cg(n)>
+           !
+           psi_norm = gamma * cg0 * sint
+           !
+           !$cuf kernel do
+           do i=1, npwx
+              cg_d(i) = cg_d(i) - psi_norm * psi_d(i,m)
+              ! ... |cg> contains now the conjugate gradient
+              ! ... set Im[ cg(G=0) ] -  needed for numerical stability
+              IF ( gstart == 2 .and. i == 1 ) cg_d(1) = CMPLX( DBLE(cg_d(1)), 0.D0 ,kind=DP)
+           end do
+           !
+        END IF
+        !
+        ! ... |scg> is S|cg>
+        !
+        CALL hs_1psi_gpu( npwx, npw, cg_d(1), ppsi_d(1), scg_d(1) )
+        !
+        cg0 = 2.D0 * ksDdot( npw2, cg_d(1), 1, scg_d(1), 1 )
+        IF ( gstart == 2 ) THEN 
+           cg1 = cg_d(1) ; scg1 = scg_d(1)
+           cg0 = cg0 - cg1*scg1
+        END IF
+        !
+        CALL mp_sum(  cg0 , intra_bgrp_comm )
+        !
+        cg0 = SQRT( cg0 )
+        !
+        ! ... |ppsi> contains now HP|cg>
+        ! ... minimize <y(t)|PHP|y(t)> , where :
+        ! ...                         |y(t)> = cos(t)|y> + sin(t)/cg0 |cg>
+        ! ... Note that  <y|P^2S|y> = 1, <y|P^2S|cg> = 0 ,
+        ! ...           <cg|P^2S|cg> = cg0^2
+        ! ... so that the result is correctly normalized :
+        ! ...                           <y(t)|P^2S|y(t)> = 1
+        !
+        a0 = 4.D0 * ksDdot( npw2, psi_d(1,m), 1, ppsi_d(1), 1 )
+        IF ( gstart == 2 ) THEN 
+           psi1 = psi_d(1,m)
+           ppsi1 = ppsi_d(1)
+           a0 = a0 - 2.D0 * psi1 * ppsi1
+        END IF
+        !
+        a0 = a0 / cg0
+        !
+        CALL mp_sum(  a0 , intra_bgrp_comm )
+        !
+        b0 = 2.D0 * ksDdot( npw2, cg_d(1), 1, ppsi_d(1), 1 )
+        IF ( gstart == 2 ) THEN 
+           cg1 = cg_d(1)
+           ppsi1 = ppsi_d(1)
+           b0 = b0 - cg1 * ppsi1
+        END IF
+        !
+        b0 = b0 / cg0**2
+        !
+        CALL mp_sum(  b0 , intra_bgrp_comm )
+        !
+        e0 = e(m)
+        !
+        theta = 0.5D0 * ATAN( a0 / ( e0 - b0 ) )
+        !
+        cost = COS( theta )
+        sint = SIN( theta )
+        !
+        cos2t = cost*cost - sint*sint
+        sin2t = 2.D0*cost*sint
+        !
+        es(1) = 0.5D0 * (   ( e0 - b0 ) * cos2t + a0 * sin2t + e0 + b0 )
+        es(2) = 0.5D0 * ( - ( e0 - b0 ) * cos2t - a0 * sin2t + e0 + b0 )
+        !
+        ! ... there are two possible solutions, choose the minimum
+        !
+        IF ( es(2) < es(1) ) THEN
+           !
+           theta = theta + 0.5D0 * pi
+           !
+           cost = COS( theta )
+           sint = SIN( theta )
+           !
+        END IF
+        !
+        ! ... new estimate of the eigenvalue
+        !
+        e(m) = MIN( es(1), es(2) )
+        !
+        ! ... upgrade |psi>
+        !
+        !$cuf kernel do
+        do i=1, npwx
+           psi_d(i,m) = cost * psi_d(i,m) + sint / cg0 * cg_d(i)
+        end do
+        !
+        ! ... here one could test convergence on the energy
+        !
+        IF ( ABS( e(m) - e0 ) < ethr_m ) EXIT iterate
+        !
+        ! ... upgrade H|psi> and S|psi>
+        !
+        !$cuf kernel do
+        do i=1, npwx
+           spsi_d(i) = cost * spsi_d(i) + sint / cg0 * scg_d(i)
+           !
+           hpsi_d(i) = cost * hpsi_d(i) + sint / cg0 * ppsi_d(i)
+        end do
+        !
+     END DO iterate
+     !
+#if defined(__VERBOSE)
+     IF ( iter >= maxter ) THEN
+        WRITE(stdout,'("e(",i4,") = ",f12.6," eV  (not converged after ",i3,&
+                     & " iterations)")') m, e(m)*13.6058, iter
+     ELSE
+        WRITE(stdout,'("e(",i4,") = ",f12.6," eV  (",i3," iterations)")') &
+         m, e(m)*13.6058, iter
+     END IF
+     FLUSH (stdout)
+#endif
+     IF ( iter >= maxter ) notconv = notconv + 1
+     !
+     avg_iter = avg_iter + iter + 1
+     !
+     ! ... reorder eigenvalues if they are not in the right order
+     ! ... ( this CAN and WILL happen in not-so-special cases )
+     !
+     IF ( m > 1 .AND. reorder ) THEN
+        !
+        IF ( e(m) - e(m-1) < - 2.D0 * ethr_m ) THEN
+           !
+           ! ... if the last calculated eigenvalue is not the largest...
+           !
+           DO i = m - 2, 1, - 1
+              !
+              IF ( e(m) - e(i) > 2.D0 * ethr_m ) EXIT
+              !
+           END DO
+           !
+           i = i + 1
+           !
+           moved = moved + 1
+           !
+           ! ... last calculated eigenvalue should be in the
+           ! ... i-th position: reorder
+           !
+           e0 = e(m)
+           !
+           !$cuf kernel do
+           do l=1, npwx
+              ppsi_d(l) = psi_d(l,m)
+           end do
+           !
+           DO j = m, i + 1, - 1
+              !
+              e(j) = e(j-1)
+              !
+              !$cuf kernel do
+              do l=1, npwx
+                 psi_d(l,j) = psi_d(l,j-1)
+              end do
+              !
+           END DO
+           !
+           e(i) = e0
+           !
+           !$cuf kernel do
+           do l=1, npwx
+              psi_d(l,i) = ppsi_d(l)
+           end do
+           !
+           ! ... this procedure should be good if only a few inversions occur,
+           ! ... extremely inefficient if eigenvectors are often in bad order
+           ! ... ( but this should not happen )
+           !
+        END IF
+        !
+     END IF
+     !
+  END DO
+  !
+  avg_iter = avg_iter / DBLE( nbnd )
+  e_d(1:nbnd) = e(1:nbnd)
+  !
+  DEALLOCATE( lagrange )
+  DEALLOCATE( lagrange_d )
+  DEALLOCATE( e )
+  DEALLOCATE( psi_aux )
+  DEALLOCATE( ppsi_d )
+  DEALLOCATE( g0_d )
+  DEALLOCATE( cg_d )
+  DEALLOCATE( g_d )
+  DEALLOCATE( hpsi_d )
+  DEALLOCATE( scg_d )
+  DEALLOCATE( spsi_d )
+  !
+  CALL stop_clock( 'rcgdiagg' )
+  !
+  RETURN
+  !
+END SUBROUTINE rcgdiagg_gpu
--- a/KS_Solvers/DENSE/Makefile
+++ b/KS_Solvers/DENSE/Makefile
@ -11,6 +11,8 @@ rotate_HSpsi_k.o \
 rotate_wfc_gamma.o \
 rotate_wfc_k.o 

+-include make.gpu
+
 all : libdense.a


--- a/KS_Solvers/DENSE/make.gpu
+++ b/KS_Solvers/DENSE/make.gpu
@ -0,0 +1,19 @@
+# Makefile for CG GPU
+
+DENSE += \
+rotate_wfc_k_gpu.o \
+rotate_wfc_gamma_gpu.o \
+rotate_HSpsi_k_gpu.o \
+rotate_HSpsi_gamma_gpu.o
+
+rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/mp.o
+rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
+rotate_HSpsi_gamma_gpu.o : ../../UtilXlib/util_param.o
+rotate_wfc_gamma_gpu.o : ../../LAXlib/la_module.o
+rotate_wfc_gamma_gpu.o : ../../UtilXlib/mp.o
+rotate_wfc_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
+rotate_wfc_gamma_gpu.o : ../../UtilXlib/util_param.o
+rotate_wfc_k_gpu.o : ../../LAXlib/la_module.o
+rotate_wfc_k_gpu.o : ../../UtilXlib/mp.o
+rotate_wfc_k_gpu.o : ../../UtilXlib/mp_bands_util.o
+rotate_wfc_k_gpu.o : ../../UtilXlib/util_param.o
--- a/KS_Solvers/DENSE/rotate_HSpsi_gamma_gpu.f90
+++ b/KS_Solvers/DENSE/rotate_HSpsi_gamma_gpu.f90
@ -0,0 +1,222 @@
+!
+! Copyright (C) 2003-2007 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!
+#define ZERO ( 0.D0, 0.D0 )
+!----------------------------------------------------------------------------
+SUBROUTINE rotate_HSpsi_gamma_gpu( npwx, npw, nstart, nbnd, psi_d, hpsi_d, overlap, spsi_d, e_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... Serial version of rotate_wfc for Gamma-only calculations
+  ! ... This version assumes real wavefunctions (k=0) with only
+  ! ... half plane waves stored: psi(-G)=psi*(G), except G=0
+  !
+  ! GPU version by Ivan Carnimeo
+  !
+#if defined (__CUDA)
+  USE cudafor
+#endif 
+  USE util_param,    ONLY : DP
+  USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
+                            me_bgrp, root_bgrp
+  USE mp_bands_util, ONLY : gstart ! index of the first nonzero G 
+  USE mp,            ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
+  !
+  IMPLICIT NONE
+  !
+  INCLUDE 'laxlib.fh'
+  !
+  ! ... I/O variables
+  !
+  INTEGER, INTENT(IN) :: &
+    npw,                 &         ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
+    npwx,                &         ! leading dimension of the wavefunction-related matrices
+    nstart,              &         ! input number of states 
+    nbnd                           ! output number of states
+  LOGICAL, INTENT(IN) :: overlap   ! if .FALSE. : spsi is not needed (and not used)
+  REAL(DP), INTENT(OUT) :: e_d(nbnd) ! eigenvalues of the reduced H matrix
+  COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nstart), hpsi_d(npwx,nstart) ! input and output psi, Hpsi,
+  COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi_d(npwx,nstart)         ! ...   and optionnally Spsi
+  !
+  ! ... local variables
+  !
+  INTEGER :: kdim, kdmx
+  INTEGER :: n_start, n_end, my_n, recv_counts(nbgrp), displs(nbgrp), column_type
+  INTEGER :: ii, jj ! indexes for cuf kernel loops 
+  !
+  ! ... device variables
+  !
+  COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
+  REAL(DP),    ALLOCATABLE :: hh_d(:,:), ss_d(:,:), vv_d(:,:)
+  REAL(DP),    ALLOCATABLE :: en_d(:)
+#if defined (__CUDA)
+  attributes (device) :: psi_d, hpsi_d, spsi_d, e_d
+  attributes (device) :: aux_d, hh_d, ss_d, vv_d, en_d
+#endif   
+  !
+  IF ( gstart == -1 ) CALL errore( 'rotHSw', 'gstart variable not initialized', 1 )
+  IF ( overlap .AND..NOT.present(spsi_d) ) call errore( 'rotHSw','spsi array needed with overlap=.TRUE.',1)
+  !
+  call start_clock('rotHSw'); !write(*,*) 'start rotHSw' ; FLUSH(6)
+  !
+  ! ... set Im[ psi(G=0) ] etc -  needed for numerical stability
+  !
+  IF ( gstart == 2 ) then
+!$cuf kernel do(1) 
+     DO ii = 1, nstart
+       psi_d (1,ii) = CMPLX( DBLE( psi_d (1,ii) ), 0.D0,kind=DP)
+       hpsi_d(1,ii) = CMPLX( DBLE( hpsi_d(1,ii) ), 0.D0,kind=DP)
+     END DO 
+     IF (overlap) THEN
+!$cuf kernel do(1) 
+       DO ii = 1, nstart
+         spsi_d(1,ii) = CMPLX( DBLE( spsi_d(1,ii) ), 0.D0,kind=DP)
+       END DO 
+     END IF
+  END IF
+  
+  kdim = 2 * npw
+  kdmx = 2 * npwx
+  !
+  ALLOCATE( hh_d( nstart, nstart ) )
+  ALLOCATE( ss_d( nstart, nstart ) )
+  ALLOCATE( vv_d( nstart, nstart ) )
+  ALLOCATE( en_d( nstart ) )
+  !
+  ! ... Set up the Hamiltonian and Overlap matrix on the subspace :
+  !
+  ! ...      H_ij = <psi_i| H |psi_j>     S_ij = <psi_i| S |psi_j>
+  !
+  call start_clock('rotHSw:hc'); !write(*,*) 'start rotHSw:hc' ; FLUSH(6)
+  CALL mp_type_create_column_section(hh_d(1,1), 0, nstart, nstart, column_type)
+  CALL divide_all(inter_bgrp_comm,nstart,n_start,n_end,recv_counts,displs)
+  my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
+  if (n_start .le. n_end) &
+  CALL gpu_DGEMM( 'T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, hpsi_d(1,n_start),kdmx, 0.D0, hh_d(1,n_start),nstart )
+  IF ( gstart == 2 ) call gpu_DGER( nstart, my_n, -1.D0, psi_d,kdmx, hpsi_d(1,n_start),kdmx, hh_d(1,n_start),nstart )
+  call start_clock('rotHSw:hc:s1')
+  CALL mp_sum( hh_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
+  call stop_clock('rotHSw:hc:s1')
+!  call start_clock('rotHSw:hc:b1'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b1')
+  call start_clock('rotHSw:hc:s2')
+  CALL mp_allgather(hh_d, column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:hc:s2')
+  !
+  IF ( overlap ) THEN
+     !
+     if (n_start .le. n_end) &
+     CALL gpu_DGEMM('T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, spsi_d(1,n_start),kdmx, 0.D0, ss_d(1,n_start),nstart)
+     IF ( gstart == 2 ) CALL gpu_DGER(nstart, my_n, -1.D0, psi_d,kdmx, spsi_d(1,n_start),kdmx, ss_d(1,n_start),nstart)
+     !
+  ELSE
+     !
+     if (n_start .le. n_end) &
+     CALL gpu_DGEMM('T','N', nstart,my_n,kdim, 2.D0, psi_d,kdmx, psi_d(1,n_start),kdmx, 0.D0, ss_d(1,n_start),nstart)
+     IF ( gstart == 2 ) CALL gpu_DGER(nstart, my_n, -1.D0, psi_d,kdmx, psi_d(1,n_start),kdmx, ss_d(1,n_start),nstart)
+     !
+  END IF
+  call start_clock('rotHSw:hc:s3')
+  CALL mp_sum( ss_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
+  call stop_clock('rotHSw:hc:s3')
+!  call start_clock('rotHSw:hc:b2'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b2')
+  call start_clock('rotHSw:hc:s4')
+  CALL mp_allgather(ss_d, column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:hc:s4')
+
+  CALL mp_type_free( column_type )
+
+  call stop_clock('rotHSw:hc'); !write(*,*) 'stop rotHSw:hc' ; FLUSH(6)
+  !
+  ! ... Diagonalize
+  !
+  call start_clock('rotHSw:diag'); !write(*,*) 'start rotHSw:diag' ; FLUSH(6)
+  CALL diaghg( nstart, nbnd, hh_d, ss_d, nstart, en_d, vv_d, me_bgrp, root_bgrp, intra_bgrp_comm )
+!$cuf kernel do(1)
+  DO ii = 1, nbnd
+    e_d(ii) = en_d(ii)
+  END DO 
+  call stop_clock('rotHSw:diag'); !write(*,*) 'stop rotHSw:diag' ; FLUSH(6)
+  !
+  ! ... update the basis set
+  !
+  call start_clock('rotHSw:evc'); !write(*,*) 'start rotHSw:evc' ; FLUSH(6)
+
+  CALL mp_type_create_column_section(psi_d(1,1), 0, npwx, npwx, column_type)
+  CALL divide_all(inter_bgrp_comm,nbnd,n_start,n_end,recv_counts,displs)
+
+  ALLOCATE( aux_d ( npwx, nbnd ) )
+
+  my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
+  if (n_start .le. n_end) &
+  CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0, psi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
+!$cuf kernel do(2)
+  DO ii = 1, npwx
+    DO jj = n_start, n_end 
+      psi_d(ii,jj) = aux_d(ii,jj)
+    END DO 
+  END DO 
+!  call start_clock('rotHSw:ev:b3'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b3')
+  call start_clock('rotHSw:ev:s5')
+  CALL mp_allgather(psi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:ev:s5')
+  if (n_start .le. n_end) &
+  CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0,hpsi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
+!$cuf kernel do (2) 
+  DO ii = 1, npwx
+    DO jj = n_start, n_end 
+      hpsi_d(ii,jj) = aux_d(ii,jj)
+    END DO 
+  END DO 
+!  call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
+  call start_clock('rotHSw:ev:s6')
+  CALL mp_allgather(hpsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:ev:s6')
+
+  IF (overlap) THEN
+
+     if (n_start .le. n_end) &
+     CALL gpu_DGEMM( 'N','N', kdim,my_n,nstart, 1.D0,spsi_d,kdmx,vv_d(1,n_start),nstart, 0.D0, aux_d(1,n_start),kdmx )
+!$cuf kernel do (2) 
+     DO ii = 1, npwx
+       DO jj = n_start, n_end 
+         spsi_d(ii,jj) = aux_d(ii,jj)
+       END DO 
+     END DO 
+!     call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
+     call start_clock('rotHSw:ev:s7')
+     CALL mp_allgather(spsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
+     call stop_clock('rotHSw:ev:s7')
+
+  ELSE IF (present(spsi_d)) THEN
+
+!$cuf kernel do (2) 
+     DO ii = 1, npwx
+       DO jj = 1, nbnd 
+         spsi_d(ii,jj) = psi_d(ii,jj)
+       END DO 
+     END DO 
+
+  END IF
+
+  DEALLOCATE( aux_d )
+  CALL mp_type_free( column_type )
+
+  call stop_clock('rotHSw:evc'); !write(*,*) 'stop rotHSw:evc' ; FLUSH(6)
+  !
+  DEALLOCATE( en_d )
+  DEALLOCATE( vv_d )
+  DEALLOCATE( ss_d )
+  DEALLOCATE( hh_d )
+  call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
+  !call print_clock('rotHSw')
+  !call print_clock('rotHSw:hc')
+  !call print_clock('rotHSw:diag')
+  !call print_clock('rotHSw:evc')
+  !
+  RETURN
+  !
+END SUBROUTINE rotate_HSpsi_gamma_gpu
--- a/KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90
+++ b/KS_Solvers/DENSE/rotate_HSpsi_k_gpu.f90
@ -0,0 +1,286 @@
+!
+! Copyright (C) 2003-2007 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!
+#define ZERO ( 0.D0, 0.D0 )
+!----------------------------------------------------------------------------
+SUBROUTINE rotate_HSpsi_k_gpu( npwx, npw, nstart, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, e_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... Serial version of rotate_wfc for colinear, k-point calculations
+  !
+#if defined (__CUDA)
+  USE cudafor
+#endif
+  USE util_param,    ONLY : DP
+  USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
+                            me_bgrp, root_bgrp
+  USE mp,            ONLY : mp_sum, mp_barrier, mp_allgather, mp_type_create_column_section, mp_type_free
+  USE device_memcpy_m,    ONLY: dev_memcpy, dev_memset
+  !
+  IMPLICIT NONE
+  !
+  INCLUDE 'laxlib.fh'
+  !
+  ! ... I/O variables
+  !
+  INTEGER, INTENT(IN) :: &
+    npw,                 &         ! dimension of the matrices (psi,Hpsi,Spsi) to be rotated
+    npwx,                &         ! leading dimension of the wavefunction-related matrices
+    nstart,              &         ! input number of states 
+    nbnd,                &         ! output number of states
+    npol                           ! number of spin polarizations
+  COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart), hpsi_d(npwx*npol,nstart) ! input and output psi, Hpsi,
+  COMPLEX(DP), INTENT(INOUT), OPTIONAL :: spsi_d(npwx*npol,nstart)              ! ...   and optionnally Spsi
+  LOGICAL, INTENT(IN) :: overlap   ! if .FALSE. : spsi is not needed (and not used)
+  REAL(DP), INTENT(OUT) :: e_d(nbnd) ! eigenvalues of the reduced H matrix
+  !
+  ! ... local variables
+  !
+  INTEGER :: kdim, kdmx
+  INTEGER :: n_start, n_end, my_n, recv_counts(nbgrp), displs(nbgrp), column_type
+  INTEGER :: ii, jj  ! indexes for cuf kernel loops 
+  !
+  ! ... device variables
+  !
+  COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
+  COMPLEX(DP), ALLOCATABLE :: hh_d(:,:), ss_d(:,:), vv_d(:,:)
+  REAL(DP),    ALLOCATABLE :: en_d(:)
+#if defined (__CUDA)
+  attributes(device) :: aux_d, psi_d, hpsi_d, spsi_d 
+  attributes(device) :: hh_d, ss_d, vv_d, en_d, e_d
+#endif
+  !
+  IF ( overlap .AND..NOT.present(spsi_d) ) call errore( 'rotHSw','spsi_d array needed with overlap=.TRUE.',1)
+  !
+  call start_clock('rotHSw'); !write(*,*) 'start rotHSw' ; FLUSH(6)
+  !
+  if (npol == 2 .and. npw < npwx ) then ! pack wfcs so that pw's are contiguous 
+    call start_clock('rotHSw:move'); !write(*,*) 'start rotHSw:move' ; FLUSH(6)
+    ALLOCATE ( aux_d ( npwx, nstart ) )
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nstart 
+        aux_d(ii,jj) = psi_d (npwx+ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nstart 
+        psi_d (npw+ii,jj) = aux_d(ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nstart 
+        aux_d(ii,jj) = hpsi_d(npwx+ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nstart 
+        hpsi_d(npw+ii,jj) = aux_d(ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nstart 
+        aux_d(ii,jj) = spsi_d(npwx+ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nstart 
+        spsi_d(npw+ii,jj) = aux_d(ii,jj)
+      END DO 
+    END DO 
+    DEALLOCATE( aux_d )
+    call stop_clock('rotHSw:move'); !write(*,*) 'stop rotHSw:move' ; FLUSH(6)
+  end if
+
+  kdim = npol * npw 
+  kdmx = npol * npwx
+  !
+  ALLOCATE( hh_d( nstart, nstart ) )
+  ALLOCATE( ss_d( nstart, nstart ) )
+  ALLOCATE( vv_d( nstart, nstart ) )
+  ALLOCATE( en_d( nstart ) )
+  !
+  ! ... Set up the Hamiltonian and Overlap matrix on the subspace :
+  !
+  ! ...      H_ij = <psi_i| H |psi_j>     S_ij = <psi_i| S |psi_j>
+  !
+  call start_clock('rotHSw:hc'); !write(*,*) 'start rotHSw:hc' ; FLUSH(6)
+
+  CALL mp_type_create_column_section(hh_d(1,1), 0, nstart, nstart, column_type)
+  CALL divide_all(inter_bgrp_comm,nstart,n_start,n_end,recv_counts,displs)
+
+  my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
+  if (n_start .le. n_end) &
+    CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, hpsi_d(1,n_start), kdmx, (0.D0,0.D0), &
+                                                                                         hh_d(1,n_start), nstart )
+  call start_clock('rotHSw:hc:s1')
+  CALL mp_sum( hh_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
+  call stop_clock('rotHSw:hc:s1')
+  !call start_clock('rotHSw:hc:b1'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b1')
+  call start_clock('rotHSw:hc:s2')
+  CALL mp_allgather(hh_d, column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:hc:s2')
+  !
+  IF ( overlap ) THEN
+     !
+     if (n_start .le. n_end) &
+       CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, spsi_d(1,n_start), kdmx, &
+                                                                          (0.D0,0.D0), ss_d(1,n_start), nstart )
+     !
+  ELSE
+     !
+     if (n_start .le. n_end) &
+       CALL gpu_ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, kdmx, psi_d(1,n_start), kdmx, &
+                                                                           (0.D0,0.D0), ss_d(1,n_start), nstart )
+     !
+  END IF
+  call start_clock('rotHSw:hc:s3')
+  CALL mp_sum( ss_d(:,n_start:n_end), intra_bgrp_comm ) ! this section only needs to be collected inside bgrp
+  call stop_clock('rotHSw:hc:s3')
+  !call start_clock('rotHSw:hc:b2'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:hc:b2')
+  call start_clock('rotHSw:hc:s4')
+  CALL mp_allgather(ss_d, column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:hc:s4')
+
+  CALL mp_type_free( column_type )
+
+  call stop_clock('rotHSw:hc'); !write(*,*) 'stop rotHSw:hc' ; FLUSH(6)
+  !
+  ! ... Diagonalize
+  !
+  call start_clock('rotHSw:diag'); !write(*,*) 'start rotHSw:diag' ; FLUSH(6)
+  CALL diaghg( nstart, nbnd, hh_d, ss_d, nstart, en_d, vv_d, me_bgrp, root_bgrp, intra_bgrp_comm )
+  CALL dev_memcpy(e_d, en_d, [1,nbnd])
+  call stop_clock('rotHSw:diag'); !write(*,*) 'stop rotHSw:diag' ; FLUSH(6)
+  !
+  ! ... update the basis set
+  !
+  call start_clock('rotHSw:evc'); !write(*,*) 'start rotHSw:evc' ; FLUSH(6)
+
+  CALL mp_type_create_column_section(psi_d(1,1), 0, npwx, npwx, column_type)
+  CALL divide_all(inter_bgrp_comm,nbnd,n_start,n_end,recv_counts,displs)
+
+  ALLOCATE( aux_d ( kdmx, nbnd ) )
+
+  my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
+  if (n_start .le. n_end) &
+    CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), psi_d, kdmx, vv_d(1,n_start), nstart, &
+                                                                      (0.D0,0.D0), aux_d(1,n_start), kdmx )
+
+  CALL dev_memcpy(psi_d, aux_d, [1, kdmx], 1, [n_start,n_end])
+  !call start_clock('rotHSw:ev:b3'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b3')
+  call start_clock('rotHSw:ev:s5')
+  CALL mp_allgather(psi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:ev:s5')
+
+  if (n_start .le. n_end) &
+    CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), hpsi_d, kdmx, vv_d(1,n_start), nstart, &
+                                                                       (0.D0,0.D0), aux_d(1,n_start), kdmx )
+  CALL dev_memcpy(hpsi_d, aux_d, [1, kdmx], 1, [n_start,n_end])  !call start_clock('rotHSw:ev:b4'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b4')
+  call start_clock('rotHSw:ev:s6')
+  CALL mp_allgather(hpsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
+  call stop_clock('rotHSw:ev:s6')
+
+  IF (overlap) THEN
+     if (n_start .le. n_end) &
+       CALL gpu_ZGEMM( 'N','N', kdim, my_n, nstart, (1.D0,0.D0), spsi_d, kdmx, vv_d(1,n_start), &
+                                                          nstart, (0.D0,0.D0), aux_d(1,n_start), kdmx )
+     CALL dev_memcpy(spsi_d, aux_d, [1, kdmx], 1, [n_start,n_end])     !call start_clock('rotHSw:ev:b5'); CALL mp_barrier( inter_bgrp_comm ); call stop_clock('rotHSw:ev:b5')
+     call start_clock('rotHSw:ev:s7')
+     CALL mp_allgather(spsi_d(:,1:nbnd), column_type, recv_counts, displs, inter_bgrp_comm)
+     call stop_clock('rotHSw:ev:s7')
+
+  ELSE IF (present(spsi_d)) THEN
+
+     CALL dev_memcpy(spsi_d, psi_d, [1, kdmx], 1, [n_start,n_end])
+  END IF
+
+  DEALLOCATE( aux_d )
+  CALL mp_type_free( column_type )
+  call stop_clock('rotHSw:evc'); !write(*,*) 'stop rotHSw:evc' ; FLUSH(6)
+  !
+  DEALLOCATE( vv_d )
+  DEALLOCATE( ss_d )
+  DEALLOCATE( hh_d )
+  DEALLOCATE( en_d )
+  !
+  call stop_clock('rotHSw'); !write(*,*) 'stop rotHSw' ; FLUSH(6)
+  !call print_clock('rotHSw')
+  !call print_clock('rotHSw:hc')
+  !call print_clock('rotHSw:diag')
+  !call print_clock('rotHSw:evc')
+  !
+  if (npol== 2 .and. npw < npwx ) then   ! unpack wfcs to its original order
+    call start_clock('rotHSw:move'); !write(*,*) 'start rotHSw:move' ; FLUSH(6)
+    ALLOCATE ( aux_d ( npwx, nbnd ) )
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nbnd 
+        aux_d(ii,jj) = psi_d (npw+ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nbnd 
+        psi_d(npwx+ii,jj) = aux_d(ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nbnd 
+        aux_d(ii,jj) = hpsi_d(npw+ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nbnd 
+        hpsi_d(npwx+ii,jj) = aux_d(ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nbnd 
+        aux_d(ii,jj) = spsi_d(npw+ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = 1, npw
+      DO jj = 1, nbnd 
+        spsi_d(npwx+ii,jj) = aux_d(ii,jj)
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = npw+1, npwx
+      DO jj = 1, nbnd 
+        psi_d(ii,jj) = ZERO
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = npw+1, npwx
+      DO jj = 1, nbnd 
+        hpsi_d(ii,jj) = ZERO 
+      END DO 
+    END DO 
+!$cuf kernel do(2)
+    DO ii = npw+1, npwx
+      DO jj = 1, nbnd 
+        spsi_d(ii,jj) = ZERO
+      END DO 
+    END DO 
+    DEALLOCATE( aux_d )
+    call stop_clock('rotHSw:move'); !write(*,*) 'stop rotHSw:move' ; FLUSH(6)
+  end if
+  !
+  RETURN
+  !
+END SUBROUTINE rotate_HSpsi_k_gpu
--- a/KS_Solvers/DENSE/rotate_wfc_gamma_gpu.f90
+++ b/KS_Solvers/DENSE/rotate_wfc_gamma_gpu.f90
@ -0,0 +1,198 @@
+!
+! Copyright (C) 2003-2007 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!
+!----------------------------------------------------------------------------
+SUBROUTINE rotate_wfc_gamma_gpu( h_psi_gpu, s_psi_gpu, overlap, &
+                             npwx, npw, nstart, nbnd, psi_d, evc_d, e_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... Serial version of rotate_wfc for Gamma-only calculations
+  ! ... This version assumes real wavefunctions (k=0) with only
+  ! ... half plane waves stored: psi_d(-G)=psi_d*(G), except G=0
+  !
+  USE LAXlib,       ONLY : diaghg
+#if defined(__CUDA)
+  USE cudafor
+  USE cublas
+#else
+#define cublasDGEMM dgemm
+#endif
+  USE util_param,    ONLY : DP
+  USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
+          nbgrp, my_bgrp_id, me_bgrp, root_bgrp
+  USE mp_bands_util, ONLY : gstart ! index of the first nonzero G 
+  USE mp,            ONLY : mp_sum 
+  !
+  IMPLICIT NONE
+  !
+  !INCLUDE 'laxlib.fh'
+  !
+  ! ... I/O variables
+  !
+  INTEGER :: npw, npwx, nstart, nbnd
+    ! dimension of the matrix to be diagonalized
+    ! leading dimension of matrix psi_d, as declared in the calling pgm unit
+    ! input number of states
+    ! output number of states
+  LOGICAL :: overlap
+    ! if .FALSE. : S|psi_d> not needed
+  COMPLEX(DP) :: psi_d(npwx,nstart), evc_d(npwx,nbnd)
+    ! input and output eigenvectors (may overlap)
+  REAL(DP) :: e_d(nbnd)
+    ! eigenvalues
+#if defined(__CUDA)
+  attributes(DEVICE) :: psi_d, evc_d, e_d
+#endif
+  !
+  ! ... local variables
+  !
+  INTEGER                  :: npw2, npwx2
+  COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
+  REAL(DP),    ALLOCATABLE :: hr_d(:,:), sr_d(:,:), vr_d(:,:)
+  REAL(DP),    ALLOCATABLE :: en_d(:)
+#if defined(__CUDA)
+  attributes(DEVICE) :: aux_d, hr_d, sr_d, vr_d, en_d
+#endif
+  INTEGER :: n_start, n_end, my_n, i, j
+  !
+  EXTERNAL  h_psi_gpu,    s_psi_gpu
+    ! h_psi(npwx,npw,nvec,psi_d,hpsi)
+    !     calculates H|psi_d>
+    ! s_psi(npwx,npw,nvec,spsi)
+    !     calculates S|psi_d> (if needed)
+    !     Vectors psi_d,hpsi,spsi are dimensioned (npwx,npol,nvec)
+
+  npw2  = 2 * npw
+  npwx2 = 2 * npwx
+
+  IF ( gstart == -1 ) CALL errore( 'regter', 'gstart variable not initialized', 1 )
+
+  !
+  ALLOCATE( aux_d(  npwx, nstart ) )
+  ALLOCATE( hr_d( nstart, nstart ) )
+  ALLOCATE( sr_d( nstart, nstart ) )
+  ALLOCATE( vr_d( nstart, nstart ) )
+  ALLOCATE( en_d( nstart ) )
+  call start_clock('rotwfcg'); !write(*,*) 'start rotwfcg' ; FLUSH(6)
+  !
+  ! ... Set up the Hamiltonian and Overlap matrix on the subspace :
+  !
+  ! ...      H_ij = <psi_i| H |psi_j>     S_ij = <psi_i| S |psi_j>
+  !
+  ! ... set Im[ psi_d(G=0) ] -  needed for numerical stability
+  !
+  IF ( gstart == 2 ) THEN
+     !$cuf kernel do(1)
+     DO i=1,nstart
+        psi_d(1,i) = CMPLX( DBLE( psi_d(1,i) ), 0.D0,kind=DP)
+     END DO
+  END IF
+  !
+  call start_clock('rotwfcg:hpsi'); !write(*,*) 'start rotwfcg:hpsi' ; FLUSH(6)
+  CALL h_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
+  call stop_clock('rotwfcg:hpsi'); !write(*,*) 'stop rotwfcg:hpsi' ; FLUSH(6)
+  !
+  call start_clock('rotwfcg:hc'); !write(*,*) 'start rotwfcg:hc' ; FLUSH(6)
+  hr_d=0.D0
+  CALL divide(inter_bgrp_comm,nstart,n_start,n_end)
+  my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
+  if (n_start .le. n_end) &
+  CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
+                    npwx2, aux_d(1,n_start), npwx2, 0.D0, hr_d(1,n_start), nstart )
+  IF ( gstart == 2 ) call CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
+                                      npwx2, aux_d(1,n_start), npwx2, hr_d(1,n_start), nstart )
+  CALL mp_sum( hr_d, inter_bgrp_comm )
+  !
+  CALL mp_sum( hr_d, intra_bgrp_comm )
+  !
+  sr_d=0.D0
+  IF ( overlap ) THEN
+     !
+     CALL s_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
+     !
+     if (n_start .le. n_end) &
+     CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
+                       npwx2, aux_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
+     IF ( gstart == 2 ) CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
+                                         npwx2, aux_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
+     !
+  ELSE
+     !
+     if (n_start .le. n_end) &
+     CALL cublasDGEMM( 'T','N', nstart, my_n, npw2, 2.D0, psi_d, &
+                       npwx2, psi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
+     IF ( gstart == 2 ) CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, &
+                                         npwx2, psi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
+     !
+  END IF
+  CALL mp_sum( sr_d, inter_bgrp_comm )
+  !
+  CALL mp_sum( sr_d, intra_bgrp_comm )
+  call stop_clock('rotwfcg:hc'); !write(*,*) 'stop rotwfcg:hc' ; FLUSH(6)
+  !
+  ! ... Diagonalize
+  !
+  call start_clock('rotwfcg:diag'); !write(*,*) 'start rotwfcg:diag' ; FLUSH(6)
+  CALL diaghg( nstart, nbnd, hr_d, sr_d, nstart, en_d, vr_d, me_bgrp, root_bgrp, intra_bgrp_comm )
+  call stop_clock('rotwfcg:diag'); !write(*,*) 'stop rotwfcg:diag' ; FLUSH(6)
+  call start_clock('rotwfcg:evc_d'); !write(*,*) 'start rotwfcg:evc_d' ; FLUSH(6)
+  !
+  !$cuf kernel do(1)
+  DO i=1, nbnd
+     e_d(i) = en_d(i)
+  END DO
+  !
+  ! ... update the basis set
+  !
+  aux_d=(0.D0,0.D0)
+  if (n_start .le. n_end) &
+  CALL cublasDGEMM( 'N','N', npw2, nbnd, my_n, 1.D0, psi_d(1,n_start), &
+                     npwx2, vr_d(n_start,1), nstart, 0.D0, aux_d, npwx2 )
+  CALL mp_sum( aux_d, inter_bgrp_comm )
+  !
+  !$cuf kernel do(2)
+  DO i=1, nbnd
+     DO j=1, npwx
+        evc_d(j,i) = aux_d(j,i)
+     END DO
+  END DO
+  call stop_clock('rotwfcg:evc_d'); !write(*,*) 'stop rotwfcg:evc_d' ; FLUSH(6)
+  !
+  DEALLOCATE( en_d )
+  DEALLOCATE( vr_d )
+  DEALLOCATE( sr_d )
+  DEALLOCATE( hr_d )
+  DEALLOCATE( aux_d )
+  call stop_clock('rotwfcg'); !write(*,*) 'stop rotwfcg' ; FLUSH(6)
+  !call print_clock('rotwfcg')
+  !call print_clock('rotwfcg:hpsi')
+  !call print_clock('rotwfcg:hc')
+  !call print_clock('rotwfcg:diag')
+  !call print_clock('rotwfcg:evc_d')
+  !
+  RETURN
+  !
+END SUBROUTINE rotate_wfc_gamma_gpu
+
+! In principle this can go away .......
+SUBROUTINE CGcudaDGER  ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+#if defined(__CUDA)
+    use cudafor
+    use cublas
+#endif
+!     .. Scalar Arguments ..
+    DOUBLE PRECISION ::  ALPHA
+    INTEGER          ::   INCX, INCY, LDA, M, N
+!     .. Array Arguments ..
+    DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
+#if defined(__CUDA)
+    attributes(device) :: A, X, Y
+#endif
+    CALL DGER  ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+
+END SUBROUTINE CGcudaDGER
--- a/KS_Solvers/DENSE/rotate_wfc_k_gpu.f90
+++ b/KS_Solvers/DENSE/rotate_wfc_k_gpu.f90
@ -0,0 +1,170 @@
+!
+! Copyright (C) 2001-2007 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!
+!----------------------------------------------------------------------------
+SUBROUTINE rotate_wfc_k_gpu( h_psi_gpu, s_psi_gpu, overlap, &
+                             npwx, npw, nstart, nbnd, npol, psi_d, evc_d, e_d )
+  !----------------------------------------------------------------------------
+  !
+  ! ... Serial version of rotate_wfc for colinear, k-point calculations
+  !
+  USE LAXlib
+#if defined(__CUDA)
+  USE cudafor
+  USE cublas
+#endif
+  USE util_param,    ONLY : DP
+  USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id,&
+          nbgrp, my_bgrp_id, me_bgrp, root_bgrp
+  USE mp,            ONLY : mp_sum
+  !
+  IMPLICIT NONE
+  !
+  !INCLUDE 'laxlib.fh'
+  !
+  ! ... I/O variables
+  !
+  INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
+  ! dimension of the matrix to be diagonalized
+  ! leading dimension of matrix psi, as declared in the calling pgm unit
+  ! input number of states
+  ! output number of states
+  ! number of spin polarizations
+  LOGICAL :: overlap
+  ! if .FALSE. : S|psi> not needed
+  COMPLEX(DP) :: psi_d(npwx*npol,nstart), evc_d(npwx*npol,nbnd)
+  ! input and output eigenvectors (may overlap)
+  REAL(DP)    :: e_d(nbnd)
+  ! eigenvalues
+#if defined(__CUDA)
+  attributes(DEVICE) :: psi_d, evc_d, e_d
+#endif
+  !
+  ! ... local variables
+  !
+  INTEGER :: kdim, kdmx
+  COMPLEX(DP), ALLOCATABLE :: aux_d(:,:)
+  COMPLEX(DP), ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
+  REAL(DP),    ALLOCATABLE :: en_d(:)
+#if defined(__CUDA)
+  attributes(DEVICE) :: aux_d, hc_d, sc_d, vc_d, en_d
+#endif
+  INTEGER :: n_start, n_end, my_n, i, j
+  !
+  EXTERNAL  h_psi_gpu,    s_psi_gpu
+  ! h_psi(npwx,npw,nvec,psi,hpsi)
+  !     calculates H|psi>
+  ! s_psi(npwx,npw,nvec,spsi)
+  !     calculates S|psi> (if needed)
+  !     Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
+
+  IF ( npol == 1 ) THEN
+     !
+     kdim = npw
+     kdmx = npwx
+     !
+  ELSE
+     !
+     kdim = npwx*npol
+     kdmx = npwx*npol
+     !
+  END IF
+  !
+  ALLOCATE( aux_d(kdmx, nstart ) )
+  ALLOCATE( hc_d( nstart, nstart) )
+  ALLOCATE( sc_d( nstart, nstart) )
+  ALLOCATE( vc_d( nstart, nstart) )
+  ALLOCATE( en_d( nstart ) )
+  call start_clock('rotwfck'); !write(*,*) 'start rotwfck';FLUSH(6)
+  !
+  ! ... Set up the Hamiltonian and Overlap matrix on the subspace :
+  !
+  ! ...      H_ij = <psi_i| H |psi_j>     S_ij = <psi_i| S |psi_j>
+  !
+  call start_clock('rotwfck:hpsi'); !write(*,*) 'start rotwfck:hpsi';FLUSH(6)
+  CALL h_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
+  call stop_clock('rotwfck:hpsi') ; !write(*,*) 'stop rotwfck:hpsi';FLUSH(6)
+  !
+  call start_clock('rotwfck:hc'); !write(*,*) 'start rotwfck:hc';FLUSH(6)
+  hc_d=(0.D0,0.D0)
+  CALL divide(inter_bgrp_comm,nstart,n_start,n_end)
+  my_n = n_end - n_start + 1; !write (*,*) nstart,n_start,n_end
+  if (n_start .le. n_end) &
+       call ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
+                    kdmx, aux_d(1,n_start), kdmx, (0.D0,0.D0), hc_d(1,n_start), nstart )
+  CALL mp_sum( hc_d, inter_bgrp_comm )
+  !
+  CALL mp_sum( hc_d, intra_bgrp_comm )
+  !
+  sc_d=(0.D0,0.D0)
+  IF ( overlap ) THEN
+     !
+     CALL s_psi_gpu( npwx, npw, nstart, psi_d, aux_d )
+     if (n_start .le. n_end) &
+          CALL ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
+                      kdmx, aux_d(1,n_start), kdmx, (0.D0,0.D0), sc_d(1,n_start), nstart )
+     !
+  ELSE
+     !
+     if (n_start .le. n_end) &
+          CALL ZGEMM( 'C','N', nstart, my_n, kdim, (1.D0,0.D0), psi_d, &
+                      kdmx, psi_d(1,n_start), kdmx, (0.D0,0.D0), sc_d(1,n_start), nstart )
+     !
+  END IF
+  CALL mp_sum( sc_d, inter_bgrp_comm )
+  !
+  CALL mp_sum( sc_d, intra_bgrp_comm )
+  call stop_clock('rotwfck:hc'); !write(*,*) 'stop rotwfck:hc';FLUSH(6)
+  !
+  ! ... Diagonalize
+  !
+  call start_clock('rotwfck:diag');  !write(*,*) 'start rotwfck:diag';FLUSH(6)
+  !! cdiaghg on GPU. See interface from LAXlib module
+  CALL diaghg( nstart, nbnd, hc_d, sc_d, nstart, en_d, vc_d, me_bgrp, root_bgrp, intra_bgrp_comm )
+  call stop_clock('rotwfck:diag');  !write(*,*) 'stop rotwfck:diag';FLUSH(6)
+  call start_clock('rotwfck:evc'); !write(*,*) 'start rotwfck:evc';FLUSH(6)
+  !
+  !$cuf kernel do(1) <<<*,*>>>
+  DO i=1,nbnd
+    e_d(i) = en_d(i)
+  END DO
+  !
+  ! ...  update the basis set
+  !
+  aux_d=(0.D0,0.D0)
+  if (n_start .le. n_end) &
+       CALL ZGEMM( 'N','N', kdim, nbnd, my_n, (1.D0,0.D0), psi_d(1,n_start), &
+                   kdmx, vc_d(n_start,1), nstart, (0.D0,0.D0), aux_d, kdmx )
+  CALL mp_sum( aux_d, inter_bgrp_comm )
+  !
+  !
+  !$cuf kernel do(1) <<<*,*>>>
+  DO i=1, nbnd
+     DO j=1, kdmx
+       evc_d(j,i) = aux_d(j,i)
+     END DO
+  END DO
+  !
+  call stop_clock('rotwfck:evc') ! ; write(*,*) 'stop rotwfck;evc';FLUSH(6)
+  !
+  DEALLOCATE( en_d )
+  DEALLOCATE( vc_d )
+  DEALLOCATE( sc_d )
+  DEALLOCATE( hc_d )
+  DEALLOCATE( aux_d )
+  call stop_clock('rotwfck'); !write(*,*) 'stop rotwfck';FLUSH(6)
+  !call print_clock('rotwfck')
+  !call print_clock('rotwfck:hpsi')
+  !call print_clock('rotwfck:hc')
+  !call print_clock('rotwfck:diag')
+  !call print_clock('rotwfck:evc')
+  !
+  RETURN
+  !
+END SUBROUTINE rotate_wfc_k_gpu
+!
--- a/KS_Solvers/Davidson/Makefile
+++ b/KS_Solvers/Davidson/Makefile
@ -9,6 +9,8 @@ DAVID = \
 cegterg.o \
 regterg.o

+-include make.gpu
+
 all : libdavid.a


--- a/KS_Solvers/Davidson/cegterg_gpu.f90
+++ b/KS_Solvers/Davidson/cegterg_gpu.f90
--- a/KS_Solvers/Davidson/make.gpu
+++ b/KS_Solvers/Davidson/make.gpu
@ -0,0 +1,13 @@
+# Makefile for DAVID GPU
+
+DAVID += \
+cegterg_gpu.o \
+regterg_gpu.o
+
+ifdef PGI_POWER_WORKAROUND
+# Circumvents bug in the Power implementation of PGI compilers
+F90FLAGS:= -O1 $(filter-out -fast,$(F90FLAGS))
+endif
+
+cegterg_gpu.o : ../../UtilXlib/util_param.o
+regterg_gpu.o : ../../UtilXlib/util_param.o
--- a/KS_Solvers/Davidson/regterg.f90
+++ b/KS_Solvers/Davidson/regterg.f90
@ -22,7 +22,6 @@ SUBROUTINE regterg(  h_psi, s_psi, uspp, g_psi, &
  ! ... where H is an hermitean operator, e is a real scalar,
  ! ... S is an uspp matrix, evc is a complex vector
  ! ... (real wavefunctions with only half plane waves stored)
-  
  USE util_param,    ONLY : DP, stdout
  USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, &
          nbgrp, my_bgrp_id, me_bgrp, root_bgrp
--- a/KS_Solvers/Davidson/regterg_gpu.f90
+++ b/KS_Solvers/Davidson/regterg_gpu.f90
--- a/KS_Solvers/Makefile
+++ b/KS_Solvers/Makefile
@ -32,7 +32,10 @@ DENSE/rotate_HSpsi_gamma.o \
 DENSE/rotate_HSpsi_k.o \
 DENSE/rotate_wfc_gamma.o \
 DENSE/rotate_wfc_k.o 
-# add here other objects, e.g. NewSolver/*.o
+
+-include make.gpu 
+
+# add here other objects, e.g. ParO/*.o NewSolver/*.o

 libks_solvers.a: $(ALLOBJS)
 	$(AR) $(ARFLAGS) $@ $?
--- a/KS_Solvers/PPCG/Makefile
+++ b/KS_Solvers/PPCG/Makefile
@ -6,8 +6,11 @@ include ../../make.inc
 MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).

 PPCG = \
+generic_cublas.o \
 ppcg_gamma.o \
-ppcg_k.o
+ppcg_gamma_gpu.o \
+ppcg_k.o \
+ppcg_k_gpu.o \

 all : libppcg.a

--- a/KS_Solvers/PPCG/generic_cublas.f90
+++ b/KS_Solvers/PPCG/generic_cublas.f90
@ -0,0 +1,221 @@
+subroutine gpu_DGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
+#if defined(__CUDA)
+USE cublas
+#endif
+implicit none
+  character*1 transa, transb
+  integer :: m, n, k, lda, ldb, ldc
+  DOUBLE PRECISION :: alpha, beta 
+  DOUBLE PRECISION, dimension(lda, *)  :: A
+  DOUBLE PRECISION, dimension(ldb, *)  :: B
+  DOUBLE PRECISION, dimension(ldc, *)  :: C
+#if defined(__CUDA)  
+  attributes(device) :: A, B, C  
+  call cublasDGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
+#endif
+  return
+end subroutine gpu_DGEMM
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+subroutine gpu_ZGEMM (transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
+#if defined(__CUDA)
+USE cublas
+#endif
+implicit none
+  character*1 transa, transb
+  integer :: m, n, k, lda, ldb, ldc
+  DOUBLE COMPLEX :: alpha, beta 
+  DOUBLE COMPLEX, dimension(lda, *)  :: A
+  DOUBLE COMPLEX, dimension(ldb, *)  :: B
+  DOUBLE COMPLEX, dimension(ldc, *)  :: C
+#if defined(__CUDA)
+  attributes(device) :: A, B, C
+  call cublasZGEMM(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
+#endif
+  return
+end subroutine gpu_ZGEMM
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+subroutine gpu_DGER (m, n, alpha, x, incx, y, incy, a, lda)
+#if defined(__CUDA)
+USE cublas
+#endif
+implicit none
+  integer :: m, n, lda, incx, incy
+  DOUBLE PRECISION :: alpha
+  DOUBLE PRECISION, dimension(lda, *)  :: A
+  DOUBLE PRECISION, dimension(*)  :: x, y 
+#if defined(__CUDA)
+  attributes(device) :: A, x, y
+  call cublasDGER(m, n, alpha, x, incx, y, incy, a, lda)  
+#endif
+  return
+end subroutine gpu_DGER 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+function gpu_DDOT (n, dx, incx, dy, incy)
+#if defined(__CUDA)
+USE cublas
+#endif
+implicit none
+  DOUBLE PRECISION :: gpu_DDOT
+  integer :: n, incx, incy
+  DOUBLE PRECISION, dimension(*)  :: dx, dy 
+#if defined(__CUDA)
+  attributes(device) :: dx, dy
+  gpu_DDOT=cublasDDOT(n, dx, incx, dy, incy)  
+#endif
+  return
+end function gpu_DDOT 
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+subroutine gpu_DTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb) 
+#if defined(__CUDA)
+USE cublas
+#endif
+implicit none
+  character*1 :: side, uplo, transa, diag 
+  integer :: m, n, lda, ldb 
+  DOUBLE PRECISION :: alpha 
+  DOUBLE PRECISION, dimension(lda, *) :: a 
+  DOUBLE PRECISION, dimension(ldb, *) :: b 
+#if defined(__CUDA)
+  attributes(device) :: a, b 
+  call cublasDTRSM(side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb)  
+#endif
+  return
+end subroutine gpu_DTRSM
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+SUBROUTINE gpu_threaded_memset(array, val, length)
+  !
+#if defined(__CUDA)
+  USE cudafor
+#endif
+  USE util_param,   ONLY : DP
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: length
+  REAL(DP), INTENT(OUT) :: array(length)
+#if defined(__CUDA)
+  attributes(device) :: array
+#endif
+  REAL(DP), INTENT(IN) :: val
+  !
+  INTEGER :: i
+  !
+  IF (length<=0) RETURN
+  !
+!$cuf kernel do(1)  
+  DO i=1, length
+     array(i) = val
+  ENDDO
+  !
+END SUBROUTINE gpu_threaded_memset
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+SUBROUTINE gpu_threaded_assign(array_out, array_in, kdimx, nact, use_idx, idx, bgrp_root_only)
+  !  assign (copy) a complex array in a threaded way
+  !  array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, 1:nact )       or
+  !  array_out( 1:kdimx, 1:nact ) = array_in( 1:kdimx, idx(1:nact) )
+  !  if the index array idx is given
+  !  if  bgrp_root_only is present and .true. the assignement is made only by the 
+  !  MPI root process of the bgrp and array_out is zeroed otherwise
+#if defined(__CUDA)
+  USE cudafor
+#endif
+  USE util_param,   ONLY : DP
+  USE mp_bands_util,      ONLY : root_bgrp_id, nbgrp, my_bgrp_id
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN)      :: kdimx, nact
+  COMPLEX(DP), INTENT(OUT) :: array_out( kdimx, nact )
+  COMPLEX(DP), INTENT(IN)  :: array_in ( kdimx, * )
+  INTEGER, INTENT(IN) :: idx( * )
+#if defined(__CUDA)
+  attributes(device) :: array_out, array_in, idx
+#endif
+  LOGICAL, INTENT(IN) :: bgrp_root_only
+  LOGICAL, INTENT(IN) :: use_idx
+  !
+  INTEGER, PARAMETER :: blocksz = 256
+  INTEGER :: nblock
+  
+  INTEGER :: i, j
+  !
+  IF (kdimx <=0 .OR. nact<= 0) RETURN
+  !
+  IF (bgrp_root_only .AND. ( my_bgrp_id /= root_bgrp_id ) ) THEN
+     call threaded_memset( array_out, 0.d0, 2*kdimx*nact )
+     RETURN
+  END IF
+  
+  nblock = (kdimx - 1)/blocksz  + 1
+  
+  IF (use_idx ) THEN
+!$cuf kernel do(2)
+     DO i=1, nact 
+       DO j=1,nblock
+        array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) 
+       ENDDO 
+     ENDDO
+  ELSE
+!$cuf kernel do(2)
+     DO i=1, nact 
+       DO j=1,nblock
+        array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) 
+       ENDDO 
+     ENDDO
+  END IF
+  !
+END SUBROUTINE gpu_threaded_assign
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2, a2_in )
+  !  assign (copy) a complex array in a threaded way
+  !  array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact )      or
+  !  array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact )  + a2_in( 1:kdimx, idx(1:nact) (
+  !  if a2_in is present
+  !  the index array idx is mandatory otherwise one could use previous routine)
+#if defined(__CUDA)
+  USE cudafor
+#endif
+  USE util_param,   ONLY : DP
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN)      :: kdimx, nact
+  COMPLEX(DP), INTENT(INOUT) :: array_out( kdimx, * ) ! we don't want to mess with un referenced columns
+  COMPLEX(DP), INTENT(IN)  :: array_in ( kdimx, nact )
+  COMPLEX(DP), INTENT(IN)  :: a2_in ( kdimx, * )
+  INTEGER, INTENT(IN)      :: idx( * )
+#if defined(__CUDA)
+  attributes(device) :: array_out, array_in, a2_in, idx
+#endif
+  LOGICAL, INTENT(IN) :: use_a2
+  !
+  INTEGER, PARAMETER :: blocksz = 256
+  INTEGER :: nblock
+
+  INTEGER :: i, j
+  !
+  IF (kdimx <=0 .OR. nact<= 0) RETURN
+  !
+
+  nblock = (kdimx - 1)/blocksz  + 1
+
+  IF ( use_a2) THEN
+!$cuf kernel do(2)
+     DO i=1, nact 
+       DO j=1,nblock
+        array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = &
+            array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )     + & 
+                a2_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) 
+       ENDDO 
+     ENDDO
+  ELSE
+!$cuf kernel do(2)
+     DO i=1, nact 
+       DO j=1,nblock
+        array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) 
+       ENDDO 
+     ENDDO
+  END IF
+  !
+END SUBROUTINE gpu_threaded_backassign
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
--- a/KS_Solvers/PPCG/make.depend
+++ b/KS_Solvers/PPCG/make.depend
@ -1,6 +1,14 @@
+generic_cublas.o : ../../UtilXlib/mp_bands_util.o
+generic_cublas.o : ../../UtilXlib/util_param.o
 ppcg_gamma.o : ../../UtilXlib/mp.o
 ppcg_gamma.o : ../../UtilXlib/mp_bands_util.o
 ppcg_gamma.o : ../../UtilXlib/util_param.o
+ppcg_gamma_gpu.o : ../../UtilXlib/mp.o
+ppcg_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
+ppcg_gamma_gpu.o : ../../UtilXlib/util_param.o
 ppcg_k.o : ../../UtilXlib/mp.o
 ppcg_k.o : ../../UtilXlib/mp_bands_util.o
 ppcg_k.o : ../../UtilXlib/util_param.o
+ppcg_k_gpu.o : ../../UtilXlib/mp.o
+ppcg_k_gpu.o : ../../UtilXlib/mp_bands_util.o
+ppcg_k_gpu.o : ../../UtilXlib/util_param.o
--- a/KS_Solvers/PPCG/ppcg_gamma_gpu.f90
+++ b/KS_Solvers/PPCG/ppcg_gamma_gpu.f90
--- a/KS_Solvers/PPCG/ppcg_k_gpu.f90
+++ b/KS_Solvers/PPCG/ppcg_k_gpu.f90
--- a/KS_Solvers/ParO/Makefile
+++ b/KS_Solvers/ParO/Makefile
@ -13,6 +13,8 @@ pcg_k.o \
 bpcg_gamma.o \
 bpcg_k.o

+-include make.gpu
+
 all : libparo.a

 libparo.a:  $(PARO)
--- a/KS_Solvers/ParO/bpcg_gamma_gpu.f90
+++ b/KS_Solvers/ParO/bpcg_gamma_gpu.f90
@ -0,0 +1,404 @@
+! Copyright (C) 2015-2016 Aihui Zhou's group
+!
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!-------------------------------------------------------------------------------
+!
+! We propose some parallel orbital updating based plane wave basis methods
+! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
+! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
+! parallelization, which make them have great advantage in large-scale parallelization.
+!
+! The approach following Algorithm is the parallel orbital updating algorithm:
+! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
+!    the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$ 
+!    and let $n=0$.
+! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
+!    $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))]  $$
+!    in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the 
+!    $n$-th iteration or the former iterations.
+! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$   satisfying
+!      $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad  \forall v \in \tilde{V}_N$$
+!      where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$, 
+!      $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
+! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else,  stop.
+!
+! You can see the detailed information through
+!  X. Dai, X. Gong, A. Zhou, J. Zhu,
+!   A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
+! X. Dai, Z. Liu, X. Zhang, A. Zhou,
+!  A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations, 
+!   arXiv:1510.07230 (2015).
+! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
+!  A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
+!
+! The file is written mainly by Stefano de Gironcoli and Yan Pan.
+!
+! The following file is for solving step 2 of the parallel orbital updating algorithm.
+!
+!  Ivan Carnimeo: GPU version
+!
+#define ZERO ( 0.D0, 0.D0 )
+#define ONE  ( 1.D0, 0.D0 )
+!
+!----------------------------------------------------------------------------
+SUBROUTINE bpcg_gamma_gpu( hs_psi_gpu, g_1psi_gpu, psi0_d, spsi0_d, npw, npwx, nbnd, nvec, psi_d, hpsi_d, spsi_d, ethr, e_d, nhpsi )
+  !----------------------------------------------------------------------------
+  !
+  ! Block Preconditioned Conjugate Gradient solution of the linear system
+  !
+  !      [ H - e S ]|\tilde\psi> = Pc [ e S - H ] |psi>
+  !
+  ! the search targets the space orthogonal to the current best wfcs (psi0);
+  ! the solution is sought until the residual norm is a fixed fraction of the RHS norm
+  ! in this way the more accurate is the original problem the more accuratly the correction is computed
+  !
+  ! in order to avoid un-necessary HSpsi evaluations this version assumes psi,hpsi and spsi are all
+  ! provided in input and return their estimate for further use
+  !
+  !  Ivan Carnimeo: GPU version
+  !
+#if defined (__CUDA) 
+  USE cudafor
+#endif  
+  USE util_param,    ONLY : DP, stdout
+  USE mp_bands_util, ONLY : intra_bgrp_comm, gstart
+  USE mp,            ONLY : mp_sum
+  !
+  IMPLICIT NONE
+  !
+  ! Following varibales are temporary
+  COMPLEX(DP), INTENT(IN) :: psi0_d(npwx,nbnd)  ! psi0  needed to compute the Pv projection
+  COMPLEX(DP), INTENT(IN) :: spsi0_d(npwx,nbnd) ! Spsi0  needed to compute the Pv projection
+  !
+  INTEGER,  INTENT(IN)   :: npw, npwx, nbnd, nvec ! input dimensions 
+  REAL(DP), INTENT(IN)   :: ethr                  ! threshold for convergence.
+  REAL(DP), INTENT(INOUT)   :: e_d(nvec)            ! current estimate of the target eigenvalues
+  COMPLEX(DP),INTENT(INOUT) :: psi_d(npwx,nvec),hpsi_d(npwx,nvec),spsi_d(npwx,nvec) ! 
+                                                  ! input: the current estimate of the wfcs
+                                                  ! output: the estimated correction vectors
+  INTEGER, INTENT(INOUT) :: nhpsi                 ! (updated) number of Hpsi evaluations
+  !
+  ! ... LOCAL variables
+  !
+  INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations 
+  !
+  REAL(DP), ALLOCATABLE :: g0(:), g1(:), g2(:), gamma(:), ethr_cg(:), ff(:), ff0(:)
+  INTEGER, ALLOCATABLE :: cg_iter(:)
+  REAL(DP) :: beta, ee
+  INTEGER  :: npw2, npwx2, i, l, block_size, done, nactive, nnew, newdone
+  !
+  ! ... DEVICE variables   
+  !
+  EXTERNAL  g_1psi_gpu, hs_psi_gpu
+  REAL(DP), EXTERNAL :: gpu_DDOT
+  COMPLEX(DP), ALLOCATABLE ::  b_d(:,:),                        & ! RHS for testing
+                               p_d(:,:), hp_d(:,:), sp_d(:,:), z_d(:,:) ! additional working vetors
+  REAL(DP), ALLOCATABLE    ::  spsi0vec_d (:,:) ! the product of spsi0 and a group of vectors
+  COMPLEX(DP) :: tmp, tmp_d
+  INTEGER :: ii, jj
+  REAL(DP), ALLOCATABLE ::  alpha(:) 
+#if defined (__CUDA) 
+  attributes(device) :: psi_d, spsi_d, hpsi_d 
+  attributes(device) :: psi0_d, spsi0_d, spsi0vec_d 
+  attributes(device) :: e_d, b_d, z_d, p_d, hp_d, sp_d
+  attributes(device) :: tmp_d
+#endif  
+!
+  !
+  CALL start_clock( 'pcg' ); !write (6,*) ' enter pcg' , e(1:2) ; FLUSH(6)
+  !
+  npw2  = 2*npw
+  npwx2 = 2*npwx
+  block_size = min(nvec,64) 
+  !
+  ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), gamma( block_size ) )
+  ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
+  ALLOCATE( z_d( npwx, block_size ), b_d( npwx, block_size ) )
+  ALLOCATE( p_d(npwx,block_size), hp_d(npwx,block_size), sp_d(npwx,block_size) )
+  ALLOCATE( spsi0vec_d(nbnd, block_size) )
+  ALLOCATE( alpha( block_size ) )
+!  
+  !
+  done    = 0  ! the number of correction vectors already solved
+  nactive = 0  ! the number of correction vectors currently being updated
+  cg_iter = 0  ! how many iteration each active vector has completed (<= maxter)
+
+  MAIN_LOOP: & ! This is a continuous loop. It terminates only when nactive vanishes
+  DO
+     nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
+     if ( nnew > 0 ) then    ! add nnew vectors to the active list
+        !write(6,*) ' nnew =', nnew
+        do l=nactive+1,nactive+nnew
+           i=l+done
+           !write(6,*) ' l =',l,' i =',i
+           !write (6,*) ' enter pcg' , e(i) ; FLUSH(6)
+!$cuf kernel do(1)
+           DO ii = 1, npwx 
+             b_d(ii,l) = e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)               ! initial gradient and saved RHS for later
+           END DO 
+!$cuf kernel do(1)
+           DO ii = 1, npwx 
+             z_d(ii,l) = b_d(ii,l)
+           END DO 
+           call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i)) ! initial preconditioned gradient
+        end do
+     !- project on conduction bands
+        CALL start_clock( 'pcg:ortho' )
+        CALL gpu_DGEMM( 'T','N', nbnd,nnew,npw2, 2.D0, spsi0_d, npwx2, z_d(:,nactive+1), npwx2, 0.D0, spsi0vec_d, nbnd )
+        IF ( gstart == 2 ) CALL gpu_DGER( nbnd, nnew, -1.D0, spsi0_d, npwx2, z_d(:,nactive+1), npwx2, spsi0vec_d, nbnd )
+        CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
+        CALL gpu_DGEMM( 'N','N', npw2,nnew,nbnd,-1.D0, psi0_d, npwx2, spsi0vec_d, nbnd, 1.D0, z_d(:,nactive+1), npwx2 )
+        CALL stop_clock( 'pcg:ortho' )
+     !-
+        do l=nactive+1,nactive+nnew; i=l+done
+           g0(l) = 2.D0*gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) 
+           IF (gstart==2) g0(l)=g0(l) - gpu_DDOT(2,z_d(1,l),1,b_d(1,l),1)  
+        end do
+        CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
+     
+        do l=nactive+1,nactive+nnew; i=l+done
+           !write(6,*) ' l =',l,' i =',i
+           ff(l) = 0.d0 ; ff0(l) = ff(l)
+           !write (6,*) 0, g0(l), ff(l)
+
+           ! ethr_cg = ethr  ! CG convergence threshold could be set from input but it is not ...
+           ethr_cg(l) = 1.0D-2  ! it makes more sense to fix the convergence of the CG solution to a 
+                                ! fixed function of the RHS (see ethr_cg update later).
+           ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
+           !write(6,*) 'ethr_cg :', ethr_cg(l)
+
+           ! zero the trial solution
+           psi_d(:,i) = ZERO
+           hpsi_d(:,i) = ZERO
+           spsi_d(:,i) = ZERO
+
+           ! initial search direction
+!$cuf kernel do(1)
+           DO ii = 1, npwx
+             p_d(ii,l) = z_d(ii,l)
+           END DO 
+           cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
+        end do
+        nactive = nactive + nnew
+     end if
+     !write(6,*) ' done  =',done, ' nactive  =',nactive
+
+!  iterate: !  DO cg_iter = 1, maxter  ! THIS IS THE ENTRY POINT OF THE PCG LOOP
+
+     if ( nactive == 0 ) EXIT MAIN_LOOP               ! this is the only MAIN_LOOP EXIT condition
+
+     cg_iter(1:nactive) = cg_iter(1:nactive) + 1      ! update interation counters
+
+     CALL start_clock( 'pcg:hs_1psi' )
+!     do l = 1, nactive  ! THIS COULD/SHOULD BE A GLOBAL CALL (ONLY WITHIN ONE BGRP THOUGH)
+!        CALL hs_1psi( npwx, npw, p(:,l), hp(:,l), sp(:,l) ) ! apply H to a single wavefunction (no bgrp parallelization here!)
+!     end do
+     CALL hs_psi_gpu( npwx, npw, nactive, p_d, hp_d, sp_d ) ! apply H to a single wavefunction (no bgrp parallelization here!)
+     CALL stop_clock( 'pcg:hs_1psi' )
+
+     do l = 1, nactive; i=l+done
+        gamma(l) = 2.D0*gpu_DDOT(npw2,p_d(:,l),1,hp_d(:,l),1) - e_d(i) * 2.D0*gpu_DDOT(npw2,p_d(:,l),1,sp_d(:,l),1) 
+        IF (gstart==2) gamma(l) = gamma(l) - gpu_DDOT(2,p_d(1,l),1,hp_d(1,l),1) + e_d(i) * gpu_DDOT(2,p_d(1,l),1,sp_d(1,l),1)
+     end do
+     CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
+
+     do l = 1, nactive; i=l+done
+        !write(6,*) ' l =',l,' i =',i
+
+        alpha(l) = g0(l)/gamma(l)  
+        !write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha_d(l)
+        tmp = alpha(l)
+
+!$cuf kernel do(1)
+        DO ii = 1, npwx
+          psi_d(ii,i)  = psi_d(ii,i)  + tmp * p_d(ii,l)     ! updated solution
+          hpsi_d(ii,i) = hpsi_d(ii,i) + tmp * hp_d(ii,l)    ! updated solution
+          spsi_d(ii,i) = spsi_d(ii,i) + tmp * sp_d(ii,l)    ! updated solution
+        END DO 
+
+        g2(l) = 2.D0 * ( gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) + &
+                         e_d(i) * gpu_DDOT(npw2,z_d(:,l),1,spsi_d(:,i),1) - &
+                         gpu_DDOT(npw2,z_d(:,l),1,hpsi_d(:,i),1) )
+        IF (gstart==2) g2(l) = g2(l) - gpu_DDOT(2,z_d(1,l),1,b_d(1,l),1) - &
+                                       e_d(i)*gpu_DDOT(2,z_d(1,l),1,spsi_d(1,i),1) + &
+                                       gpu_DDOT(2,z_d(1,l),1,hpsi_d(1,i),1)
+     end do
+     CALL mp_sum( g2(1:nactive), intra_bgrp_comm )    ! g2 = < old z | new gradient b + e spsi - hpsi >
+
+     do l = 1, nactive; i=l+done                      ! update the preconditioned gradient
+!$cuf kernel do(1)
+        DO ii = 1, npwx 
+          z_d(ii,l) = b_d(ii,l) + e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)    
+        END DO 
+        call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i))
+     end do
+  !- project on conduction bands
+     CALL start_clock( 'pcg:ortho' )
+     CALL gpu_DGEMM( 'T','N', nbnd,nactive,npw2, 2.D0, spsi0_d, npwx2, z_d, npwx2, 0.D0, spsi0vec_d, nbnd )
+     IF ( gstart == 2 ) CALL gpu_DGER( nbnd, nactive, -1.D0, spsi0_d, npwx2, z_d, npwx2, spsi0vec_d, nbnd )
+     CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
+     CALL gpu_DGEMM( 'N','N', npw2,nactive,nbnd,-1.D0, psi0_d, npwx2, spsi0vec_d, nbnd, 1.D0, z_d, npwx2 )
+     CALL stop_clock( 'pcg:ortho' )
+  !-
+     do l = 1, nactive; i=l+done
+        g1(l) = 2.D0 * ( gpu_DDOT(npw2,z_d(:,l),1,b_d(:,l),1) + &
+                         e_d(i) * gpu_DDOT(npw2,z_d(:,l),1,spsi_d(:,i),1) - &
+                         gpu_DDOT(npw2,z_d(:,l),1,hpsi_d(:,i),1) )
+        IF (gstart==2) THEN
+!$cuf kernel do(1)
+          do ii = 1, 1
+            tmp_d = b_d(1,l) + e_d(i)*spsi_d(1,i) - hpsi_d(1,i) 
+          end do 
+          g1(l) = g1(l) - gpu_DDOT(2,z_d(1,l),1,tmp_d,1)
+        END IF
+     end do
+     CALL mp_sum( g1(1:nactive), intra_bgrp_comm )   ! g1 = < new z | new gradient b + e spsi - hpsi >
+
+     do l = 1, nactive; i = l + done                 ! evaluate the function ff
+        ff(l) = - ( e_d(i)*gpu_DDOT(npw2,psi_d(:,i),1,spsi_d(:,i),1) - gpu_DDOT(npw2,psi_d(:,i),1,hpsi_d(:,i),1) ) &
+                - 2.D0 * gpu_DDOT(npw2,psi_d(:,i),1,b_d(:,l),1)
+        if (gstart==2) THEN
+!$cuf kernel do(1)
+          do ii = 1, 1
+            tmp_d = e_d(i)*spsi_d(1,i) - hpsi_d(1,i) + 2.D0 * b_d(1,l)
+          end do 
+          ff(l) = ff(l) + 0.5D0 * gpu_DDOT(2,psi_d(1,i),1,tmp_d,1)
+        END IF
+     end do
+     CALL mp_sum( ff(1:nactive), intra_bgrp_comm )   ! function minimum -0.5 < psi | e spsi - hpsi > - < psi | b >
+
+     newdone = 0  ! number of correction vectors that converge (or are done) at this iteration
+     do l = 1, nactive; i = l + done
+        !write (6,*) cg_iter(l), g1(l), ff(l),  gamma(l)
+
+        IF ( ff(l) > ff0(l) .AND. ff0(l) < 0.d0 ) THEN
+           tmp = alpha(l)
+!$cuf kernel do(1)
+           DO ii = 1, npwx    
+             psi_d(ii,i)  = psi_d(ii,i)  - tmp * p_d(ii,l) ! fallback solution: if last iter failed to improve ff0
+             hpsi_d(ii,i) = hpsi_d(ii,i) - tmp * hp_d(ii,l)! exit whitout updating and ...
+             spsi_d(ii,i) = spsi_d(ii,i) - tmp * sp_d(ii,l)! hope next time it'll be better
+           END DO 
+        END IF
+
+        !write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
+        !write(6,*) 'ff0, ff : ', ff0(l), ff(l)
+        IF ( ABS ( g1(l) ) < ethr_cg(l) .OR. ( ff(l) > ff0(l) ) .OR. cg_iter(l) == maxter) THEN ! EXIT iterate
+           !write (6,*) ' exit  pcg loop'
+           !write(6,*) ' l =',l,' i =',i
+           !if ( cg_iter(l) == maxter.and. ABS(g1(l)) > ethr_cg(l))  write (6,*) 'CG not converged maxter exceeded', cg_iter(l), g1(l), g0(l), ethr_cg(l)
+           !IF ( ABS ( g1(l) ) < ethr_cg(l)) write (6,*) 'CG correction converged ', cg_iter(l), g1(l), ethr_cg(l)
+           !IF ( ABS ( g1(l) ) > g0(l)     ) write (6,*) 'CG not converged ', cg_iter(l), g1(l), g0(l), ethr_cg(l)
+
+           nhpsi = nhpsi + cg_iter(l)    ! update nhpsi count
+           IF (.NOT. (ABS(g1(l))< ethr_cg(l) .OR. (ff(l)>ff0(l)) ) .AND. cg_iter(l)==maxter) nhpsi = nhpsi + 1 ! because this would be the count
+
+           newdone = newdone + 1         ! one more solution found (or no more active anyway)
+           !write(6,*) ' newdone = ', newdone
+       
+           CALL start_clock( 'pcg:move' )
+           !write(6,*) ' swapping converged psi/hpsi/spsi i = ',i, " with i' = ",done+newdone
+           ! swap the terminated vector with the first in the list of the active ones
+!$cuf kernel do(1)
+           DO ii = 1, npwx 
+             p_d (ii,l) = psi_d (ii,done+newdone) 
+             psi_d (ii,done+newdone) = psi_d (ii,i) 
+             psi_d (ii,i) = p_d (ii,l)
+             hp_d(ii,l) = hpsi_d(ii,done+newdone) 
+             hpsi_d(ii,done+newdone) = hpsi_d(ii,i) 
+             hpsi_d(ii,i) = hp_d(ii,l)
+             sp_d(ii,l) = spsi_d(ii,done+newdone) 
+             spsi_d(ii,done+newdone) = spsi_d(ii,i) 
+             spsi_d(ii,i) = sp_d(ii,l)
+           END DO
+
+           ee = e_d(done+newdone)      
+!$cuf kernel do(1)
+           DO ii = 1, 1
+             e_d(done+newdone) = e_d(i)
+           END DO 
+           e_d(i) = ee
+
+           !write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
+           ! move information of the swapped active vector in the right place to keep going
+!$cuf kernel do(1)
+           DO ii = 1, npwx
+             p_d(ii,l) = p_d(ii,newdone)          
+             hp_d(ii,l) = p_d(ii,newdone)       
+             sp_d(ii,l) = sp_d(ii,newdone)
+             b_d(ii,l) = b_d(ii,newdone) 
+             z_d(ii,l) = z_d(ii,newdone) 
+           END DO
+
+           ff0(l) = ff0(newdone) ; ff(l) = ff(newdone)
+           alpha(l) = alpha(newdone) 
+           g0(l) = g0(newdone) ; g1(l) = g1(newdone) ; g2(l) = g2(newdone)
+           cg_iter(l) = cg_iter(newdone) ; ethr_cg(l) = ethr_cg(newdone)
+           CALL stop_clock( 'pcg:move' )
+
+        ELSE
+
+           !write(6,*) ' l =',l,' i =',i
+           beta   = (g1(l)-g2(l))/g0(l)         ! Polak - Ribiere style update
+           g0(l)  = g1(l)                       ! < new z | new gradient >  ->  < old z | old gradient >
+!$cuf kernel do(1)
+           DO ii = 1, npwx
+             p_d(ii,l) = z_d(ii,l) + beta * p_d(ii,l)      ! updated search direction
+           END DO
+           !write(6,*) 'beta :', beta
+
+           ff0(l) = ff(l)                       ! updated minimum value reached by the function
+
+        END IF
+     end do
+    
+     IF ( newdone > 0 ) THEN
+
+        done    = done    + newdone
+        nactive = nactive - newdone
+
+        !write(6,*) ' there have been ', newdone, ' new converged solution'
+        !write(6,*) ' done = ', done, ' nactive =', nactive
+
+        CALL start_clock( 'pcg:move' )
+        do l=1, nactive
+
+           !write(6,*) ' l+newdone =',l+newdone,'  ->   l =',l
+!$cuf kernel do(1)
+           DO ii = 1, npwx
+             p_d (ii,l) = p_d (ii,l+newdone) 
+             hp_d(ii,l) = hp_d(ii,l+newdone) 
+             sp_d(ii,l) = sp_d(ii,l+newdone)
+             b_d(ii,l) = b_d(ii,l+newdone) 
+             z_d(ii,l) = z_d(ii,l+newdone) 
+           END DO
+
+           ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
+           g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
+           cg_iter(l) = cg_iter(l+newdone) ; ethr_cg(l) = ethr_cg(l+newdone)
+
+        end do
+        CALL stop_clock( 'pcg:move' )
+
+     END IF
+
+!  END DO iterate      Here is where the pcg loop would terminate
+
+  END DO  MAIN_LOOP
+  !write (6,*) ' exit  pcg loop'
+
+  DEALLOCATE( b_d, z_d, p_d, hp_d, sp_d )
+  DEALLOCATE( ethr_cg, ff, ff0, cg_iter )
+  DEALLOCATE( g0, g1, g2, gamma )
+  DEALLOCATE( spsi0vec_d )
+  DEALLOCATE( alpha )
+  !
+  CALL stop_clock( 'pcg' )
+  !
+  RETURN
+  !
+END SUBROUTINE bpcg_gamma_gpu
--- a/KS_Solvers/ParO/bpcg_k_gpu.f90
+++ b/KS_Solvers/ParO/bpcg_k_gpu.f90
@ -0,0 +1,389 @@
+! Copyright (C) 2015-2016 Aihui Zhou's group
+!
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!-------------------------------------------------------------------------------
+!
+! We propose some parallel orbital updating based plane wave basis methods
+! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
+! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
+! parallelization, which make them have great advantage in large-scale parallelization.
+!
+! The approach following Algorithm is the parallel orbital updating algorithm:
+! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
+!    the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$ 
+!    and let $n=0$.
+! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
+!    $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))]  $$
+!    in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the 
+!    $n$-th iteration or the former iterations.
+! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$   satisfying
+!      $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad  \forall v \in \tilde{V}_N$$
+!      where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$, 
+!      $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
+! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else,  stop.
+!
+! You can see the detailed information through
+!  X. Dai, X. Gong, A. Zhou, J. Zhu,
+!   A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
+! X. Dai, Z. Liu, X. Zhang, A. Zhou,
+!  A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations, 
+!   arXiv:1510.07230 (2015).
+! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
+!  A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
+!
+! The file is written mainly by Stefano de Gironcoli and Yan Pan.
+! * GPU version Ivan Carnimeo
+!
+! The following file is for solving step 2 of the parallel orbital updating algorithm.
+!
+#define ZERO ( 0.D0, 0.D0 )
+#define ONE  ( 1.D0, 0.D0 )
+!
+!----------------------------------------------------------------------------
+SUBROUTINE bpcg_k_gpu( hs_psi_gpu, g_1psi_gpu, psi0_d, spsi0_d, npw, npwx, nbnd, npol, nvec, &
+                                              psi_d, hpsi_d, spsi_d, ethr, e_d, nhpsi )
+  !----------------------------------------------------------------------------
+  !
+  ! Block Preconditioned Conjugate Gradient solution of the linear system
+  !
+  !      [ H - e S ]|\tilde\psi> = Pc [ e S - H ] |psi>
+  !
+  ! the search targets the space orthogonal to the current best wfcs (psi0);
+  ! the solution is sought until the residual norm is a fixed fraction of the RHS norm
+  ! in this way the more accurate is the original problem the more accuratly the correction is computed
+  !
+  ! in order to avoid un-necessary HSpsi evaluations this version assumes psi,hpsi and spsi are all
+  ! provided in input and return their estimate for further use
+  !
+#if defined (__CUDA)
+  USE cudafor
+#endif
+  USE util_param,    ONLY : DP, stdout
+  USE mp_bands_util, ONLY : intra_bgrp_comm
+  USE mp,            ONLY : mp_sum
+  !
+  IMPLICIT NONE
+  !
+  ! Following varibales are temporary
+
+  COMPLEX(DP),INTENT(IN) :: psi0_d(npwx*npol,nbnd)  ! psi0  needed to compute the Pv projection
+  COMPLEX(DP),INTENT(IN) :: spsi0_d(npwx*npol,nbnd) ! Spsi0  needed to compute the Pv projection
+  INTEGER,  INTENT(IN)   :: npw, npwx, nbnd, npol, nvec ! input dimensions 
+  REAL(DP), INTENT(IN)   :: ethr                  ! threshold for convergence.
+  REAL(DP), INTENT(INOUT)   :: e_d(nvec)            ! current estimate of the target eigenvalues
+  COMPLEX(DP),INTENT(INOUT) :: psi_d(npwx*npol,nvec),hpsi_d(npwx*npol,nvec),spsi_d(npwx*npol,nvec) ! 
+                                                  ! input: the current estimate of the wfcs
+                                                  ! output: the estimated correction vectors
+  INTEGER, INTENT(INOUT) :: nhpsi                 ! (updated) number of Hpsi evaluations
+  !
+  ! ... LOCAL variables
+  !
+  INTEGER, PARAMETER :: maxter = 5 ! maximum number of CG iterations 
+  !
+  COMPLEX(DP), ALLOCATABLE ::  b_d(:,:),                        & ! RHS for testing
+                               p_d(:,:), hp_d(:,:), sp_d(:,:), z_d(:,:) ! additional working vetors
+
+  COMPLEX(DP), ALLOCATABLE ::  spsi0vec_d (:,:) ! the product of spsi0 and a group of vectors
+
+  REAL(DP), ALLOCATABLE :: g0(:), g1(:), g2(:), gamma(:), ethr_cg(:), ff(:), ff0(:)
+  REAL(DP), ALLOCATABLE :: alpha(:)
+  INTEGER, ALLOCATABLE :: cg_iter(:)
+  REAL(DP) :: beta, ee
+  INTEGER  :: kdim, kdmx, i, l, block_size, done, nactive, nnew, newdone
+  !
+  REAL(DP), EXTERNAL :: gpu_DDOT
+
+  EXTERNAL  g_1psi_gpu, hs_psi_gpu
+  ! hs_1psi( npwx, npw, psi, hpsi, spsi )
+  ! hs_psi( npwx, npw, nvec, psi, hpsi, spsi )
+  !
+  !
+  INTEGER :: ii, jj  ! cuf kernel indeces
+  REAL(DP) :: tmp  
+#if defined (__CUDA)
+  attributes(device) :: psi_d, hpsi_d, spsi_d, psi0_d, spsi0_d, spsi0vec_d
+  attributes(device) :: e_d 
+  attributes(device) :: b_d, p_d, hp_d, sp_d, z_d
+#endif 
+  !
+  CALL start_clock( 'pcg' ); !write (6,*) ' enter pcg' , e(1:2), 'npol = ', npol ; FLUSH(6)
+  !
+  kdim  = npwx*(npol-1) + npw
+  kdmx  = npwx* npol
+  block_size = min(nvec,64) 
+  !
+  ALLOCATE( g0( block_size ), g1( block_size ), g2( block_size ), gamma( block_size ) )
+  ALLOCATE( ethr_cg( block_size ), ff( block_size ), ff0( block_size ), cg_iter( block_size ) )
+  ALLOCATE( z_d( kdmx, block_size ), b_d( kdmx, block_size ) )
+  ALLOCATE( p_d(kdmx,block_size), hp_d(kdmx,block_size), sp_d(kdmx,block_size) )
+  ALLOCATE( spsi0vec_d(nbnd, block_size) )
+  ALLOCATE( alpha( block_size ) )
+  !
+  done    = 0  ! the number of correction vectors already solved
+  nactive = 0  ! the number of correction vectors currently being updated
+  cg_iter = 0  ! how many iteration each active vector has completed (<= maxter)
+
+  MAIN_LOOP: & ! This is a continuous loop. It terminates only when nactive vanishes
+  DO
+     nnew = min(done+block_size,nvec)-(done+nactive) ! number of new corrections to be added to the seach
+     if ( nnew > 0 ) then    ! add nnew vectors to the active list
+        !write(6,*) ' nnew =', nnew
+        do l=nactive+1,nactive+nnew; i=l+done
+           !write(6,*) ' l =',l,' i =',i
+           !write (6,*) ' enter pcg' , e(i), 'npol = ', npol ; FLUSH(6)
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             b_d(ii,l) = e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i)               ! initial gradient and saved RHS for later
+           END DO 
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             z_d(ii,l) = b_d(ii,l)
+           END DO 
+           call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i)) ! initial preconditioned gradient
+        end do
+     !- project on conduction bands
+        CALL start_clock( 'pcg:ortho' )
+        CALL gpu_ZGEMM( 'C', 'N', nbnd, nnew, kdim, ONE, spsi0_d, kdmx, z_d(:,nactive+1), kdmx, ZERO, &
+                                                                                        spsi0vec_d, nbnd )
+        CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
+        CALL gpu_ZGEMM( 'N', 'N', kdim, nnew, nbnd, (-1.D0,0.D0), psi0_d, kdmx, spsi0vec_d, nbnd, ONE, &
+                                                                                      z_d(:,nactive+1), kdmx )
+        CALL stop_clock( 'pcg:ortho' )
+     !-
+        do l=nactive+1,nactive+nnew; i=l+done
+           g0(l) = gpu_DDOT( 2*kdim, z_d(:,l), 1, b_d(:,l), 1 )
+        end do
+        CALL mp_sum( g0(nactive+1:nactive+nnew), intra_bgrp_comm ) ! g0 = < initial z | initial gradient b >
+     
+        do l=nactive+1,nactive+nnew; i=l+done
+           !write(6,*) ' l =',l,' i =',i
+           ff(l) = 0.d0 ; ff0(l) = ff(l)
+           !write (6,*) 0, g0(l), ff(l)
+
+           ! ethr_cg = ethr  ! CG convergence threshold could be set from input but it is not ...
+           ethr_cg(l) = 1.0D-2  ! it makes more sense to fix the convergence of the CG solution to a 
+                                ! fixed function of the RHS (see ethr_cg update later).
+           ethr_cg(l) = max ( 0.01*ethr, ethr_cg(l) * g0(l) ) ! here we set the convergence of the correction
+           !write(6,*) 'ethr_cg :', ethr_cg(l)
+
+           ! zero the trial solution
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             psi_d(ii,i) = ZERO 
+             hpsi_d(ii,i) = ZERO 
+             spsi_d(ii,i) = ZERO
+           END DO 
+           ! initial search direction
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             p_d(ii,l) = z_d(ii,l)
+           END DO 
+           cg_iter(l) = 0 ! this is a new correction vector, reset its interation count
+        end do
+        nactive = nactive + nnew
+     end if
+     !write(6,*) ' done  =',done, ' nactive  =',nactive
+
+!  iterate: !  DO cg_iter = 1, maxter  ! THIS IS THE ENTRY POINT OF THE PCG LOOP
+
+     if ( nactive == 0 ) EXIT MAIN_LOOP               ! this is the only MAIN_LOOP EXIT condition
+
+     cg_iter(1:nactive) = cg_iter(1:nactive) + 1      ! update interation counters
+
+     CALL start_clock( 'pcg:hs_1psi' )
+!     do l = 1, nactive  ! THIS COULD/SHOULD BE A GLOBAL CALL (ONLY WITHIN ONE BGRP THOUGH)
+!        CALL hs_1psi( npwx, npw, p(:,l), hp(:,l), sp(:,l) ) ! apply H to a single wavefunction (no bgrp parallelization here!)
+!     end do
+     CALL hs_psi_gpu( npwx, npw, nactive, p_d, hp_d, sp_d ) ! apply H to a single wavefunction (no bgrp parallelization here!)
+     CALL stop_clock( 'pcg:hs_1psi' )
+     do l = 1, nactive; i=l+done
+        gamma(l) = gpu_DDOT( 2*kdim, p_d(:,l), 1, hp_d(:,l), 1 )
+        gamma(l) = gamma(l) - e_d(i) * gpu_DDOT( 2*kdim, p_d(:,l), 1, sp_d(:,l), 1 ) 
+     end do
+     CALL mp_sum( gamma(1:nactive), intra_bgrp_comm ) ! gamma = < p | hp - e sp >
+
+     do l = 1, nactive; i=l+done
+        !write(6,*) ' l =',l,' i =',i
+
+        alpha(l) = g0(l)/gamma(l)
+        !write(6,*) 'g0, gamma, alpha :', g0(l), gamma(l), alpha(l)
+        tmp = alpha(l)
+!$cuf kernel do(1)
+        DO ii = 1, kdmx
+          psi_d(ii,i)  = psi_d(ii,i)  + tmp * p_d(ii,l)     ! updated solution
+          hpsi_d(ii,i) = hpsi_d(ii,i) + tmp * hp_d(ii,l)    ! updated solution
+          spsi_d(ii,i) = spsi_d(ii,i) + tmp * sp_d(ii,l)    ! updated solution
+        END DO 
+
+        g2(l) = gpu_DDOT(2*kdim,z_d(:,l),1,b_d(:,l),1) &
+                + e_d(i) * gpu_DDOT(2*kdim,z_d(:,l),1,spsi_d(:,i),1) &
+                - gpu_DDOT(2*kdim,z_d(:,l),1,hpsi_d(:,i),1)
+     end do
+     CALL mp_sum( g2(1:nactive), intra_bgrp_comm )    ! g2 = < old z | new gradient b + e spsi - hpsi >
+     do l = 1, nactive; i=l+done                      ! update the preconditioned gradient
+!$cuf kernel do(1)
+        DO ii = 1, kdmx
+          z_d(ii,l) = b_d(ii,l) + e_d(i) * spsi_d(ii,i) - hpsi_d(ii,i) 
+        END DO 
+        call g_1psi_gpu(npwx,npw,z_d(:,l),e_d(i))
+     end do
+  !- project on conduction bands
+     CALL start_clock( 'pcg:ortho' )
+     CALL gpu_ZGEMM( 'C', 'N', nbnd, nactive, kdim, ONE, spsi0_d, kdmx, z_d, kdmx, ZERO, spsi0vec_d, nbnd )
+     CALL mp_sum( spsi0vec_d, intra_bgrp_comm )
+     CALL gpu_ZGEMM( 'N', 'N', kdim, nactive, nbnd, (-1.D0,0.D0), psi0_d, kdmx, spsi0vec_d, nbnd, ONE, z_d, kdmx )
+     CALL stop_clock( 'pcg:ortho' )
+  !-
+     do l = 1, nactive; i=l+done
+        g1(l) = gpu_DDOT(2*kdim,z_d(:,l),1,b_d(:,l),1) &
+                + e_d(i) * gpu_DDOT(2*kdim,z_d(:,l),1,spsi_d(:,i),1) &
+                - gpu_DDOT(2*kdim,z_d(:,l),1,hpsi_d(:,i),1)
+     end do
+     CALL mp_sum( g1(1:nactive), intra_bgrp_comm )   ! g1 = < new z | new gradient b + e spsi - hpsi >
+     do l = 1, nactive; i = l + done                 ! evaluate the function ff
+        ff(l) = -0.5_DP * ( e_d(i)*gpu_DDOT(2*kdim,psi_d(:,i),1,spsi_d(:,i),1)&
+                                                -gpu_DDOT(2*kdim,psi_d(:,i),1,hpsi_d(:,i),1) )
+        ff(l) = ff(l) - gpu_DDOT(2*kdim,psi_d(:,i),1,b_d(:,l),1)
+     end do
+     CALL mp_sum( ff(1:nactive), intra_bgrp_comm )   ! function minimum -0.5 < psi | e spsi - hpsi > - < psi | b >
+
+     newdone = 0  ! number of correction vectors that converge (or are done) at this iteration
+     do l = 1, nactive; i = l + done
+        !write (6,*) cg_iter(l), g1(l), ff(l),  gamma(l)
+
+        IF ( ff(l) > ff0(l) .AND. ff0(l) < 0.d0 ) THEN
+           tmp = alpha(l)
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             psi_d(ii,i)  = psi_d(ii,i)  - tmp * p_d(ii,l) ! fallback solution: if last iter failed to improve ff0
+             hpsi_d(ii,i) = hpsi_d(ii,i) - tmp * hp_d(ii,l)! exit whitout updating and ...
+             spsi_d(ii,i) = spsi_d(ii,i) - tmp * sp_d(ii,l)! hope next time it'll be better
+           END DO 
+        END IF
+
+        !write(6,*) 'g0, g1, g2 :', g0(l), g1(l), g2(l)
+        !write(6,*) 'ff0, ff : ', ff0(l), ff(l)
+        IF ( ABS ( g1(l) ) < ethr_cg(l) .OR. ( ff(l) > ff0(l) ) .OR. cg_iter(l) == maxter) THEN ! EXIT iterate
+           !write (6,*) ' exit  pcg loop'
+           !write(6,*) ' l =',l,' i =',i
+           !if ( cg_iter(l) == maxter.and. ABS(g1(l)) > ethr_cg(l))  write (6,*) 'CG not converged maxter exceeded', cg_iter(l), g1(l), g0(l), ethr_cg(l)
+           !IF ( ABS ( g1(l) ) < ethr_cg(l)) write (6,*) 'CG correction converged ', cg_iter(l), g1(l), ethr_cg(l)
+           !IF ( ABS ( g1(l) ) > g0(l)     ) write (6,*) 'CG not converged ', cg_iter(l), g1(l), g0(l), ethr_cg(l)
+
+           nhpsi = nhpsi + cg_iter(l)    ! update nhpsi count
+           IF (.NOT. (ABS(g1(l))< ethr_cg(l) .OR. (ff(l)>ff0(l)) ) .AND. cg_iter(l)==maxter) nhpsi = nhpsi + 1 ! because this would be the count
+
+           newdone = newdone + 1         ! one more solution found (or no more active anyway)
+           !write(6,*) ' newdone = ', newdone
+       
+           CALL start_clock( 'pcg:move' )
+           !write(6,*) ' swapping converged psi/hpsi/spsi i = ',i, " with i' = ",done+newdone
+           ! swap the terminated vector with the first in the list of the active ones
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             p_d (ii,l) = psi_d (ii,done+newdone) 
+             psi_d (ii,done+newdone) = psi_d (ii,i) 
+             psi_d (ii,i) = p_d (ii,l)
+             hp_d(ii,l) = hpsi_d(ii,done+newdone) 
+             hpsi_d(ii,done+newdone) = hpsi_d(ii,i) 
+             hpsi_d(ii,i) = hp_d(ii,l)
+             sp_d(ii,l) = spsi_d(ii,done+newdone) 
+             spsi_d(ii,done+newdone) = spsi_d(ii,i)  
+             spsi_d(ii,i) = sp_d(ii,l)
+           END DO 
+
+           ee = e_d(done+newdone)       
+!$cuf kernel do(1)
+           DO ii = 1, 1
+             e_d(done+newdone) = e_d(i)      
+           END DO 
+           e_d(i) = ee
+
+           !write(6,*) ' overwrite converged p/hp/etc l = ',l, ' with newdone = ',newdone
+           ! move information of the swapped active vector in the right place to keep going
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             p_d(ii,l) = p_d(ii,newdone)          
+             hp_d(ii,l) = p_d(ii,newdone)           
+             sp_d(ii,l) = sp_d(ii,newdone)
+             b_d(ii,l) = b_d(ii,newdone) 
+             z_d(ii,l) = z_d(ii,newdone) 
+           END DO 
+
+           alpha(l) = alpha(newdone) 
+
+           ff0(l) = ff0(newdone) 
+           ff(l) = ff(newdone)
+           g0(l) = g0(newdone) 
+           g1(l) = g1(newdone) 
+           g2(l) = g2(newdone)
+           cg_iter(l) = cg_iter(newdone) 
+           ethr_cg(l) = ethr_cg(newdone)
+           CALL stop_clock( 'pcg:move' )
+
+        ELSE
+
+           !write(6,*) ' l =',l,' i =',i
+           beta   = (g1(l)-g2(l))/g0(l)         ! Polak - Ribiere style update
+           g0(l)  = g1(l)                       ! < new z | new gradient >  ->  < old z | old gradient >
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             p_d(ii,l) = z_d(ii,l) + beta * p_d(ii,l)      ! updated search direction
+           END DO 
+           !write(6,*) 'beta :', beta
+
+           ff0(l) = ff(l)                       ! updated minimum value reached by the function
+
+        END IF
+     end do
+
+     IF ( newdone > 0 ) THEN
+
+        done    = done    + newdone
+        nactive = nactive - newdone
+
+        !write(6,*) ' there have been ', newdone, ' new converged solution'
+        !write(6,*) ' done = ', done, ' nactive =', nactive
+
+        CALL start_clock( 'pcg:move' )
+        do l=1, nactive
+
+           !write(6,*) ' l+newdone =',l+newdone,'  ->   l =',l
+!$cuf kernel do(1)
+           DO ii = 1, kdmx
+             p_d (ii,l) = p_d (ii,l+newdone) 
+             hp_d(ii,l) = hp_d(ii,l+newdone) 
+             sp_d(ii,l) = sp_d(ii,l+newdone)
+             b_d(ii,l) = b_d(ii,l+newdone)  
+             z_d(ii,l) = z_d(ii,l+newdone) 
+           END DO 
+
+           ff0(l) = ff0(l+newdone) ; ff(l) = ff(l+newdone)
+           g0(l) = g0(l+newdone) ; g1(l) = g1(l+newdone) ; g2(l) = g2(l+newdone)
+           cg_iter(l) = cg_iter(l+newdone) ; ethr_cg(l) = ethr_cg(l+newdone)
+
+        end do
+        CALL stop_clock( 'pcg:move' )
+
+     END IF
+
+!  END DO iterate      Here is where the pcg loop would terminate
+    
+  END DO  MAIN_LOOP
+  !write (6,*) ' exit  pcg loop'
+
+  DEALLOCATE( spsi0vec_d )
+  DEALLOCATE( b_d, p_d, hp_d, sp_d, z_d )
+  DEALLOCATE( ethr_cg, ff, ff0, cg_iter )
+  DEALLOCATE( g0, g1, g2, gamma )
+  DEALLOCATE( alpha )
+  !
+  CALL stop_clock( 'pcg' )
+  !
+  RETURN
+  !
+END SUBROUTINE bpcg_k_gpu
--- a/KS_Solvers/ParO/make.gpu
+++ b/KS_Solvers/ParO/make.gpu
@ -0,0 +1,13 @@
+PARO += \
+paro_gamma_new_gpu.o  \
+paro_k_new_gpu.o \
+bpcg_gamma_gpu.o \
+bpcg_k_gpu.o
+
+
+bpcg_gamma_gpu.o : ../../UtilXlib/mp.o
+bpcg_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
+bpcg_gamma_gpu.o : ../../UtilXlib/util_param.o
+paro_gamma_new_gpu.o : ../../UtilXlib/mp.o
+paro_gamma_new_gpu.o : ../../UtilXlib/mp_bands_util.o
+paro_gamma_new_gpu.o : ../../UtilXlib/util_param.o
--- a/KS_Solvers/ParO/paro_gamma_new_gpu.f90
+++ b/KS_Solvers/ParO/paro_gamma_new_gpu.f90
@ -0,0 +1,327 @@
+!
+! Copyright (C) 2015-2016 Aihui Zhou's group
+!
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!-------------------------------------------------------------------------------
+!
+! We propose some parallel orbital updating based plane wave basis methods
+! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
+! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
+! parallelization, which make them have great advantage in large-scale parallelization.
+!
+! The approach following Algorithm is the parallel orbital updating algorithm:
+! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
+!    the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$ 
+!    and let $n=0$.
+! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
+!    $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))]  $$
+!    in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the 
+!    $n$-th iteration or the former iterations.
+! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$   satisfying
+!      $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad  \forall v \in \tilde{V}_N$$
+!      where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$, 
+!      $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
+! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else,  stop.
+!
+! You can see the detailed information through
+!  X. Dai, X. Gong, A. Zhou, J. Zhu,
+!   A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
+! X. Dai, Z. Liu, X. Zhang, A. Zhou,
+!  A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations, 
+!   arXiv:1510.07230 (2015).
+! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
+!  A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
+!
+! The file is written mainly by Stefano de Gironcoli and Yan Pan.
+!
+! GPU version by Ivan Carnimeo
+!
+!-------------------------------------------------------------------------------
+SUBROUTINE paro_gamma_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap, &
+                 npwx, npw, nbnd, evc_d, eig_d, btype, ethr, notconv, nhpsi )
+  !-------------------------------------------------------------------------------
+  !paro_flag = 1: modified parallel orbital-updating method
+  !
+#if defined (__CUDA) 
+  USE cudafor
+#endif  
+  ! global variables
+  USE util_param,          ONLY : DP, stdout
+  USE mp_bands_util,       ONLY : inter_bgrp_comm, nbgrp, my_bgrp_id
+  USE mp,                  ONLY : mp_sum, mp_allgather, mp_barrier, &
+                                  mp_type_create_column_section, mp_type_free
+
+  IMPLICIT NONE
+  !
+  INCLUDE 'laxlib.fh'
+
+  ! I/O variables
+  LOGICAL, INTENT(IN)        :: overlap
+  INTEGER, INTENT(IN)        :: npw, npwx, nbnd
+  COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx,nbnd)
+  REAL(DP), INTENT(IN)       :: ethr
+  REAL(DP), INTENT(INOUT)    :: eig_d(nbnd)
+  INTEGER, INTENT(IN)        :: btype(nbnd)
+  INTEGER, INTENT(OUT)       :: notconv, nhpsi
+!  INTEGER, INTENT(IN)        :: paro_flag
+  
+  ! local variables (used in the call to cegterg )
+  !------------------------------------------------------------------------
+  EXTERNAL h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu
+  ! subroutine h_psi  (npwx,npw,nvec,evc,hpsi)  computes H*evc  using band parallelization
+  ! subroutine s_psi  (npwx,npw,nvec,evc,spsi)  computes S*evc  using band parallelization
+  ! subroutine hs_1psi(npwx,npw,evc,hpsi,spsi)  computes H*evc and S*evc for a single band
+  ! subroutine g_1psi  (npwx,npw,psi,eig)       computes g*psi  for a single band
+
+  !
+  ! ... local variables
+  !
+  INTEGER :: itry, paro_ntr, nconv, nextra, nactive, nbase, ntrust, ndiag, nvecx, nproc_ortho
+
+  LOGICAL, ALLOCATABLE     :: conv(:)
+
+  REAL(DP), PARAMETER      :: extra_factor = 0.5 ! workspace is at most this factor larger than nbnd
+  INTEGER, PARAMETER       :: min_extra = 4      ! but at least this lager
+
+  INTEGER :: ibnd, ibnd_start, ibnd_end, how_many, lbnd, kbnd, last_unconverged, &
+             recv_counts(nbgrp), displs(nbgrp), column_type
+
+  INTEGER :: ii, jj, kk ! indexes for cuf kernel loops
+
+!civn 2fix: these are needed only for __MPI = true (protate)
+  COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:) 
+  REAL(DP), ALLOCATABLE    :: eig(:), ew(:)
+!
+  !
+  ! ... device variables
+  !
+  COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
+  REAL(DP), ALLOCATABLE    :: ew_d(:)
+  LOGICAL, ALLOCATABLE     :: conv_d(:)
+#if defined (__CUDA)
+  attributes(device) :: psi_d, hpsi_d, spsi_d
+  attributes(device) :: evc_d, eig_d, ew_d
+  attributes(device) :: conv_d 
+#endif 
+  !
+  ! ... init local variables
+  !
+  CALL laxlib_getval( nproc_ortho = nproc_ortho )
+  paro_ntr = 20
+
+  nvecx = nbnd + max ( nint ( extra_factor * nbnd ), min_extra )
+  !
+  CALL start_clock( 'paro_gamma' ); !write (6,*) ' enter paro diag'
+
+  CALL mp_type_create_column_section(evc_d(1,1), 0, npwx, npwx, column_type)
+
+  ALLOCATE ( conv(nbnd) )
+  ALLOCATE ( conv_d(nbnd) )
+  ALLOCATE ( psi_d(npwx,nvecx), hpsi_d(npwx,nvecx), spsi_d(npwx,nvecx), ew_d(nvecx) )
+
+  CALL start_clock( 'paro:init' ); 
+  conv(:) =  .FALSE. ; nconv = COUNT ( conv(:) )
+!$cuf kernel do(1)
+  do ii = 1, nbnd
+    conv_d(ii) = .FALSE.
+  end do  
+
+!$cuf kernel do(1)
+  DO ii = 1, npwx
+    psi_d(ii,1:nbnd) = evc_d(ii,1:nbnd) ! copy input evc into work vector
+  END DO
+
+  call h_psi_gpu  (npwx,npw,nbnd,psi_d,hpsi_d) ! computes H*psi
+  call s_psi_gpu  (npwx,npw,nbnd,psi_d,spsi_d) ! computes S*psi
+  nhpsi = 0 ; IF (my_bgrp_id==0) nhpsi = nbnd
+  CALL stop_clock( 'paro:init' ); 
+
+#if defined(__MPI)
+  IF ( nproc_ortho == 1 ) THEN
+#endif
+     CALL rotate_HSpsi_gamma_gpu (  npwx, npw, nbnd, nbnd, psi_d, hpsi_d, overlap, spsi_d, eig_d )
+#if defined(__MPI)
+  ELSE
+!civn 2fix
+     ALLOCATE ( psi(npwx,nvecx), hpsi(npwx,nvecx), spsi(npwx,nvecx), eig(nbnd) )
+     eig = eig_d
+     psi  = psi_d
+     hpsi = hpsi_d
+     spsi = spsi_d
+     CALL protate_HSpsi_gamma(  npwx, npw, nbnd, nbnd, psi, hpsi, overlap, spsi, eig )
+     eig_d = eig
+     psi_d   =  psi  
+     hpsi_d  =  hpsi 
+     spsi_d  =  spsi 
+     DEALLOCATE ( psi, hpsi, spsi, eig )
+  ENDIF
+#endif
+
+  !write (6,'(10f10.4)') psi(1:5,1:3)
+
+  !write (6,*) eig(1:nbnd)
+
+  ParO_loop : &
+  DO itry = 1,paro_ntr
+
+     !write (6,*) ' paro_itry =', itry, ethr
+
+     !----------------------------
+
+     nactive = nbnd - (nconv+1)/2 ! number of correction vectors to be computed (<nbnd)
+     notconv = nbnd - nconv       ! number of needed roots
+     nextra  = nactive - notconv  ! number of extra vectors
+     nbase   = nconv + nactive    ! number of orbitals the correction should be orthogonal to (<2*nbnd)
+     ndiag   = nbase + nactive    ! dimension of the matrix to be diagonalized at this iteration (<2*nbnd)
+
+     !----------------------------
+
+     nactive = min ( (nvecx-nconv)/2, nvecx-nbnd) ! number of corrections there is space for
+     notconv = nbnd - nconv                       ! number of needed roots
+     nextra  = max ( nactive - notconv, 0 )       ! number of extra vectors, if any
+     nbase   = max ( nconv + nactive , nbnd )     ! number of orbitals to be orthogonal to  (<nvecx)
+     ntrust  = min ( nconv + nactive , nbnd )     ! number of orbitals that will be actually corrected
+     ndiag   = nbase + nactive       ! dimension of the matrix to be diagonalized at this iteration (<nvecx)
+
+     !write (6,*) itry, notconv, conv
+     !write (6,*) ' nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag  =', nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag
+     
+     CALL divide_all(inter_bgrp_comm,nactive,ibnd_start,ibnd_end,recv_counts,displs)
+     how_many = ibnd_end - ibnd_start + 1
+     !write (6,*) nactive, ibnd_start, ibnd_end, recv_counts, displs
+
+     CALL start_clock( 'paro:pack' ); 
+     lbnd = 1; kbnd = 1
+     DO ibnd = 1, ntrust ! pack unconverged roots in the available space
+        IF (.NOT.conv(ibnd) ) THEN
+!$cuf kernel do(1)
+           DO ii = 1, npwx
+             psi_d (ii,nbase+kbnd)  = psi_d(ii,ibnd)
+             hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
+             spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
+           END DO 
+!$cuf kernel do(1)
+           DO ii = 1, 1
+             ew_d(kbnd) = eig_d(ibnd) 
+           END DO 
+           last_unconverged = ibnd
+           lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
+        END IF
+     END DO
+     DO ibnd = nbnd+1, nbase   ! add extra vectors if it is the case
+!$cuf kernel do(1)
+        DO ii = 1, npwx
+          psi_d (ii,nbase+kbnd)  = psi_d(ii,ibnd)
+          hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
+          spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
+        END DO 
+!$cuf kernel do(1) 
+        DO ii = 1, 1
+          ew_d(kbnd) = eig_d(last_unconverged)
+        END DO 
+        lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
+     END DO
+!$cuf kernel do(2)
+     DO ii = 1, npwx
+       DO jj = nbase+1, nbase+how_many  
+         kk = jj + ibnd_start - 1
+         psi_d (ii,jj) = psi_d (ii,kk)
+         hpsi_d(ii,jj) = hpsi_d(ii,kk)
+         spsi_d(ii,jj) = spsi_d(ii,kk)
+       END DO 
+     END DO 
+!$cuf kernel do(1)
+     DO ii = 1, how_many
+       kk = ii + ibnd_start - 1 
+       ew_d(ii) = ew_d(kk)
+     END DO 
+ 
+     CALL stop_clock( 'paro:pack' ); 
+   
+!     write (6,*) ' check nactive = ', lbnd, nactive
+     if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
+     CALL bpcg_gamma_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, how_many, &
+                psi_d(:,nbase+1), hpsi_d(:,nbase+1), spsi_d(:,nbase+1), ethr, ew_d(1), nhpsi)
+     CALL start_clock( 'paro:mp_bar' ); 
+     CALL mp_barrier(inter_bgrp_comm)
+     CALL stop_clock( 'paro:mp_bar' ); 
+     CALL start_clock( 'paro:mp_sum' ); 
+!$cuf kernel do(2)
+     DO ii = 1, npwx 
+       DO jj = nbase+1, nbase+how_many 
+         kk = jj + ibnd_start - 1 
+         psi_d(ii, kk)  = psi_d(ii, jj)        
+         hpsi_d(ii, kk) = hpsi_d(ii, jj)        
+         spsi_d(ii, kk) = spsi_d(ii, jj)        
+       END DO 
+     END DO 
+     CALL mp_allgather(psi_d (:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
+     CALL mp_allgather(hpsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
+     CALL mp_allgather(spsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
+     CALL stop_clock( 'paro:mp_sum' ); 
+
+#if defined(__MPI)
+     IF ( nproc_ortho == 1 ) THEN
+#endif
+        CALL rotate_HSpsi_gamma_gpu (  npwx, npw, ndiag, ndiag, psi_d, hpsi_d, overlap, spsi_d, ew_d )
+#if defined(__MPI)
+     ELSE
+!civn 2fix
+        ALLOCATE ( psi(npwx,nvecx), hpsi(npwx,nvecx), spsi(npwx,nvecx), ew(nvecx) )
+        ew = ew_d
+        psi  =  psi_d   
+        hpsi =  hpsi_d  
+        spsi =  spsi_d  
+        CALL protate_HSpsi_gamma(  npwx, npw, ndiag, ndiag, psi, hpsi, overlap, spsi, ew )
+        ew_d = ew
+        psi_d   =  psi  
+        hpsi_d  =  hpsi 
+        spsi_d  =  spsi 
+        DEALLOCATE ( psi, hpsi, spsi, ew )
+     ENDIF
+#endif
+
+     !write (6,*) ' ew : ', ew(1:nbnd)
+     ! only the first nbnd eigenvalues are relevant for convergence
+     ! but only those that have actually been corrected should be trusted
+     conv(1:nbnd) = .FALSE.
+!$cuf kernel do(1)
+     do ii = 1, nbnd
+       conv_d(ii) = .FALSE.
+     end do 
+!$cuf kernel do(1)
+     DO ii = 1, ntrust
+       conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr 
+     END DO 
+     conv = conv_d
+     nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
+!$cuf kernel do(1)
+     DO ii = 1, nbnd
+       eig_d(ii)  = ew_d(ii)
+     END DO
+     IF ( nconv == nbnd ) EXIT ParO_loop
+
+  END DO ParO_loop
+
+!$cuf kernel do(1)
+  DO ii = 1, npwx
+    DO jj = 1, nbnd
+      evc_d(ii,jj) = psi_d(ii,jj)
+    END DO 
+  END DO
+
+  CALL mp_sum(nhpsi,inter_bgrp_comm)
+
+  DEALLOCATE ( ew_d, conv )
+  DEALLOCATE ( conv_d )
+  DEALLOCATE ( psi_d, hpsi_d, spsi_d )
+
+  CALL mp_type_free( column_type )
+
+  CALL stop_clock( 'paro_gamma' ); !write (6,*) ' exit paro diag'
+
+END SUBROUTINE paro_gamma_new_gpu
--- a/KS_Solvers/ParO/paro_k_new_gpu.f90
+++ b/KS_Solvers/ParO/paro_k_new_gpu.f90
@ -0,0 +1,325 @@
+!
+! Copyright (C) 2015-2016 Aihui Zhou's group
+!
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!-------------------------------------------------------------------------------
+!
+! We propose some parallel orbital updating based plane wave basis methods
+! for electronic structure calculations, which aims to the solution of the corresponding eigenvalue
+! problems. Compared to the traditional plane wave methods, our methods have the feature of two level
+! parallelization, which make them have great advantage in large-scale parallelization.
+!
+! The approach following Algorithm is the parallel orbital updating algorithm:
+! 1. Choose initial $E_{\mathrm{cut}}^{(0)}$ and then obtain $V_{N_G^{0}}$, use the SCF method to solve
+!    the Kohn-Sham equation in $V_{G_0}$ and get the initial $(\lambda_i^{0},u_i^{0}), i=1, \cdots, N$ 
+!    and let $n=0$.
+! 2. For $i=1,2,\ldots,N$, find $e_i^{n+1/2}\in V_{G_n}$ satisfying
+!    $$a(\rho_{in}^{n}; e_i^{n+1/2}, v) = -[(a(\rho_{in}^{n}; u_i^{n}, v) - \lambda_i^{n} (u_i^{n}, v))]  $$
+!    in parallel , where $\rho_{in}^{n}$ is the input charge density obtained by the orbits obtained in the 
+!    $n$-th iteration or the former iterations.
+! 3. Find $\{\lambda_i^{n+1},u_i^{n+1}\} \in \mathbf{R}\times \tilde{V}_N$   satisfying
+!      $$a(\tilde{\rho}; u_i^{n+1}, v) = ( \lambda_i^{n+1}u_i^{n+1}, v) \quad  \forall v \in \tilde{V}_N$$
+!      where $\tilde{V}_N = \mathrm{span}\{e_1^{n+1/2},\ldots,e_N^{n+1/2},u_1^{n},\ldots,u_N^{n}\}$, 
+!      $\tilde{\rho}(x)$ is the input charge density obtained from the previous orbits.
+! 4. Convergence check: if not converged, set $n=n+1$, go to step 2; else,  stop.
+!
+! You can see the detailed information through
+!  X. Dai, X. Gong, A. Zhou, J. Zhu,
+!   A parallel orbital-updating approach for electronic structure calculations, arXiv:1405.0260 (2014).
+! X. Dai, Z. Liu, X. Zhang, A. Zhou,
+!  A Parallel Orbital-updating Based Optimization Method for Electronic Structure Calculations, 
+!   arXiv:1510.07230 (2015).
+! Yan Pan, Xiaoying Dai, Xingao Gong, Stefano de Gironcoli, Gian-Marco Rignanese, and Aihui Zhou,
+!  A Parallel Orbital-updating Based Plane Wave Basis Method. J. Comp. Phys. 348, 482-492 (2017).
+!
+! The file is written mainly by Stefano de Gironcoli and Yan Pan.
+!
+!-------------------------------------------------------------------------------
+SUBROUTINE paro_k_new_gpu( h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu, overlap, &
+                   npwx, npw, nbnd, npol, evc_d, eig_d, btype, ethr, notconv, nhpsi )
+  !-------------------------------------------------------------------------------
+  !paro_flag = 1: modified parallel orbital-updating method
+
+#if defined (__CUDA)
+  USE cudafor
+#endif
+  ! global variables
+  USE util_param,          ONLY : DP, stdout
+  USE mp_bands_util,       ONLY : inter_bgrp_comm, nbgrp, my_bgrp_id
+  USE mp,                  ONLY : mp_sum, mp_allgather, mp_barrier, &
+                                  mp_type_create_column_section, mp_type_free
+
+  IMPLICIT NONE
+  !
+  INCLUDE 'laxlib.fh'
+
+  ! I/O variables
+  LOGICAL, INTENT(IN)        :: overlap
+  INTEGER, INTENT(IN)        :: npw, npwx, nbnd, npol
+  COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
+  REAL(DP), INTENT(IN)       :: ethr
+  REAL(DP), INTENT(INOUT)    :: eig_d(nbnd)   
+  REAL(DP), ALLOCATABLE      :: eig(:)       ! copy for protate
+  INTEGER, INTENT(IN)        :: btype(nbnd)
+  INTEGER, INTENT(OUT)       :: notconv, nhpsi
+!  INTEGER, INTENT(IN)        :: paro_flag
+  
+  ! local variables (used in the call to cegterg )
+  !------------------------------------------------------------------------
+  EXTERNAL h_psi_gpu, s_psi_gpu, hs_psi_gpu, g_1psi_gpu
+  ! subroutine h_psi  (npwx,npw,nvec,evc,hpsi)  computes H*evc  using band parallelization
+  ! subroutine s_psi  (npwx,npw,nvec,evc,spsi)  computes S*evc  using band parallelization
+  ! subroutine hs_1psi(npwx,npw,evc,hpsi,spsi)  computes H*evc and S*evc for a single band
+  ! subroutine g_1psi  (npwx,npw,psi,eig)       computes g*psi  for a single band
+
+  !
+  ! ... local variables
+  !
+  INTEGER :: itry, paro_ntr, nconv, nextra, nactive, nbase, ntrust, ndiag, nvecx, nproc_ortho
+
+  LOGICAL, ALLOCATABLE     :: conv(:)
+
+  REAL(DP), PARAMETER      :: extra_factor = 0.5 ! workspace is at most this factor larger than nbnd
+  INTEGER, PARAMETER       :: min_extra = 4      ! but at least this lager
+
+  INTEGER :: ibnd, ibnd_start, ibnd_end, how_many, lbnd, kbnd, last_unconverged, &
+             recv_counts(nbgrp), displs(nbgrp), column_type
+
+  INTEGER :: ii, jj, kk   ! indexes for cuf kernel loops
+!
+!civn 2fix: these are needed only for __MPI = true (protate)
+  REAL(DP), ALLOCATABLE    :: ew(:)
+  COMPLEX(DP), ALLOCATABLE :: psi(:,:), hpsi(:,:), spsi(:,:)
+  !
+  ! .. device variables
+  !
+  REAL(DP), ALLOCATABLE    :: ew_d(:)
+  COMPLEX(DP), ALLOCATABLE :: psi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
+  LOGICAL, ALLOCATABLE     :: conv_d(:)
+#if defined (__CUDA)
+  attributes(device) :: evc_d, eig_d
+  attributes(device) :: psi_d, hpsi_d, spsi_d, ew_d
+  attributes(device) :: conv_d 
+#endif  
+  !
+  ! ... init local variables
+  !
+  CALL laxlib_getval( nproc_ortho = nproc_ortho )
+  paro_ntr = 20
+
+  nvecx = nbnd + max ( nint ( extra_factor * nbnd ), min_extra )
+  !
+  CALL start_clock( 'paro_k' ); !write (6,*) ' enter paro diag'
+
+  CALL mp_type_create_column_section(evc_d(1,1), 0, npwx*npol, npwx*npol, column_type)
+
+  ALLOCATE ( ew_d(nvecx), conv(nbnd) )
+  ALLOCATE ( conv_d(nbnd) )
+  ALLOCATE ( psi_d(npwx*npol,nvecx), hpsi_d(npwx*npol,nvecx), spsi_d(npwx*npol,nvecx) )
+
+  CALL start_clock( 'paro:init' ); 
+  conv(:) =  .FALSE. ; nconv = COUNT ( conv(:) )
+!$cuf kernel do(1)
+  do ii = 1, nbnd
+    conv_d(ii) = .FALSE.
+  end do  
+!$cuf kernel do(2) 
+  DO ii = 1, npwx*npol
+    DO jj = 1, nbnd
+      psi_d(ii,jj) = evc_d(ii,jj)
+    END DO 
+  END DO
+
+  call h_psi_gpu  (npwx,npw,nbnd,psi_d,hpsi_d) ! computes H*psi
+  call s_psi_gpu  (npwx,npw,nbnd,psi_d,spsi_d) ! computes S*psi
+  nhpsi = 0 ; IF (my_bgrp_id==0) nhpsi = nbnd
+  CALL stop_clock( 'paro:init' ); 
+
+#if defined(__MPI)
+  IF ( nproc_ortho == 1 ) THEN
+#endif
+     CALL rotate_HSpsi_k_gpu (  npwx, npw, nbnd, nbnd, npol, psi_d, hpsi_d, overlap, spsi_d, eig_d )
+#if defined(__MPI)
+  ELSE
+!civn 2fix
+     ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), eig(nbnd) )
+     psi = psi_d
+     hpsi = hpsi_d
+     spsi = spsi_d
+     eig = eig_d
+     CALL protate_HSpsi_k(  npwx, npw, nbnd, nbnd, npol, psi, hpsi, overlap, spsi, eig )
+     psi_d = psi
+     hpsi_d = hpsi
+     spsi_d = spsi
+     eig_d = eig
+     DEALLOCATE ( psi, hpsi, spsi, eig )
+  ENDIF
+#endif
+  !write (6,'(10f10.4)') psi(1:5,1:3)
+
+  !write (6,*) eig(1:nbnd)
+
+  ParO_loop : &
+  DO itry = 1,paro_ntr
+
+     !write (6,*) ' paro_itry =', itry, ethr
+
+     !----------------------------
+
+     nactive = nbnd - (nconv+1)/2 ! number of correction vectors to be computed (<nbnd)
+     notconv = nbnd - nconv       ! number of needed roots
+     nextra  = nactive - notconv  ! number of extra vectors
+     nbase   = nconv + nactive    ! number of orbitals the correction should be orthogonal to (<2*nbnd)
+     ndiag   = nbase + nactive    ! dimension of the matrix to be diagonalized at this iteration (<2*nbnd)
+
+     !----------------------------
+
+     nactive = min ( (nvecx-nconv)/2, nvecx-nbnd) ! number of corrections there is space for
+     notconv = nbnd - nconv                       ! number of needed roots
+     nextra  = max ( nactive - notconv, 0 )       ! number of extra vectors, if any
+     nbase   = max ( nconv + nactive , nbnd )     ! number of orbitals to be orthogonal to  (<nvecx)
+     ntrust  = min ( nconv + nactive , nbnd )     ! number of orbitals that will be actually corrected
+     ndiag   = nbase + nactive       ! dimension of the matrix to be diagonalized at this iteration (<nvecx)
+
+     !write (6,*) itry, notconv, conv
+     !write (6,*) ' nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag  =', nvecx, nbnd, nconv, notconv, nextra, nactive, nbase, ntrust, ndiag
+     
+     CALL divide_all(inter_bgrp_comm,nactive,ibnd_start,ibnd_end,recv_counts,displs)
+     how_many = ibnd_end - ibnd_start + 1
+     !write (6,*) nactive, ibnd_start, ibnd_end, recv_counts, displs
+
+     CALL start_clock( 'paro:pack' ); 
+     lbnd = 1; kbnd = 1
+     DO ibnd = 1, ntrust ! pack unconverged roots in the available space
+        IF (.NOT.conv(ibnd) ) THEN
+!$cuf kernel do(1)
+           DO ii = 1, npwx*npol
+             psi_d (ii,nbase+kbnd)  = psi_d(ii,ibnd)
+             hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
+             spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
+           END DO 
+!$cuf kernel do(1)
+           DO ii = 1, 1
+             ew_d(kbnd) = eig_d(ibnd) 
+           END DO 
+           last_unconverged = ibnd
+           lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
+        END IF
+     END DO
+     DO ibnd = nbnd+1, nbase   ! add extra vectors if it is the case
+!$cuf kernel do(1)
+        DO ii = 1, npwx*npol
+          psi_d (ii,nbase+kbnd)  = psi_d(ii,ibnd)
+          hpsi_d(ii,nbase+kbnd) = hpsi_d(ii,ibnd)
+          spsi_d(ii,nbase+kbnd) = spsi_d(ii,ibnd)
+        END DO
+!$cuf kernel do(1)
+        DO ii = 1, 1
+          ew_d(kbnd) = eig_d(last_unconverged)
+        END DO 
+        lbnd=lbnd+1 ; kbnd=kbnd+recv_counts(mod(lbnd-2,nbgrp)+1); if (kbnd>nactive) kbnd=kbnd+1-nactive
+     END DO
+!$cuf kernel do(2)
+     DO jj = 1, how_many  
+       kk = jj + ibnd_start - 1
+       DO ii = 1, npwx*npol
+         psi_d (ii,nbase+jj) = psi_d (ii,nbase+kk)
+         hpsi_d(ii,nbase+jj) = hpsi_d(ii,nbase+kk)
+         spsi_d(ii,nbase+jj) = spsi_d(ii,nbase+kk)
+       END DO
+     END DO
+!$cuf kernel do(1)
+     DO ii = 1, how_many
+       ew_d(ii) = ew_d(ii+ibnd_start-1)
+     END DO 
+     CALL stop_clock( 'paro:pack' ); 
+
+     !write (6,*) ' check nactive = ', lbnd, nactive, nconv
+     if (lbnd .ne. nactive+1 ) stop ' nactive check FAILED '
+
+     CALL bpcg_k_gpu(hs_psi_gpu, g_1psi_gpu, psi_d, spsi_d, npw, npwx, nbnd, npol, how_many, &
+                psi_d(:,nbase+1), hpsi_d(:,nbase+1), spsi_d(:,nbase+1), ethr, ew_d(1), nhpsi)
+!
+     CALL start_clock( 'paro:mp_bar' ); 
+     CALL mp_barrier(inter_bgrp_comm)
+     CALL stop_clock( 'paro:mp_bar' ); 
+     CALL start_clock( 'paro:mp_sum' ); 
+!$cuf kernel do(2)
+     DO ii = 1, npwx*npol
+       DO jj = nbase+1, nbase+how_many  
+         kk = jj + ibnd_start - 1
+         psi_d (ii,kk) = psi_d (ii,jj) 
+         hpsi_d(ii,kk) = hpsi_d(ii,jj) 
+         spsi_d(ii,kk) = spsi_d(ii,jj) 
+       END DO 
+     END DO 
+
+     CALL mp_allgather(psi_d (:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
+     CALL mp_allgather(hpsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
+     CALL mp_allgather(spsi_d(:,nbase+1:ndiag), column_type, recv_counts, displs, inter_bgrp_comm)
+     CALL stop_clock( 'paro:mp_sum' ); 
+
+#if defined(__MPI)
+     IF ( nproc_ortho == 1 ) THEN
+#endif
+        CALL rotate_HSpsi_k_gpu ( npwx, npw, ndiag, ndiag, npol, psi_d, hpsi_d, overlap, spsi_d, ew_d )
+#if defined(__MPI)
+     ELSE
+!civn 2fix
+        ALLOCATE ( psi(npwx*npol,nvecx), hpsi(npwx*npol,nvecx), spsi(npwx*npol,nvecx), ew(nvecx) )
+        psi = psi_d
+        hpsi = hpsi_d
+        spsi = spsi_d
+        ew = ew_d
+        CALL protate_HSpsi_k( npwx, npw, ndiag, ndiag, npol, psi, hpsi, overlap, spsi, ew )
+        psi_d = psi
+        hpsi_d = hpsi
+        spsi_d = spsi
+        ew_d = ew
+        DEALLOCATE ( psi, hpsi, spsi, ew )
+     ENDIF
+#endif
+     !write (6,*) ' ew : ', ew(1:nbnd)
+     ! only the first nbnd eigenvalues are relevant for convergence
+     ! but only those that have actually been corrected should be trusted
+     conv(1:nbnd) = .FALSE.
+!$cuf kernel do(1)
+     do ii = 1, nbnd
+       conv_d(ii) = .FALSE.
+     end do 
+!$cuf kernel do(1)
+     DO ii = 1, ntrust
+       conv_d(ii) = ABS(ew_d(ii) - eig_d(ii)).LT.ethr 
+     END DO 
+     conv = conv_d
+     nconv = COUNT(conv(1:ntrust)) ; notconv = nbnd - nconv
+!$cuf kernel do(1)
+     DO ii = 1, nbnd
+       eig_d(ii)  = ew_d(ii)
+     END DO 
+     IF ( nconv == nbnd ) EXIT ParO_loop
+
+  END DO ParO_loop
+
+!$cuf kernel do(2)
+  DO ii = 1, npwx*npol
+    DO jj = 1, nbnd  
+      evc_d(ii,jj) = psi_d(ii,jj)
+    END DO 
+  END DO 
+
+  CALL mp_sum(nhpsi,inter_bgrp_comm)
+
+  DEALLOCATE ( ew_d, conv )
+  DEALLOCATE ( conv_d )
+  DEALLOCATE ( psi_d, hpsi_d, spsi_d )
+
+  CALL mp_type_free( column_type )
+
+  CALL stop_clock( 'paro_k' ); !write (6,*) ' exit paro diag'
+
+END SUBROUTINE paro_k_new_gpu
--- a/KS_Solvers/make.gpu
+++ b/KS_Solvers/make.gpu
@ -0,0 +1,18 @@
+# Makefile for DAVIDSON and CG on GPU
+
+ALLOBJS += \
+Davidson/cegterg_gpu.o \
+Davidson/regterg_gpu.o \
+DENSE/rotate_wfc_k_gpu.o \
+DENSE/rotate_wfc_gamma_gpu.o \
+CG/rcgdiagg_gpu.o \
+CG/ccgdiagg_gpu.o \
+PPCG/generic_cublas.o \
+PPCG/ppcg_gamma_gpu.o \
+PPCG/ppcg_k_gpu.o \
+ParO/bpcg_gamma_gpu.o \
+ParO/bpcg_k_gpu.o \
+ParO/paro_gamma_new_gpu.o \
+ParO/paro_k_new_gpu.o \
+DENSE/rotate_HSpsi_gamma_gpu.o \
+DENSE/rotate_HSpsi_k_gpu.o
--- a/LAXlib/Makefile
+++ b/LAXlib/Makefile
@ -2,10 +2,13 @@

 include ../make.inc

+
 EXTLIBS=$(CUDA_LIBS) $(SCALAPACK_LIBS) $(LAPACK_LIBS) $(BLAS_LIBS) $(MPI_LIBS)

+
 LAX = la_types.o \
 la_error.o \
+la_module.o \
 la_helper.o \
 cdiaghg.o \
 rdiaghg.o \
--- a/LAXlib/la_module.f90
+++ b/LAXlib/la_module.f90
@ -0,0 +1,442 @@
+MODULE LAXlib
+#ifdef __CUDA
+  USE cudafor
+#endif
+  IMPLICIT NONE
+  !
+  INTERFACE diaghg
+     MODULE PROCEDURE cdiaghg_cpu_, rdiaghg_cpu_
+#ifdef __CUDA
+     MODULE PROCEDURE cdiaghg_gpu_, rdiaghg_gpu_
+#endif
+  END INTERFACE
+  !
+  INTERFACE pdiaghg
+     MODULE PROCEDURE pcdiaghg_, prdiaghg_
+#ifdef __CUDA
+     MODULE PROCEDURE pcdiaghg__gpu, prdiaghg__gpu
+#endif
+  END INTERFACE
+  !
+  CONTAINS
+  !
+  !----------------------------------------------------------------------------
+  SUBROUTINE cdiaghg_cpu_( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, offload )
+    !----------------------------------------------------------------------------
+    !
+    ! ... calculates eigenvalues and eigenvectors of the generalized problem
+    ! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
+    ! ... On output both matrix are unchanged
+    !
+    ! ... LAPACK version - uses both ZHEGV and ZHEGVX
+    !
+#if defined (__CUDA)
+    USE cudafor
+#endif
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    !
+    INTEGER, INTENT(IN) :: n, m, ldh
+      ! dimension of the matrix to be diagonalized
+      ! number of eigenstates to be calculate
+      ! leading dimension of h, as declared in the calling pgm unit
+    COMPLEX(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
+      ! actually intent(in) but compilers don't know and complain
+      ! matrix to be diagonalized
+      ! overlap matrix
+    REAL(DP), INTENT(OUT) :: e(n)
+      ! eigenvalues
+    COMPLEX(DP), INTENT(OUT) :: v(ldh,m)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
+      !
+    LOGICAL, OPTIONAL ::  offload
+      ! optionally solve the eigenvalue problem on the GPU
+    LOGICAL :: loffload
+      !
+#if defined(__CUDA)
+    COMPLEX(DP), ALLOCATABLE, DEVICE :: v_d(:,:), h_d(:,:), s_d(:,:)
+    REAL(DP),    ALLOCATABLE, DEVICE :: e_d(:)
+    INTEGER :: info
+#endif
+    !
+    loffload = .false.
+    !
+    ! the following ifdef ensures no offload if not compiling from GPU 
+#if defined(__CUDA)
+    IF (PRESENT(offload)) loffload = offload
+#endif
+    !
+    ! ... always false when compiling without CUDA support
+    !
+    IF ( loffload ) THEN
+#if defined(__CUDA)
+      !
+      ALLOCATE(s_d, source=s); ALLOCATE(h_d, source=h)
+      ALLOCATE(e_d(n), v_d(ldh,n))
+      !
+      CALL laxlib_cdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
+      !
+      e = e_d
+      v(1:ldh,1:m) = v_d(1:ldh,1:m)
+      !
+      DEALLOCATE(h_d, s_d, e_d, v_d)
+#endif
+    ELSE
+      CALL laxlib_cdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
+    END IF
+    !
+    RETURN
+    !
+  END SUBROUTINE cdiaghg_cpu_
+  !
+#if defined(__CUDA)
+  !----------------------------------------------------------------------------
+  SUBROUTINE cdiaghg_gpu_( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, onhost )
+    !----------------------------------------------------------------------------
+    !
+    ! ... calculates eigenvalues and eigenvectors of the generalized problem
+    ! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
+    ! ... On output both matrix are unchanged
+    !
+    ! ... LAPACK version - uses both ZHEGV and ZHEGVX
+    !
+    USE cudafor
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    !
+    INTEGER, INTENT(IN) :: n, m, ldh
+      ! dimension of the matrix to be diagonalized
+      ! number of eigenstates to be calculate
+      ! leading dimension of h, as declared in the calling pgm unit
+    COMPLEX(DP), DEVICE, INTENT(INOUT) :: h_d(ldh,n), s_d(ldh,n)
+      ! actually intent(in) but compilers don't know and complain
+      ! matrix to be diagonalized
+      ! overlap matrix
+    REAL(DP), DEVICE, INTENT(OUT) :: e_d(n)
+      ! eigenvalues
+    COMPLEX(DP), DEVICE, INTENT(OUT) :: v_d(ldh,n)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
+      ! communicators
+    LOGICAL, OPTIONAL ::  onhost
+      ! optionally solve the eigenvalue problem on the CPU
+    LOGICAL :: lonhost
+      !
+    COMPLEX(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
+    REAL(DP),    ALLOCATABLE :: e(:)
+    !
+    INTEGER :: info
+    !
+    lonhost = .false.
+    !
+    IF (PRESENT(onhost)) lonhost = onhost
+    !
+    !
+    IF ( lonhost ) THEN
+      !
+      ALLOCATE(s, source=s_d); ALLOCATE(h, source=h_d)
+      ALLOCATE(e(n), v(ldh,m))
+      !
+      CALL laxlib_cdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
+      !
+      e_d = e
+      v_d(1:ldh,1:m) = v(1:ldh,1:m)
+      !
+      DEALLOCATE(h, s, e, v)
+    ELSE
+      CALL laxlib_cdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
+    END IF
+    !
+    RETURN
+    !
+  END SUBROUTINE cdiaghg_gpu_
+#endif
+  !
+  !----------------------------------------------------------------------------
+  SUBROUTINE rdiaghg_cpu_( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, offload )
+    !----------------------------------------------------------------------------
+    !
+    ! ... general interface for rdiaghg
+    !
+#if defined(__CUDA)
+    USE cudafor
+#endif
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    !
+    INTEGER, INTENT(IN) :: n, m, ldh
+      ! dimension of the matrix to be diagonalized
+      ! number of eigenstates to be calculate
+      ! leading dimension of h, as declared in the calling pgm unit
+    REAL(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
+      ! actually intent(in) but compilers don't know and complain
+      ! matrix to be diagonalized
+      ! overlap matrix
+    REAL(DP), INTENT(OUT) :: e(n)
+      ! eigenvalues
+    REAL(DP), INTENT(OUT) :: v(ldh,m)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
+      ! communicators
+    LOGICAL, OPTIONAL ::  offload
+      ! optionally solve the eigenvalue problem on the GPU
+    LOGICAL :: loffload
+      !
+#if defined(__CUDA)
+    REAL(DP), ALLOCATABLE, DEVICE :: v_d(:,:), h_d(:,:), s_d(:,:)
+    REAL(DP), ALLOCATABLE, DEVICE :: e_d(:)
+    INTEGER :: info
+#endif
+    !
+    loffload = .false.
+    !
+    ! the following ifdef ensures no offload if not compiling from GPU 
+#if defined(__CUDA)
+    IF (PRESENT(offload)) loffload = offload
+#endif
+    !
+    ! ... always false when compiling without CUDA support
+    !
+    IF ( loffload ) THEN
+#if defined(__CUDA)
+      !
+      ALLOCATE(s_d, source=s); ALLOCATE(h_d, source=h)
+      ALLOCATE(e_d(n), v_d(ldh,n))
+      !
+      CALL laxlib_rdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
+      !
+      e = e_d
+      v(1:ldh,1:m) = v_d(1:ldh,1:m)
+      !
+      DEALLOCATE(h_d, s_d, e_d, v_d)
+#endif
+    ELSE
+      CALL laxlib_rdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
+    END IF
+    !
+    RETURN
+    !
+  END SUBROUTINE rdiaghg_cpu_
+  !
+#if defined(__CUDA)
+  !----------------------------------------------------------------------------
+  SUBROUTINE rdiaghg_gpu_( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, onhost )
+    !----------------------------------------------------------------------------
+    !
+    ! ... General interface to rdiaghg_gpu
+    !
+    USE cudafor
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    !
+    INTEGER, INTENT(IN) :: n, m, ldh
+      ! dimension of the matrix to be diagonalized
+      ! number of eigenstates to be calculate
+      ! leading dimension of h, as declared in the calling pgm unit
+    REAL(DP), DEVICE, INTENT(INOUT) :: h_d(ldh,n), s_d(ldh,n)
+      ! actually intent(in) but compilers don't know and complain
+      ! matrix to be diagonalized
+      ! overlap matrix
+    REAL(DP), DEVICE, INTENT(OUT) :: e_d(n)
+      ! eigenvalues
+    REAL(DP), DEVICE, INTENT(OUT) :: v_d(ldh,n)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: me_bgrp, root_bgrp, intra_bgrp_comm
+      ! communicators
+    LOGICAL, OPTIONAL ::  onhost
+      ! optionally solve the eigenvalue problem on the CPU
+    LOGICAL :: lonhost
+      !
+    REAL(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
+    REAL(DP),    ALLOCATABLE :: e(:)
+    !
+    INTEGER :: info
+    !
+    lonhost = .false.
+    !
+    IF (PRESENT(onhost)) lonhost = onhost
+    !
+    !
+    IF ( lonhost ) THEN
+      !
+      ALLOCATE(s, source=s_d); ALLOCATE(h, source=h_d)
+      ALLOCATE(e(n), v(ldh,m))
+      !
+      CALL laxlib_rdiaghg(n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm)
+      !
+      e_d = e
+      v_d(1:ldh,1:m) = v(1:ldh,1:m)
+      !
+      DEALLOCATE(h, s, e, v)
+    ELSE
+      CALL laxlib_rdiaghg_gpu(n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm)
+    END IF
+    !
+    RETURN
+    !
+  END SUBROUTINE rdiaghg_gpu_
+#endif
+  !
+  !  === Parallel diagonalization interface subroutines
+  !
+  !
+  !----------------------------------------------------------------------------
+  SUBROUTINE prdiaghg_( n, h, s, ldh, e, v, idesc, offload )
+    !----------------------------------------------------------------------------
+    !
+    ! ... calculates eigenvalues and eigenvectors of the generalized problem
+    ! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
+    ! ... On output both matrix are unchanged
+    !
+    ! ... Parallel version with full data distribution
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    include 'laxlib_param.fh'
+    !
+    INTEGER, INTENT(IN) :: n, ldh
+      ! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
+      ! leading dimension of h, as declared in the calling pgm unit
+    REAL(DP), INTENT(INOUT) :: h(ldh,ldh), s(ldh,ldh)
+      ! matrix to be diagonalized
+      ! overlap matrix
+    !
+    REAL(DP), INTENT(OUT) :: e(n)
+      ! eigenvalues
+    REAL(DP), INTENT(OUT) :: v(ldh,ldh)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
+      !
+    LOGICAL, OPTIONAL ::  offload
+      ! place-holder, offloading on GPU not implemented yet
+    LOGICAL :: loffload
+
+    CALL laxlib_prdiaghg( n, h, s, ldh, e, v, idesc)
+      
+  END SUBROUTINE
+  !----------------------------------------------------------------------------
+  SUBROUTINE pcdiaghg_( n, h, s, ldh, e, v, idesc, offload )
+    !----------------------------------------------------------------------------
+    !
+    ! ... calculates eigenvalues and eigenvectors of the generalized problem
+    ! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
+    ! ... On output both matrix are unchanged
+    !
+    ! ... Parallel version with full data distribution
+    !
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    include 'laxlib_param.fh'
+    !
+    INTEGER, INTENT(IN) :: n, ldh
+      ! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
+      ! leading dimension of h, as declared in the calling pgm unit
+    COMPLEX(DP), INTENT(INOUT) :: h(ldh,ldh), s(ldh,ldh)
+      ! matrix to be diagonalized
+      ! overlap matrix
+    !
+    REAL(DP), INTENT(OUT) :: e(n)
+      ! eigenvalues
+    COMPLEX(DP), INTENT(OUT) :: v(ldh,ldh)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
+      !
+    LOGICAL, OPTIONAL ::  offload
+      ! place-holder, offloading on GPU not implemented yet
+    LOGICAL :: loffload
+
+    CALL laxlib_pcdiaghg( n, h, s, ldh, e, v, idesc)
+      
+  END SUBROUTINE
+  !
+#if defined(__CUDA)
+  !----------------------------------------------------------------------------
+  SUBROUTINE prdiaghg__gpu( n, h_d, s_d, ldh, e_d, v_d, idesc, onhost )
+    !----------------------------------------------------------------------------
+    !
+    ! ... calculates eigenvalues and eigenvectors of the generalized problem
+    ! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
+    ! ... On output both matrix are unchanged
+    !
+    ! ... Parallel version with full data distribution
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    include 'laxlib_param.fh'
+    !
+    INTEGER, INTENT(IN) :: n, ldh
+      ! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
+      ! leading dimension of h, as declared in the calling pgm unit
+    REAL(DP), INTENT(INOUT), DEVICE :: h_d(ldh,ldh), s_d(ldh,ldh)
+      ! matrix to be diagonalized
+      ! overlap matrix
+    !
+    REAL(DP), INTENT(OUT), DEVICE :: e_d(n)
+      ! eigenvalues
+    REAL(DP), INTENT(OUT), DEVICE :: v_d(ldh,ldh)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
+      !
+    LOGICAL, OPTIONAL ::  onhost
+      ! place-holder, prdiaghg on GPU not implemented yet
+    LOGICAL :: lonhost
+      !
+    REAL(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
+    REAL(DP), ALLOCATABLE :: e(:)
+    
+    ALLOCATE(h(ldh,ldh), s(ldh,ldh), e(n), v(ldh,ldh))
+    h = h_d; s = s_d;
+    CALL laxlib_prdiaghg( n, h, s, ldh, e, v, idesc)
+    e_d = e; v_d = v
+    DEALLOCATE(h,s,v,e)
+    ! 
+  END SUBROUTINE
+  !----------------------------------------------------------------------------
+  SUBROUTINE pcdiaghg__gpu( n, h_d, s_d, ldh, e_d, v_d, idesc, onhost )
+    !----------------------------------------------------------------------------
+    !
+    ! ... calculates eigenvalues and eigenvectors of the generalized problem
+    ! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
+    ! ... On output both matrix are unchanged
+    !
+    ! ... Parallel version with full data distribution
+    !
+    IMPLICIT NONE
+    include 'laxlib_kinds.fh'
+    include 'laxlib_param.fh'
+    !
+    INTEGER, INTENT(IN) :: n, ldh
+      ! dimension of the matrix to be diagonalized and number of eigenstates to be calculated
+      ! leading dimension of h, as declared in the calling pgm unit
+    COMPLEX(DP), INTENT(INOUT), DEVICE :: h_d(ldh,ldh), s_d(ldh,ldh)
+      ! matrix to be diagonalized
+      ! overlap matrix
+    !
+    REAL(DP), INTENT(OUT), DEVICE :: e_d(n)
+      ! eigenvalues
+    COMPLEX(DP), INTENT(OUT), DEVICE :: v_d(ldh,ldh)
+      ! eigenvectors (column-wise)
+    INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
+      !
+    LOGICAL, OPTIONAL ::  onhost
+      ! place-holder, pcdiaghg on GPU not implemented yet
+    LOGICAL :: lonhost
+      !
+    COMPLEX(DP), ALLOCATABLE :: v(:,:), h(:,:), s(:,:)
+    REAL(DP), ALLOCATABLE :: e(:)
+    
+    ALLOCATE(h(ldh,ldh), s(ldh,ldh), e(n), v(ldh,ldh))
+    h = h_d; s = s_d;
+    CALL laxlib_pcdiaghg( n, h, s, ldh, e, v, idesc)
+    e_d = e; v_d = v
+    DEALLOCATE(h,s,v,e)
+    !
+  END SUBROUTINE
+#endif
+END MODULE LAXlib
--- a/LAXlib/laxlib_low.h
+++ b/LAXlib/laxlib_low.h
@ -395,4 +395,16 @@ SUBROUTINE redist_row2col_x( n, a, b, ldx, nx, idesc )
   REAL(DP)            :: a(ldx,nx), b(ldx,nx)
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
 END SUBROUTINE
+#if defined (__CUDA)
+SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
+   IMPLICIT NONE
+   include 'laxlib_kinds.fh'
+   include 'laxlib_param.fh'
+   INTEGER, INTENT(IN) :: n
+   INTEGER, INTENT(IN) :: ldx, nx
+   REAL(DP), DEVICE    :: a(:,:)
+   REAL(DP), DEVICE    :: b(:,:)
+   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
+END SUBROUTINE
+#endif
 END INTERFACE
--- a/LAXlib/make.depend
+++ b/LAXlib/make.depend
@ -7,6 +7,7 @@ la_error.o : la_param.o
 la_helper.o : dspev_drv.o
 la_helper.o : la_param.o
 la_helper.o : la_types.o
+la_module.o : la_param.o
 la_helper.o : mp_diag.o
 mp_diag.o : la_param.o
 ptoolkit.o : la_param.o
--- a/LAXlib/ptoolkit.f90
+++ b/LAXlib/ptoolkit.f90
@ -3719,13 +3719,10 @@ SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
   INTEGER, INTENT(IN) :: n
   INTEGER, INTENT(IN) :: ldx, nx
   REAL(DP), DEVICE    :: a(:,:)
-   REAL(DP)            :: b(:,:)
+   REAL(DP), DEVICE    :: b(:,:)
   INTEGER, INTENT(IN) :: idesc(LAX_DESC_SIZE)
   !
-   REAL(DP), ALLOCATABLE :: wrk(:,:)
-#if defined(__GPU_MPI)
-   ATTRIBUTES(DEVICE) :: wrk
-#endif
+   REAL(DP), ALLOCATABLE :: a_h(:,:), b_h(:,:)
   INTEGER :: ierr
   INTEGER :: np, rowid, colid
   INTEGER :: comm
@ -3778,30 +3775,30 @@ SUBROUTINE redist_row2col_gpu_x( n, a, b, ldx, nx, idesc )
      CALL lax_error__( " redist_row2col_gpu ", " in MPI_BARRIER ", ABS( ierr ) )
   !
 #if defined(__GPU_MPI)
-   ALLOCATE( wrk(SIZE(b,1),SIZE(b,2)), STAT=ierr )
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
   !
   ierr = cudaDeviceSynchronize()
   CALL MPI_SENDRECV(a, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
-                     wrk, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
-   IF( ierr /= 0 ) &
-      CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
-
-   b = wrk
-   !
-   DEALLOCATE( wrk )
-#else
-   ALLOCATE( wrk, SOURCE=a, STAT=ierr )
-   IF( ierr /= 0 ) &
-      CALL lax_error__( " redist_row2col_gpu ", " allocating wrk ", ABS( ierr ) )
-   !
-   CALL MPI_SENDRECV(wrk, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
                     b, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
   IF( ierr /= 0 ) &
      CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
+#else
+   ALLOCATE( a_h, SOURCE=a, STAT=ierr )
+   IF( ierr /= 0 ) &
+      CALL lax_error__( " redist_row2col_gpu ", " allocating a_h ", ABS( ierr ) )
+   ALLOCATE( b_h, MOLD=b, STAT=ierr )
+   IF( ierr /= 0 ) &
+      CALL lax_error__( " redist_row2col_gpu ", " allocating b_h ", ABS( ierr ) )
   !
-   DEALLOCATE( wrk )
+   CALL MPI_SENDRECV(a_h, ldx*nx, MPI_DOUBLE_PRECISION, idest, np+np+1, &
+                     b_h, ldx*nx, MPI_DOUBLE_PRECISION, isour, np+np+1, comm, istatus, ierr)
+   IF( ierr /= 0 ) &
+      CALL lax_error__( " redist_row2col_gpu ", " in MPI_SENDRECV ", ABS( ierr ) )
+   !
+   b = b_h
+   !
+   DEALLOCATE( a_h, b_h )
 #endif
   !
 #else
--- a/LAXlib/tests/.gitattributes
+++ b/LAXlib/tests/.gitattributes
@ -0,0 +1 @@
+*.bin binary
--- a/LAXlib/tests/Makefile
+++ b/LAXlib/tests/Makefile
@ -0,0 +1,38 @@
+# Makefile for LAXlib
+
+include ../../make.inc
+
+MODFLAGS= $(MOD_FLAG)../../UtilXlib $(MOD_FLAG)../../ELPA/src  $(MOD_FLAG).. $(MOD_FLAG).
+
+SRCS = test_diaghg_1.f90 \
+       test_diaghg_2.f90 \
+       test_diaghg_3.f90 \
+       test_diaghg_4.f90 \
+       test_diaghg_gpu_1.f90 \
+       test_diaghg_gpu_2.f90 \
+       test_diaghg_gpu_3.f90 \
+       test_diaghg_gpu_4.f90
+
+EXECS = $(SRCS:.f90=.x)
+
+UTILS = tester.o mp_world.o utils.o test_io.o test_helpers.o
+
+all: common $(EXECS)
+
+common: $(UTILS)
+
+%.x: %.o
+	$(LD) $(LDFLAGS) $< $(UTILS) -o $@ ../libqela.a \
+	../../UtilXlib/libutil.a $(LIBOBJS) $(QELIBS)
+
+clean :
+	- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
+
+# .PHONY forces execution of a rule irrespective of the presence of an
+# updated file with the same name of the rule. In this way, the script 
+# that generates version.f90 always runs, updating the version if you 
+# execute "svn update". The update_version script takes care of not
+# changing the file if the svn version did not change
+
+.PHONY: all clean
+
--- a/LAXlib/tests/README.md
+++ b/LAXlib/tests/README.md
@ -0,0 +1,10 @@
+# LAXlib testing suite
+
+In order to run the tests first run `./configure` in QE topdir and generate a
+valid `make.inc`.
+You may also download large eigenvalue problems (in this directory) with:
+
+    wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1EAB3xkoD-i9p4nW6NJDED3WaEK8ZCcf4' -O SiGeK1.bin
+    wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13lFkDbv99V8fqiXER1N2IzoJ_EhuGtt9' -O SiGeK2.bin
+
+Finally `make` and run all `.x` executable files.
--- a/LAXlib/tests/ZnOG1.bin
+++ b/LAXlib/tests/ZnOG1.bin
--- a/LAXlib/tests/ZnOG2.bin
+++ b/LAXlib/tests/ZnOG2.bin
--- a/LAXlib/tests/ZnOK1.bin
+++ b/LAXlib/tests/ZnOK1.bin
--- a/LAXlib/tests/ZnOK2.bin
+++ b/LAXlib/tests/ZnOK2.bin
--- a/LAXlib/tests/mp_world.f90
+++ b/LAXlib/tests/mp_world.f90
@ -0,0 +1,89 @@
+!----------------------------------------------------------------------------
+MODULE mp_world
+  !----------------------------------------------------------------------------
+  !
+#if defined(__MPI)
+  USE mpi
+#endif
+  USE mp, ONLY : mp_barrier, mp_start, mp_end, mp_stop 
+  USE mp, ONLY : mp_count_nodes
+  !
+  IMPLICIT NONE 
+  SAVE
+  !
+  ! ... World group - all QE routines using mp_world_start to start MPI
+  ! ... will work in the communicator passed as input to mp_world_start
+  !
+  INTEGER :: nnode = 1 ! number of nodes
+  INTEGER :: nproc = 1  ! number of processors
+  INTEGER :: mpime = 0  ! processor index (starts from 0 to nproc-1)
+  INTEGER :: root  = 0  ! index of the root processor
+  INTEGER :: world_comm = 0  ! communicator
+  !
+  ! ... library_mode =.true. if QE is called as a library by an external code
+  ! ... if true, MPI_Init()     is not called when starting MPI,
+  ! ...          MPI_Finalize() is not called when stopping MPI
+  !
+  !
+#if defined(__MPI)
+  LOGICAL :: library_mode = .FALSE.
+#endif
+  PRIVATE
+  PUBLIC :: nnode, nproc, mpime, root, world_comm, mp_world_start, mp_world_end
+  !
+CONTAINS
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE mp_world_start ( my_world_comm )
+    !-----------------------------------------------------------------------
+    !
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: my_world_comm
+#if defined(__MPI)
+    INTEGER :: ierr
+#endif
+#if defined(_OPENMP)
+    INTEGER :: PROVIDED
+#endif
+    !
+    world_comm = my_world_comm
+    !
+    ! ... check if mpi is already initialized (library mode) or not
+    ! 
+#if defined(__MPI)
+    CALL MPI_Initialized ( library_mode, ierr)
+    IF (ierr/=0) CALL mp_stop( 8000 )
+    IF (.NOT. library_mode ) THEN
+#if defined(_OPENMP)
+       CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr)
+#else
+       CALL MPI_Init(ierr)
+#endif
+       IF (ierr/=0) CALL mp_stop( 8001 )
+    END IF
+#endif
+    !
+    CALL mp_start( nproc, mpime, world_comm )
+    !CALL mp_count_nodes ( nnode, world_comm )
+    !
+    RETURN
+    !
+  END SUBROUTINE mp_world_start
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE mp_world_end ( )
+    !-----------------------------------------------------------------------
+#if defined(__MPI)
+    INTEGER :: ierr
+#endif
+    !
+    CALL mp_barrier( world_comm )
+    CALL mp_end ( world_comm )
+#if defined(__MPI)
+    CALL mpi_finalize(ierr)
+    IF (ierr/=0) CALL mp_stop( 8002 )
+#endif
+    !
+  END SUBROUTINE mp_world_end
+  !
+END MODULE mp_world
--- a/LAXlib/tests/test_diaghg_1.f90
+++ b/LAXlib/tests/test_diaghg_1.f90
@ -0,0 +1,122 @@
+program test_diaghg
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL real_1(test)
+    !
+    CALL complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE real_1(test)
+    USE LAXlib
+    USE la_param, ONLY : DP
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! real variables
+    real(DP) :: h(2,2)
+    real(DP) :: s(2,2)
+    real(DP) :: e(2)
+    real(DP) :: v(2,2)
+    
+    h = 0.d0
+    h(1,1) = 1.d0
+    h(2,2) = 1.d0
+    s = 0.d0
+    s(1,1) = 1.d0
+    s(2,2) = 1.d0
+    !
+    v = 0.d0
+    e = 0.d0
+    CALL diaghg(  2, 2, h, s, 2, e, v, .false. )
+    !
+    CALL test%assert_close( e, [1.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(v, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(s, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    !
+    v = 0.d0
+    e = 0.d0
+    CALL diaghg(  2, 2, h, s, 2, e, v, .true. )
+    !
+    CALL test%assert_close( e, [1.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(v, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(s, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    !
+  END SUBROUTINE real_1
+  !
+  SUBROUTINE complex_1(test)
+    USE LAXlib
+    USE la_param, ONLY : DP
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! real variables
+    complex(DP) :: h(2,2)
+    complex(DP) :: h_save(2,2)
+    complex(DP) :: s(2,2)
+    complex(DP) :: s_save(2,2)
+    real(DP) :: e(2)
+    complex(DP) :: v(2,2)
+    !
+    h = 0.d0
+    h(1,1) = (1.d0,  0.d0)
+    h(1,2) = (0.d0, -2.d0)
+    h(2,1) = ( 0.d0, 2.d0)
+    h(2,2) = ( 5.d0, 0.d0)
+    s = 0.d0
+    s(1,1) = (1.d0, 0.d0)
+    s(2,2) = (1.d0, 0.d0)
+    !
+    h_save = h
+    s_save = s
+    !
+    v = (0.d0, 0.d0)
+    e = 0.d0
+    CALL diaghg(  2, 2, h, s, 2, e, v, .false. )
+    ! 0.1715728752538099, 5.82842712474619 
+    CALL test%assert_close( e, [0.1715728752538099d0,  5.82842712474619d0] )
+    CALL test%assert_close( v(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
+    CALL test%assert_close( v(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
+    CALL test%assert_close( RESHAPE(h, [4]), RESHAPE(h_save, [4]))
+    CALL test%assert_close( RESHAPE(s, [4]), RESHAPE(s_save, [4]))
+    !
+    v = (0.d0, 0.d0)
+    e = 0.d0
+    CALL diaghg(  2, 2, h, s, 2, e, v, .true. )
+    !
+    CALL test%assert_close( e, [0.1715728752538099d0,  5.82842712474619d0] )
+    CALL test%assert_close( v(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
+    CALL test%assert_close( v(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
+    CALL test%assert_close( RESHAPE(h, [4]), RESHAPE(h_save, [4]))
+    CALL test%assert_close( RESHAPE(s, [4]), RESHAPE(s_save, [4]))
+    !
+  END SUBROUTINE complex_1
+end program test_diaghg
--- a/LAXlib/tests/test_diaghg_2.f90
+++ b/LAXlib/tests/test_diaghg_2.f90
@ -0,0 +1,110 @@
+program test_diaghg_2
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE complex_1(test)
+    USE LAXlib
+    USE la_param, ONLY : DP
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer, parameter :: m_size=1024
+    complex(DP) :: h(m_size,m_size)
+    complex(DP) :: h_save(m_size,m_size)
+    complex(DP) :: s(m_size,m_size)
+    complex(DP) :: s_save(m_size,m_size)
+    real(DP) :: e(m_size)
+    complex(DP) :: v(m_size,m_size)
+    real(DP) :: e_save(m_size)
+    complex(DP) :: v_save(m_size,m_size)
+    !
+    CALL hermitian(m_size, h)
+    CALL hermitian(m_size, s)
+    !
+    h_save = h
+    s_save = s
+    !
+    v = (0.d0, 0.d0)
+    e = 0.d0
+    CALL diaghg(  m_size, m_size, h, s, m_size, e, v, .false. )
+    ! 
+    CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
+    CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
+    !
+    e_save = e
+    v_save = v
+    !
+    v = (0.d0, 0.d0)
+    e = 0.d0
+    CALL diaghg(  m_size, m_size, h, s, m_size, e, v, .true. )
+    !
+    CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
+    CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
+    test%tolerance32=1.d-5
+    test%tolerance64=1.d-14
+    CALL test%assert_close( e, e_save)
+    !
+  END SUBROUTINE complex_1
+  !
+  SUBROUTINE hermitian(mSize, M)
+    USE la_param, ONLY : DP
+    IMPLICIT NONE
+    integer, intent(in) :: msize
+    complex(dp), intent(out) :: M(:,:)
+    !       
+    real(dp), allocatable :: rnd(:)
+    complex(dp), allocatable :: tmp(:,:)
+
+    INTEGER :: h, k, j
+    !
+    ALLOCATE(rnd(mSize*(mSize+1)))
+    CALL RANDOM_NUMBER(rnd)
+    rnd = 1.d0*rnd - 5.d-1
+    !
+    M = (0.d0, 0.d0)
+    j = 1
+    DO k=1,mSize
+      DO h=1,mSize
+        IF(h>k) THEN
+          M(h,k) = CMPLX(rnd(j), rnd(j+1))
+          M(k,h) = CONJG(M(h,k))
+          j=j+2;
+        ELSE IF(k == h) THEN
+          M(k,h) = CMPLX(mSize, 0.d0, kind=DP)
+        END IF
+      END DO
+    END DO
+    !
+  END SUBROUTINE hermitian
+  
+end program test_diaghg_2
--- a/LAXlib/tests/test_diaghg_3.f90
+++ b/LAXlib/tests/test_diaghg_3.f90
@ -0,0 +1,174 @@
+program test_diaghg_3
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    test%tolerance64=1.d-8
+    !
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL real_1(test)
+    !
+    CALL complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE real_1(test)
+    USE mp_world, ONLY : mpime
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer :: ldh, n, m
+    real(DP), allocatable :: h(:,:)
+    real(DP), allocatable :: h_save(:,:)
+    real(DP), allocatable :: s(:,:)
+    real(DP), allocatable :: s_save(:,:)
+    real(DP), allocatable    :: e(:)
+    real(DP), allocatable :: v(:,:)
+    real(DP), allocatable    :: e_save(:)
+    real(DP), allocatable :: v_save(:,:)
+    !
+    character(len=20)        :: inputs(2)
+    integer :: i, j, info
+    !
+    inputs = ["ZnOG1.bin", "ZnOG2.bin"]
+    !
+    DO i=1, SIZE(inputs)
+        !
+        CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg(  n, m, h, s, ldh, e, v, .false. )
+        !
+        test%tolerance64=1.d-6 ! check this
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        test%tolerance64=1.d-8 ! check this
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        h = h_save
+        s = s_save
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg( n, m, h, s, ldh, e, v, .true. )
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
+    END DO
+    !
+  END SUBROUTINE real_1
+  !
+  SUBROUTINE complex_1(test)
+    USE mp_world, ONLY : mpime
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer :: ldh, n, m
+    complex(DP), allocatable :: h(:,:)
+    complex(DP), allocatable :: h_save(:,:)
+    complex(DP), allocatable :: s(:,:)
+    complex(DP), allocatable :: s_save(:,:)
+    real(DP), allocatable    :: e(:)
+    complex(DP), allocatable :: v(:,:)
+    real(DP), allocatable    :: e_save(:)
+    complex(DP), allocatable :: v_save(:,:)
+    !
+    character(len=20)        :: inputs(4)
+    integer :: i, j, info
+    !
+    inputs = ["ZnOK1.bin ", &
+              "ZnOK2.bin ", &
+              "SiGeK1.bin", &
+              "SiGeK2.bin"]
+    !
+    DO i=1, SIZE(inputs)
+        !
+        CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg(  n, m, h, s, ldh, e, v, .false. )
+        !
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        h = h_save
+        s = s_save
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg( n, m, h, s, ldh, e, v, .true. )
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
+    END DO
+    !
+  END SUBROUTINE complex_1
+  
+end program test_diaghg_3
--- a/LAXlib/tests/test_diaghg_4.f90
+++ b/LAXlib/tests/test_diaghg_4.f90
@ -0,0 +1,312 @@
+! This test uses the internal parallel diagonalization algorithm of LAXlib
+! to solve the problems stored in binary files:
+!
+!  - ZnOG1.bin
+!  - ZnOG2.bin
+!  - ZnOK1.bin
+!  - ZnOK2.bin
+!  - SiGeK1.bin
+!  - SiGeK2.bin
+!
+! If the scalacpak or ELPA driver is used, the test is skipped.
+!
+#if ! defined(__SCALAPACK)
+program test_diaghg_4
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    test%tolerance64=1.d-8
+    !
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL parallel_real_1(test)
+    !
+    CALL parallel_complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE parallel_real_1(test)
+    USE mp_world,    ONLY : mpime
+    USE LAXlib
+    USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
+    USE la_param,    ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    !
+    TYPE(la_descriptor) :: desc
+    integer :: ldh, n, m
+    real(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
+    real(DP), allocatable :: h_save(:,:)       !< full Hpsi, used to check consistence across calls
+    real(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
+    real(DP), allocatable :: s_save(:,:)       !< full Spsi, used to check consistence across calls
+    real(DP), allocatable    :: e(:)           !< full set of eigenvalues
+    real(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
+    real(DP), allocatable    :: e_save(:)      !< full set of eigenvalues, used for checks
+    real(DP), allocatable :: v_save(:,:)       !< full set of eigenvectors, used for checks
+    !
+    character(len=20)        :: inputs(2)
+    integer                  :: l, i, j, ii, jj, info, nrdst
+    logical                  :: la_proc
+    !
+    inputs = ["ZnOG1.bin", "ZnOG2.bin"]
+    !
+    DO l=1, SIZE(inputs)
+        !
+        CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        CALL init_parallel_diag(desc, n)
+        !
+        IF( desc%active_node > 0 ) la_proc = .TRUE.
+        nrdst = desc%nrcx
+        IF (.not. la_proc) nrdst = 1
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        print *, nrdst, n, m
+        ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
+        !
+        IF (la_proc) THEN
+            DO j = 1, desc%nc ! number of column in the local block of lambda
+                DO i = 1, desc%nr ! number of row in the local block of lambda
+                   ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
+                   jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
+                   hdst(i, j) = h(ii, jj)
+                   sdst(i, j) = s(ii, jj)
+                END DO
+            END DO
+        END IF
+        !
+        CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .false. )
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .true. )
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
+    END DO
+    !
+  END SUBROUTINE parallel_real_1
+  !
+  SUBROUTINE parallel_complex_1(test)
+    USE mp_world, ONLY : mpime
+    USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer :: ldh, n, m
+    complex(DP), allocatable :: h(:,:), hdst(:,:) !< full and distributed Hpsi
+    complex(DP), allocatable :: h_save(:,:)       !< full Hpsi, used to check consistence across calls
+    complex(DP), allocatable :: s(:,:), sdst(:,:) !< full and distributed Spsi
+    complex(DP), allocatable :: s_save(:,:)       !< full Spsi, used to check consistence across calls
+    real(DP), allocatable    :: e(:)              !< full set of eigenvalues
+    complex(DP), allocatable :: v(:,:), vdst(:,:) !< full and distributed eigenvectors
+    real(DP), allocatable    :: e_save(:)         !< full set of eigenvalues, used for checks
+    complex(DP), allocatable :: v_save(:,:)       !< full set of eigenvectors, used for checks
+    TYPE(la_descriptor)      :: desc
+    !
+    character(len=20)        :: inputs(4)
+    integer                  :: l, i, j, ii, jj, info, nrdst
+    logical                  :: la_proc
+    !
+    inputs = ["ZnOK1.bin ", &
+              "ZnOK2.bin ", &
+              "SiGeK1.bin", &
+              "SiGeK2.bin"]
+    !
+    DO l=1, SIZE(inputs)
+        !
+        CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        CALL init_parallel_diag(desc, n)
+        !
+        IF( desc%active_node > 0 ) la_proc = .TRUE.
+        nrdst = desc%nrcx
+        IF (.not. la_proc) nrdst = 1
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        print *, nrdst, n, m
+        ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
+        !
+        IF (la_proc) THEN
+            DO j = 1, desc%nc ! number of column in the local block of lambda
+                DO i = 1, desc%nr ! number of row in the local block of lambda
+                   ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
+                   jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
+                   hdst(i, j) = h(ii, jj)
+                   sdst(i, j) = s(ii, jj)
+                END DO
+            END DO
+        END IF
+        !
+        CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .false. )
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL pdiaghg( n, hdst, sdst, nrdst, e, vdst, desc, .true. )
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
+    END DO
+    !
+  END SUBROUTINE parallel_complex_1
+  !
+  SUBROUTINE init_parallel_diag(desc, n)
+  
+      USE mp_world, ONLY : mpime, nproc, world_comm
+      USE mp_diag,  ONLY : ortho_parent_comm
+      USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
+      USE LAXlib
+      USE la_param, ONLY : DP
+      implicit none
+      !
+      TYPE(la_descriptor) :: desc
+      INTEGER             :: n    ! global dimension of the matrix
+      !
+      INTEGER :: ierr = 0
+      INTEGER :: color, key
+      !
+      INTEGER :: np_ortho(2) = 1  ! size of the processor grid used in ortho
+      INTEGER :: me_ortho(2) = 0  ! coordinates of the processors
+      INTEGER :: me_ortho1   = 0  ! task id for the ortho group
+      INTEGER :: nproc_ortho = 1  ! size of the ortho group:
+      INTEGER :: ortho_comm  = 0  ! communicator for the ortho group
+      INTEGER :: ortho_row_comm  = 0  ! communicator for the ortho row group
+      INTEGER :: ortho_col_comm  = 0  ! communicator for the ortho col group
+      INTEGER :: ortho_comm_id = 0 ! id of the ortho_comm
+      !
+      ortho_parent_comm = world_comm
+      !
+#if defined __MPI
+      !
+      CALL grid2d_dims( 'S', nproc, np_ortho(1), np_ortho(2) )
+      !
+      nproc_ortho = np_ortho(1) * np_ortho(2)
+      !
+      !  here we choose the first "nproc_ortho" processors
+      !
+      color = 0
+      IF( mpime < nproc_ortho ) color = 1
+      !
+      key = mpime
+      !
+      !  initialize the communicator for the new group by splitting the input
+      !  communicator
+      !
+      CALL mpi_comm_split( MPI_COMM_WORLD , color, key, ortho_comm, ierr )
+      !
+      ! Computes coordinates of the processors, in row maior order
+      !
+      CALL mpi_comm_rank( ortho_comm, me_ortho1, ierr)
+      !
+      IF( mpime == 0 .AND. me_ortho1 /= 0 ) &
+           CALL lax_error__( " init_ortho_group ", " wrong root task in ortho group ", ierr )
+      !
+      if( color == 1 ) then
+         ! this task belong to the ortho_group compute its coordinates
+         ortho_comm_id = 1
+         CALL GRID2D_COORDS( 'R', me_ortho1, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
+         CALL GRID2D_RANK( 'R', np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2), ierr )
+         IF( ierr /= me_ortho1 ) &
+              CALL lax_error__( " init_ortho_group ", " wrong task coordinates in ortho group ", ierr )
+         IF( me_ortho1 /= mpime ) &
+              CALL lax_error__( " init_ortho_group ", " wrong rank assignment in ortho group ", ierr )
+         CALL mpi_comm_split( ortho_comm , me_ortho(2), me_ortho(1), ortho_col_comm, ierr )
+         CALL mpi_comm_split( ortho_comm , me_ortho(1), me_ortho(2), ortho_row_comm, ierr )
+      else
+         ! this task does NOT belong to the ortho_group set dummy values
+         ortho_comm_id = 0
+         me_ortho(1) = me_ortho1
+         me_ortho(2) = me_ortho1
+      endif
+#else
+      ortho_comm_id = 1
+#endif
+      CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, -1, ortho_comm_id )
+      
+  END SUBROUTINE init_parallel_diag
+  
+end program test_diaghg_4
+#else
+program test_diaghg_4
+end program test_diaghg_4
+#endif
--- a/LAXlib/tests/test_diaghg_gpu_1.f90
+++ b/LAXlib/tests/test_diaghg_gpu_1.f90
@ -0,0 +1,175 @@
+#if defined(__CUDA)
+program test_diaghg_gpu
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL real_1(test)
+    !
+    CALL complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE real_1(test)
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! variables on device
+    real(DP), device :: h_d(2,2)
+    real(DP), device :: s_d(2,2)
+    real(DP), device :: e_d(2)
+    real(DP), device :: v_d(2,2)
+    ! variables on host
+    real(DP) :: h_h(2,2)
+    real(DP) :: s_h(2,2)
+    real(DP) :: e_h(2)
+    real(DP) :: v_h(2,2)
+    
+    h_h = 0.d0
+    h_h(1,1) = 1.d0
+    h_h(2,2) = 1.d0
+    h_d = h_h
+    
+    s_h = 0.d0
+    s_h(1,1) = 1.d0
+    s_h(2,2) = 1.d0
+    s_d = s_h
+    !
+    v_d = 0.d0
+    e_d = 0.d0
+    CALL diaghg(  2, 2, h_d, s_d, 2, e_d, v_d, .false. )
+    v_h = v_d
+    e_h = e_d
+    h_h = h_d
+    s_h = s_d
+    !
+    CALL test%assert_close( e_h, [1.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(v_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(h_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(s_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    !
+    v_d = 0.d0
+    e_d = 0.d0
+    CALL diaghg(  2, 2, h_d, s_d, 2, e_d, v_d, .true. )
+    v_h = v_d
+    e_h = e_d
+    h_h = h_d
+    s_h = s_d
+    !
+    CALL test%assert_close( e_h, [1.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(v_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(h_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    CALL test%assert_close( RESHAPE(s_h, [4]), [1.d0, 0.d0, 0.d0, 1.d0] )
+    !
+  END SUBROUTINE real_1
+  !
+  SUBROUTINE complex_1(test)
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! variables on device
+    complex(DP), device :: h_d(2,2)
+    complex(DP), device :: s_d(2,2)
+    complex(DP), device :: v_d(2,2)
+    real(DP),    device :: e_d(2)
+    ! variables on host
+    complex(DP) :: h_h(2,2)
+    complex(DP) :: s_h(2,2)
+    complex(DP) :: v_h(2,2)
+    real(DP)    :: e_h(2)
+
+
+    complex(DP) :: s_save(2,2)
+    complex(DP) :: h_save(2,2)
+
+
+    !
+    h_h = 0.d0
+    h_h(1,1) = (1.d0,  0.d0)
+    h_h(1,2) = (0.d0, -2.d0)
+    h_h(2,1) = ( 0.d0, 2.d0)
+    h_h(2,2) = ( 5.d0, 0.d0)
+    s_h = 0.d0
+    s_h(1,1) = (1.d0, 0.d0)
+    s_h(2,2) = (1.d0, 0.d0)
+    !
+    ! save for later comparison
+    h_save = h_h
+    s_save = s_h
+    !
+    ! Update device
+    h_d = h_h
+    s_d = s_h
+    !
+    v_h = (0.d0, 0.d0)
+    e_h = 0.d0
+    v_d = v_h; e_d = e_h
+    !
+    CALL diaghg(  2, 2, h_d, s_d, 2, e_d, v_d, .false. )
+    v_h = v_d
+    e_h = e_d
+    h_h = h_d
+    s_h = s_d
+    !
+    ! 0.1715728752538099, 5.82842712474619 
+    CALL test%assert_close( e_h, [0.1715728752538099d0,  5.82842712474619d0] )
+    CALL test%assert_close( v_h(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
+    CALL test%assert_close( v_h(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
+    CALL test%assert_close( RESHAPE(h_h, [4]), RESHAPE(h_save, [4]))
+    CALL test%assert_close( RESHAPE(s_h, [4]), RESHAPE(s_save, [4]))
+    !
+    v_h = (0.d0, 0.d0)
+    e_h = 0.d0
+    !
+    ! Update device
+    h_d = h_h
+    s_d = s_h
+    v_d = v_h; e_d = e_h
+    CALL diaghg(  2, 2, h_d, s_d, 2, e_d, v_d, .true. )
+    v_h = v_d
+    e_h = e_d
+    h_h = h_d
+    s_h = s_d
+    !
+    CALL test%assert_close( e_h, [0.1715728752538099d0,  5.82842712474619d0] )
+    CALL test%assert_close( v_h(:,1), [( 0.d0, -0.9238795325112867d0), (-0.3826834323650898d0, 0.d0)] )
+    CALL test%assert_close( v_h(:,2), [( 0.d0, -0.3826834323650898d0), ( 0.9238795325112867d0, 0.d0)] )
+    CALL test%assert_close( RESHAPE(h_h, [4]), RESHAPE(h_save, [4]))
+    CALL test%assert_close( RESHAPE(s_h, [4]), RESHAPE(s_save, [4]))
+    !
+  END SUBROUTINE complex_1
+end program test_diaghg_gpu
+#else
+program test_diaghg_gpu
+end program test_diaghg_gpu
+#endif
--- a/LAXlib/tests/test_diaghg_gpu_2.f90
+++ b/LAXlib/tests/test_diaghg_gpu_2.f90
@ -0,0 +1,151 @@
+#if defined(__CUDA)
+program test_diaghg_gpu_2
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE complex_1(test)
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE cudafor
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer, parameter :: m_size=1024
+    integer :: i
+    complex(DP) :: h(m_size,m_size)
+    complex(DP), DEVICE :: h_d(m_size,m_size)
+    complex(DP) :: h_save(m_size,m_size)
+    complex(DP) :: s(m_size,m_size)
+    complex(DP), DEVICE :: s_d(m_size,m_size)
+    complex(DP) :: s_save(m_size,m_size)
+    real(DP) :: e(m_size), e_(m_size)
+    real(DP), DEVICE :: e_d(m_size)
+    complex(DP) :: v(m_size,m_size)
+    complex(DP), DEVICE :: v_d(m_size,m_size)
+    real(DP) :: e_save(m_size)
+    complex(DP) :: v_save(m_size,m_size)
+    !
+    CALL hermitian(m_size, h)
+    CALL hermitian(m_size, s)
+    !
+    h_save = h
+    s_save = s
+    h_d = h     ! copy H and S to device
+    s_d = s     ! <----------|
+    !
+    v = (0.d0, 0.d0)
+    e = 0.d0
+    v_d = (0.d0, 0.d0)
+    e_d = 0.d0
+    !
+    ! 1. Compare same algorithm starting from data on device ...
+    CALL diaghg(  m_size, m_size-1, h_d, s_d, m_size, e_d, v_d )
+    e_save = e_d
+    v_save = v_d
+    ! 2. ... and on the host, this will trigger the same subroutine used above
+    CALL diaghg(  m_size, m_size-1, h, s, m_size, e, v, .true. )
+    !
+    CALL test%assert_close( RESHAPE(h, [m_size*m_size]), RESHAPE(h_save, [m_size*m_size]))
+    CALL test%assert_close( RESHAPE(s, [m_size*m_size]), RESHAPE(s_save, [m_size*m_size]))
+    test%tolerance32=1.d-5
+    test%tolerance64=1.d-10
+    DO i=1, m_size-1
+        CALL test%assert_close( v(1:m_size,i), v_save(1:m_size,i))
+        CALL test%assert_close( e(i), e_save(i) )
+    END DO
+    !
+    ! reset data
+    h = h_save
+    s = s_save
+    v = (0.d0, 0.d0)
+    e = 0.d0
+    !
+    ! 3. repeat the same task but with CPU subroutine now.
+    !  note that it uses a different algorithm and produces slightly 
+    !  different eigenvectors.
+    !
+    CALL diaghg( m_size, m_size-1, h, s, m_size, e, v )
+    h = h_save; s = s_save
+    !
+    ! Solve-again, with the same algorithm used in the GPU version.
+    ! This is needed to compare eigenvectors.
+    CALL solve_with_zhegvd(m_size, h, s, m_size, e_)
+    !
+    test%tolerance32=1.d-5
+    test%tolerance64=1.d-10
+    DO i=1, m_size-1
+        ! compare eigenvectors obtained in 1. with LAPACK zhegvd
+        CALL test%assert_close( v_save(1:m_size,i), h(1:m_size,i) )
+        ! compare eigenvalues obtained with zhegvd, 1. and 3.
+        CALL test%assert_close( e(i), e_save(i) )
+        CALL test%assert_close( e_(i), e_save(i) )
+    END DO
+    !
+  END SUBROUTINE complex_1
+  !
+  SUBROUTINE hermitian(mSize, M)
+    USE la_param, ONLY : DP
+    IMPLICIT NONE
+    integer, intent(in) :: msize
+    complex(dp), intent(out) :: M(:,:)
+    !       
+    real(dp), allocatable :: rnd(:)
+    complex(dp), allocatable :: tmp(:,:)
+
+    INTEGER :: h, k, j
+    !
+    ALLOCATE(rnd(mSize*(mSize+1)))
+    CALL RANDOM_NUMBER(rnd)
+    rnd = 1.d0*rnd - 5.d-1
+    !
+    M = (0.d0, 0.d0)
+    j = 1
+    DO k=1,mSize
+      DO h=1,mSize
+        IF(h>k) THEN
+          M(h,k) = CMPLX(rnd(j), rnd(j+1))
+          M(k,h) = CONJG(M(h,k))
+          j=j+2;
+        ELSE IF(k == h) THEN
+          M(k,h) = CMPLX(mSize, 0.d0, kind=DP)
+        END IF
+      END DO
+    END DO
+    !
+  END SUBROUTINE hermitian
+  !
+end program test_diaghg_gpu_2
+#else
+program test_diaghg_gpu_2
+end program test_diaghg_gpu_2
+#endif
--- a/LAXlib/tests/test_diaghg_gpu_3.f90
+++ b/LAXlib/tests/test_diaghg_gpu_3.f90
@ -0,0 +1,283 @@
+#ifdef __CUDA
+program test_diaghg_gpu_3
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    test%tolerance64=1.d-8
+    !
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL real_1(test)
+    !
+    CALL complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE real_1(test)
+    USE mp_world, ONLY : mpime
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer :: ldh, n, m
+    real(DP), allocatable :: h(:,:)
+    real(DP), allocatable :: h_save(:,:)
+    real(DP), allocatable :: s(:,:)
+    real(DP), allocatable :: s_save(:,:)
+    real(DP), allocatable :: e(:)
+    real(DP), allocatable :: v(:,:)
+    real(DP), allocatable :: e_save(:)
+    real(DP), allocatable :: v_save(:,:)
+    real(DP), allocatable :: e_dc(:)      ! stores results from divide and conquer
+    real(DP), allocatable :: v_dc(:,:)    !
+    ! device copies
+    real(DP), allocatable, device :: h_d(:,:)
+    real(DP), allocatable, device :: s_d(:,:)
+    real(DP), allocatable, device :: e_d(:)
+    real(DP), allocatable, device :: v_d(:,:)
+    !
+    !
+    character(len=20) :: inputs(2)
+    integer :: i, j, info
+    !
+    inputs = ["ZnOG1.bin", "ZnOG2.bin"]
+    !
+    DO i=1, SIZE(inputs)
+        !
+        CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg(  n, m, h, s, ldh, e, v, .false. )
+        !
+        test%tolerance64=1.d-6 ! <- check this
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        test%tolerance64=1.d-8 ! <- check this
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg( n, m, h, s, ldh, e, v, .true. )
+        !
+        ALLOCATE(v_dc, SOURCE=h_save)
+        ALLOCATE(e_dc(n))
+        s = s_save
+        CALL solve_with_dsygvd(n, v_dc, s, ldh, e_dc)
+        s = s_save
+        !
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        ! GPU data & subroutines
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        s = s_save
+        h = h_save
+        ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
+        ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
+        !
+        CALL diaghg(  n, m, h_d, s_d, ldh, e_d, v_d, .false. )
+        !
+        v(1:n, 1:m) = v_d(1:n, 1:m)
+        e(1:m)      = e_d(1:m)
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        v_d = (0.d0, 0.d0)
+        e_d = 0.d0
+        s_d = s_save
+        h_d = h_save
+        !
+        ! Start from data on the GPU and diagonalize on the CPU
+        CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .true. )
+        !
+        v(1:n, 1:m) = v_d(1:n, 1:m)
+        e(1:m)      = e_d(1:m)
+        !
+        test%tolerance64=1.d-6 ! <- check this
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        test%tolerance64=1.d-8 ! <- check this
+        !
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        DEALLOCATE(h_d, s_d, e_d, v_d)
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, v_dc, e_dc)
+    END DO
+    !
+  END SUBROUTINE real_1
+  !
+  SUBROUTINE complex_1(test)
+    USE mp_world, ONLY : mpime
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer :: ldh, n, m
+    complex(DP), allocatable :: h(:,:)
+    complex(DP), allocatable :: h_save(:,:)
+    complex(DP), allocatable :: s(:,:)
+    complex(DP), allocatable :: s_save(:,:)
+    real(DP), allocatable    :: e(:)
+    complex(DP), allocatable :: v(:,:)
+    real(DP), allocatable    :: e_save(:)
+    complex(DP), allocatable :: v_save(:,:)
+    real(DP), allocatable    :: e_dc(:)      ! stores results from divide and conquer
+    complex(DP), allocatable :: v_dc(:,:)    !
+    
+    ! device copies
+    complex(DP), allocatable, device :: h_d(:,:)
+    complex(DP), allocatable, device :: s_d(:,:)
+    real(DP), allocatable, device    :: e_d(:)
+    complex(DP), allocatable, device :: v_d(:,:)
+    !
+    !
+    character(len=20)        :: inputs(4)
+    integer :: i, j, info
+    !
+    inputs = ["ZnOK1.bin ", &
+              "ZnOK2.bin ", &
+              "SiGeK1.bin", &
+              "SiGeK2.bin"]
+    !
+    DO i=1, SIZE(inputs)
+        !
+        CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        ALLOCATE(e_dc(n))
+        ALLOCATE(v_dc, SOURCE=h)
+        !
+        h_save = h
+        s_save = s
+        !
+        ! == Check CPU interface without and with offloading ==
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg(  n, m, h, s, ldh, e, v, .false. )
+        !
+        DO j = 1, m
+            CALL test%assert_close(  v(1:n, j), v_save(1:n, j) )
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        h = h_save; s = s_save;
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        CALL diaghg( n, m, h, s, ldh, e, v, .true. )
+        !
+        ! N.B.: GPU eigensolver uses a different algorithm: zhegvd
+        s = s_save; e_dc = 0.d0
+        CALL solve_with_zhegvd(n, v_dc, s, ldh, e_dc)
+        !
+        DO j = 1, m
+            CALL test%assert_close( v(1:n, j), v_dc(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        ! GPU data & subroutines
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        s = s_save
+        h = h_save
+        ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
+        ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
+        !
+        CALL diaghg(  n, m, h_d, s_d, ldh, e_d, v_d, .false. )
+        !
+        v(1:n, 1:m) = v_d(1:n, 1:m)
+        e(1:m)      = e_d(1:m)
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        v_d = (0.d0, 0.d0)
+        e_d = 0.d0
+        s_d = s_save
+        h_d = h_save
+        CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, .true. )
+        !
+        v(1:n, 1:m) = v_d(1:n, 1:m)
+        e(1:m)      = e_d(1:m)
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        DEALLOCATE(h_d, s_d, e_d, v_d)
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
+        DEALLOCATE(e_dc, v_dc)
+    END DO
+    !
+  END SUBROUTINE complex_1
+  !
+end program test_diaghg_gpu_3
+#else
+program test_diaghg_gpu_3
+end program test_diaghg_gpu_3
+#endif
--- a/LAXlib/tests/test_diaghg_gpu_4.f90
+++ b/LAXlib/tests/test_diaghg_gpu_4.f90
@ -0,0 +1,345 @@
+! This test uses the internal parallel diagonalization algorithm of LAXlib
+! (GPU interface) to solve the problems stored in binary files:
+!
+!  - ZnOG1.bin
+!  - ZnOG2.bin
+!  - ZnOK1.bin
+!  - ZnOK2.bin
+!  - SiGeK1.bin
+!  - SiGeK2.bin
+!
+! If the scalacpak or ELPA driver is used, the test is skipped.
+!
+#if ( ! defined(__SCALAPACK) ) && defined(__CUDA) 
+program test_diaghg_gpu_4
+#if defined(__MPI)
+    USE MPI
+#endif
+    USE mp,            ONLY : mp_bcast
+    USE mp_world,      ONLY : mp_world_start, mp_world_end, mpime, &
+                              root, nproc, world_comm
+    USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: world_group = 0
+    !
+    CALL test%init()
+    test%tolerance64=1.d-8
+    !
+#if defined(__MPI)    
+    world_group = MPI_COMM_WORLD
+#endif
+    CALL mp_world_start(world_group)
+    !
+    me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
+    !
+    CALL parallel_real_1(test)
+    !
+    CALL parallel_complex_1(test)
+    !
+    CALL collect_results(test)
+    !
+    CALL mp_world_end()
+    !
+    IF (mpime .eq. 0) CALL test%print()
+    !
+  CONTAINS
+  !
+  SUBROUTINE parallel_real_1(test)
+    USE cudafor
+    USE mp_world,    ONLY : mpime
+    USE LAXlib
+    USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
+    USE la_param,    ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    !
+    TYPE(la_descriptor) :: desc
+    integer :: ldh, n, m
+    real(DP), allocatable         :: h(:,:), hdst(:,:)   !< full and distributed Hpsi
+    real(DP), allocatable, device :: hdst_d(:,:)         !< distributed Hpsi on device
+    real(DP), allocatable         :: h_save(:,:)         !< full Hpsi, used to check consistence across calls
+    real(DP), allocatable         :: s(:,:), sdst(:,:)   !< full and distributed Spsi
+    real(DP), allocatable, device :: sdst_d(:,:)         !< distributed Spsi on device
+    real(DP), allocatable         :: s_save(:,:)         !< full Spsi, used to check consistence across calls
+    real(DP), allocatable         :: e(:)                !< full set of eigenvalues
+    real(DP), allocatable, device :: e_d(:)              !< full set of eigenvalues
+    real(DP), allocatable         :: v(:,:), vdst(:,:)   !< full and distributed eigenvectors
+    real(DP), allocatable, device :: vdst_d(:,:)         !< full and distributed eigenvectors
+    real(DP), allocatable         :: e_save(:)           !< full set of eigenvalues, used for checks
+    real(DP), allocatable         :: v_save(:,:)         !< full set of eigenvectors, used for checks
+    !
+    character(len=20)        :: inputs(2)
+    integer                  :: l, i, j, ii, jj, info, nrdst
+    logical                  :: la_proc
+    !
+    inputs = ["ZnOG1.bin", "ZnOG2.bin"]
+    !
+    DO l=1, SIZE(inputs)
+        !
+        ! Read the problem from specified input file.
+        !  this will also allocate(h, s, e, v)
+        CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        CALL init_parallel_diag(desc, n)
+        !
+        IF( desc%active_node > 0 ) la_proc = .TRUE.  ! selects processors involved in diagonalization
+        nrdst = desc%nrcx                            ! stores square distributed matrix sizes
+        IF (.not. la_proc) nrdst = 1                 ! dummy value to avoid 0 allocations
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        print *, nrdst, n, m
+        ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
+        !
+        IF (la_proc) THEN
+            DO j = 1, desc%nc ! number of column in the local block of lambda
+                DO i = 1, desc%nr ! number of row in the local block of lambda
+                   ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
+                   jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
+                   hdst(i, j) = h(ii, jj)
+                   sdst(i, j) = s(ii, jj)
+                END DO
+            END DO
+        END IF
+        !
+        ALLOCATE(hdst_d, SOURCE=hdst)
+        ALLOCATE(sdst_d, SOURCE=sdst)
+        ALLOCATE(vdst_d( nrdst , nrdst ), e_d(n))
+        e(1:n)   = 0.d0
+        e_d(1:n) = 0.d0
+        !
+        CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .false. )
+        !
+        e(1:n) = e_d
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        e(1:n) = 0.d0
+        e_d(1:n) = 0.d0
+        !
+        CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .true. )
+        !
+        e(1:n) = e_d
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
+        DEALLOCATE(e_d, hdst_d, sdst_d, vdst_d)
+    END DO
+    !
+  END SUBROUTINE parallel_real_1
+  !
+  SUBROUTINE parallel_complex_1(test)
+    USE cudafor
+    USE mp_world, ONLY : mpime
+    USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
+    USE LAXlib
+    USE la_param, ONLY : DP
+    USE test_io
+    implicit none
+    !
+    TYPE(tester_t) :: test
+    ! 
+    integer :: ldh, n, m
+    complex(DP), allocatable         :: h(:,:), hdst(:,:)   !< full and distributed Hpsi
+    complex(DP), allocatable, device :: hdst_d(:,:)         !< distributed Hpsi on device
+    complex(DP), allocatable         :: h_save(:,:)         !< full Hpsi, used to check consistence across calls
+    complex(DP), allocatable         :: s(:,:), sdst(:,:)   !< full and distributed Spsi
+    complex(DP), allocatable, device :: sdst_d(:,:)         !< distributed Spsi on device
+    complex(DP), allocatable         :: s_save(:,:)         !< full Spsi, used to check consistence across calls
+    real(DP), allocatable            :: e(:)                !< full set of eigenvalues
+    real(DP), allocatable, device    :: e_d(:)              !< full set of eigenvalues
+    complex(DP), allocatable         :: v(:,:), vdst(:,:)   !< full and distributed eigenvectors
+    complex(DP), allocatable, device :: vdst_d(:,:)         !< full and distributed eigenvectors
+    real(DP), allocatable            :: e_save(:)           !< full set of eigenvalues, used for checks
+    complex(DP), allocatable         :: v_save(:,:)         !< full set of eigenvectors, used for checks
+    TYPE(la_descriptor)              :: desc
+    !
+    character(len=20)        :: inputs(4)
+    integer                  :: l, i, j, ii, jj, info, nrdst
+    logical                  :: la_proc
+    !
+    inputs = ["ZnOK1.bin ", &
+              "ZnOK2.bin ", &
+              "SiGeK1.bin", &
+              "SiGeK2.bin"]
+    !
+    DO l=1, SIZE(inputs)
+        !
+        ! Read the problem from specified input file.
+        !  this will also allocate(h, s, e, v)
+        CALL read_problem(inputs(l), ldh, n, m, h, s, e, v, info)
+        !
+        IF (info /= 0) THEN
+            IF (mpime == 0) print *, "Test with ", inputs(l), " skipped. Input not found."
+            CYCLE
+        END IF
+        !
+        ALLOCATE(h_save, SOURCE=h)
+        ALLOCATE(s_save, SOURCE=s)
+        ALLOCATE(e_save, SOURCE=e)
+        ALLOCATE(v_save, SOURCE=v)
+        !
+        h_save = h
+        s_save = s
+        !
+        CALL init_parallel_diag(desc, n)
+        !
+        IF( desc%active_node > 0 ) la_proc = .TRUE.
+        nrdst = desc%nrcx
+        IF (.not. la_proc) nrdst = 1
+        !
+        v = (0.d0, 0.d0)
+        e = 0.d0
+        !
+        ALLOCATE( hdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( sdst( nrdst , nrdst ), STAT=info )
+        ALLOCATE( vdst( nrdst , nrdst ), STAT=info )
+        !
+        IF (la_proc) THEN
+            DO j = 1, desc%nc ! number of column in the local block of lambda
+                DO i = 1, desc%nr ! number of row in the local block of lambda
+                   ii = i + desc%ir - 1 ! globla index of the first row in the local block of lambda
+                   jj = j + desc%ic - 1 ! global index of the first column in the local block of lambda
+                   hdst(i, j) = h(ii, jj)
+                   sdst(i, j) = s(ii, jj)
+                END DO
+            END DO
+        END IF
+        !
+        ALLOCATE(hdst_d, SOURCE=hdst)
+        ALLOCATE(sdst_d, SOURCE=sdst)
+        ALLOCATE(vdst_d( nrdst , nrdst ), e_d(n))
+        !
+        e_d(1:n) = 0.d0
+        CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .false. )
+        e = e_d
+        !
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m) )
+        !
+        !
+        e_d(1:n) = 0.d0
+        CALL pdiaghg( n, hdst_d, sdst_d, nrdst, e_d, vdst_d, desc, .true. )
+        !
+        e = e_d
+        DO j = 1, m
+            !CALL test%assert_close( v(1:n, j), v_save(1:n, j))
+        END DO
+        CALL test%assert_close( e(1:m), e_save(1:m))
+        !
+        DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, hdst, sdst, vdst)
+        DEALLOCATE(e_d, hdst_d, sdst_d, vdst_d)
+    END DO
+    !
+  END SUBROUTINE parallel_complex_1
+  !
+  SUBROUTINE init_parallel_diag(desc, n)
+  
+      USE mp_world, ONLY : mpime, nproc, world_comm
+      USE mp_diag,  ONLY : ortho_parent_comm
+      USE descriptors, ONLY : la_descriptor, descla_init, descla_local_dims
+      USE LAXlib
+      USE la_param, ONLY : DP
+      implicit none
+      !
+      TYPE(la_descriptor) :: desc
+      INTEGER             :: n    ! global dimension of the matrix
+      !
+      INTEGER :: ierr = 0
+      INTEGER :: color, key
+      !
+      INTEGER :: np_ortho(2) = 1  ! size of the processor grid used in ortho
+      INTEGER :: me_ortho(2) = 0  ! coordinates of the processors
+      INTEGER :: me_ortho1   = 0  ! task id for the ortho group
+      INTEGER :: nproc_ortho = 1  ! size of the ortho group:
+      INTEGER :: ortho_comm  = 0  ! communicator for the ortho group
+      INTEGER :: ortho_row_comm  = 0  ! communicator for the ortho row group
+      INTEGER :: ortho_col_comm  = 0  ! communicator for the ortho col group
+      INTEGER :: ortho_comm_id = 0 ! id of the ortho_comm
+      !
+      ortho_parent_comm = world_comm
+      !
+#if defined __MPI
+      !
+      CALL grid2d_dims( 'S', nproc, np_ortho(1), np_ortho(2) )
+      !
+      nproc_ortho = np_ortho(1) * np_ortho(2)
+      !
+      !  here we choose the first "nproc_ortho" processors
+      !
+      color = 0
+      IF( mpime < nproc_ortho ) color = 1
+      !
+      key = mpime
+      !
+      !  initialize the communicator for the new group by splitting the input
+      !  communicator
+      !
+      CALL mpi_comm_split( MPI_COMM_WORLD , color, key, ortho_comm, ierr )
+      !
+      ! Computes coordinates of the processors, in row maior order
+      !
+      CALL mpi_comm_rank( ortho_comm, me_ortho1, ierr)
+      !
+      IF( mpime == 0 .AND. me_ortho1 /= 0 ) &
+           CALL lax_error__( " init_ortho_group ", " wrong root task in ortho group ", ierr )
+      !
+      if( color == 1 ) then
+         ! this task belong to the ortho_group compute its coordinates
+         ortho_comm_id = 1
+         CALL GRID2D_COORDS( 'R', me_ortho1, np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2) )
+         CALL GRID2D_RANK( 'R', np_ortho(1), np_ortho(2), me_ortho(1), me_ortho(2), ierr )
+         IF( ierr /= me_ortho1 ) &
+              CALL lax_error__( " init_ortho_group ", " wrong task coordinates in ortho group ", ierr )
+         IF( me_ortho1 /= mpime ) &
+              CALL lax_error__( " init_ortho_group ", " wrong rank assignment in ortho group ", ierr )
+         CALL mpi_comm_split( ortho_comm , me_ortho(2), me_ortho(1), ortho_col_comm, ierr )
+         CALL mpi_comm_split( ortho_comm , me_ortho(1), me_ortho(2), ortho_row_comm, ierr )
+      else
+         ! this task does NOT belong to the ortho_group set dummy values
+         ortho_comm_id = 0
+         me_ortho(1) = me_ortho1
+         me_ortho(2) = me_ortho1
+      endif
+#else
+      ortho_comm_id = 1
+#endif
+      CALL descla_init( desc, n, n, np_ortho, me_ortho, ortho_comm, -1, ortho_comm_id )
+      
+  END SUBROUTINE init_parallel_diag
+  
+end program test_diaghg_gpu_4
+#else
+program test_diaghg_gpu_4
+end program test_diaghg_gpu_4
+#endif
--- a/LAXlib/tests/test_helpers.f90
+++ b/LAXlib/tests/test_helpers.f90
@ -0,0 +1,76 @@
+!
+SUBROUTINE solve_with_zhegvd(n, v, s, ldh, e)
+  USE la_param, ONLY : DP
+  IMPLICIT NONE
+  !
+  complex(DP) :: v(ldh,n)
+  complex(DP) :: s(ldh,n)
+  real(DP) :: e(n)
+  INTEGER                  :: n
+  !
+  INTEGER                  :: lwork, lrwork, liwork, info, ldh
+  !
+  REAL(DP)                 :: abstol
+  INTEGER,     ALLOCATABLE :: iwork(:), ifail(:)
+  REAL(DP),    ALLOCATABLE :: rwork(:)
+  COMPLEX(DP), ALLOCATABLE :: work(:)
+  ! various work space
+
+  !
+  ALLOCATE(work(1), rwork(1), iwork(1))
+  CALL ZHEGVD( 1, 'V', 'U', n, v, ldh, &
+              s, ldh, e, work, -1, rwork, -1, iwork, -1, info )
+  
+  IF (info /= 0) print *, "Workspace not computed!"
+  
+  lwork = work(1)
+  lrwork = rwork(1)
+  liwork = iwork(1)
+  
+  DEALLOCATE(work, rwork, iwork)
+  ALLOCATE(work(lwork), rwork(lrwork), iwork(liwork))
+  
+  CALL ZHEGVD( 1, 'V', 'U', n, v, ldh, &
+              s, ldh, e, work, lwork, rwork, lrwork, iwork, liwork, info )
+  
+  DEALLOCATE(work, rwork, iwork)
+  !
+END SUBROUTINE solve_with_zhegvd
+!
+SUBROUTINE solve_with_dsygvd(n, v, s, ldh, e)
+  USE la_param, ONLY : DP
+  IMPLICIT NONE
+  !
+  REAL(DP) :: v(ldh,n)
+  REAL(DP) :: s(ldh,n)
+  real(DP) :: e(n)
+  INTEGER                  :: n
+  !
+  INTEGER                  :: lwork, liwork, info, ldh
+  !
+  REAL(DP)                 :: abstol
+  INTEGER,     ALLOCATABLE :: iwork(:), ifail(:)
+  REAL(DP),    ALLOCATABLE :: work(:)
+  ! various work space
+
+  !
+  ALLOCATE(work(1), iwork(1))
+  CALL dsygvd( 1, 'V', 'U', n, v, ldh, &
+              s, ldh, e, work, -1, iwork, -1, info )
+  
+  IF (info /= 0) print *, "Workspace not computed!"
+  
+  lwork = work(1)
+  liwork = iwork(1)
+  
+  DEALLOCATE(work, iwork)
+  ALLOCATE(work(lwork), iwork(liwork))
+  !
+  CALL dsygvd( 1, 'V', 'U', n, v, ldh, &
+              s, ldh, e, work, lwork, iwork, liwork, info )
+  !
+  DEALLOCATE(work, iwork)
+  !
+END SUBROUTINE solve_with_dsygvd
+!
+! TODO: add check for eigenvalue probelm
--- a/LAXlib/tests/test_io.f90
+++ b/LAXlib/tests/test_io.f90
@ -0,0 +1,104 @@
+MODULE test_io
+    !
+    IMPLICIT NONE
+    !
+    INTERFACE read_problem
+       MODULE PROCEDURE read_cmplx_problem, read_real_problem
+    END INTERFACE
+    !
+    CONTAINS
+    !
+    SUBROUTINE read_cmplx_problem(fname, ldh, n, m, h, s, e, v, info)
+        USE la_param, ONLY : DP
+        IMPLICIT NONE
+        character(len=*), intent(in) :: fname
+        integer, intent(out) :: ldh, n, m
+        complex(dp), allocatable, intent(inout) :: h(:,:)
+        complex(dp), allocatable, intent(inout) :: s(:,:)
+        real(dp), allocatable, intent(inout)    :: e(:)
+        complex(dp), allocatable, intent(inout) :: v(:,:)
+        integer, intent(out)                    :: info
+        !
+        real(dp) :: aux1, aux2
+        integer :: i, j, t
+        logical :: exist
+        !
+        character(len=20):: fname_
+        !
+        info   = 0
+        fname_ = trim(fname)
+        !
+        print *, "reading ", fname_
+        inquire(file=fname_, exist=exist)
+        !
+        if (.not. exist ) then
+            info=1
+            return
+        end if
+        !
+        open (unit=15, file=fname_, status='old', form='unformatted', action='read')
+        !
+        t = 0
+        read (15)  n
+        read (15)  m
+        read (15)  ldh
+        !
+        t = t + 3
+        !
+        ALLOCATE(h(ldh, n), s(ldh, n), e(n), v(ldh, m))
+        !
+        READ(15) h
+        READ(15) s
+        READ(15) e
+        READ(15) v
+        !
+        close(15)
+    END SUBROUTINE read_cmplx_problem
+    !
+    SUBROUTINE read_real_problem(fname, ldh, n, m, h, s, e, v, info)
+        USE la_param, ONLY : DP
+        IMPLICIT NONE
+        character(len=*), intent(in) :: fname
+        integer, intent(out) :: ldh, n, m
+        real(dp), allocatable, intent(inout) :: h(:,:)
+        real(dp), allocatable, intent(inout) :: s(:,:)
+        real(dp), allocatable, intent(inout) :: e(:)
+        real(dp), allocatable, intent(inout) :: v(:,:)
+        integer, intent(out)                 :: info
+        !
+        real(dp) :: aux1, aux2
+        integer :: i, j, t
+        logical :: exist
+        !
+        character(len=20):: fname_    
+        !
+        info   = 0
+        fname_ = trim(fname)
+        !
+        print *, "reading ", fname_
+        inquire(file=fname_, exist=exist)
+        !
+        if (.not. exist ) then
+            info=1
+            return
+        end if
+        !
+        open (unit=15, file=fname_, status='old', form='unformatted', action='read')
+        !
+        t = 0
+        read (15)  n
+        read (15)  m
+        read (15)  ldh
+        !
+        t = t + 3
+        !
+        ALLOCATE(h(ldh, n), s(ldh, n), e(n), v(ldh, m))
+        !
+        READ(15) h
+        READ(15) s
+        READ(15) e
+        READ(15) v
+        !
+        close(15)
+    END SUBROUTINE read_real_problem
+END MODULE test_io
--- a/LAXlib/tests/tester.f90
+++ b/LAXlib/tests/tester.f90
@ -0,0 +1,862 @@
+! This file is part of fortran_tester
+! Copyright 2015 Pierre de Buyl and Peter Colberg
+!           2016 Pierre de Buyl and Stefano Szaghi
+!           2018 Pierre de Buyl and Pietro Bonfa
+! License: BSD
+
+!> Routines to test Fortran programs
+!!
+!! fortran_tester is a pure-Fortran module. It provides a datatype to hold test results and
+!! routines to test for equality, closeness, and positivity of variables. The routines are
+!! overloaded and the resulting interface consists of a small number of names.
+
+module tester
+  use, intrinsic :: iso_fortran_env, only : int8, int16, int32, int64, real32, real64
+
+  implicit none
+  private
+  public :: tester_t
+
+  !> The main **tester** class.
+  type :: tester_t
+     integer(int32) :: n_errors=0_int32                         !< Number of errors.
+     integer(int32) :: n_tests=0_int32                          !< Number of tests.
+     real(real32)   :: tolerance32=2._real32*epsilon(1._real32) !< Real tolerance, 32 bits.
+     real(real64)   :: tolerance64=2._real64*epsilon(1._real64) !< Real tolerance, 64 bits.
+   contains
+     procedure :: init                           !< Initialize the tester.
+     procedure :: print                          !< Print tests results.
+     generic, public :: assert_equal =>     &
+                        assert_equal_i8,    &
+                        assert_equal_i16,   &
+                        assert_equal_i32,   &
+                        assert_equal_i64,   &
+                        assert_equal_r32,   &
+                        assert_equal_r64,   &
+                        assert_equal_c32,   &
+                        assert_equal_c64,   &
+                        assert_equal_l,     &
+                        assert_equal_i8_1,  &
+                        assert_equal_i16_1, &
+                        assert_equal_i32_1, &
+                        assert_equal_i64_1, &
+                        assert_equal_r32_1, &
+                        assert_equal_r64_1, &
+                        assert_equal_c32_1, &
+                        assert_equal_c64_1, &
+                        assert_equal_l_1         !< Check if two values (integer, real, complex or logical) are equal.
+     procedure, private :: assert_equal_i8       !< Check if two integers (8  bits) are equal.
+     procedure, private :: assert_equal_i16      !< Check if two integers (16 bits) are equal.
+     procedure, private :: assert_equal_i32      !< Check if two integers (32 bits) are equal.
+     procedure, private :: assert_equal_i64      !< Check if two integers (64 bits) are equal.
+     procedure, private :: assert_equal_r32      !< Check if two reals (32 bits) are equal.
+     procedure, private :: assert_equal_r64      !< Check if two reals (64 bits) are equal.
+     procedure, private :: assert_equal_c32      !< Check if two complex numbers (32 bits) are equal.
+     procedure, private :: assert_equal_c64      !< Check if two complex numbers (64 bits) are equal.
+     procedure, private :: assert_equal_l        !< Check if two logicals are equal.
+     procedure, private :: assert_equal_i8_1     !< Check if two integer (8  bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_i16_1    !< Check if two integer (16 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_i32_1    !< Check if two integer (32 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_i64_1    !< Check if two integer (64 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_r32_1    !< Check if two real (32 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_r64_1    !< Check if two real (64 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_c32_1    !< Check if two complex (32 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_c64_1    !< Check if two complex (64 bits) arrays (rank 1) are equal.
+     procedure, private :: assert_equal_l_1      !< Check if two logical arrays (rank 1) are equal.
+     generic, public :: assert_positive =>     &
+                        assert_positive_i8,    &
+                        assert_positive_i16,   &
+                        assert_positive_i32,   &
+                        assert_positive_i64,   &
+                        assert_positive_r32,   &
+                        assert_positive_r64,   &
+                        assert_positive_i8_1,  &
+                        assert_positive_i16_1, &
+                        assert_positive_i32_1, &
+                        assert_positive_i64_1, &
+                        assert_positive_r32_1, &
+                        assert_positive_r64_1    !< Check if a number (integer or real) is positive.
+     procedure, private :: assert_positive_i8    !< Check if a integer (8  bits) is positive.
+     procedure, private :: assert_positive_i16   !< Check if a integer (16 bits) is positive.
+     procedure, private :: assert_positive_i32   !< Check if a integer (32 bits) is positive.
+     procedure, private :: assert_positive_i64   !< Check if a integer (64 bits) is positive.
+     procedure, private :: assert_positive_r32   !< Check if a real (32 bits) is positive.
+     procedure, private :: assert_positive_r64   !< Check if a real (64 bits) is positive.
+     procedure, private :: assert_positive_i8_1  !< Check if a integer (8  bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_i16_1 !< Check if a integer (16 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_i32_1 !< Check if a integer (32 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_i64_1 !< Check if a integer (64 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_r32_1 !< Check if a real (32 bits) array (rank 1) is positive.
+     procedure, private :: assert_positive_r64_1 !< Check if a real (64 bits) array (rank 1) is positive.
+     generic, public :: assert_close =>     &
+                        assert_close_r32,   &
+                        assert_close_r64,   &
+                        assert_close_c32,   &
+                        assert_close_c64,   &
+                        assert_close_r32_1, &
+                        assert_close_r64_1, &
+                        assert_close_c32_1, &
+                        assert_close_c64_1       !< Check if two values (real or complex) are close with respect a tolerance.
+     procedure, private :: assert_close_r32      !< Check if two reals (32 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_r64      !< Check if two reals (64 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_c32      !< Check if two complex numbers (32 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_c64      !< Check if two complex numbers (64 bits) are close with respect a tolerance.
+     procedure, private :: assert_close_r32_1    !< Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
+     procedure, private :: assert_close_r64_1    !< Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
+     procedure, private :: assert_close_c32_1    !< Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
+     procedure, private :: assert_close_c64_1    !< Check if two complex (64 bits) arrays (rank 1) are close with respect a tolerance.
+  end type tester_t
+
+contains
+
+  !> Initialize the tester.
+  subroutine init(this, tolerance32, tolerance64)
+    class(tester_t), intent(out)          :: this        !< The tester.
+    real(real32),    intent(in), optional :: tolerance32 !< Real tolerance, 32 bits.
+    real(real64),    intent(in), optional :: tolerance64 !< Real tolerance, 64 bits.
+
+    this% n_errors = 0
+    this% n_tests = 0
+
+    if (present(tolerance64)) then
+       this% tolerance64 = tolerance64
+    else
+       this% tolerance64 = 2._real64*epsilon(1._real64)
+    end if
+
+    if (present(tolerance32)) then
+       this% tolerance32 = tolerance32
+    else
+       this% tolerance32 = 2._real32*epsilon(1._real32)
+    end if
+
+  end subroutine init
+
+  !> Print tests results.
+  subroutine print(this, errorstop)
+    class(tester_t), intent(in)           :: this      !< The tester.
+    logical,         intent(in), optional :: errorstop !< Flag to activate error stop if one test fails.
+
+    logical :: do_errorstop
+    if (present(errorstop)) then
+       do_errorstop = errorstop
+    else
+       do_errorstop = .true.
+    end if
+
+    write(*,*) 'fortran_tester:', this% n_errors, ' error(s) for', this% n_tests, 'test(s)'
+
+    if (this% n_errors == 0) then
+       write(*,*) 'fortran_tester: all tests succeeded'
+    else
+       write(*,*) 'fortran_tester: tests failed'
+       if (do_errorstop) then
+          stop 1
+       end if
+    end if
+
+  end subroutine print
+
+  !> Check if two integers (8 bits) are equal.
+  subroutine assert_equal_i8(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int8),   intent(in)           :: i1   !< Value to compare.
+    integer(int8),   intent(in)           :: i2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i8
+
+  !> Check if two integers (16 bits) are equal.
+  subroutine assert_equal_i16(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this   !< The tester.
+    integer(int16),  intent(in)           :: i1     !< Value to compare.
+    integer(int16),  intent(in)           :: i2     !< Value to compare.
+    logical,         intent(in), optional :: fail   !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i16
+
+  !> Check if two integers (32 bits) are equal.
+  subroutine assert_equal_i32(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int32),  intent(in)           :: i1   !< Value to compare.
+    integer(int32),  intent(in)           :: i2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i32
+
+  !> Check if two integers (64 bits) are equal.
+  subroutine assert_equal_i64(this, i1, i2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int64),  intent(in)           :: i1   !< Value to compare.
+    integer(int64),  intent(in)           :: i2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i1 .ne. i2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_i64
+
+  !> Check if two reals (32 bits) are equal.
+  subroutine assert_equal_r32(this, r1, r2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real32),    intent(in)           :: r1   !< Value to compare.
+    real(real32),    intent(in)           :: r2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r1 .ne. r2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_r32
+
+  !> Check if two reals (64 bits) are equal.
+  subroutine assert_equal_r64(this, r1, r2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real64),    intent(in)           :: r1   !< Value to compare.
+    real(real64),    intent(in)           :: r2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r1 .ne. r2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_r64
+
+  !> Check if two complex numbers (32 bits) are equal.
+  subroutine assert_equal_c32(this, c1, c2, fail)
+    class(tester_t), intent(inout)           :: this !< The tester.
+    complex(real32),    intent(in)           :: c1   !< Value to compare.
+    complex(real32),    intent(in)           :: c2   !< Value to compare.
+    logical,            intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (c1 .ne. c2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_c32
+
+  !> Check if two complex numbers (64 bits) are equal.
+  subroutine assert_equal_c64(this, c1, c2, fail)
+    class(tester_t), intent(inout)           :: this !< The tester.
+    complex(real64),    intent(in)           :: c1   !< Value to compare.
+    complex(real64),    intent(in)           :: c2   !< Value to compare.
+    logical,            intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (c1 .ne. c2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_c64
+
+  !> Check if two logicals are equal.
+ subroutine assert_equal_l(this, l1, l2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    logical,         intent(in)           :: l1   !< Value to compare.
+    logical,         intent(in)           :: l2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (l1 .neqv. l2) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_equal_l
+
+  !> Check if two integer (8 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i8_1(this, i1, i2, fail)
+    class(tester_t),             intent(inout)        :: this !< The tester.
+    integer(int8), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int8), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                     intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i8_1
+
+  !> Check if two integer (16 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i16_1(this, i1, i2, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int16), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int16), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i16_1
+
+  !> Check if two integer (32 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i32_1(this, i1, i2, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int32), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int32), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i32_1
+
+  !> Check if two integer (64 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_i64_1(this, i1, i2, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int64), dimension(:), intent(in)           :: i1   !< Value to compare.
+    integer(int64), dimension(:), intent(in)           :: i2   !< Value to compare.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(i1) .ne. size(i2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(i1-i2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_i64_1
+
+  !> Check if two real (32 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_r32_1(this, r1, r2, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real32), dimension(:), intent(in)           :: r1   !< Value to compare.
+    real(real32), dimension(:), intent(in)           :: r2   !< Value to compare.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_r32_1
+
+  !> Check if two real (64 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_r64_1(this, r1, r2, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real64), dimension(:), intent(in)           :: r1   !< Value to compare.
+    real(real64), dimension(:), intent(in)           :: r2   !< Value to compare.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_r64_1
+  
+  !> Check if two complex (32 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_c32_1(this, c1, c2, fail)
+    class(tester_t),               intent(inout)        :: this !< The tester.
+    complex(real32), dimension(:), intent(in)           :: c1   !< Value to compare.
+    complex(real32), dimension(:), intent(in)           :: c2   !< Value to compare.
+    logical,                       intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_c32_1
+
+  !> Check if two complex (64 bits) arrays (rank 1) are equal.
+  subroutine assert_equal_c64_1(this, c1, c2, fail)
+    class(tester_t),               intent(inout)        :: this !< The tester.
+    complex(real64), dimension(:), intent(in)           :: c1   !< Value to compare.
+    complex(real64), dimension(:), intent(in)           :: c2   !< Value to compare.
+    logical,                       intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > 0 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_equal_c64_1
+
+  !> Check if two logical arrays (rank 1) are equal.
+  subroutine assert_equal_l_1(this, l1, l2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    logical,         intent(in), dimension(:) :: l1   !< Value to compare.
+    logical,         intent(in), dimension(:) :: l2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    integer :: k
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(l1) .ne. size(l2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       do k = 1, size(l1)
+          if (l1(k) .neqv. l2(k)) then
+             if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+                this% n_errors = this% n_errors + 1
+             end if
+             exit
+          end if
+       end do
+    end if
+
+  end subroutine assert_equal_l_1
+
+  !> Check if a integer (32 bits) is positive.
+  subroutine assert_positive_i8(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int8),   intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i8
+
+  !> Check if a integer (16 bits) is positive.
+  subroutine assert_positive_i16(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int16),  intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i16
+
+  !> Check if a integer (32 bits) is positive.
+  subroutine assert_positive_i32(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int32),  intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i32
+
+  !> Check if a integer (32 bits) is positive.
+  subroutine assert_positive_i64(this, i, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    integer(int64),  intent(in)           :: i    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (i < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i64
+
+  !> Check if a real (32 bits) is positive.
+  subroutine assert_positive_r32(this, r, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real32),    intent(in)           :: r    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r32
+
+  !> Check if a real (64 bits) is positive.
+  subroutine assert_positive_r64(this, r, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real64),    intent(in)           :: r    !< Value to check.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+    if (r < 0) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r64
+
+  !> Check if a integer (8 bits) array (rank 1) is positive.
+  subroutine assert_positive_i8_1(this, i, fail)
+    class(tester_t),             intent(inout)        :: this !< The tester.
+    integer(int8), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                     intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i8_1
+
+  !> Check if a integer (16 bits) array (rank 1) is positive.
+  subroutine assert_positive_i16_1(this, i, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int16), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i16_1
+
+  !> Check if a integer (32 bits) array (rank 1) is positive.
+  subroutine assert_positive_i32_1(this, i, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int32), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i32_1
+
+  !> Check if a integer (64 bits) array (rank 1) is positive.
+  subroutine assert_positive_i64_1(this, i, fail)
+    class(tester_t),              intent(inout)        :: this !< The tester.
+    integer(int64), dimension(:), intent(in)           :: i    !< Value to check.
+    logical,                      intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(i) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_i64_1
+
+  !> Check if a real (32 bits) array (rank 1) is positive.
+  subroutine assert_positive_r32_1(this, r, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real32), dimension(:), intent(in)           :: r    !< Value to check.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(r) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r32_1
+
+  !> Check if a real (64 bits) array (rank 1) is positive.
+  subroutine assert_positive_r64_1(this, r, fail)
+    class(tester_t),            intent(inout)        :: this !< The tester.
+    real(real64), dimension(:), intent(in)           :: r    !< Value to check.
+    logical,                    intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( minval(r) < 0 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_positive_r64_1
+
+  !> Check if two reals (32 bits) are close with respect a tolerance.
+  subroutine assert_close_r32(this, r1, r2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    real(real32),    intent(in)           :: r1   !< Value to compare.
+    real(real32),    intent(in)           :: r2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(r1-r2) > this% tolerance32 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_r32
+
+  !> Check if two reals (64 bits) are close with respect a tolerance.
+  subroutine assert_close_r64(this, r1, r2, fail)
+    class(tester_t),  intent(inout)        :: this !< The tester.
+    real(real64),     intent(in)           :: r1   !< Value to compare.
+    real(real64),     intent(in)           :: r2   !< Value to compare.
+    logical,          intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(r1-r2) > this% tolerance64 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_r64
+
+  !> Check if two real (32 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_r32_1(this, r1, r2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    real(real32),    intent(in), dimension(:) :: r1   !< Value to compare.
+    real(real32),    intent(in), dimension(:) :: r2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_r32_1
+
+  !> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_r64_1(this, r1, r2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    real(real64),    intent(in), dimension(:) :: r1   !< Value to compare.
+    real(real64),    intent(in), dimension(:) :: r2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(r1) .ne. size(r2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(r1-r2)) > this% tolerance64 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_r64_1
+  
+  !> Check if two complex numbers (32 bits) are close with respect a tolerance.
+  subroutine assert_close_c32(this, c1, c2, fail)
+    class(tester_t), intent(inout)        :: this !< The tester.
+    complex(real32), intent(in)           :: c1   !< Value to compare.
+    complex(real32), intent(in)           :: c2   !< Value to compare.
+    logical,         intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(c1-c2) > this% tolerance32 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_c32
+
+  !> Check if two complex numbers (64 bits) are close with respect a tolerance.
+  subroutine assert_close_c64(this, r1, c2, fail)
+    class(tester_t),  intent(inout)        :: this !< The tester.
+    complex(real64),  intent(in)           :: r1   !< Value to compare.
+    complex(real64),  intent(in)           :: c2   !< Value to compare.
+    logical,          intent(in), optional :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( abs(r1-c2) > this% tolerance64 ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    end if
+
+  end subroutine assert_close_c64
+
+  !> Check if two complex (32 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_c32_1(this, c1, c2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    complex(real32), intent(in), dimension(:) :: c1   !< Value to compare.
+    complex(real32), intent(in), dimension(:) :: c2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > this% tolerance32 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_c32_1
+
+  !> Check if two real (64 bits) arrays (rank 1) are close with respect a tolerance.
+  subroutine assert_close_c64_1(this, c1, c2, fail)
+    class(tester_t), intent(inout)            :: this !< The tester.
+    complex(real64), intent(in), dimension(:) :: c1   !< Value to compare.
+    complex(real64), intent(in), dimension(:) :: c2   !< Value to compare.
+    logical,         intent(in), optional     :: fail !< Fail flag.
+
+    this% n_tests = this% n_tests + 1
+
+    if ( size(c1) .ne. size(c2) ) then
+       if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+          this% n_errors = this% n_errors + 1
+       end if
+    else
+       if ( maxval(abs(c1-c2)) > this% tolerance64 ) then
+          if (.not. present(fail) .or. (present(fail) .and. fail .eqv. .false.)) then
+             this% n_errors = this% n_errors + 1
+          end if
+       end if
+    end if
+
+  end subroutine assert_close_c64_1
+
+end module tester
--- a/LAXlib/tests/utils.f90
+++ b/LAXlib/tests/utils.f90
@ -0,0 +1,53 @@
+SUBROUTINE collect_results(test)
+#if defined(__MPI)
+    USE mpi
+#endif
+    USE tester
+    IMPLICIT NONE
+    !
+    TYPE(tester_t) :: test
+    INTEGER :: itottests, itoterr, ierr, me
+    !
+#if defined(__MPI)
+    !
+    CALL MPI_REDUCE(test%n_errors, itoterr, 1, MPI_INTEGER, MPI_SUM, &
+                        0, MPI_COMM_WORLD, ierr)
+    ! Fail in case MPI fails...
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+    CALL MPI_REDUCE(test%n_tests, itottests, 1, MPI_INTEGER, MPI_SUM, &
+                        0, MPI_COMM_WORLD, ierr)
+    ! Fail in case MPI fails...
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+    test%n_tests  = itottests
+    test%n_errors = itoterr
+    !
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+    CALL MPI_Comm_rank(MPI_COMM_WORLD, me, ierr);
+    !
+    IF (ierr /= 0) CALL test%assert_equal(0, ierr)
+    !
+#endif
+END SUBROUTINE collect_results
+
+
+SUBROUTINE no_test
+#if defined(__MPI)
+    USE mpi
+#endif
+    USE tester
+    IMPLICIT NONE
+    !TYPE(tester_t) :: test
+    INTEGER :: ierr
+    !    
+#if defined(__MPI)
+    CALL MPI_Init(ierr)
+#endif
+    !CALL test%init()
+    !CALL print_results(test)
+#if defined(__MPI)
+    CALL mpi_finalize(ierr)
+#endif
+END SUBROUTINE no_test
--- a/Modules/Makefile
+++ b/Modules/Makefile
@ -141,6 +141,8 @@ w0gauss.o \
 w1gauss.o \
 deviatoric.o

+-include make.gpu
+
 TLDEPS= libfox libutil libla libfft librxc

 all : libqemod.a
--- a/Modules/becmod_gpu.f90
+++ b/Modules/becmod_gpu.f90
@ -0,0 +1,387 @@
+!
+! Copyright (C) 2002-2011 Quantum ESPRESSO group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+#define DIMS1D(my_array) lbound(my_array,1):ubound(my_array,1)
+#define DIMS2D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2)
+#define DIMS3D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2),lbound(my_array,3):ubound(my_array,3)
+#define DIMS4D(my_array) lbound(my_array,1):ubound(my_array,1),lbound(my_array,2):ubound(my_array,2),lbound(my_array,3):ubound(my_array,3),lbound(my_array,4):ubound(my_array,4)
+!=----------------------------------------------------------------------------=!
+   MODULE becmod_gpum
+!=----------------------------------------------------------------------------=!
+#if defined(__CUDA)
+     USE cudafor
+#endif
+     IMPLICIT NONE
+     SAVE
+     INTEGER, PARAMETER :: DP = selected_real_kind(14,200)
+     INTEGER, PARAMETER :: sgl = selected_real_kind(6,30)
+     INTEGER, PARAMETER :: i4b = selected_int_kind(9)
+     INTEGER, PARAMETER :: i8b = selected_int_kind(18)
+     INTEGER :: iverbosity = 0
+#if defined(__DEBUG)
+     iverbosity = 1
+#endif
+     !
+     TYPE bec_type_d
+#if defined(__CUDA)
+        REAL(DP), ALLOCATABLE, DEVICE :: r_d(:, :)
+#else
+        REAL(DP), ALLOCATABLE :: r_d(:, :)
+#endif
+#if defined(__CUDA)
+        COMPLEX(DP), ALLOCATABLE, DEVICE :: k_d(:, :)
+#else
+        COMPLEX(DP), ALLOCATABLE :: k_d(:, :)
+#endif
+#if defined(__CUDA)
+        COMPLEX(DP), ALLOCATABLE, DEVICE :: nc_d(:, :, :)
+#else
+        COMPLEX(DP), ALLOCATABLE :: nc_d(:, :, :)
+#endif
+        INTEGER :: comm
+        INTEGER :: nbnd
+        INTEGER :: nproc
+        INTEGER :: mype
+        INTEGER :: nbnd_loc
+        INTEGER :: ibnd_begin
+     END TYPE bec_type_d
+     !
+     TYPE (bec_type_d), TARGET :: becp_d  ! <beta|psi>
+     !
+
+     LOGICAL :: becp_r_ood = .false.    ! used to flag out of date variables
+     LOGICAL :: becp_d_r_d_ood = .false.    ! used to flag out of date variables
+     LOGICAL :: becp_k_ood = .false.    ! used to flag out of date variables
+     LOGICAL :: becp_d_k_d_ood = .false.    ! used to flag out of date variables
+     LOGICAL :: becp_nc_ood = .false.    ! used to flag out of date variables
+     LOGICAL :: becp_d_nc_d_ood = .false.    ! used to flag out of date variables
+     !
+     CONTAINS
+     !
+     SUBROUTINE using_becp_r(intento, debug_info)
+         !
+         ! intento is used to specify what the variable will  be used for :
+         !  0 -> in , the variable needs to be synchronized but won't be changed
+         !  1 -> inout , the variable needs to be synchronized AND will be changed
+         !  2 -> out , NO NEED to synchronize the variable, everything will be overwritten
+         !
+         USE becmod, ONLY : becp
+         implicit none
+         INTEGER, INTENT(IN) :: intento
+         CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
+         !
+#if defined(__CUDA) || defined(__CUDA_GNU)
+         INTEGER :: intento_
+         intento_ = intento
+         !
+         IF (PRESENT(debug_info) ) print *, debug_info
+         !
+         IF (becp_r_ood) THEN
+             IF (.not. allocated(becp_d%r_d)) THEN
+                CALL errore('using_r_d', 'PANIC: sync of becp%r from becp_d%r_d with unallocated array. Bye!!', 1)
+                stop
+             END IF
+             IF (.not. allocated(becp%r)) THEN
+                IF (intento_ /= 2) THEN
+                   print *, "WARNING: sync of becp%r with unallocated array and intento /= 2? Changed to 2!"
+                   intento_ = 2 
+                END IF
+             END IF
+             IF (intento_ < 2) THEN
+                IF ( iverbosity > 0 ) print *, "Really copied becp%r D->H"
+                becp%r = becp_d%r_d
+             END IF
+             ! ALWAYS update auxiliary variables
+             !IF ( becp%comm /= becp_d%comm ) &
+             !     print *, "WARNING: auxiliary variable becp%comm changed"
+             becp%comm = becp_d%comm
+             !IF ( becp%nbnd /= becp_d%nbnd ) &
+             !     print *, "WARNING: auxiliary variable becp%nbnd changed"
+             becp%nbnd = becp_d%nbnd
+             !IF ( becp%nproc /= becp_d%nproc ) &
+             !     print *, "WARNING: auxiliary variable becp%nproc changed"
+             becp%nproc = becp_d%nproc
+             !IF ( becp%mype /= becp_d%mype ) &
+             !     print *, "WARNING: auxiliary variable becp%mype changed"
+             becp%mype = becp_d%mype
+             !IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
+             !     print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
+             becp%nbnd_loc = becp_d%nbnd_loc
+             !IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
+             !     print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
+             becp%ibnd_begin = becp_d%ibnd_begin
+             !
+             becp_r_ood = .false.
+         ENDIF
+         IF (intento_ > 0)    becp_d_r_d_ood = .true.
+#endif
+     END SUBROUTINE using_becp_r
+     !
+     SUBROUTINE using_becp_r_d(intento, debug_info)
+         !
+         USE becmod, ONLY : becp
+         implicit none
+         INTEGER, INTENT(IN) :: intento
+         CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
+         !
+#if defined(__CUDA) || defined(__CUDA_GNU)
+         !
+         IF (PRESENT(debug_info) ) print *, debug_info
+         !
+         IF (.not. allocated(becp%r)) THEN
+             IF (intento /= 2) print *, "WARNING: sync of becp%r_d with unallocated array and intento /= 2?"
+             IF (allocated(becp_d%r_d)) DEALLOCATE(becp_d%r_d)
+             becp_d_r_d_ood = .false.
+             RETURN
+         END IF
+         ! here we know that r is allocated, check if size is 0 
+         IF ( SIZE(becp%r) == 0 ) THEN
+             print *, "Refusing to allocate 0 dimensional array becp_d%r_d. If used, code will crash."
+             RETURN
+         END IF
+         !
+         IF (becp_d_r_d_ood) THEN
+             IF ( allocated(becp_d%r_d) .and. (SIZE(becp_d%r_d)/=SIZE(becp%r))) deallocate(becp_d%r_d)
+             IF (.not. allocated(becp_d%r_d)) ALLOCATE(becp_d%r_d(DIMS2D(becp%r)))  ! MOLD does not work on all compilers
+             IF (intento < 2) THEN
+                IF ( iverbosity > 0 ) print *, "Really copied becp%r H->D"
+                becp_d%r_d = becp%r
+             END IF
+             ! ALWAYS update auxiliary variables
+             becp_d%comm = becp%comm
+             becp_d%nbnd = becp%nbnd
+             becp_d%nproc = becp%nproc
+             becp_d%mype = becp%mype
+             becp_d%nbnd_loc = becp%nbnd_loc
+             becp_d%ibnd_begin = becp%ibnd_begin
+             !
+             becp_d_r_d_ood = .false.
+         ENDIF
+         IF (intento > 0)    becp_r_ood = .true.
+#else
+         CALL errore('using_becp_d%r_d', 'Trying to use device data without device compilated code!', 1)
+#endif
+     END SUBROUTINE using_becp_r_d
+     !
+     SUBROUTINE using_becp_k(intento, debug_info)
+         !
+         ! intento is used to specify what the variable will  be used for :
+         !  0 -> in , the variable needs to be synchronized but won't be changed
+         !  1 -> inout , the variable needs to be synchronized AND will be changed
+         !  2 -> out , NO NEED to synchronize the variable, everything will be overwritten
+         !
+         USE becmod, ONLY : becp
+         implicit none
+         INTEGER, INTENT(IN) :: intento
+         CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
+         !
+#if defined(__CUDA) || defined(__CUDA_GNU)
+         INTEGER :: intento_
+         intento_ = intento
+         !
+         IF (PRESENT(debug_info) ) print *, debug_info
+         !
+         IF (becp_k_ood) THEN
+             IF (.not. allocated(becp_d%k_d)) THEN
+                CALL errore('using_k_d', 'PANIC: sync of becp%k from becp_d%k_d with unallocated array. Bye!!', 1)
+                stop
+             END IF
+             IF (.not. allocated(becp%k)) THEN
+                IF (intento_ /= 2) THEN
+                   print *, "WARNING: sync of becp%k with unallocated array and intento /= 2? Changed to 2!"
+                   intento_ = 2 
+                END IF
+             END IF
+             IF (intento_ < 2) THEN
+                IF ( iverbosity > 0 ) print *, "Really copied becp%k D->H"
+                becp%k = becp_d%k_d
+             END IF
+             ! ALWAYS update auxiliary variables
+             !IF ( becp%comm /= becp_d%comm ) &
+             !     print *, "WARNING: auxiliary variable becp%comm changed"
+             becp%comm = becp_d%comm
+             !IF ( becp%nbnd /= becp_d%nbnd ) &
+             !     print *, "WARNING: auxiliary variable becp%nbnd changed"
+             becp%nbnd = becp_d%nbnd
+             !IF ( becp%nproc /= becp_d%nproc ) &
+             !     print *, "WARNING: auxiliary variable becp%nproc changed"
+             becp%nproc = becp_d%nproc
+             !IF ( becp%mype /= becp_d%mype ) &
+             !     print *, "WARNING: auxiliary variable becp%mype changed"
+             becp%mype = becp_d%mype
+             !IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
+             !     print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
+             becp%nbnd_loc = becp_d%nbnd_loc
+             !IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
+             !     print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
+             becp%ibnd_begin = becp_d%ibnd_begin
+             !
+             becp_k_ood = .false.
+         ENDIF
+         IF (intento_ > 0)    becp_d_k_d_ood = .true.
+#endif
+     END SUBROUTINE using_becp_k
+     !
+     SUBROUTINE using_becp_k_d(intento, debug_info)
+         !
+         USE becmod, ONLY : becp
+         implicit none
+         INTEGER, INTENT(IN) :: intento
+         CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
+         !
+#if defined(__CUDA) || defined(__CUDA_GNU)
+         !
+         IF (PRESENT(debug_info) ) print *, debug_info
+         !
+         IF (.not. allocated(becp%k)) THEN
+             IF (intento /= 2) print *, "WARNING: sync of becp%k_d with unallocated array and intento /= 2?"
+             IF (allocated(becp_d%k_d)) DEALLOCATE(becp_d%k_d)
+             becp_d_k_d_ood = .false.
+             RETURN
+         END IF
+         ! here we know that k is allocated, check if size is 0 
+         IF ( SIZE(becp%k) == 0 ) THEN
+             print *, "Refusing to allocate 0 dimensional array becp_d%k_d. If used, code will crash."
+             RETURN
+         END IF
+         !
+         IF (becp_d_k_d_ood) THEN
+             IF ( allocated(becp_d%k_d) .and. (SIZE(becp_d%k_d)/=SIZE(becp%k))) deallocate(becp_d%k_d)
+             IF (.not. allocated(becp_d%k_d)) ALLOCATE(becp_d%k_d(DIMS2D(becp%k)))  ! MOLD does not work on all compilers
+             IF (intento < 2) THEN
+                IF ( iverbosity > 0 ) print *, "Really copied becp%k H->D"
+                becp_d%k_d = becp%k
+             END IF
+             ! ALWAYS update auxiliary variables
+             becp_d%comm = becp%comm
+             becp_d%nbnd = becp%nbnd
+             becp_d%nproc = becp%nproc
+             becp_d%mype = becp%mype
+             becp_d%nbnd_loc = becp%nbnd_loc
+             becp_d%ibnd_begin = becp%ibnd_begin
+             !
+             becp_d_k_d_ood = .false.
+         ENDIF
+         IF (intento > 0)    becp_k_ood = .true.
+#else
+         CALL errore('using_becp_d%k_d', 'Trying to use device data without device compilated code!', 1)
+#endif
+     END SUBROUTINE using_becp_k_d
+     !
+     SUBROUTINE using_becp_nc(intento, debug_info)
+         !
+         ! intento is used to specify what the variable will  be used for :
+         !  0 -> in , the variable needs to be synchronized but won't be changed
+         !  1 -> inout , the variable needs to be synchronized AND will be changed
+         !  2 -> out , NO NEED to synchronize the variable, everything will be overwritten
+         !
+         USE becmod, ONLY : becp
+         implicit none
+         INTEGER, INTENT(IN) :: intento
+         CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
+         !
+#if defined(__CUDA) || defined(__CUDA_GNU)
+         INTEGER :: intento_
+         intento_ = intento
+         !
+         IF (PRESENT(debug_info) ) print *, debug_info
+         !
+         IF (becp_nc_ood) THEN
+             IF (.not. allocated(becp_d%nc_d)) THEN
+                CALL errore('using_nc_d', 'PANIC: sync of becp%nc from becp_d%nc_d with unallocated array. Bye!!', 1)
+                stop
+             END IF
+             IF (.not. allocated(becp%nc)) THEN
+                IF (intento_ /= 2) THEN
+                   print *, "WARNING: sync of becp%nc with unallocated array and intento /= 2? Changed to 2!"
+                   intento_ = 2 
+                END IF
+             END IF
+             IF (intento_ < 2) THEN
+                IF ( iverbosity > 0 ) print *, "Really copied becp%nc D->H"
+                becp%nc = becp_d%nc_d
+             END IF
+             ! ALWAYS update auxiliary variables
+             !IF ( becp%comm /= becp_d%comm ) &
+             !     print *, "WARNING: auxiliary variable becp%comm changed"
+             becp%comm = becp_d%comm
+             !IF ( becp%nbnd /= becp_d%nbnd ) &
+             !     print *, "WARNING: auxiliary variable becp%nbnd changed"
+             becp%nbnd = becp_d%nbnd
+             !IF ( becp%nproc /= becp_d%nproc ) &
+             !     print *, "WARNING: auxiliary variable becp%nproc changed"
+             becp%nproc = becp_d%nproc
+             !IF ( becp%mype /= becp_d%mype ) &
+             !     print *, "WARNING: auxiliary variable becp%mype changed"
+             becp%mype = becp_d%mype
+             !IF ( becp%nbnd_loc /= becp_d%nbnd_loc ) &
+             !     print *, "WARNING: auxiliary variable becp%nbnd_loc changed"
+             becp%nbnd_loc = becp_d%nbnd_loc
+             !IF ( becp%ibnd_begin /= becp_d%ibnd_begin ) &
+             !     print *, "WARNING: auxiliary variable becp%ibnd_begin changed"
+             becp%ibnd_begin = becp_d%ibnd_begin
+             !
+             becp_nc_ood = .false.
+         ENDIF
+         IF (intento_ > 0)    becp_d_nc_d_ood = .true.
+#endif
+     END SUBROUTINE using_becp_nc
+     !
+     SUBROUTINE using_becp_nc_d(intento, debug_info)
+         !
+         USE becmod, ONLY : becp
+         implicit none
+         INTEGER, INTENT(IN) :: intento
+         CHARACTER(len=*), INTENT(IN), OPTIONAL :: debug_info
+         !
+#if defined(__CUDA) || defined(__CUDA_GNU)
+         !
+         IF (PRESENT(debug_info) ) print *, debug_info
+         !
+         IF (.not. allocated(becp%nc)) THEN
+             IF (intento /= 2) print *, "WARNING: sync of becp%nc_d with unallocated array and intento /= 2?"
+             IF (allocated(becp_d%nc_d)) DEALLOCATE(becp_d%nc_d)
+             becp_d_nc_d_ood = .false.
+             RETURN
+         END IF
+         ! here we know that nc is allocated, check if size is 0 
+         IF ( SIZE(becp%nc) == 0 ) THEN
+             print *, "Refusing to allocate 0 dimensional array becp_d%nc_d. If used, code will crash."
+             RETURN
+         END IF
+         !
+         IF (becp_d_nc_d_ood) THEN
+             IF ( allocated(becp_d%nc_d) .and. (SIZE(becp_d%nc_d)/=SIZE(becp%nc))) deallocate(becp_d%nc_d)
+             IF (.not. allocated(becp_d%nc_d)) ALLOCATE(becp_d%nc_d(DIMS3D(becp%nc)))  ! MOLD does not work on all compilers
+             IF (intento < 2) THEN
+                IF ( iverbosity > 0 ) print *, "Really copied becp%nc H->D"
+                becp_d%nc_d = becp%nc
+             END IF
+             ! ALWAYS update auxiliary variables
+             becp_d%comm = becp%comm
+             becp_d%nbnd = becp%nbnd
+             becp_d%nproc = becp%nproc
+             becp_d%mype = becp%mype
+             becp_d%nbnd_loc = becp%nbnd_loc
+             becp_d%ibnd_begin = becp%ibnd_begin
+             !
+             becp_d_nc_d_ood = .false.
+         ENDIF
+         IF (intento > 0)    becp_nc_ood = .true.
+#else
+         CALL errore('using_becp_d%nc_d', 'Trying to use device data without device compilated code!', 1)
+#endif
+     END SUBROUTINE using_becp_nc_d
+     !     
+     SUBROUTINE deallocate_becmod_gpu
+       IF( ALLOCATED( becp_d%r_d ) ) DEALLOCATE( becp_d%r_d )
+       IF( ALLOCATED( becp_d%k_d ) ) DEALLOCATE( becp_d%k_d )
+       IF( ALLOCATED( becp_d%nc_d ) ) DEALLOCATE( becp_d%nc_d )
+     END SUBROUTINE deallocate_becmod_gpu
+!=----------------------------------------------------------------------------=!
+   END MODULE becmod_gpum
+!=----------------------------------------------------------------------------=!
--- a/Modules/becmod_subs_gpu.f90
+++ b/Modules/becmod_subs_gpu.f90
@ -0,0 +1,609 @@
+!
+! Copyright (C) 2001-2007 PWSCF group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!----------------------------------------------------------------------------
+!
+MODULE becmod_subs_gpum
+  
+  ! NOTA BENE : THE SUBROUTINES IN THIS FILE ARE ONLY PARTIALLY TESTED!
+   
+  !
+  ! ... *bec* contain <beta|psi> - used in h_psi, s_psi, many other places
+  ! ... calbec( npw, beta, psi, betapsi [, nbnd ] ) is an interface calculating
+  ! ...    betapsi(i,j)  = <beta(i)|psi(j)>   (the sum is over npw components)
+  ! ... or betapsi(i,s,j)= <beta(i)|psi(s,j)> (s=polarization index)
+  !
+  USE kinds,            ONLY : DP
+  USE control_flags,    ONLY : gamma_only, smallmem
+  USE gvect,            ONLY : gstart
+  USE noncollin_module, ONLY : noncolin, npol
+  USE becmod_gpum,      ONLY : bec_type_d
+  !
+  SAVE
+  !
+  PRIVATE
+  !
+  INTERFACE calbec_gpu
+     !
+     MODULE PROCEDURE calbec_k_gpu, calbec_gamma_gpu, calbec_gamma_nocomm_gpu, calbec_nc_gpu, calbec_bec_type_gpu
+     !
+  END INTERFACE
+
+  INTERFACE becscal_gpu
+     !
+     MODULE PROCEDURE becscal_nck_gpu, becscal_gamma_gpu
+     !
+  END INTERFACE
+  !
+  PUBLIC :: allocate_bec_type_gpu, deallocate_bec_type_gpu, calbec_gpu, &
+            beccopy_gpu, becscal_gpu, is_allocated_bec_type_gpu, &
+            synchronize_bec_type_gpu, &
+            using_becp_auto, using_becp_d_auto
+  !
+CONTAINS
+  !-----------------------------------------------------------------------
+  SUBROUTINE calbec_bec_type_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
+    !-----------------------------------------------------------------------
+    !_
+    USE mp_bands, ONLY: intra_bgrp_comm
+    USE mp,       ONLY: mp_get_comm_null
+    !
+    IMPLICIT NONE
+    COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
+    TYPE (bec_type_d), TARGET, INTENT (inout) :: betapsi_d ! NB: must be INOUT otherwise
+                                               !  the allocatd array is lost
+    INTEGER, INTENT (in) :: npw
+    INTEGER, OPTIONAL :: nbnd
+#if defined(__CUDA)
+    attributes(DEVICE) :: beta_d, psi_d
+#endif
+    !
+    INTEGER :: local_nbnd
+    INTEGER, EXTERNAL :: ldim_block, gind_block
+    INTEGER :: m_loc, m_begin, ip
+    REAL(DP), ALLOCATABLE :: dtmp_d(:,:)   ! replace this with buffers !
+    INTEGER :: i, j, nkb
+    REAL(DP), POINTER :: betapsi_d_r_d(:,:)
+#if defined(__CUDA)
+    attributes(DEVICE) :: dtmp_d, betapsi_d_r_d
+#endif
+    !
+    IF ( present (nbnd) ) THEN
+        local_nbnd = nbnd
+    ELSE
+        local_nbnd = size ( psi_d, 2)
+    ENDIF
+
+    IF ( gamma_only ) THEN
+       !
+       IF( betapsi_d%comm == mp_get_comm_null() ) THEN
+          !
+          CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d%r_d, local_nbnd, intra_bgrp_comm )
+          !
+       ELSE
+          !
+          ALLOCATE( dtmp_d( SIZE( betapsi_d%r_d, 1 ), SIZE( betapsi_d%r_d, 2 ) ) )
+          !
+          DO ip = 0, betapsi_d%nproc - 1
+             m_loc   = ldim_block( betapsi_d%nbnd , betapsi_d%nproc, ip )
+             m_begin = gind_block( 1,  betapsi_d%nbnd, betapsi_d%nproc, ip )
+             IF( ( m_begin + m_loc - 1 ) > local_nbnd ) m_loc = local_nbnd - m_begin + 1
+             IF( m_loc > 0 ) THEN
+                CALL calbec_gamma_gpu ( npw, beta_d, psi_d(:,m_begin:m_begin+m_loc-1), dtmp_d, m_loc, betapsi_d%comm )
+                IF( ip == betapsi_d%mype ) THEN
+                   nkb = SIZE( betapsi_d%r_d, 1 )
+                   betapsi_d_r_d => betapsi_d%r_d
+!$cuf kernel do(2) <<<*,*>>>
+                   DO j=1,m_loc
+                      DO i=1, nkb
+                         betapsi_d_r_d(i,j) = dtmp_d(i,j)
+                      END DO
+                   END DO
+                END IF
+             END IF
+          END DO
+
+          DEALLOCATE( dtmp_d )
+          !
+       END IF
+       !
+    ELSEIF ( noncolin) THEN
+       !
+       CALL  calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d%nc_d, local_nbnd )
+       !
+    ELSE
+       !
+       CALL  calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d%k_d, local_nbnd )
+       !
+    ENDIF
+    !
+    RETURN
+    !
+  END SUBROUTINE calbec_bec_type_gpu
+  !-----------------------------------------------------------------------
+  SUBROUTINE calbec_gamma_nocomm_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
+    !-----------------------------------------------------------------------
+    USE mp_bands, ONLY: intra_bgrp_comm
+    IMPLICIT NONE
+    COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
+    REAL (DP), INTENT (out) :: betapsi_d(:,:)
+    INTEGER, INTENT (in) :: npw
+    INTEGER, OPTIONAL :: nbnd
+    INTEGER :: m
+#if defined(__CUDA)
+    attributes(DEVICE) :: beta_d, psi_d, betapsi_d
+#endif
+    IF ( present (nbnd) ) THEN
+        m = nbnd
+    ELSE
+        m = size ( psi_d, 2)
+    ENDIF
+    CALL calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, m, intra_bgrp_comm )
+    RETURN
+    !
+  END SUBROUTINE calbec_gamma_nocomm_gpu
+  !-----------------------------------------------------------------------
+  SUBROUTINE calbec_gamma_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd, comm )
+    !-----------------------------------------------------------------------
+    !
+    ! ... matrix times matrix with summation index (k=1,npw) running on
+    ! ... half of the G-vectors or PWs - assuming k=0 is the G=0 component:
+    ! ... betapsi(i,j) = 2Re(\sum_k beta^*(i,k)psi(k,j)) + beta^*(i,0)psi(0,j)
+    !
+    USE mp,        ONLY : mp_sum, mp_size
+#if defined(__CUDA)
+    USE cudafor
+    USE cublas
+#endif
+    IMPLICIT NONE
+    COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
+    REAL (DP), INTENT (out) :: betapsi_d(:,:)
+    INTEGER, INTENT (in) :: npw
+    INTEGER, INTENT (in) :: nbnd
+    INTEGER, INTENT (in) :: comm 
+    !
+#if defined(__CUDA)
+    attributes(DEVICE) :: beta_d, psi_d, betapsi_d
+#endif
+    INTEGER :: nkb, npwx, m
+    INTEGER :: i,j
+    !
+    m = nbnd
+    !
+    nkb = size (beta_d, 2)
+    IF ( nkb == 0 ) RETURN
+    !
+    CALL start_clock( 'calbec' )
+    IF ( npw == 0 ) betapsi_d(:,:)=0.0_DP
+    npwx= size (beta_d, 1)
+    IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
+    IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
+#if defined(DEBUG)
+    WRITE (*,*) 'calbec gamma'
+    WRITE (*,*)  nkb,  size (betapsi_d,1) , m , size (betapsi_d, 2)
+#endif
+    IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
+      CALL errore ('calbec', 'size mismatch', 3)
+    !
+    IF ( m == 1 ) THEN
+        !
+        CALL cudaDGEMV( 'C', 2*npw, nkb, 2.0_DP, beta_d, 2*npwx, psi_d, 1, 0.0_DP, &
+                     betapsi_d, 1 )
+        IF ( gstart == 2 ) THEN
+           !betapsi_d(:,1) = betapsi_d(:,1) - beta_d(1,:)*psi_d(1,1)
+           !$cuf kernel do(1) <<<*,*>>>
+           DO i=1, nkb
+              betapsi_d(i,1) = betapsi_d(i,1) - DBLE(beta_d(1,i)*psi_d(1,1))
+           END DO
+        END IF
+        !
+    ELSE
+        !
+        CALL DGEMM( 'C', 'N', nkb, m, 2*npw, 2.0_DP, beta_d, 2*npwx, psi_d, &
+                    2*npwx, 0.0_DP, betapsi_d, nkb )
+        IF ( gstart == 2 ) &
+           CALL cudaDGER( nkb, m, -1.0_DP, beta_d, 2*npwx, psi_d, 2*npwx, betapsi_d, nkb )
+        !
+    ENDIF
+    !
+    IF (mp_size(comm) > 1) CALL mp_sum( betapsi_d( :, 1:m ), comm )
+    !
+    CALL stop_clock( 'calbec' )
+    !
+    RETURN
+    !
+  END SUBROUTINE calbec_gamma_gpu
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE calbec_k_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
+    !-----------------------------------------------------------------------
+    !
+    ! ... matrix times matrix with summation index (k=1,npw) running on
+    ! ... G-vectors or PWs : betapsi(i,j) = \sum_k beta^*(i,k) psi(k,j)
+    !
+    USE mp_bands, ONLY : intra_bgrp_comm
+    USE mp,       ONLY : mp_sum, mp_size
+#if defined(__CUDA)
+    USE cudafor
+    USE cublas
+#endif
+    IMPLICIT NONE
+    COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
+    COMPLEX (DP), INTENT (out) :: betapsi_d(:,:)
+    INTEGER, INTENT (in) :: npw
+    INTEGER, OPTIONAL :: nbnd
+    !
+    INTEGER :: nkb, npwx, m
+    !
+#if defined(__CUDA)
+    attributes(device) :: beta_d, psi_d, betapsi_d
+#endif
+    nkb = size (beta_d, 2)
+    IF ( nkb == 0 ) RETURN
+    !
+    CALL start_clock( 'calbec' )
+    IF ( npw == 0 ) betapsi_d(:,:)=(0.0_DP,0.0_DP)
+    npwx= size (beta_d, 1)
+    IF ( npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
+    IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
+    IF ( present (nbnd) ) THEN
+        m = nbnd
+    ELSE
+        m = size ( psi_d, 2)
+    ENDIF
+#if defined(DEBUG)
+    WRITE (*,*) 'calbec k'
+    WRITE (*,*)  nkb,  size (betapsi_d,1) , m , size (betapsi_d, 2)
+#endif
+    IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 2) ) &
+      CALL errore ('calbec', 'size mismatch', 3)
+    !
+    IF ( m == 1 ) THEN
+       !
+       CALL ZGEMV( 'C', npw, nkb, (1.0_DP,0.0_DP), beta_d, npwx, psi_d, 1, &
+                   (0.0_DP, 0.0_DP), betapsi_d, 1 )
+       !
+    ELSE
+       !
+       CALL ZGEMM( 'C', 'N', nkb, m, npw, (1.0_DP,0.0_DP), &
+                 beta_d, npwx, psi_d, npwx, (0.0_DP,0.0_DP), betapsi_d, nkb )
+       !
+    ENDIF
+    !
+    IF (mp_size(intra_bgrp_comm) > 1) CALL mp_sum( betapsi_d( :, 1:m ), intra_bgrp_comm )
+    !
+    CALL stop_clock( 'calbec' )
+    !
+    RETURN
+    !
+  END SUBROUTINE calbec_k_gpu
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE calbec_nc_gpu ( npw, beta_d, psi_d, betapsi_d, nbnd )
+    !-----------------------------------------------------------------------
+    !
+    ! ... matrix times matrix with summation index (k below) running on
+    ! ... G-vectors or PWs corresponding to two different polarizations:
+    ! ... betapsi(i,1,j) = \sum_k=1,npw beta^*(i,k) psi(k,j)
+    ! ... betapsi(i,2,j) = \sum_k=1,npw beta^*(i,k) psi(k+npwx,j)
+    !
+    USE mp_bands, ONLY : intra_bgrp_comm
+    USE mp,       ONLY : mp_sum, mp_size
+#if defined(__CUDA)
+    USE cudafor
+    USE cublas
+#endif
+    IMPLICIT NONE
+    COMPLEX (DP), INTENT (in) :: beta_d(:,:), psi_d(:,:)
+    COMPLEX (DP), INTENT (out) :: betapsi_d(:,:,:)
+    INTEGER, INTENT (in) :: npw
+    INTEGER, OPTIONAL :: nbnd
+    !
+    INTEGER :: nkb, npwx, npol, m
+    !
+#if defined(__CUDA)
+    attributes(device) :: beta_d, psi_d, betapsi_d
+#endif
+    nkb = size (beta_d, 2)
+    IF ( nkb == 0 ) RETURN
+    !
+    CALL start_clock ('calbec')
+    IF ( npw == 0 ) betapsi_d(:,:,:)=(0.0_DP,0.0_DP)
+    npwx= size (beta_d, 1)
+    IF ( 2*npwx /= size (psi_d, 1) ) CALL errore ('calbec', 'size mismatch', 1)
+    IF ( npwx < npw ) CALL errore ('calbec', 'size mismatch', 2)
+    IF ( present (nbnd) ) THEN
+        m = nbnd
+    ELSE
+        m = size ( psi_d, 2)
+    ENDIF
+    npol= size (betapsi_d, 2)
+#if defined(DEBUG)
+    WRITE (*,*) 'calbec nc'
+    WRITE (*,*)  nkb,  size (betapsi_d,1) , m , size (betapsi_d, 3)
+#endif
+    IF ( nkb /= size (betapsi_d,1) .or. m > size (betapsi_d, 3) ) &
+      CALL errore ('calbec', 'size mismatch', 3)
+    !
+    CALL ZGEMM ('C', 'N', nkb, m*npol, npw, (1.0_DP, 0.0_DP), beta_d, &
+              npwx, psi_d, npwx, (0.0_DP, 0.0_DP),  betapsi_d, nkb)
+    !
+    IF (mp_size(intra_bgrp_comm) > 1)  CALL mp_sum( betapsi_d( :, :, 1:m ), intra_bgrp_comm )
+    !
+    CALL stop_clock( 'calbec' )
+    !
+    RETURN
+    !
+  END SUBROUTINE calbec_nc_gpu
+  !
+  !
+  !-----------------------------------------------------------------------
+  FUNCTION is_allocated_bec_type_gpu (bec_d) RESULT (isalloc)
+    !-----------------------------------------------------------------------
+    IMPLICIT NONE
+    TYPE (bec_type_d) :: bec_d
+    LOGICAL :: isalloc
+    isalloc = (allocated(bec_d%r_d) .or. allocated(bec_d%nc_d) .or. allocated(bec_d%k_d))
+    RETURN
+    !
+    !-----------------------------------------------------------------------
+  END FUNCTION is_allocated_bec_type_gpu
+  !-----------------------------------------------------------------------
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE allocate_bec_type_gpu ( nkb, nbnd, bec_d, comm )
+    !-----------------------------------------------------------------------
+    USE mp, ONLY: mp_size, mp_rank, mp_get_comm_null
+    USE device_memcpy_m, ONLY : dev_memset
+    IMPLICIT NONE
+    TYPE (bec_type_d) :: bec_d
+    INTEGER, INTENT (in) :: nkb, nbnd
+    INTEGER, INTENT (in), OPTIONAL :: comm
+    INTEGER :: ierr, nbnd_siz
+    INTEGER, EXTERNAL :: ldim_block, gind_block
+    !
+    nbnd_siz = nbnd
+    bec_d%comm = mp_get_comm_null()
+    bec_d%nbnd = nbnd
+    bec_d%mype = 0
+    bec_d%nproc = 1
+    bec_d%nbnd_loc = nbnd
+    bec_d%ibnd_begin = 1
+    !
+    IF( PRESENT( comm ) .AND. gamma_only .AND. smallmem ) THEN
+       bec_d%comm = comm
+       bec_d%nproc = mp_size( comm )
+       IF( bec_d%nproc > 1 ) THEN
+          nbnd_siz   = nbnd / bec_d%nproc
+          IF( MOD( nbnd, bec_d%nproc ) /= 0 ) nbnd_siz = nbnd_siz + 1
+          bec_d%mype  = mp_rank( bec_d%comm )
+          bec_d%nbnd_loc   = ldim_block( bec_d%nbnd , bec_d%nproc, bec_d%mype )
+          bec_d%ibnd_begin = gind_block( 1,  bec_d%nbnd, bec_d%nproc, bec_d%mype )
+       END IF
+    END IF
+    !
+    IF ( gamma_only ) THEN
+       !
+       ALLOCATE( bec_d%r_d( nkb, nbnd_siz ), STAT=ierr )
+       IF( ierr /= 0 ) &
+          CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%r ', ABS(ierr) )
+       !
+       CALL dev_memset(bec_d%r_d, 0.0D0, (/1,nkb/), 1, (/1, nbnd_siz/), 1)
+       !
+    ELSEIF ( noncolin) THEN
+       !
+       ALLOCATE( bec_d%nc_d( nkb, npol, nbnd_siz ), STAT=ierr )
+       IF( ierr /= 0 ) &
+          CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%nc ', ABS(ierr) )
+       !
+       CALL dev_memset(bec_d%nc_d, (0.0D0,0.0D0), (/1, nkb/), 1, (/1, npol/), 1, (/1, nbnd_siz/), 1)
+       !
+    ELSE
+       !
+       ALLOCATE( bec_d%k_d( nkb, nbnd_siz ), STAT=ierr )
+       IF( ierr /= 0 ) &
+          CALL errore( ' allocate_bec_type ', ' cannot allocate bec_d%k ', ABS(ierr) )
+       !
+       CALL dev_memset(bec_d%k_d, (0.0D0,0.0D0), (/1, nkb/), 1, (/1, npol/), 1)
+       !
+    ENDIF
+    !
+    RETURN
+    !
+  END SUBROUTINE allocate_bec_type_gpu
+  !-----------------------------------------------------------------------
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE deallocate_bec_type_gpu (bec_d)
+    !-----------------------------------------------------------------------
+    !
+    USE mp, ONLY: mp_get_comm_null
+    IMPLICIT NONE
+    TYPE (bec_type_d) :: bec_d
+    !
+    bec_d%comm = mp_get_comm_null()
+    bec_d%nbnd = 0
+    !
+    IF (allocated(bec_d%r_d))  DEALLOCATE(bec_d%r_d)
+    IF (allocated(bec_d%nc_d)) DEALLOCATE(bec_d%nc_d)
+    IF (allocated(bec_d%k_d))  DEALLOCATE(bec_d%k_d)
+    !
+    RETURN
+    !
+  END SUBROUTINE deallocate_bec_type_gpu
+  !-----------------------------------------------------------------------
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE 	synchronize_bec_type_gpu (bec_d, bec, what)
+    !-----------------------------------------------------------------------
+    !
+    ! ... Updates a device or host version of a bec_type variable.
+    ! ... Direction 'h' updates host version, direction 'd' the device
+    ! ... version.
+    !
+    USE mp, ONLY: mp_get_comm_null
+    USE becmod, ONLY : bec_type
+    IMPLICIT NONE
+    TYPE (bec_type_d) :: bec_d
+    TYPE (bec_type) :: bec
+    CHARACTER, INTENT(IN) :: what
+    !
+    IF ( gamma_only ) THEN
+       !
+       IF (.not. (allocated(bec_d%r_d) .and. allocated(bec%r))) &
+          CALL errore('becmod_gpu', 'Unallocated array',1)
+       SELECT CASE(what)
+        CASE('d')
+          bec_d%r_d = bec%r
+        CASE('h')
+          bec%r     = bec_d%r_d
+        CASE DEFAULT
+          CALL errore('becmod_gpu', 'Invalid command',2)
+       END SELECT
+       !
+    ELSEIF ( noncolin) THEN
+       !
+       IF (.not. (allocated(bec_d%nc_d) .and. allocated(bec%nc))) &
+          CALL errore('becmod_gpu', 'Unallocated array',3)
+       SELECT CASE(what)
+        CASE('d')
+          bec_d%nc_d = bec%nc
+        CASE('h')
+          bec%nc     = bec_d%nc_d
+        CASE DEFAULT
+          CALL errore('becmod_gpu', 'Invalid command',4)
+       END SELECT
+       !
+    ELSE
+       !
+       IF (.not. (allocated(bec_d%k_d) .and. allocated(bec%k))) &
+          CALL errore('becmod_gpu', 'Unallocated array',5)
+       SELECT CASE(what)
+        CASE('d')
+          bec_d%k_d = bec%k
+        CASE('h')
+          bec%k     = bec_d%k_d
+        CASE DEFAULT
+          CALL errore('becmod_gpu', 'Invalid command',6)
+       END SELECT
+       !
+    ENDIF
+    !
+    RETURN
+    !
+  END SUBROUTINE 	synchronize_bec_type_gpu
+  !-----------------------------------------------------------------------
+  !
+  !-----------------------------------------------------------------------
+  SUBROUTINE beccopy_gpu(bec, bec1, nkb, nbnd)
+#if defined(__CUDA)
+    USE cudafor
+    USE cublas
+#endif
+    IMPLICIT NONE
+    TYPE(bec_type_d), INTENT(in) :: bec
+    TYPE(bec_type_d)  :: bec1
+    INTEGER, INTENT(in) :: nkb, nbnd
+
+    IF (gamma_only) THEN
+       CALL dcopy(nkb*nbnd, bec%r_d, 1, bec1%r_d, 1)
+    ELSEIF (noncolin) THEN
+       CALL zcopy(nkb*npol*nbnd, bec%nc_d, 1, bec1%nc_d,  1)
+    ELSE
+       CALL zcopy(nkb*nbnd, bec%k_d, 1, bec1%k_d, 1)
+    ENDIF
+
+    RETURN
+  END SUBROUTINE beccopy_gpu
+
+  SUBROUTINE becscal_nck_gpu(alpha, bec_d, nkb, nbnd)
+#if defined(__CUDA)
+    USE cudafor
+    USE cublas
+#endif
+    IMPLICIT NONE
+    TYPE(bec_type_d), INTENT(INOUT) :: bec_d
+    COMPLEX(DP), INTENT(IN) :: alpha
+    INTEGER, INTENT(IN) :: nkb, nbnd
+    IF (gamma_only) THEN
+       CALL errore('becscal_nck','called in the wrong case',1)
+    ELSEIF (noncolin) THEN
+       CALL zscal(nkb*npol*nbnd, alpha, bec_d%nc_d, 1)
+    ELSE
+       CALL zscal(nkb*nbnd, alpha, bec_d%k_d, 1)
+    ENDIF
+
+    RETURN
+  END SUBROUTINE becscal_nck_gpu
+
+  SUBROUTINE becscal_gamma_gpu(alpha, bec_d, nkb, nbnd)
+#if defined(__CUDA)
+    USE cudafor
+    USE cublas
+#endif
+    IMPLICIT NONE
+    TYPE(bec_type_d), INTENT(INOUT) :: bec_d
+    REAL(DP), INTENT(IN) :: alpha
+    INTEGER, INTENT(IN) :: nkb, nbnd
+
+    IF (gamma_only) THEN
+       CALL dscal(nkb*nbnd, alpha, bec_d%r_d, 1)
+    ELSE
+       CALL errore('becscal_gamma','called in the wrong case',1)
+    ENDIF
+
+    RETURN
+  END SUBROUTINE becscal_gamma_gpu
+  !
+  SUBROUTINE using_becp_auto(intento)
+    USE becmod_gpum, ONLY : using_becp_r
+    USE becmod_gpum, ONLY : using_becp_k
+    USE becmod_gpum, ONLY : using_becp_nc
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: intento
+    !
+    !
+    IF ( gamma_only ) THEN
+       !
+       CALL using_becp_r(intento)
+       !
+    ELSEIF ( noncolin) THEN
+       !
+       CALL using_becp_nc(intento)
+       !
+    ELSE
+       !
+       CALL using_becp_k(intento)
+       !
+    ENDIF
+  END SUBROUTINE using_becp_auto
+  !
+  SUBROUTINE using_becp_d_auto(intento)
+    USE becmod_gpum, ONLY : using_becp_r_d
+    USE becmod_gpum, ONLY : using_becp_k_d
+    USE becmod_gpum, ONLY : using_becp_nc_d
+    IMPLICIT NONE
+    INTEGER, INTENT(IN) :: intento
+    !
+    !
+    IF ( gamma_only ) THEN
+       !
+       CALL using_becp_r_d(intento)
+       !
+    ELSEIF ( noncolin) THEN
+       !
+       CALL using_becp_nc_d(intento)
+       !
+    ELSE
+       !
+       CALL using_becp_k_d(intento)
+       !
+    ENDIF
+  END SUBROUTINE using_becp_d_auto
+
+
+END MODULE becmod_subs_gpum
--- a/Modules/command_line_options.f90
+++ b/Modules/command_line_options.f90
@ -38,6 +38,7 @@ MODULE command_line_options
  INTEGER :: nargs = 0
  ! ... QE arguments read from command line
  INTEGER :: nimage_= 1, npool_= 1, ndiag_ = 0, nband_= 1, ntg_= 1, nyfft_ = 1, nmany_ = 1
+  LOGICAL :: pencil_decomposition_ = .false.
  ! ... Indicate if using library init
  LOGICAL :: library_init = .FALSE.
  ! ... input file name read from command line
@ -125,6 +126,14 @@ CONTAINS
              READ ( arg, *, ERR = 15, END = 15) ntg_         ! read the argument as ntg_
              nyfft_ = ntg_  ! set nyfft_ equal to ntg_
              narg = narg + 1
+           CASE ( '-pd', 'use_pd', '-pencil_decomposition', '-use_pencil_decomposition' )
+              IF (read_string) THEN
+                 CALL my_getarg ( input_command_line, narg, arg )
+              ELSE
+                 CALL get_command_argument ( narg, arg )
+              ENDIF
+              READ ( arg, *, ERR = 15, END = 15) pencil_decomposition_
+              narg = narg + 1
           CASE ( '-nb', '-nband', '-nbgrp', '-nband_group') 
              IF (read_string) THEN
                 CALL my_getarg ( input_command_line, narg, arg )
@ -170,6 +179,7 @@ CONTAINS
     CALL mp_bcast( nyfft_ , root, world_comm ) 
     CALL mp_bcast( nband_ , root, world_comm ) 
     CALL mp_bcast( ndiag_ , root, world_comm ) 
+     CALL mp_bcast( pencil_decomposition_ , root, world_comm )
     
  END SUBROUTINE get_command_line
  !
@ -216,17 +226,18 @@ CONTAINS

  END SUBROUTINE my_getarg 

-  SUBROUTINE set_command_line ( nimage, npool, ntg, nmany, nyfft, nband, ndiag)
+  SUBROUTINE set_command_line ( nimage, npool, ntg, nmany, nyfft, nband, ndiag, pencil_decomposition)
     ! directly set command line options without going through the command line
     IMPLICIT NONE

-     INTEGER, INTENT(IN), OPTIONAL :: nimage, npool, ntg, nmany, nyfft, nband, ndiag
+     INTEGER, INTENT(IN), OPTIONAL :: nimage, npool, ntg, nmany, nyfft, nband, ndiag, pencil_decomposition
     !
     IF ( PRESENT(nimage) ) nimage_ = nimage
     IF ( PRESENT(npool)  ) npool_  = npool
     IF ( PRESENT(nyfft)  ) nyfft_  = nyfft
     IF ( PRESENT(nband)  ) nband_  = nband
     IF ( PRESENT(ndiag)  ) ndiag_  = ndiag
+     IF ( PRESENT(pencil_decomposition)  ) pencil_decomposition_  = pencil_decomposition
     IF ( PRESENT(ntg) .and. PRESENT(nmany) ) THEN
        ! ERROR!!!!
     ELSEIF ( PRESENT(ntg) ) THEN
--- a/Modules/control_flags.f90
+++ b/Modules/control_flags.f90
@ -250,6 +250,12 @@ MODULE control_flags
  LOGICAL, PUBLIC :: &
    do_makov_payne = .FALSE.   ! if .TRUE. makov-payne correction for isolated
                               ! system is used
+  LOGICAL, PUBLIC :: &
+    use_gpu = .FALSE.          ! if .TRUE. selects the accelerated version of the subroutines
+                               ! when available
+  INTEGER, PUBLIC :: &
+    many_fft = 16              ! the size of FFT batches in vloc_psi and
+                               ! sumband. Only use in accelerated subroutines.
  !
  INTEGER  :: ortho_max = 0      ! maximum number of iterations in routine ortho
  REAL(DP) :: ortho_eps = 0.0_DP ! threshold for convergence in routine ortho
@ -276,6 +282,7 @@ MODULE control_flags

  LOGICAL,          PUBLIC :: treinit_gvecs = .FALSE.

+  LOGICAL,          PUBLIC :: diagonalize_on_host = .FALSE.
  !
  ! ...  end of module-scope declarations
  !
--- a/Modules/cuda_subroutines.f90
+++ b/Modules/cuda_subroutines.f90
@ -0,0 +1,39 @@
+  !----------------------------------------------
+  ! ... this file contains a number of subroutines optionally interfaced
+  ! ... to cublas
+  !----------------------------------------------
+
+SUBROUTINE cudaDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+#if defined(__CUDA)
+    use cudafor
+    use cublas
+#endif
+    implicit none
+    DOUBLE PRECISION :: ALPHA,BETA
+    INTEGER :: INCX,INCY,LDA,M,N
+    CHARACTER :: TRANS
+    DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
+#if defined(__CUDA)
+    attributes(device) :: A, X, Y
+#endif
+    !
+    call DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+    !
+END SUBROUTINE cudaDGEMV
+
+SUBROUTINE cudaDGER  ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+#if defined(__CUDA)
+    use cudafor
+    use cublas
+#endif
+!     .. Scalar Arguments ..
+    DOUBLE PRECISION ::  ALPHA
+    INTEGER          ::   INCX, INCY, LDA, M, N
+!     .. Array Arguments ..
+    DOUBLE PRECISION :: A( LDA, * ), X( * ), Y( * )
+#if defined(__CUDA)
+    attributes(device) :: A, X, Y
+#endif
+    CALL DGER  ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA )
+
+END SUBROUTINE cudaDGER
--- a/Modules/dylmr2_gpu.f90
+++ b/Modules/dylmr2_gpu.f90
@ -0,0 +1,111 @@
+!
+! Copyright (C) 2001 PWSCF group
+! This file is distributed under the terms of the
+! GNU General Public License. See the file `License'
+! in the root directory of the present distribution,
+! or http://www.gnu.org/copyleft/gpl.txt .
+!
+!-----------------------------------------------------------------------
+SUBROUTINE dylmr2_gpu( nylm, ngy, g_d, gg_d, dylm_d, ipol )
+  !-----------------------------------------------------------------------
+  !! Compute \partial Y_lm(G) \over \partial (G)_ipol
+  !! using simple numerical derivation (SdG).
+  !! The spherical harmonics are calculated in ylmr2.
+  !
+  USE kinds, ONLY : DP
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: nylm
+  ! input: number of spherical harmonics
+  INTEGER, INTENT(IN) :: ngy
+  ! input: the number of g vectors to compute
+  INTEGER, INTENT(IN) :: ipol
+  ! input: desired polarization
+  REAL(DP), INTENT(IN) :: g_d(3,ngy)
+  !! input: the coordinates of g vectors
+  REAL(DP), INTENT(IN) :: gg_d(ngy)
+  !! input: the moduli of g vectors
+  REAL(DP), INTENT(OUT) :: dylm_d(ngy,nylm)
+  !! output: the spherical harmonics derivatives
+  !
+  ! ... local variables
+  !
+  INTEGER :: ig, lm, i
+  ! counter on g vectors
+  ! counter on l,m component
+  !
+  INTEGER :: apol, bpol
+  !
+  REAL(DP), PARAMETER :: delta = 1.E-6_DP
+  REAL(DP), ALLOCATABLE :: dg_d(:), dgi_d(:), gx_d(:,:)
+  REAL(DP), ALLOCATABLE :: ggx_d(:), ylmaux_d(:,:)
+  ! dg is the finite increment for numerical derivation:
+  ! dg = delta |G| = delta * sqrt(gg)
+  ! dgi= 1 /(delta * sqrt(gg))
+  ! gx = g +/- dg
+  ! ggx = gx^2
+  !
+#if defined(__CUDA)
+  attributes(DEVICE) :: g_d, gg_d, dylm_d, gx_d, ggx_d, dg_d, &
+                        dgi_d, ylmaux_d
+#endif  
+  !
+  
+  !
+  ALLOCATE( gx_d(3,ngy), ggx_d(ngy), dg_d(ngy) )
+  ALLOCATE( dgi_d(ngy), ylmaux_d(ngy,nylm) )
+
+  !$cuf kernel do (1) <<<*,*>>>
+  DO ig = 1, ngy
+     dg_d(ig) = delta * SQRT(gg_d(ig) )
+     IF (gg_d(ig) > 1.E-9_DP) THEN
+        dgi_d(ig) = 1._DP / dg_d(ig)
+     ELSE
+        dgi_d(ig) = 0._DP
+     ENDIF
+  ENDDO
+  !
+  IF ( ipol==1 ) THEN
+    apol = 2 ; bpol = 3
+  ELSEIF ( ipol==2 ) THEN
+    apol = 1 ; bpol = 3
+  ELSEIF ( ipol==3 ) THEN
+    apol = 1 ; bpol = 2
+  ENDIF
+  !
+  !$cuf kernel do (1) <<<*,*>>>
+  DO ig = 1, ngy
+     gx_d(apol,ig) = g_d(apol,ig)
+     gx_d(bpol,ig) = g_d(bpol,ig)
+     gx_d(ipol,ig) = g_d(ipol,ig) + dg_d(ig)
+     ggx_d(ig) = gx_d(1,ig) * gx_d(1,ig) + &
+                 gx_d(2,ig) * gx_d(2,ig) + &
+                 gx_d(3,ig) * gx_d(3,ig)
+  ENDDO
+  !
+  CALL ylmr2_gpu( nylm, ngy, gx_d, ggx_d, dylm_d )
+  !
+  !$cuf kernel do (1) <<<*,*>>>
+  DO ig = 1, ngy
+     gx_d(ipol,ig) = g_d(ipol,ig) - dg_d(ig)
+     ggx_d(ig) = gx_d(1,ig) * gx_d(1,ig) + &
+                 gx_d(2,ig) * gx_d(2,ig) + &
+                 gx_d(3,ig) * gx_d(3,ig)
+  ENDDO
+  !
+  CALL ylmr2_gpu( nylm, ngy, gx_d, ggx_d, ylmaux_d )
+  !
+  !$cuf kernel do (2) <<<*,*>>>
+  DO lm = 1, nylm
+     DO ig = 1, ngy
+        dylm_d(ig,lm) = (dylm_d(ig,lm)-ylmaux_d(ig,lm)) * 0.5_DP * dgi_d(ig)
+     ENDDO
+  ENDDO
+  !
+  DEALLOCATE( gx_d, ggx_d, dg_d, dgi_d, ylmaux_d )
+  !
+  RETURN
+  !
+END SUBROUTINE dylmr2_gpu
+
--- a/Modules/electrons_base.f90
+++ b/Modules/electrons_base.f90
@ -10,6 +10,9 @@
 !------------------------------------------------------------------------------!

      USE kinds, ONLY: DP
+#if defined (__CUDA)
+      USE cudafor
+#endif
 !
      IMPLICIT NONE
      SAVE
@ -43,9 +46,13 @@
      REAL(DP) :: qbac = 0.0_DP       ! background neutralizing charge
      INTEGER, ALLOCATABLE :: ispin(:) ! spin of each state

-      REAL(DP), ALLOCATABLE :: f_bgrp(:)     ! occupation numbers ( at gamma )
+      REAL(DP), ALLOCATABLE :: f_bgrp(:)  ! occupation numbers ( at gamma )
+      REAL(DP), ALLOCATABLE :: f_d(:)     ! occupation numbers ( at gamma )
      INTEGER, ALLOCATABLE  :: ispin_bgrp(:) ! spin of each state
      INTEGER, ALLOCATABLE :: ibgrp_g2l(:)    ! local index of the i-th global band index
+#if defined (__CUDA)
+      ATTRIBUTES( DEVICE ) :: f_d
+#endif
 !
 !------------------------------------------------------------------------------!
  CONTAINS
@ -397,6 +404,7 @@
      IF( ALLOCATED( f ) ) DEALLOCATE( f )
      IF( ALLOCATED( ispin ) ) DEALLOCATE( ispin )
      IF( ALLOCATED( f_bgrp ) ) DEALLOCATE( f_bgrp )
+      IF( ALLOCATED( f_d ) ) DEALLOCATE( f_d )
      IF( ALLOCATED( ispin_bgrp ) ) DEALLOCATE( ispin_bgrp )
      IF( ALLOCATED( ibgrp_g2l ) ) DEALLOCATE( ibgrp_g2l )
      telectrons_base_initval = .FALSE.
@ -455,6 +463,10 @@
         END DO
      END DO

+#if defined (__CUDA)
+      ALLOCATE( f_d, SOURCE = f_bgrp )
+#endif
+      
      RETURN

    END SUBROUTINE distribute_bands
--- a/Modules/environment.f90
+++ b/Modules/environment.f90
@ -174,6 +174,8 @@ CONTAINS
         &    "395502 (2009);", &
         &/9X,"""P. Giannozzi et al., J. Phys.:Condens. Matter 29 ",&
         &    "465901 (2017);", &
+         &/9X,"""P. Giannozzi et al., J. Chem. Phys. 152 ",&
+         &    "154105 (2020);", &
         &/9X," URL http://www.quantum-espresso.org"", ", &
         &/5X,"in publications or presentations arising from this work. More details at",&
         &/5x,"http://www.quantum-espresso.org/quote")' )
--- a/Show More
+++ b/Show More