diff --git a/PW/src/c_bands.f90 b/PW/src/c_bands.f90 index b09f115fc..25e681a3b 100644 --- a/PW/src/c_bands.f90 +++ b/PW/src/c_bands.f90 @@ -111,13 +111,19 @@ SUBROUTINE c_bands( iter ) ! ! ... More stuff needed by the hamiltonian: nonlocal projectors ! - IF ( use_gpu ) THEN - IF ( nkb > 0 ) CALL using_vkb_d(2) - IF ( nkb > 0 ) CALL init_us_2_gpu( ngk(ik), igk_k_d(1,ik), xk(1,ik), vkb_d ) - ELSE - IF ( nkb > 0 ) CALL using_vkb(2) - IF ( nkb > 0 ) CALL init_us_2( ngk(ik), igk_k(1,ik), xk(1,ik), vkb ) +!civn +! IF ( use_gpu ) THEN +! IF ( nkb > 0 ) CALL using_vkb_d(2) +! IF ( nkb > 0 ) CALL init_us_2_gpu( ngk(ik), igk_k_d(1,ik), xk(1,ik), vkb_d ) +! ELSE +! IF ( nkb > 0 ) CALL using_vkb(2) +! IF ( nkb > 0 ) CALL init_us_2( ngk(ik), igk_k(1,ik), xk(1,ik), vkb ) +! END IF + IF ( nkb > 0 ) THEN + CALL using_vkb(2) + CALL init_us_2( ngk(ik), igk_k(1,ik), xk(1,ik), vkb ) END IF +! ! ! ... read in wavefunctions from the previous iteration ! diff --git a/PW/src/init_us_2.f90 b/PW/src/init_us_2.f90 index 149503266..65768c5f3 100644 --- a/PW/src/init_us_2.f90 +++ b/PW/src/init_us_2.f90 @@ -33,9 +33,21 @@ SUBROUTINE init_us_2( npw_, igk_, q_, vkb_ ) ! CALL start_clock( 'init_us_2' ) ! +!civn +#if defined(__CUDA) +!$acc data copyin(igk_(npw_), eigts1(:,:), eigts2(:,:), eigts3(:,:), mill(:,:), g(:,:)) copy(vkb_(npwx,nkb)) +!$acc host_data use_device(eigts1, eigts2, eigts3, mill, g, igk_, vkb_) + CALL init_us_2_base_gpu(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega,& + dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,& + vkb_ ) +!$acc end host_data +!$acc end data +#else CALL init_us_2_base(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, & dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,& vkb_ ) +#endif +! ! CALL stop_clock( 'init_us_2' ) ! diff --git a/upflib/init_us_2_base.f90 b/upflib/init_us_2_base.f90 index a8b121daa..483909a65 100644 --- a/upflib/init_us_2_base.f90 +++ b/upflib/init_us_2_base.f90 @@ -67,6 +67,9 @@ SUBROUTINE init_us_2_base( npw_, npwx, igk_, q_, nat, tau, ityp, & INTEGER, PARAMETER :: blocksize = 256 INTEGER :: iblock, numblock, realblocksize ! +!civn +write(*,*) "using init_us_2_base" +! IF (lmaxkb < 0) RETURN ! ! setting cache blocking size diff --git a/upflib/init_us_2_base_gpu.f90 b/upflib/init_us_2_base_gpu.f90 index 09b24ee17..e02b780c6 100644 --- a/upflib/init_us_2_base_gpu.f90 +++ b/upflib/init_us_2_base_gpu.f90 @@ -82,6 +82,9 @@ SUBROUTINE init_us_2_base_gpu( npw_, npwx, igk__d, q_, nat, tau, ityp, & #endif ! ! +!civn +write(*,*) "using init_us_2_base_gpu" +! if (lmaxkb<0) return ! JR Eventually replace with smarter allocation/deallocation of GPU temp arrays