diff --git a/CPV/src/init.f90 b/CPV/src/init.f90 index c79de107e..c9de049a4 100644 --- a/CPV/src/init.f90 +++ b/CPV/src/init.f90 @@ -184,9 +184,9 @@ CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms ) ! END IF -!NOTE g and mill already allocate in the device they are initialized below. -!$acc data present(g, mill) -!$acc update device(g,mill) +!NOTE g, gg and mill already allocate in the device they are initialized below. +!$acc data present(g,gg,mill) +!$acc update device(g,gg,mill) !$acc end data ! CALL gshells (.TRUE.) @@ -429,13 +429,13 @@ ! re-calculate G-vectors and kinetic energy ! dfftp_ngm = dfftp%ngm -!$acc parallel loop present(g, mill) copyin(bg) copyout(gg) +!$acc parallel loop present(g,gg,mill) copyin(bg) do ig = 1, dfftp_ngm g(:,ig)= mill(1,ig)*bg(:,1) + mill(2,ig)*bg(:,2) + mill(3,ig)*bg(:,3) gg(ig)=g(1,ig)**2 + g(2,ig)**2 + g(3,ig)**2 enddo -!$acc end parallel loop -!$acc update host(g) +!$acc end parallel loop +!$acc update host(g,gg) ! call g2kin_init ( gg, tpiba2 ) ! diff --git a/CPV/src/potentials.f90 b/CPV/src/potentials.f90 index b8fafbca0..e702d9b8a 100644 --- a/CPV/src/potentials.f90 +++ b/CPV/src/potentials.f90 @@ -357,7 +357,7 @@ ! -DEV_ACC data present(rhoeg, rhops, mill,g ) copy(fion) create(rp(1:s_ngm_)) copyin(sfac, screen_coul, gg, vps, ityp,ei1, ei2, ei3) +DEV_ACC data present(rhoeg, rhops, mill,g,gg ) copy(fion) create(rp(1:s_ngm_)) copyin(sfac, screen_coul, vps, ityp,ei1, ei2, ei3) ! DEV_OMP parallel default(none) & DEV_OMP shared(gstart, dffts,sfac, rhops, screen_coul, rhoeg, nsp, gg, tpiba2, tpiba, mill, g, & diff --git a/CPV/src/vofrho.f90 b/CPV/src/vofrho.f90 index cdd828852..7d13d7f93 100644 --- a/CPV/src/vofrho.f90 +++ b/CPV/src/vofrho.f90 @@ -212,7 +212,8 @@ DEV_ACC enter data create(drhot(1:p_ngm_, 1:6)) ! zpseu = 0.0_DP ! - DEV_ACC data copyin(rhog,drhog,ht,sfac,vps,gg,rhops) copyout(vtemp) + DEV_ACC update device(gg) + DEV_ACC data copyin(rhog,drhog,ht,sfac,vps,rhops) copyout(vtemp) DEV_OMP parallel default(shared), private(ig,is,ij,i,j,k) ! DEV_OMP do @@ -337,7 +338,7 @@ DEV_ACC loop vector reduction(+:x_tmp) DEV_ACC end parallel ! DEV_OMP do -DEV_ACC parallel loop present(rhotmp) +DEV_ACC parallel loop present(rhotmp,gg) DO ig = gstart, p_ngm_ vtemp(ig) = CONJG( rhotmp( ig ) ) * rhotmp( ig ) / gg( ig ) END DO @@ -402,7 +403,7 @@ DEV_ACC kernels DEV_ACC end kernels ! -DEV_ACC parallel loop present(rhotmp) +DEV_ACC parallel loop present(rhotmp,gg) ! DEV_OMP parallel default(shared), private(ig,is) DEV_OMP do diff --git a/Modules/recvec.f90 b/Modules/recvec.f90 index 25f4807db..7f284d962 100644 --- a/Modules/recvec.f90 +++ b/Modules/recvec.f90 @@ -120,7 +120,7 @@ ALLOCATE( igtongl_d(ngm) ) ALLOCATE( gl_d(ngm) ) ENDIF - !$acc enter data create( mill(1:3, 1:ngm), g(1:3, 1:ngm) ) + !$acc enter data create( mill(1:3,1:ngm), g(1:3,1:ngm), gg(1:ngm) ) ! RETURN ! @@ -142,7 +142,10 @@ END IF ! ! - IF( ALLOCATED( gg ) ) DEALLOCATE( gg ) + IF( ALLOCATED( gg ) ) THEN +!$acc exit data delete(gg) + DEALLOCATE( gg ) + END IF IF( ALLOCATED( g ) ) THEN !$acc exit data delete(g) DEALLOCATE( g ) diff --git a/PHonon/PH/drhodv.f90 b/PHonon/PH/drhodv.f90 index 88d3dade4..cf3b9a9ca 100644 --- a/PHonon/PH/drhodv.f90 +++ b/PHonon/PH/drhodv.f90 @@ -29,11 +29,7 @@ subroutine drhodv (nu_i0, nper, drhoscf) USE cell_base, ONLY : tpiba USE lsda_mod, ONLY : current_spin, lsda, isk, nspin USE wvfct, ONLY : npwx, nbnd -#if defined(__CUDA) - USE uspp, ONLY : nkb, vkb, deeq_nc, deeq_nc_d, okvan -#else USE uspp, ONLY : nkb, vkb, deeq_nc, okvan -#endif USE becmod, ONLY : calbec, bec_type, becscal, allocate_bec_type, & deallocate_bec_type USE fft_base, ONLY : dfftp @@ -142,18 +138,14 @@ subroutine drhodv (nu_i0, nper, drhoscf) ELSE IF (okvan) THEN deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,2) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) int1_nc(:,:,:,:,:)=int1_nc_save(:,:,:,:,:,2) ENDIF call drhodvnl (ik, ikk, nper, nu_i0, dynwrk, becpt, alphapt, & dbecq, dalpq) IF (okvan) THEN deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,1) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) int1_nc(:,:,:,:,:)=int1_nc_save(:,:,:,:,:,1) ENDIF ENDIF diff --git a/PHonon/PH/phq_setup.f90 b/PHonon/PH/phq_setup.f90 index de6b1cb86..50b494ae9 100644 --- a/PHonon/PH/phq_setup.f90 +++ b/PHonon/PH/phq_setup.f90 @@ -65,7 +65,7 @@ subroutine phq_setup USE symm_base, ONLY : nrot, nsym, s, irt, t_rev, time_reversal, & sr, invs, inverse_s, d1, d2, d3, check_grid_sym USE uspp_param, ONLY : upf - USE uspp, ONLY : nlcc_any, deeq_nc, deeq_nc_d, okvan + USE uspp, ONLY : nlcc_any, deeq_nc, okvan USE noncollin_module, ONLY : noncolin, domag, m_loc, angle1, angle2, ux USE nlcc_ph, ONLY : drc USE control_ph, ONLY : rec_code, lgamma_gamma, search_sym, start_irr, & @@ -183,9 +183,7 @@ subroutine phq_setup v%of_r(:,2:4)=-v%of_r(:,2:4) deeq_nc_save(:,:,:,:,2)=deeq_nc(:,:,:,:) deeq_nc(:,:,:,:)=deeq_nc_save(:,:,:,:,1) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) ENDIF ENDIF ! diff --git a/PHonon/PH/solve_linter.f90 b/PHonon/PH/solve_linter.f90 index b2490ef86..9e31873fe 100644 --- a/PHonon/PH/solve_linter.f90 +++ b/PHonon/PH/solve_linter.f90 @@ -47,10 +47,8 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf) USE scf, ONLY : rho, vrs #if defined(__CUDA) USE scf_gpum, ONLY : vrs_d - USE uspp, ONLY : okvan, vkb, deeq_nc, deeq_nc_d -#else - USE uspp, ONLY : okvan, vkb, deeq_nc #endif + USE uspp, ONLY : okvan, vkb, deeq_nc USE uspp_param, ONLY : nhm USE noncollin_module, ONLY : noncolin, domag, npol, nspin_mag USE paw_variables, ONLY : okpaw @@ -282,22 +280,18 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf) ! Hubbard potential dvbare_hub_q * psi_kpoint ! is calculated and added to dvpsi. ! - IF (lda_plus_u) CALL dvqhub_barepsi_us(ik, u(1, mode)) + IF (lda_plus_u) CALL dvqhub_barepsi_us(ik, u(1,mode)) ! ELSE IF (okvan) THEN deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) int1_nc(:,:,:,:,:) = int1_nc_save(:,:,:,:,:,2) ENDIF CALL dvqpsi_us(ik, u(1, mode), .FAlSE., becpt, alphapt) IF (okvan) THEN deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) int1_nc(:,:,:,:,:) = int1_nc_save(:,:,:,:,:,1) ENDIF ENDIF @@ -342,9 +336,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf) #endif IF (okvan) THEN deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,2) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:) = deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) ENDIF ENDIF ! @@ -375,9 +367,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf) #endif IF (okvan) THEN deeq_nc(:,:,:,:) = deeq_nc_save(:,:,:,:,1) -#if defined(__CUDA) - deeq_nc_d(:,:,:,:)=deeq_nc(:,:,:,:) -#endif + !$acc update device(deeq_nc) ENDIF ENDIF ! diff --git a/PW/CMakeLists.txt b/PW/CMakeLists.txt index c638ef9e1..4c3280305 100644 --- a/PW/CMakeLists.txt +++ b/PW/CMakeLists.txt @@ -260,32 +260,22 @@ set(src_pw src/force_corr_gpu.f90 src/orthoatwfc_gpu.f90 src/wfcinit_gpu.f90 - src/stres_loc_gpu.f90 src/add_paw_to_deeq_gpu.f90 src/addusdens_gpu.f90 src/addusforce_gpu.f90 - src/deriv_drhoc_gpu.f90 src/s_psi_gpu.f90 src/rotate_wfc_gpu.f90 src/usnldiag_gpu.f90 src/add_vuspsi_gpu.f90 src/hs_1psi_gpu.f90 src/g_psi_gpu.f90 - src/stres_mgga_gpu.f90 src/atomic_wfc_gpu.f90 - src/dvloc_of_g_gpu.f90 - src/stres_ewa_gpu.f90 - src/stres_knl_gpu.f90 - src/compute_deff_gpu.f90 src/add_vhub_to_deeq_gpu.f90 src/s_1psi_gpu.f90 src/h_psi_gpu.f90 - src/stres_us_gpu.f90 - src/stres_cc_gpu.f90 src/utils_gpu.f90 src/vhpsi_gpu.f90 src/drhoc_gpu.f90 - src/stres_har_gpu.f90 src/vloc_psi_gpu.f90 src/hs_psi_gpu.f90) qe_enable_cuda_fortran("${src_pw}") diff --git a/PW/src/Coul_cut_2D.f90 b/PW/src/Coul_cut_2D.f90 index b8f9b711e..6deb10da3 100644 --- a/PW/src/Coul_cut_2D.f90 +++ b/PW/src/Coul_cut_2D.f90 @@ -352,7 +352,7 @@ END SUBROUTINE cutoff_force_lc ! ! !---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc ) +SUBROUTINE cutoff_stres_evloc( rho_G, strf, evloc ) !---------------------------------------------------------------------- !! This subroutine adds the contribution from the cutoff long-range part !! of the local part of the ionic potential to \(\text{evloc}\). @@ -363,14 +363,14 @@ SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc ) !! to re-compute it here for the stress. ! USE kinds - USE ions_base, ONLY : ntyp => nsp - USE gvect, ONLY : ngm, gstart - USE io_global, ONLY : stdout - USE fft_base, ONLY : dfftp + USE ions_base, ONLY: ntyp => nsp + USE gvect, ONLY: ngm, gstart + USE io_global, ONLY: stdout + USE fft_base, ONLY: dfftp ! IMPLICIT NONE ! - COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr) + COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr) !! charge density in G space COMPLEX(DP), INTENT(IN) :: strf(ngm,ntyp) !! the structure factor @@ -381,71 +381,28 @@ SUBROUTINE cutoff_stres_evloc( psic_G, strf, evloc ) ! INTEGER :: ng, nt ! - ! If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0 - ! So we start at gstart. + !$acc data present_or_copyin(rho_G,strf) + ! + ! ... If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0 + ! So we start at gstart. + ! + !$acc parallel loop collapse(2) reduction(+:evloc) copyin(lr_Vloc) DO nt = 1, ntyp - DO ng = gstart, ngm - evloc = evloc + DBLE( CONJG(psic_G(ng)) * strf(ng,nt) ) & - * lr_Vloc(ng,nt) - ENDDO + DO ng = gstart, ngm + evloc = evloc + DBLE( CONJG(rho_G(ng)) * strf(ng,nt) ) & + * lr_Vloc(ng,nt) + ENDDO ENDDO ! + !$acc end data + ! RETURN ! END SUBROUTINE cutoff_stres_evloc ! -!---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_evloc_gpu( psicG_d, strf_d, evloc ) - !---------------------------------------------------------------------- - !! cutoff_stres_evloc - gpu version - ! - USE kinds - USE ions_base, ONLY : ntyp => nsp - !USE vlocal, ONLY : strf - USE gvect, ONLY : ngm, gstart - USE io_global, ONLY : stdout - USE fft_base, ONLY : dfftp - ! - IMPLICIT NONE - ! - COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr) - !! charge density in G space - COMPLEX(DP), INTENT(IN) :: strf_d(ngm,ntyp) - REAL(DP), INTENT(INOUT) :: evloc - !! the energy of the electrons in the local ionic potential - ! - ! ... local variables - ! - REAL(DP), ALLOCATABLE :: lrVloc_d(:,:) - INTEGER :: ng, nt - ! -#if defined(__CUDA) - attributes(DEVICE) :: psicG_d, strf_d, lrVloc_d -#endif - ! - ALLOCATE( lrVloc_d(ngm,ntyp) ) - lrVloc_d = lr_Vloc - ! - ! If gstart=2, it means g(1) is G=0, but we have nothing to add for G=0 - ! So we start at gstart. - ! - !$cuf kernel do (2) - DO nt = 1, ntyp - DO ng = gstart, ngm - evloc = evloc + DBLE( CONJG(psicG_d(ng)) * strf_d(ng,nt) ) & - * lrVloc_d(ng,nt) - ENDDO - ENDDO - ! - DEALLOCATE( lrVloc_d ) - ! - RETURN - ! -END SUBROUTINE cutoff_stres_evloc_gpu -! ! !---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_sigmaloc( psic_G, strf, sigmaloc ) +SUBROUTINE cutoff_stres_sigmaloc( rho_G, strf, sigmaloc ) !---------------------------------------------------------------------- !! This subroutine adds the contribution from the cutoff long-range part !! of the local part of the ionic potential to the rest of the @@ -454,141 +411,67 @@ SUBROUTINE cutoff_stres_sigmaloc( psic_G, strf, sigmaloc ) USE kinds USE ions_base, ONLY : ntyp => nsp USE constants, ONLY : eps8 - USE gvect, ONLY : ngm, g, gg, gstart + USE gvect, ONLY : ngm, gstart, g, gg USE cell_base, ONLY : tpiba, tpiba2, alat, omega USE io_global, ONLY : stdout USE fft_base, ONLY : dfftp ! IMPLICIT NONE ! - COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr) + COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr) !! charge density in G space COMPLEX(DP), INTENT(IN) :: strf(ngm,ntyp) - !! the structure factor REAL(DP), INTENT(INOUT) :: sigmaloc(3,3) !! stress contribution for the local ionic potential ! ! ... local variables ! - INTEGER :: ng, nt, l, m - REAL(DP) :: Gp, G2lzo2Gp, beta, dlr_Vloc - ! - ! no G=0 contribution - DO nt = 1, ntyp - DO ng = gstart, ngm - ! - Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba - ! below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448 - IF (Gp < eps8) THEN - ! G^2*lz/2|Gp| - G2lzo2Gp = 0.0d0 - beta = 0.0d0 - ELSE - G2lzo2Gp = gg(ng)*tpiba2*lz/2.0d0/Gp - beta = G2lzo2Gp*(1.0d0-cutoff_2D(ng))/cutoff_2D(ng) - ENDIF - ! dlr_vloc corresponds to the derivative of the long-range local ionic potential - ! with respect to G - DO l = 1, 3 - IF (l == 3) THEN - dlr_Vloc = - 1.0d0/(gg(ng)*tpiba2) * lr_Vloc(ng,nt) & - * (1.0d0+ gg(ng)*tpiba2/4.0d0) - ELSE - dlr_Vloc = - 1.0d0/ (gg(ng)*tpiba2) * lr_Vloc(ng,nt) & - * (1.0d0- beta + gg(ng)*tpiba2/4.0d0) - ENDIF - ! - DO m = 1, l - sigmaloc(l,m) = sigmaloc(l,m) + DBLE( CONJG( psic_G(ng) ) & - * strf(ng,nt) ) * 2.0d0 * dlr_Vloc & - * tpiba2 * g(l,ng) * g(m,ng) - ENDDO - ENDDO - ! - ENDDO - ENDDO - ! - RETURN - ! -END SUBROUTINE cutoff_stres_sigmaloc -! -! -!---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_sigmaloc_gpu( psicG_d, strf_d, sigmaloc ) - !---------------------------------------------------------------------- - !! This subroutine adds the contribution from the cutoff long-range part - !! of the local part of the ionic potential to the rest of the - !! \(\text{sigmaloc}\). That is, the rest of Eq. (63) of PRB 96, 075448. - ! - USE kinds - USE ions_base, ONLY : ntyp => nsp - USE vlocal, ONLY : strf - USE constants, ONLY : eps8 - USE gvect, ONLY : ngm, gstart, g_d, gg_d - USE cell_base, ONLY : tpiba, tpiba2, alat, omega - USE io_global, ONLY : stdout - USE fft_base, ONLY : dfftp - ! - IMPLICIT NONE - ! - COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr) - !! charge density in G space - COMPLEX(DP), INTENT(IN) :: strf_d(ngm,ntyp) - REAL(DP), INTENT(INOUT) :: sigmaloc(3,3) - !! stress contribution for the local ionic potential - ! - ! ... local variables - ! - INTEGER :: ng, nt, l, m - REAL(DP), ALLOCATABLE :: lrVloc_d(:,:), cutoff2D_d(:) + INTEGER :: ng, nt, l, m REAL(DP) :: Gp, G2lzo2Gp, beta, dlr_Vloc1, dlr_Vloc2, dlr_Vloc3, & no_lm_dep REAL(DP) :: sigmaloc11, sigmaloc31, sigmaloc21, sigmaloc32, & sigmaloc22, sigmaloc33 -#if defined(__CUDA) - attributes(DEVICE) :: psicG_d, strf_d, lrVloc_d, cutoff2D_d -#endif - ! - ALLOCATE( lrVloc_d(ngm,ntyp), cutoff2D_d(ngm) ) - lrVloc_d = lr_Vloc - cutoff2D_d = cutoff_2D + ! + !$acc data present_or_copyin(rho_G,strf) ! sigmaloc11 = 0._DP ; sigmaloc31 = 0._DP sigmaloc21 = 0._DP ; sigmaloc32 = 0._DP sigmaloc22 = 0._DP ; sigmaloc33 = 0._DP ! - ! no G=0 contribution + ! ... no G=0 contribution ! - !$cuf kernel do (2) <<<*,*>>> + !$acc parallel loop collapse(2) copyin(lr_Vloc,cutoff_2D) & + !$acc reduction(+:sigmaloc11,sigmaloc21,sigmaloc22,sigmaloc31,& + !$acc sigmaloc32,sigmaloc33) DO nt = 1, ntyp DO ng = gstart, ngm ! - Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba - ! below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448 + Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba + ! ... below is a somewhat cumbersome way to define beta of Eq. (61) of PRB 96, 075448 IF (Gp < eps8) THEN - ! G^2*lz/2|Gp| + ! ... G^2*lz/2|Gp| G2lzo2Gp = 0._DP beta = 0._DP ELSE - G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp - beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng) + G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp + beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng) ENDIF - ! dlrVloc corresponds to the derivative of the long-range local ionic potential - ! with respect to G - dlr_Vloc1 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) & - * (1._DP- beta + gg_d(ng)*tpiba2/4._DP) - dlr_Vloc2 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) & - * (1._DP- beta + gg_d(ng)*tpiba2/4._DP) - dlr_Vloc3 = - 1._DP/ (gg_d(ng)*tpiba2) * lrVloc_d(ng,nt) & - * (1._DP+ gg_d(ng)*tpiba2/4._DP) - no_lm_dep = DBLE( CONJG( psicG_d(ng) ) & - * strf_d(ng,nt) ) * 2._DP * tpiba2 - sigmaloc11 = sigmaloc11 + no_lm_dep * dlr_Vloc1 * g_d(1,ng) * g_d(1,ng) - sigmaloc21 = sigmaloc21 + no_lm_dep * dlr_Vloc2 * g_d(2,ng) * g_d(1,ng) - sigmaloc22 = sigmaloc22 + no_lm_dep * dlr_Vloc2 * g_d(2,ng) * g_d(2,ng) - sigmaloc31 = sigmaloc31 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(1,ng) - sigmaloc32 = sigmaloc32 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(2,ng) - sigmaloc33 = sigmaloc33 + no_lm_dep * dlr_Vloc3 * g_d(3,ng) * g_d(3,ng) + ! ... dlrVloc corresponds to the derivative of the long-range local ionic potential + ! with respect to G + dlr_Vloc1 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) & + * (1._DP- beta + gg(ng)*tpiba2/4._DP) + dlr_Vloc2 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) & + * (1._DP- beta + gg(ng)*tpiba2/4._DP) + dlr_Vloc3 = -1._DP / (gg(ng)*tpiba2) * lr_Vloc(ng,nt) & + * (1._DP+ gg(ng)*tpiba2/4._DP) + no_lm_dep = DBLE( CONJG( rho_G(ng) ) & + * strf(ng,nt) ) * 2._DP * tpiba2 + sigmaloc11 = sigmaloc11 + no_lm_dep * dlr_Vloc1 * g(1,ng) * g(1,ng) + sigmaloc21 = sigmaloc21 + no_lm_dep * dlr_Vloc2 * g(2,ng) * g(1,ng) + sigmaloc22 = sigmaloc22 + no_lm_dep * dlr_Vloc2 * g(2,ng) * g(2,ng) + sigmaloc31 = sigmaloc31 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(1,ng) + sigmaloc32 = sigmaloc32 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(2,ng) + sigmaloc33 = sigmaloc33 + no_lm_dep * dlr_Vloc3 * g(3,ng) * g(3,ng) ! ENDDO ENDDO @@ -600,70 +483,15 @@ SUBROUTINE cutoff_stres_sigmaloc_gpu( psicG_d, strf_d, sigmaloc ) sigmaloc(3,2) = sigmaloc(3,2) + sigmaloc32 sigmaloc(3,3) = sigmaloc(3,3) + sigmaloc33 ! - DEALLOCATE( lrVloc_d, cutoff2D_d ) + !$acc end data ! RETURN ! -END SUBROUTINE cutoff_stres_sigmaloc_gpu +END SUBROUTINE cutoff_stres_sigmaloc ! ! !---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_sigmahar( psic_G, sigmahar ) - !---------------------------------------------------------------------- - !! This subroutine cuts off the Hartree part of the stress. - !! See Eq. (62) of PRB 96, 075448. - ! - USE kinds - USE gvect, ONLY : ngm, g, gg, gstart - USE constants, ONLY : eps8 - USE cell_base, ONLY : tpiba2, alat, tpiba - USE io_global, ONLY : stdout - USE fft_base, ONLY : dfftp - ! - IMPLICIT NONE - ! - COMPLEX(DP), INTENT(IN) :: psic_G(dfftp%nnr) - !! charge density in G-space - REAL(DP), INTENT(INOUT) :: sigmahar(3,3) - !! hartree contribution to stress - ! - ! ... local variables - ! - INTEGER :: ng, nt, l, m - REAL(DP) :: Gp, G2lzo2Gp, beta, shart, g2, fact - ! - DO ng = gstart, ngm - Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba - IF (Gp < eps8) THEN - G2lzo2Gp = 0.0d0 - beta = 0.0d0 - ELSE - G2lzo2Gp = gg(ng)*tpiba2*lz/2.0d0/Gp - beta = G2lzo2Gp*(1.0d0-cutoff_2D(ng))/cutoff_2D(ng) - ENDIF - g2 = gg (ng) * tpiba2 - shart = psic_G(ng) * CONJG(psic_G(ng)) / g2 * cutoff_2D(ng) - DO l = 1, 3 - IF (l == 3) THEN - fact = 1.0d0 - ELSE - fact = 1.0d0 - beta - ENDIF - DO m = 1, l - sigmahar(l,m) = sigmahar(l,m) + shart * tpiba2 * 2 * & - g(l,ng) * g(m,ng) / g2 * fact - ENDDO - ENDDO - ENDDO - !sigma is multiplied by 0.5*fpi*e2 after - ! - RETURN - ! -END SUBROUTINE cutoff_stres_sigmahar -! -! -!---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar ) +SUBROUTINE cutoff_stres_sigmahar( rho_G, sigmahar ) !---------------------------------------------------------------------- !! This subroutine cuts off the Hartree part of the stress. !! See Eq. (62) of PRB 96, 075448. @@ -674,14 +502,14 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar ) USE cell_base, ONLY: tpiba2, alat, tpiba USE io_global, ONLY: stdout USE fft_base, ONLY: dfftp - USE gvect, ONLY: g_d, gg_d + USE gvect, ONLY: g, gg ! IMPLICIT NONE ! - COMPLEX(DP), INTENT(IN) :: psicG_d(dfftp%nnr) + COMPLEX(DP), INTENT(IN) :: rho_G(dfftp%nnr) !! charge density in G-space REAL(DP), INTENT(INOUT) :: sigmahar(3,3) - !! hartree contribution to stress + !! Hartree contribution to stress ! ! ... local variables ! @@ -689,50 +517,45 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar ) REAL(DP) :: Gp, G2lzo2Gp, beta, shart, g2, fact REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, & sigmahar32, sigmahar22, sigmahar33 - REAL(DP), ALLOCATABLE :: cutoff2D_d(:) ! -#if defined(__CUDA) - attributes(DEVICE) :: psicG_d, cutoff2D_d -#endif - ! - ALLOCATE( cutoff2D_d(ngm) ) - cutoff2D_d = cutoff_2D + !$acc data present_or_copyin(rho_G) ! sigmahar11 = 0._DP ; sigmahar31 = 0._DP sigmahar21 = 0._DP ; sigmahar32 = 0._DP sigmahar22 = 0._DP ; sigmahar33 = 0._DP ! - !$cuf kernel do (1) <<<*,*>>> + !$acc parallel loop copyin(cutoff_2D) & + !$acc reduction(+:sigmahar11,sigmahar21,sigmahar22,sigmahar31,& + !$acc sigmahar32,sigmahar33) DO ng = gstart, ngm - Gp = SQRT(g_d(1,ng)**2 + g_d(2,ng)**2)*tpiba + Gp = SQRT(g(1,ng)**2 + g(2,ng)**2)*tpiba IF (Gp < eps8) THEN G2lzo2Gp = 0._DP beta = 0._DP ELSE - G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp - beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng) + G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp + beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng) ENDIF ! - g2 = gg_d(ng) * tpiba2 + g2 = gg(ng) * tpiba2 ! - shart = DBLE(psicG_d(ng)*CONJG(psicG_d(ng))) /& - g2 * cutoff2D_d(ng) + shart = DBLE(rho_G(ng)*CONJG(rho_G(ng))) / & + g2 * cutoff_2D(ng) ! fact = 1._DP - beta ! sigmahar11 = sigmahar11 + shart *tpiba2*2._DP * & - g_d(1,ng) * g_d(1,ng) / g2 * fact + g(1,ng) * g(1,ng) / g2 * fact sigmahar21 = sigmahar21 + shart *tpiba2*2._DP * & - g_d(2,ng) * g_d(1,ng) / g2 * fact + g(2,ng) * g(1,ng) / g2 * fact sigmahar22 = sigmahar22 + shart *tpiba2*2._DP * & - g_d(2,ng) * g_d(2,ng) / g2 * fact + g(2,ng) * g(2,ng) / g2 * fact sigmahar31 = sigmahar31 + shart *tpiba2*2._DP * & - g_d(3,ng) * g_d(1,ng) / g2 + g(3,ng) * g(1,ng) / g2 sigmahar32 = sigmahar32 + shart *tpiba2*2._DP * & - g_d(3,ng) * g_d(2,ng) / g2 + g(3,ng) * g(2,ng) / g2 sigmahar33 = sigmahar33 + shart *tpiba2*2._DP * & - g_d(3,ng) * g_d(3,ng) / g2 - ! + g(3,ng) * g(3,ng) / g2 ENDDO ! sigmahar(1,1) = sigmahar(1,1) + sigmahar11 @@ -743,11 +566,11 @@ SUBROUTINE cutoff_stres_sigmahar_gpu( psicG_d, sigmahar ) sigmahar(3,3) = sigmahar(3,3) + sigmahar33 !sigma is multiplied by 0.5*fpi*e2 after ! - DEALLOCATE( cutoff2D_d ) + !$acc end data ! RETURN ! -END SUBROUTINE cutoff_stres_sigmahar_gpu +END SUBROUTINE cutoff_stres_sigmahar ! ! !---------------------------------------------------------------------- @@ -759,87 +582,7 @@ SUBROUTINE cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa ) USE kinds USE ions_base, ONLY : nat, zv, tau, ityp USE constants, ONLY : e2, eps8 - USE gvect, ONLY : ngm, g, gg, gstart - USE cell_base, ONLY : tpiba2, alat, omega, tpiba - USE io_global, ONLY : stdout - ! - IMPLICIT NONE - ! - REAL(DP), INTENT(IN) :: alpha - !! tuning param for LR/SR separation - REAL(DP), INTENT(INOUT) :: sigmaewa(3,3) - !! ewald contribution to stress - REAL(DP), INTENT(INOUT) :: sdewald - !! constant and diagonal terms - ! - ! ... local variables - ! - INTEGER :: ng, na, l, m - REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact - COMPLEX(DP) :: rhostar - ! - ! g(1) is a problem if it's G=0, because we divide by G^2. - ! So start at gstart. - ! fact=1.0d0, gamma_only not implemented - ! G=0 componenent of the long-range part of the local part of the - ! pseudopotminus the Hartree potential is set to 0. - ! in other words, sdewald=0. - ! sdewald is the last term in equation B1 of PRB 32 3792. - ! See also similar comment for ewaldg in cutoff_ewald routine - ! - sdewald = 0._DP - DO ng = gstart, ngm - Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba - IF (Gp < eps8) THEN - G2lzo2Gp = 0._DP - beta = 0._DP - ELSE - G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp - beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng) - ENDIF - g2 = gg(ng) * tpiba2 - g2a = g2 / 4._DP / alpha - rhostar = (0._DP,0._DP) - DO na = 1, nat - arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + & - g(3,ng) * tau(3,na) ) * tpi - rhostar = rhostar + zv (ityp(na) ) * CMPLX(COS(arg), SIN(arg), KIND=DP) - ENDDO - rhostar = rhostar / omega - sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2 - ! ... sewald is an other diagonal term that is similar to the diagonal terms - ! in the other stress contributions. It basically gives a term prop to - ! the ewald energy - sdewald = sdewald-sewald - DO l = 1, 3 - IF (l == 3) THEN - fact = (g2a + 1.0d0) - ELSE - fact = (1.0d0+g2a-beta) - ENDIF - ! - DO m = 1, l - sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * & - g(l,ng) * g(m,ng) / g2 * fact - ENDDO - ENDDO - ! - ENDDO - ! - RETURN - ! -END SUBROUTINE cutoff_stres_sigmaewa -! -!---------------------------------------------------------------------- -SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) - !---------------------------------------------------------------------- - !! This subroutine cuts off the Ewald part of the stress. - !! See Eq. (64) in PRB 96 075448 - ! - USE kinds - USE ions_base, ONLY : nat, zv, tau, ityp - USE constants, ONLY : e2, eps8 - USE gvect, ONLY : ngm, gstart, g_d, gg_d + USE gvect, ONLY : ngm, gstart, g, gg USE cell_base, ONLY : tpiba2, alat, omega, tpiba USE io_global, ONLY : stdout ! @@ -858,72 +601,62 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) REAL(DP) :: Gp, G2lzo2Gp, beta, sewald, g2, g2a, arg, fact REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 COMPLEX(DP) :: rhostar - ! - INTEGER , ALLOCATABLE :: ityp_d(:) - REAL(DP), ALLOCATABLE :: cutoff2D_d(:), tau_d(:,:), zv_d(:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: cutoff2D_d, tau_d, zv_d, ityp_d -#endif ! ntyp = SIZE(zv) - ALLOCATE( cutoff2D_d(ngm), tau_d(3,nat), zv_d(ntyp) ) - ALLOCATE( ityp_d(nat) ) - cutoff2D_d = cutoff_2D - tau_d = tau - zv_d = zv - ityp_d = ityp - ! g(1) is a problem if it's G=0, because we divide by G^2. - ! So start at gstart. - ! fact=1.0d0, gamma_only not implemented - ! G=0 componenent of the long-range part of the local part of the - ! pseudopotminus the Hartree potential is set to 0. - ! in other words, sdewald=0. - ! sdewald is the last term in equation B1 of PRB 32 3792. - ! See also similar comment for ewaldg in cutoff_ewald routine + ! + ! ... g(1) is a problem if it's G=0, because we divide by G^2. + ! So start at gstart. + ! fact=1.0d0, gamma_only not implemented + ! G=0 componenent of the long-range part of the local part of the + ! pseudopotminus the Hartree potential is set to 0. + ! in other words, sdewald=0. + ! sdewald is the last term in equation B1 of PRB 32 3792. + ! See also similar comment for ewaldg in cutoff_ewald routine ! sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP ! sdewald = 0._DP ! - !$cuf kernel do (1) <<<*,*>>> + !$acc parallel loop copyin(cutoff_2D,tau,zv,ityp) & + !$acc& reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32, & + !$acc& sigma33) DO ng = gstart, ngm - Gp = SQRT( g_d(1,ng)**2 + g_d(2,ng)**2 )*tpiba + Gp = SQRT( g(1,ng)**2 + g(2,ng)**2 )*tpiba IF (Gp < eps8) THEN G2lzo2Gp = 0._DP beta = 0._DP ELSE - G2lzo2Gp = gg_d(ng)*tpiba2*lz/2._DP/Gp - beta = G2lzo2Gp*(1._DP-cutoff2D_d(ng))/cutoff2D_d(ng) + G2lzo2Gp = gg(ng)*tpiba2*lz/2._DP/Gp + beta = G2lzo2Gp*(1._DP-cutoff_2D(ng))/cutoff_2D(ng) ENDIF - g2 = gg_d(ng) * tpiba2 + g2 = gg(ng) * tpiba2 g2a = g2 / 4._DP / alpha rhostar = (0._DP,0._DP) DO na = 1, nat - arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + & - g_d(3,ng) * tau_d(3,na) ) * tpi - rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP) + arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + & + g(3,ng) * tau(3,na) ) * tpi + rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg),SIN(arg),KIND=DP) ENDDO rhostar = rhostar / CMPLX(omega) - sewald = tpi * e2 * EXP(-g2a) / g2* cutoff2D_d(ng) * ABS(rhostar)**2 + sewald = tpi * e2 * EXP(-g2a) / g2* cutoff_2D(ng) * ABS(rhostar)**2 ! ... sewald is an other diagonal term that is similar to the diagonal terms - ! in the other stress contributions. It basically gives a term prop to - ! the ewald energy + ! in the other stress contributions. It basically gives a term prop to + ! the ewald energy ! sdewald = sdewald - sewald sigma11 = sigma11 + sewald * tpiba2 * 2._DP * & - g_d(1,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta) + g(1,ng) * g(1,ng) / g2 * (1._DP+g2a-beta) sigma21 = sigma21 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(1,ng) / g2 * (1._DP+g2a-beta) + g(2,ng) * g(1,ng) / g2 * (1._DP+g2a-beta) sigma22 = sigma22 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(2,ng) / g2 * (1._DP+g2a-beta) + g(2,ng) * g(2,ng) / g2 * (1._DP+g2a-beta) sigma31 = sigma31 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(1,ng) / g2 * (g2a+1._DP) + g(3,ng) * g(1,ng) / g2 * (g2a+1._DP) sigma32 = sigma32 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(2,ng) / g2 * (g2a+1._DP) + g(3,ng) * g(2,ng) / g2 * (g2a+1._DP) sigma33 = sigma33 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(3,ng) / g2 * (g2a+1._DP) + g(3,ng) * g(3,ng) / g2 * (g2a+1._DP) ! ENDDO ! @@ -934,11 +667,9 @@ SUBROUTINE cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) sigmaewa(3,2) = sigmaewa(3,2) + sigma32 sigmaewa(3,3) = sigmaewa(3,3) + sigma33 ! - DEALLOCATE( cutoff2D_d, tau_d, zv_d ) - DEALLOCATE( ityp_d ) - ! RETURN ! -END SUBROUTINE cutoff_stres_sigmaewa_gpu +END SUBROUTINE cutoff_stres_sigmaewa +! ! END MODULE Coul_cut_2D diff --git a/PW/src/Makefile b/PW/src/Makefile index 0016fd01d..37e0fbc54 100644 --- a/PW/src/Makefile +++ b/PW/src/Makefile @@ -290,16 +290,6 @@ PWLIBS += \ drhoc_gpu.o \ addusforce_gpu.o \ force_corr_gpu.o \ - stres_loc_gpu.o \ - dvloc_of_g_gpu.o \ - stres_har_gpu.o \ - stres_mgga_gpu.o \ - stres_cc_gpu.o \ - deriv_drhoc_gpu.o \ - stres_ewa_gpu.o \ - stres_knl_gpu.o \ - stres_us_gpu.o \ - compute_deff_gpu.o \ orthoatwfc_gpu.o \ atomic_wfc_gpu.o diff --git a/PW/src/add_paw_to_deeq_gpu.f90 b/PW/src/add_paw_to_deeq_gpu.f90 index 281ed437c..e529bf515 100644 --- a/PW/src/add_paw_to_deeq_gpu.f90 +++ b/PW/src/add_paw_to_deeq_gpu.f90 @@ -19,7 +19,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d) ! IMPLICIT NONE ! - REAL(KIND=DP), INTENT(INOUT) :: deeq_d( nhm, nhm, nat, nspin ) + REAL(DP), INTENT(INOUT) :: deeq_d(nhm,nhm,nat,nspin) !! integral of the perturbed potential ! ! ... local variables @@ -27,7 +27,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d) INTEGER :: na, nb, nab, nt, ih, jh, ijh, nhnt, is REAL(DP), ALLOCATABLE :: ddd_paw_d(:,:,:) #if defined(__CUDA) - attributes(DEVICE) :: deeq_d, ddd_paw_d + attributes(DEVICE) :: ddd_paw_d, deeq_d #endif ! OPTIMIZE HERE: squeeze loop on atoms having PAW pseudo @@ -39,7 +39,7 @@ SUBROUTINE add_paw_to_deeq_gpu(deeq_d) nt = ityp(na) IF (.not.upf(nt)%tpawp) CYCLE nhnt = nh(nt) -!$cuf kernel do(3) + !$acc parallel loop collapse(3) DO is=1,nspin DO ih=1,nhnt DO jh=1,nhnt diff --git a/PW/src/add_vhub_to_deeq_gpu.f90 b/PW/src/add_vhub_to_deeq_gpu.f90 index 02a88f202..8d5710d03 100644 --- a/PW/src/add_vhub_to_deeq_gpu.f90 +++ b/PW/src/add_vhub_to_deeq_gpu.f90 @@ -7,8 +7,8 @@ ! SUBROUTINE add_vhub_to_deeq_gpu( deeq_d ) !----------------------------------------------------------------- - !! Add Hubbard contributions to the integral of V_eff and Q_{nm} when - !! Hubbard_projectors is pseudo. + !! Add Hubbard contributions to the integral of \(V_\text{eff}\) + !! and \(Q_{nm}\) when Hubbard_projectors is pseudo. ! USE kinds, ONLY : DP USE ions_base, ONLY : nat, ntyp => nsp, ityp @@ -21,27 +21,23 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d ) ! IMPLICIT NONE ! - REAL(KIND=DP), INTENT(INOUT) :: deeq_d( nhm, nhm, nat, nspin ) - !! integral of V_eff and Q_{nm} - ! -#if defined(__CUDA) - attributes(DEVICE) :: deeq_d -#endif + REAL(KIND=DP), INTENT(INOUT) :: deeq_d(nhm,nhm,nat,nspin) + !! integral of \(V_\text{eff}\) and \(Q_{nm}\) ! ! ... local variables ! - REAL(KIND=DP), ALLOCATABLE :: deeq_aux_h( :, :, : ) - REAL(KIND=DP), ALLOCATABLE :: deeq_aux_d( :, :, : ) + REAL(KIND=DP), ALLOCATABLE :: deeq_aux_h(:,:,:) + REAL(KIND=DP), ALLOCATABLE :: deeq_aux_d(:,:,:) #if defined(__CUDA) - attributes(DEVICE) :: deeq_aux_d + attributes(DEVICE) :: deeq_aux_d, deeq_d #endif INTEGER :: na, nt, ih, jh, ijh, m1, m2, ow1, ow2, is, nhnt ! - ! ! (maybe) OPTIMIZE here ... reorder the loop ? ! - ALLOCATE(deeq_aux_h( nhm, nhm, nspin )) - ALLOCATE(deeq_aux_d( nhm, nhm, nspin )) + ALLOCATE( deeq_aux_h(nhm,nhm,nspin) ) + ALLOCATE( deeq_aux_d(nhm,nhm,nspin) ) + ! DO na = 1, nat ! nt = ityp(na) @@ -68,11 +64,12 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d ) ! ENDDO ENDDO - CALL dev_memcpy(deeq_aux_d, deeq_aux_h) + ! + CALL dev_memcpy( deeq_aux_d, deeq_aux_h ) ! HERE USE devXlib nhnt = nh(nt) - !$cuf kernel do(3) - DO is=1, nspin + !$acc parallel loop collapse(3) + DO is = 1, nspin DO ih = 1, nhnt DO jh = 1, nhnt deeq_d(jh,ih,na,is) = deeq_d(jh,ih,na,is) + deeq_aux_d(jh,ih,is) @@ -81,7 +78,7 @@ SUBROUTINE add_vhub_to_deeq_gpu( deeq_d ) END DO ! ENDDO - + ! DEALLOCATE(deeq_aux_h, deeq_aux_d) ! END SUBROUTINE add_vhub_to_deeq_gpu diff --git a/PW/src/add_vuspsi_gpu.f90 b/PW/src/add_vuspsi_gpu.f90 index d9fc8cbcf..80be98262 100644 --- a/PW/src/add_vuspsi_gpu.f90 +++ b/PW/src/add_vuspsi_gpu.f90 @@ -19,7 +19,7 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d ) USE lsda_mod, ONLY: current_spin USE control_flags, ONLY: gamma_only USE noncollin_module - USE uspp, ONLY: ofsbeta, nkb, vkb, deeq_d, deeq_nc_d + USE uspp, ONLY: ofsbeta, nkb, vkb, deeq, deeq_nc USE uspp_param, ONLY: nh, nhm USE becmod_gpum, ONLY: bec_type_d, becp_d, using_becp_r_d, & using_becp_k_d, using_becp_nc_d @@ -133,10 +133,12 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d ) ! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt)) ! IF ( m_loc > 0 ) THEN + !$acc host_data use_device(deeq) CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, & - deeq_d(1,1,na,current_spin), nhm, & + deeq(1,1,na,current_spin), nhm, & becp_d%r_d(ofsbeta(na)+1,1), nkb, 0.0_dp, & ps_d(ofsbeta(na)+1,1), nkb ) + !$acc end host_data END IF ! END IF @@ -240,12 +242,12 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d ) ! deeq is real: copy it into a complex variable to perform ! a zgemm - simple but sub-optimal solution ! - !deeaux_d(:,:) = CMPLX(deeq_d(1:nh(nt),1:nh(nt),na,current_spin), 0.0_dp, KIND=dp ) + !deeaux_d(:,:) = CMPLX(deeq(1:nh(nt),1:nh(nt),na,current_spin), 0.0_dp, KIND=dp ) ! -!$cuf kernel do(2) <<<*,*>>> + !$acc parallel loop collapse(2) present(deeq) DO j = 1, nhnt DO k = 1, nhnt - deeaux_d(k,j) = CMPLX(deeq_d(k,j,na,current_spin), 0.0_dp, KIND=DP ) + deeaux_d(k,j) = CMPLX(deeq(k,j,na,current_spin), 0.0_dp, KIND=DP ) END DO END DO ! @@ -313,29 +315,31 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d ) ! IF ( ityp(na) == nt ) THEN ! + !$acc host_data use_device(deeq_nc) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - deeq_nc_d(1,1,na,1), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, & + deeq_nc(1,1,na,1), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, & (0.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,1,1), 2*nkb ) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - deeq_nc_d(1,1,na,2), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, & + deeq_nc(1,1,na,2), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, & (1.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,1,1), 2*nkb ) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - deeq_nc_d(1,1,na,3), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, & + deeq_nc(1,1,na,3), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), 2*nkb, & (0.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,2,1), 2*nkb ) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - deeq_nc_d(1,1,na,4), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, & + deeq_nc(1,1,na,4), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), 2*nkb, & (1.0_dp, 0.0_dp), ps_d(ofsbeta(na)+1,2,1), 2*nkb ) - + !$acc end host_data + ! ! DO ibnd = 1, m ! ! ! DO jh = 1, nh(nt) ! ! -!!$cuf kernel do(1) <<<*,*>>> +!!$acc parallel loop present(deeq_nc) ! DO ih = 1, nh(nt) ! ! ! ikb = ijkb0 + ih @@ -344,11 +348,11 @@ SUBROUTINE add_vuspsi_gpu( lda, n, m, hpsi_d ) ! becpdn_jkb = becp_nc_d(jkb,2,ibnd) ! ! ! ps_d(ikb,1,ibnd) = ps_d(ikb,1,ibnd) + & -! deeq_nc_d(ih,jh,na,1)*becpup_jkb + & -! deeq_nc_d(ih,jh,na,2)*becpdn_jkb +! deeq_nc(ih,jh,na,1)*becpup_jkb + & +! deeq_nc(ih,jh,na,2)*becpdn_jkb ! ps_d(ikb,2,ibnd) = ps_d(ikb,2,ibnd) + & -! deeq_nc_d(ih,jh,na,3)*becpup_jkb + & -! deeq_nc_d(ih,jh,na,4)*becpdn_jkb +! deeq_nc(ih,jh,na,3)*becpup_jkb + & +! deeq_nc(ih,jh,na,4)*becpdn_jkb ! ! ! END DO ! ! diff --git a/PW/src/addusstress.f90 b/PW/src/addusstress.f90 index fb5db59f6..42e713aa7 100644 --- a/PW/src/addusstress.f90 +++ b/PW/src/addusstress.f90 @@ -23,7 +23,7 @@ SUBROUTINE addusstress( sigmanlc ) ! ... local variables ! REAL(DP) :: sigma_r(3,3), sigma_g(3,3) - INTEGER :: na,ijh, ipol,jpol + INTEGER :: na, ijh, ipol, jpol ! IF ( tqr ) THEN sigma_r(:,:) = 0.d0 @@ -56,7 +56,7 @@ SUBROUTINE addusstress_g( sigmanlc ) !! routine). ! USE kinds, ONLY : DP - USE ions_base, ONLY : nat, ntyp => nsp, ityp + USE ions_base, ONLY : nat, ntyp=>nsp, ityp USE cell_base, ONLY : omega, tpiba USE fft_base, ONLY : dfftp USE fft_rho, ONLY : rho_r2g @@ -71,7 +71,7 @@ SUBROUTINE addusstress_g( sigmanlc ) ! IMPLICIT NONE ! - REAL(DP), INTENT(INOUT) :: sigmanlc(3, 3) + REAL(DP), INTENT(INOUT) :: sigmanlc(3,3) !! the nonlocal stress ! ! ... local variables @@ -82,18 +82,18 @@ SUBROUTINE addusstress_g( sigmanlc ) ! counters COMPLEX(DP), ALLOCATABLE :: aux1(:,:), aux2(:,:), vg(:,:), qgm(:,:) ! work space (complex) - COMPLEX(DP) :: cfac - REAL(DP) :: fac(3,nspin), sus(3,3) + COMPLEX(DP) :: cfac + REAL(DP) :: fac(3,nspin), sus(3,3) ! auxiliary variables REAL(DP) , ALLOCATABLE :: qmod(:), ylmk0(:,:), dylmk0(:,:), tbecsum(:,:) ! work space (real) ! - ! sus(:,:) = 0.d0 ! - ! Fourier transform of the total effective potential + ! ... Fourier transform of the total effective potential ! - ALLOCATE( vg(ngm,nspin) ) + ALLOCATE( vg(ngm,nspin) ) + !$acc data create( vg ) ! DO is = 1, nspin IF ( nspin == 4 .and. is /= 1 ) THEN @@ -103,10 +103,10 @@ SUBROUTINE addusstress_g( sigmanlc ) ENDIF ENDDO ! - ! With k-point parallelization, distribute G-vectors across processors - ! ngm_s = index of first G-vector for this processor - ! ngm_e = index of last G-vector for this processor - ! ngm_l = local number of G-vectors + ! ... With k-point parallelization, distribute G-vectors across processors: + ! ngm_s = index of first G-vector for this processor + ! ngm_e = index of last G-vector for this processor + ! ngm_l = local number of G-vectors ! CALL divide( inter_pool_comm, ngm, ngm_s, ngm_e ) ngm_l = ngm_e-ngm_s+1 @@ -115,80 +115,132 @@ SUBROUTINE addusstress_g( sigmanlc ) ! ALLOCATE( aux1(ngm_l,3), aux2(ngm_l,nspin), qmod(ngm_l) ) ALLOCATE( ylmk0(ngm_l,lmaxq*lmaxq), dylmk0(ngm_l,lmaxq*lmaxq) ) + !$acc data create(aux1,aux2,qmod) + !$acc data create(ylmk0,dylmk0) ! - CALL ylmr2( lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 ) +#if defined(__CUDA) + !$acc host_data use_device(g,gg,ylmk0) + CALL ylmr2_gpu( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 ) + !$acc end host_data +#else + CALL ylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), ylmk0 ) + !$acc update device(ylmk0) +#endif ! + !$acc parallel loop DO ig = 1, ngm_l qmod(ig) = SQRT( gg(ngm_s+ig-1) ) * tpiba ENDDO ! - ! here we compute the integral Q*V for each atom, + ! ... here we compute the integral Q*V for each atom, ! I = sum_G i G_a exp(-iR.G) Q_nm v^* - ! (no contribution from G=0) + ! (no contribution from G=0) ! DO ipol = 1, 3 - CALL dylmr2( lmaxq * lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol ) + ! +#if defined(__CUDA) + !$acc host_data use_device(g,gg,dylmk0) + CALL dylmr2_gpu( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol ) + !$acc end host_data +#else + CALL dylmr2( lmaxq*lmaxq, ngm_l, g(1,ngm_s), gg(ngm_s), dylmk0, ipol ) + !$acc update device(dylmk0) +#endif + ! DO nt = 1, ntyp + ! IF ( upf(nt)%tvanp ) THEN nij = nh(nt)*(nh(nt)+1)/2 ALLOCATE( qgm(ngm_l,nij), tbecsum(nij,nspin) ) + !$acc data create(qgm,tbecsum) ijh = 0 DO ih = 1, nh(nt) DO jh = ih, nh(nt) ijh = ijh + 1 CALL dqvan2( ih, jh, nt, ipol, ngm_l, g(1,ngm_s), tpiba, & - qmod, ylmk0, dylmk0, qgm(1,ijh) ) + qmod, ylmk0, dylmk0, qgm(1,ijh) ) ENDDO ENDDO ! DO na = 1, nat IF (ityp(na) == nt) THEN ! - tbecsum(:,:) = becsum( 1:nij, na, 1:nspin ) + !$acc kernels + tbecsum(:,:) = becsum(1:nij,na,1:nspin) + !$acc end kernels ! - CALL dgemm( 'N', 'N', 2*ngm_l, nspin, nij, 1.0_dp, & - qgm, 2*ngm_l, tbecsum, nij, 0.0_dp, aux2, 2*ngm_l ) + !$acc host_data use_device(qgm,tbecsum,aux2) + CALL MYDGEMM( 'N', 'N', 2*ngm_l, nspin, nij, 1.0_dp, & + qgm, 2*ngm_l, tbecsum, nij, 0.0_dp, aux2, 2*ngm_l ) + !$acc end host_data ! +#if defined(_OPENACC) +!$acc parallel loop collapse(2) +#else !$omp parallel do default(shared) private(is, ig) +#endif DO is = 1, nspin DO ig = 1, ngm_l aux2(ig,is) = aux2(ig,is) * CONJG(vg (ngm_s+ig-1, is)) - END DO - END DO + ENDDO + ENDDO +#if defined(_OPENACC) +!$acc parallel loop +#else !$omp end parallel do !$omp parallel do default(shared) private(ig, cfac) +#endif DO ig = 1, ngm_l - cfac = CONJG( eigts1(mill (1,ngm_s+ig-1), na) * & - eigts2(mill (2,ngm_s+ig-1), na) * & - eigts3(mill (3,ngm_s+ig-1), na) ) * tpiba - aux1(ig,1) = cfac * g(1,ngm_s+ig-1) - aux1(ig,2) = cfac * g(2,ngm_s+ig-1) - aux1(ig,3) = cfac * g(3,ngm_s+ig-1) - ENDDO + cfac = CONJG( eigts1(mill(1,ngm_s+ig-1),na) * & + eigts2(mill(2,ngm_s+ig-1),na) * & + eigts3(mill(3,ngm_s+ig-1),na) ) * tpiba + aux1(ig,1) = cfac * g(1,ngm_s+ig-1) + aux1(ig,2) = cfac * g(2,ngm_s+ig-1) + aux1(ig,3) = cfac * g(3,ngm_s+ig-1) + ENDDO +#if !defined(_OPENACC) !$omp end parallel do - CALL DGEMM('T','N', 3, nspin, 2*ngm_l, 1.0_dp, aux1, 2*ngm_l, & - aux2, 2*ngm_l, 0.0_dp, fac, 3 ) - DO is = 1, nspin - DO jpol = 1, 3 - sus(ipol, jpol) = sus(ipol, jpol) - omega * fac(jpol, is) - ENDDO - ENDDO +#endif + ! + !$acc data copyout( fac ) + !$acc host_data use_device(aux1,aux2,fac) + CALL MYDGEMM( 'T','N', 3, nspin, 2*ngm_l, 1.0_dp, aux1, 2*ngm_l, & + aux2, 2*ngm_l, 0.0_dp, fac, 3 ) + !$acc end host_data + !$acc end data + ! + DO is = 1, nspin + DO jpol = 1, 3 + sus(ipol,jpol) = sus(ipol,jpol) - omega * fac(jpol,is) + ENDDO + ENDDO + ! ENDIF ENDDO + !$acc end data DEALLOCATE( tbecsum, qgm ) ENDIF ENDDO - + ! ENDDO + ! + !$acc end data + !$acc end data + DEALLOCATE( ylmk0, dylmk0 ) + DEALLOCATE( aux1, aux2, qmod ) + ! 10 CONTINUE + ! CALL mp_sum( sus, inter_pool_comm ) + ! IF (gamma_only) THEN sigmanlc(:,:) = sigmanlc(:,:) + 2.0_dp*sus(:,:) ELSE sigmanlc(:,:) = sigmanlc(:,:) + sus(:,:) ENDIF - DEALLOCATE( ylmk0, dylmk0 ) - DEALLOCATE( aux1, aux2, vg, qmod ) + ! + !$acc end data + DEALLOCATE( vg ) ! RETURN ! diff --git a/PW/src/compute_deff.f90 b/PW/src/compute_deff.f90 index 0b63d2655..878ddb9d7 100644 --- a/PW/src/compute_deff.f90 +++ b/PW/src/compute_deff.f90 @@ -5,16 +5,17 @@ ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! +! !------------------------------------------------------------------------- SUBROUTINE compute_deff( deff, et ) !----------------------------------------------------------------------- - !! This routine computes the effective value of the D-eS coefficients - !! which appear often in many expressions in the US or PAW case. - !! This routine is for the collinear case. + !! This routine computes the effective value of the D-eS coefficients + !! which appear often in many expressions in the US or PAW case. + !! This routine is for the collinear case. ! USE kinds, ONLY: DP - USE ions_base, ONLY: nsp, nat, ityp - USE uspp, ONLY: deeq, qq_at, okvan + USE ions_base, ONLY: nat + USE uspp, ONLY: okvan, deeq, qq_at USE uspp_param, ONLY: nhm USE lsda_mod, ONLY: current_spin ! @@ -27,24 +28,21 @@ SUBROUTINE compute_deff( deff, et ) ! ! ... local variables ! - INTEGER :: nt, na, is + INTEGER :: nt, na, is, i, j ! - deff(:,:,:) = deeq(:,:,:,current_spin) + !$acc kernels present_or_copyout(deff) ! - IF (okvan) THEN + IF (.NOT. okvan) THEN ! - DO nt = 1, nsp - DO na = 1, nat - ! - IF ( ityp(na) == nt ) THEN - deff(:,:,na) = deff(:,:,na) - et*qq_at(:,:,na) - ENDIF - ! - ENDDO - ENDDO + deff(:,:,:) = deeq(:,:,:,current_spin) + ! + ELSE + ! + deff(:,:,:) = deeq(:,:,:,current_spin) - et*qq_at(:,:,:) ! ENDIF ! + !$acc end kernels ! RETURN ! @@ -52,7 +50,7 @@ END SUBROUTINE compute_deff ! ! !--------------------------------------------------------------------------- -SUBROUTINE compute_deff_nc( deff, et ) +SUBROUTINE compute_deff_nc( deff_nc, et ) !------------------------------------------------------------------------- !! This routine computes the effective value of the D-eS coefficients !! which appears often in many expressions. This routine is for the @@ -60,8 +58,8 @@ SUBROUTINE compute_deff_nc( deff, et ) ! USE kinds, ONLY: DP USE ions_base, ONLY: nsp, nat, ityp - USE noncollin_module, ONLY: npol, lspinorb - USE uspp, ONLY: deeq_nc, qq_at, qq_so, okvan + USE noncollin_module, ONLY: noncolin, npol, lspinorb + USE uspp, ONLY: okvan, deeq_nc, qq_so, qq_at USE uspp_param, ONLY: nhm USE lsda_mod, ONLY: nspin ! @@ -69,42 +67,71 @@ SUBROUTINE compute_deff_nc( deff, et ) ! REAL(DP), INTENT(IN) :: et !! The eigenvalues of the hamiltonian - COMPLEX(DP), INTENT(OUT) :: deff(nhm,nhm,nat,nspin) + COMPLEX(DP), INTENT(OUT) :: deff_nc(nhm,nhm,nat,nspin) !! Effective values of the D-eS coefficients ! ! ... local variables ! - INTEGER :: nt, na, is, js, ijs + INTEGER :: nt, na, is, ijs, i, j, ias + INTEGER :: nt_v(nat), na_v(nat) ! - deff=deeq_nc - IF (okvan) THEN - ! - DO nt = 1, nsp - DO na = 1, nat - ! - IF ( ityp(na) == nt ) THEN - IF (lspinorb) THEN - deff(:,:,na,:) = deff(:,:,na,:) - et * qq_so(:,:,:,nt) - ELSE - ijs=0 - ! - DO is=1,npol - DO js=1,npol - ! - ijs=ijs+1 - IF (is==js) deff(:,:,na,ijs)=deff(:,:,na,ijs)-et*qq_at(:,:,na) - ! - ENDDO - ENDDO - ! - ENDIF - ENDIF - ! + !$acc data present_or_copyout( deff_nc ) + ! + !$acc kernels + deff_nc(:,:,:,:) = deeq_nc(:,:,:,:) + !$acc end kernels + ! + IF (okvan) then + ! + ! ... set up index arrays to fill 'deff' in on gpu + i = 0 + DO nt = 1, nsp + DO na = 1, nat + IF ( ityp(na)/=nt ) CYCLE + i = i + 1 + nt_v(i) = nt + na_v(i) = na + ENDDO + ENDDO + ! + !$acc data copyin( nt_v, na_v ) + ! + IF (lspinorb) THEN + ! + !$acc parallel loop collapse(3) + DO ias = 1, nat + DO i = 1, nhm + DO j = 1, nhm + na = na_v(ias) + nt = nt_v(ias) + deff_nc(i,j,na,:) = deeq_nc(i,j,na,:) - CMPLX(et)*qq_so(i,j,:,nt) + ENDDO ENDDO - ENDDO - ! - ENDIF + ENDDO + ! + ELSE + ! + !$acc parallel loop collapse(3) + DO ias = 1, nat + DO i = 1, nhm + DO j = 1, nhm + na = na_v(ias) + !$acc loop seq + DO is = 1, npol + ijs = (is-1)*npol + is + deff_nc(i,j,na,ijs) = deeq_nc(i,j,na,ijs) - CMPLX(et*qq_at(i,j,na)) + ENDDO + ENDDO + ENDDO + ENDDO + ! + ENDIF + ! + !$acc end data + ! + ENDIF ! + !$acc end data ! RETURN ! diff --git a/PW/src/compute_deff_gpu.f90 b/PW/src/compute_deff_gpu.f90 deleted file mode 100644 index 7d8d39ed9..000000000 --- a/PW/src/compute_deff_gpu.f90 +++ /dev/null @@ -1,155 +0,0 @@ -! -! Copyright (C) 2009-2010 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -!------------------------------------------------------------------------- -SUBROUTINE compute_deff_gpu( deff_d, et ) - !----------------------------------------------------------------------- - !! This routine computes the effective value of the D-eS coefficients - !! which appear often in many expressions in the US or PAW case. - !! This routine is for the collinear case. - ! - USE kinds, ONLY: DP - USE ions_base, ONLY: nat - USE uspp, ONLY: okvan, deeq_d, qq_at_d - USE uspp_param, ONLY: nhm - USE lsda_mod, ONLY: current_spin - ! - IMPLICIT NONE - ! - REAL(DP), INTENT(IN) :: et - !! The eigenvalues of the hamiltonian - COMPLEX(DP), INTENT(OUT) :: deff_d(nhm,nhm,nat) - !! Effective values of the D-eS coefficients - ! - ! ... local variables - ! - INTEGER :: nt, na, is, i, j - ! -#if defined(__CUDA) - attributes(DEVICE) :: deff_d -#endif - ! - IF (.NOT. okvan) THEN - ! - !$cuf kernel do (3) <<<*,*>>> - DO na = 1, nat - DO i = 1, nhm - DO j = 1,nhm - deff_d(i,j,na) = CMPLX(deeq_d(i,j,na,current_spin)) - ENDDO - ENDDO - ENDDO - ! - ELSE - ! - !$cuf kernel do (3) <<<*,*>>> - DO na = 1, nat - DO i = 1, nhm - DO j = 1,nhm - deff_d(i,j,na) = CMPLX(deeq_d(i,j,na,current_spin) - et*qq_at_d(i,j,na)) - ENDDO - ENDDO - ENDDO - ! - ENDIF - ! - ! - RETURN - ! -END SUBROUTINE compute_deff_gpu -! -! -!--------------------------------------------------------------------------- -SUBROUTINE compute_deff_nc_gpu( deff_d, et ) - !------------------------------------------------------------------------- - !! This routine computes the effective value of the D-eS coefficients - !! which appears often in many expressions. This routine is for the - !! noncollinear case. - ! - USE kinds, ONLY: DP - USE ions_base, ONLY: nsp, nat, ityp - USE noncollin_module, ONLY: noncolin, npol, lspinorb - USE uspp, ONLY: okvan, deeq_nc_d, qq_so_d, qq_at_d - USE uspp_param, ONLY: nhm - USE lsda_mod, ONLY: nspin - USE device_memcpy_m, ONLY: dev_memcpy - ! - IMPLICIT NONE - ! - REAL(DP), INTENT(IN) :: et - !! The eigenvalues of the hamiltonian - COMPLEX(DP), INTENT(OUT) :: deff_d(nhm,nhm,nat,nspin) - !! Effective values of the D-eS coefficients - ! - ! ... local variables - ! - INTEGER :: nt, na, is, js, ijs, i, j, ias - INTEGER :: na_v(nat), nt_v(nat) - INTEGER, ALLOCATABLE :: na_d(:), nt_d(:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: deff_d, na_d, nt_d -#endif - ! - CALL dev_memcpy( deff_d, deeq_nc_d ) - ! - IF ( okvan ) THEN - ! - ALLOCATE( nt_d(nat), na_d(nat) ) - ! - i = 0 - DO nt = 1, nsp - DO na = 1, nat - IF ( ityp(na)/=nt ) CYCLE - i = i + 1 - nt_v(i) = nt - na_v(i) = na - ENDDO - ENDDO - ! - nt_d = nt_v ; na_d = na_v - ! - IF (lspinorb) THEN - ! - !$cuf kernel do (3) <<<*,*>>> - DO ias = 1, nat - DO i = 1, nhm - DO j = 1, nhm - na = na_d(ias) - nt = nt_d(ias) - deff_d(i,j,na,:) = deeq_nc_d(i,j,na,:) - et*qq_so_d(i,j,:,nt) - ENDDO - ENDDO - ENDDO - ! - ELSE - ! - !$cuf kernel do (3) <<<*,*>>> - DO ias = 1, nat - DO i = 1, nhm - DO j = 1, nhm - na = na_d(ias) - nt = nt_d(ias) - DO is = 1, npol - ijs = (is-1)*npol+is - deff_d(i,j,na,ijs) = deeq_nc_d(i,j,na,ijs) - & - et*qq_at_d(i,j,na) - ENDDO - ENDDO - ENDDO - ENDDO - ! - ENDIF - ! - DEALLOCATE( nt_d, na_d ) - ! - ENDIF - ! - ! - RETURN - ! -END SUBROUTINE compute_deff_nc_gpu diff --git a/PW/src/deriv_drhoc.f90 b/PW/src/deriv_drhoc.f90 index fef8f69d7..f2961e2a6 100644 --- a/PW/src/deriv_drhoc.f90 +++ b/PW/src/deriv_drhoc.f90 @@ -11,7 +11,8 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg ) !! Calculates the Fourier transform of \(d\text{Rho}_c/dG\). ! USE kinds - USE constants, ONLY : pi, fpi + USE constants, ONLY : pi, fpi + USE upf_acc_interfaces, ONLY : simpson ! IMPLICIT NONE ! @@ -46,10 +47,14 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg ) ! counter on g shells ! lower limit for loop on ngl ! + !$acc data present_or_copyin(gl,r,rab,rhoc) present_or_copyout(drhocg) + ! ! G=0 term ! IF (gl(1) < 1.0d-8) THEN + !$acc kernels drhocg(1) = 0.0d0 + !$acc end kernels igl0 = 2 ELSE igl0 = 1 @@ -57,12 +62,19 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg ) ! ! G <> 0 term ! -!$omp parallel private(aux, gx, rhocg1) +#if !defined(_OPENACC) +!$omp parallel private(aux,gx,rhocg1) +#endif ! ALLOCATE( aux(mesh) ) +#if defined(_OPENACC) +!$acc parallel loop gang private(aux) +#else !$omp do +#endif DO igl = igl0, ngl gx = SQRT( gl(igl) * tpiba2 ) + !$acc loop vector DO ir = 1, mesh aux(ir) = r(ir)*rhoc(ir)*( r(ir) * COS(gx*r(ir)) / & gx - SIN(gx*r(ir)) / gx**2 ) @@ -70,10 +82,16 @@ SUBROUTINE deriv_drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, drhocg ) CALL simpson( mesh, aux, rab, rhocg1 ) drhocg(igl) = fpi / omega * rhocg1 ENDDO +#if !defined(_OPENACC) !$omp end do nowait - DEALLOCATE( aux ) +#endif ! + DEALLOCATE( aux ) + !$acc end data + ! +#if !defined(_OPENACC) !$omp end parallel +#endif ! RETURN ! diff --git a/PW/src/deriv_drhoc_gpu.f90 b/PW/src/deriv_drhoc_gpu.f90 deleted file mode 100644 index 6f2bc6805..000000000 --- a/PW/src/deriv_drhoc_gpu.f90 +++ /dev/null @@ -1,229 +0,0 @@ -! -! Copyright (C) 2001-2007 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -MODULE compute_drhocg_gpu_m -#if defined(__CUDA) - ! - USE cudafor - ! - IMPLICIT NONE - TYPE(dim3) :: drhocg_threads = dim3(32,8,1) - PUBLIC :: drhocg_threads - CONTAINS - ATTRIBUTES(global) SUBROUTINE compute_drhocg_gpu( n, tpiba2, omega, gl, r, rhoc, rab, mesh, drhocg ) - ! - USE cudafor - USE kinds, ONLY: DP - USE constants, ONLY: pi, fpi, eps14 - ! - IMPLICIT NONE - ! - INTEGER, VALUE :: n, mesh - REAL(DP),VALUE :: tpiba2, omega - REAL(DP),DEVICE,INTENT(IN) :: gl(n), r(mesh), rhoc(mesh), rab(mesh) - REAL(DP),DEVICE,INTENT(OUT) :: drhocg(n) - ! - INTEGER :: tx, ty, igl, ir - REAL(DP) :: mysum, val, gx, x - ! - tx = threadIdx%x - ty = threadIdx%y - ! - igl = (blockIdx%x - 1) * blockDim%y + ty - ! - IF (igl > n ) RETURN - ! - gx = SQRT(gl(igl) * tpiba2) - mysum = 0_dp - ! - DO ir = tx, mesh, blockDim%x - ! - val = r(ir)*rhoc(ir)*( r(ir) * COS(gx*r(ir)) / & - gx - SIN(gx*r(ir)) / gx**2 ) * rab(ir) - ! - IF (ir == 1 .OR. ir == mesh) THEN - mysum = mysum + val - ELSEIF (MOD(ir,2)) THEN - mysum = mysum + 2._dp*val - ELSE - mysum = mysum + 4._dp*val - ENDIF - ENDDO - ! - ! Reduce by warp - val = __shfl_down(mysum,1) - mysum = mysum + val - val = __shfl_down(mysum,2) - mysum = mysum + val - val = __shfl_down(mysum,4) - mysum = mysum + val - val = __shfl_down(mysum,8) - mysum = mysum + val - val = __shfl_down(mysum,16) - mysum = mysum + val - ! - IF (tx == 1) THEN - drhocg(igl) = fpi * mysum / (3._dp * omega) - ENDIF - ! - END SUBROUTINE compute_drhocg_gpu - ! -#endif -END MODULE compute_drhocg_gpu_m -! -#if defined(__CUDA) -!---------------------------------------------------------------------------- -SUBROUTINE deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, mesh, r_d, rab_d, rhoc_d, & - drhocg_d ) - !-------------------------------------------------------------------------- - !! Calculates the Fourier transform of \(d\text{Rho}_c/dG\). - !---cuda kernel version - ! - USE cudafor - USE kinds - USE constants, ONLY : pi, fpi - USE compute_drhocg_gpu_m - ! - IMPLICIT NONE - ! - INTEGER :: ngl - !! input: the number of g shell - INTEGER :: mesh - !! input: the number of radial mesh points - REAL(DP), INTENT(IN), DEVICE :: gl_d(ngl) - !! input: the number of G shells - REAL(DP), INTENT(IN), DEVICE :: r_d(mesh) - !! input: the radial mesh - REAL(DP), INTENT(IN), DEVICE :: rab_d(mesh) - !! input: the derivative of the radial mesh - REAL(DP), INTENT(IN), DEVICE :: rhoc_d(mesh) - !! input: the radial core charge - REAL(DP), INTENT(IN) :: omega - !! input: the volume of the unit cell - REAL(DP), INTENT(IN) :: tpiba2 - !! input: 2 times pi / alat - REAL(DP), INTENT(OUT), DEVICE :: drhocg_d(ngl) - !! Fourier transform of d Rho_c/dG - ! - ! ... local variables - ! - REAL(DP) :: gx, gl1 - ! the modulus of g for a given shell - INTEGER :: igl, blocks, igl0 - TYPE(dim3) :: threads - ! counter on radial mesh points - ! counter on g shells - ! lower limit for loop on ngl - ! - ! G=0 term - ! - gl1 = gl_d(1) - IF (gl1 < 1.0d-8) THEN - drhocg_d(1) = 0.0_DP - igl0 = 2 - ELSE - igl0 = 1 - ENDIF - ! - ! G <> 0 term - ! - threads = dim3(32,8,1) - blocks = CEILING(REAL(ngl-igl0+1)/8) - CALL compute_drhocg_gpu<<>>( ngl-igl0+1, tpiba2, omega, gl_d(igl0), r_d, & - rhoc_d, rab_d, mesh, drhocg_d(igl0) ) - ! - RETURN - ! -END SUBROUTINE deriv_drhoc_gpu -! -! -#else -! -! -SUBROUTINE deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, mesh, r_d, rab_d, rhoc_d, & - drhocg_d ) - !-------------------------------------------------------------------------- - !! Calculates the Fourier transform of \(d\text{Rho}_c/dG\). - !---cuf kernel loop version--- - ! - USE kinds - USE constants, ONLY: pi, fpi - ! -#if defined(__CUDA) - USE cudafor -#endif - USE simpsn_gpum, ONLY: simpsn_gpu_dev - USE device_fbuff_m, ONLY: dev_buf - ! - IMPLICIT NONE - ! - INTEGER :: ngl - !! input: the number of g shell - INTEGER :: mesh - !! input: the number of radial mesh points - REAL(DP), INTENT(IN) :: gl_d(ngl) - !! input: the number of G shells - REAL(DP), INTENT(IN) :: r_d(mesh) - !! input: the radial mesh - REAL(DP), INTENT(IN) :: rab_d(mesh) - !! input: the derivative of the radial mesh - REAL(DP), INTENT(IN) :: rhoc_d(mesh) - !! input: the radial core charge - REAL(DP), INTENT(IN) :: omega - !! input: the volume of the unit cell - REAL(DP), INTENT(IN) :: tpiba2 - !! input: 2 times pi / alat - REAL(DP), INTENT(OUT) :: drhocg_d(ngl) - !! Fourier transform of d Rho_c/dG - ! - ! ... local variables - ! - REAL(DP) :: gx, gl1 - ! the modulus of g for a given shell - REAL(DP), ALLOCATABLE :: aux_d(:,:) - ! auxiliary memory for integration - INTEGER :: ir, igl, igl0 - ! counter on radial mesh points - ! counter on g shells - ! -#if defined(__CUDA) - attributes(DEVICE) :: gl_d, r_d, rab_d, rhoc_d, drhocg_d, aux_d -#endif - ! - ! G=0 term - ! - gl1 = gl_d(1) - IF (gl1 < 1.0d-8) THEN - drhocg_d(1) = 0.0_DP - igl0 = 2 - ELSE - igl0 = 1 - ENDIF - ! - ! G <> 0 term - ! - ALLOCATE( aux_d(mesh,ngl) ) - ! - !$cuf kernel do (1) - DO igl = igl0, ngl - gx = SQRT( gl_d(igl) * tpiba2 ) - DO ir = 1, mesh - aux_d(ir,igl) = r_d(ir)*rhoc_d(ir)*( r_d(ir) * COS(gx*r_d(ir)) / & - gx - SIN(gx*r_d(ir)) / gx**2 ) - ENDDO - CALL simpsn_gpu_dev( mesh, aux_d(:,igl), rab_d, drhocg_d(igl) ) - drhocg_d(igl) = fpi / omega * drhocg_d(igl) - ENDDO - ! - DEALLOCATE( aux_d ) - ! - RETURN - ! -END SUBROUTINE deriv_drhoc_gpu -! -#endif diff --git a/PW/src/drhoc.f90 b/PW/src/drhoc.f90 index 09e248af4..457a0b13e 100644 --- a/PW/src/drhoc.f90 +++ b/PW/src/drhoc.f90 @@ -12,7 +12,8 @@ SUBROUTINE drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, rhocg ) !! Calculates the Fourier transform of the core charge. ! USE kinds - USE constants, ONLY: pi, fpi + USE constants, ONLY: pi, fpi + USE upf_acc_interfaces, ONLY: simpson, sph_bes_acc ! IMPLICIT NONE ! @@ -40,51 +41,83 @@ SUBROUTINE drhoc( ngl, gl, omega, tpiba2, mesh, r, rab, rhoc, rhocg ) REAL(DP) :: gx, rhocg1 ! the modulus of g for a given shell ! the Fourier transform - REAL(DP), ALLOCATABLE :: aux(:) + REAL(DP), ALLOCATABLE :: aux(:), gxv(:) ! auxiliary memory for integration INTEGER :: ir, igl, igl0 ! counter on radial mesh points ! counter on g shells ! lower limit for loop on ngl ! + !--------PROVISIONAL------ gl def needs to be adjusted--------- + ALLOCATE( gxv(ngl) ) + DO igl = 1, ngl + gxv(igl) = SQRT(gl(igl) * tpiba2) + ENDDO + !-------------------------------------------------------------- ! -!$omp parallel private(aux, gx, rhocg1) +#if !defined(_OPENACC) +!$omp parallel private(aux,gx,rhocg1) +#endif ! ALLOCATE( aux(mesh) ) +#if defined(_OPENACC) + !$acc data present_or_copyin(gl,r,rab,rhoc) present_or_copyout(rhocg) + !$acc data create(aux) +#endif ! ! G=0 term ! +#if !defined(_OPENACC) !$omp single +#endif IF ( gl(1) < 1.0d-8 ) THEN + !$acc parallel loop DO ir = 1, mesh aux(ir) = r(ir)**2 * rhoc(ir) ENDDO + !$acc kernels CALL simpson( mesh, aux, rab, rhocg1 ) rhocg(1) = fpi * rhocg1 / omega + !$acc end kernels igl0 = 2 ELSE igl0 = 1 ENDIF +#if !defined(_OPENACC) !$omp end single +#endif +!$acc end data ! ! G <> 0 term ! +#if defined(_OPENACC) +!$acc parallel loop gang private(aux) copyin(gxv) +#else !$omp do +#endif DO igl = igl0, ngl - gx = SQRT(gl(igl) * tpiba2) - CALL sph_bes( mesh, r, gx, 0, aux ) + gx = gxv(igl) + CALL sph_bes_acc( mesh, r, gx, 0, aux ) + !$acc loop vector DO ir = 1, mesh - aux(ir) = r(ir)**2 * rhoc(ir) * aux(ir) + aux(ir) = r(ir)**2 * rhoc(ir) * aux(ir) ENDDO CALL simpson( mesh, aux, rab, rhocg1 ) rhocg(igl) = fpi * rhocg1 / omega ENDDO +#if !defined(_OPENACC) !$omp end do nowait - DEALLOCATE( aux ) +#endif ! -!$omp end parallel + !$acc end data + ! + DEALLOCATE( aux ) +#if !defined(_OPENACC) + !$omp end parallel +#endif + ! + DEALLOCATE( gxv ) ! RETURN ! END SUBROUTINE drhoc - diff --git a/PW/src/dvloc_of_g.f90 b/PW/src/dvloc_of_g.f90 index 7249b8657..0b1a6a1c5 100644 --- a/PW/src/dvloc_of_g.f90 +++ b/PW/src/dvloc_of_g.f90 @@ -7,142 +7,192 @@ ! ! !---------------------------------------------------------------------- -subroutine dvloc_of_g (mesh, msh, rab, r, vloc_at, zp, tpiba2, ngl, gl, & - omega, dvloc) +SUBROUTINE dvloc_of_g( mesh, msh, rab, r, vloc_at, zp, tpiba2, ngl, gl, & + omega, dvloc ) !---------------------------------------------------------------------- - ! - ! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g + !! This routine gives: + !! \[ \text{dvloc} = D\text{Vloc}(g^2)/Dg^2 = (1/2g)\ D\text{Vloc}(g)/Dg + !! \] ! USE kinds - USE constants , ONLY : pi, fpi, e2, eps8 + USE constants, ONLY: pi, fpi, e2, eps8 USE Coul_cut_2D, ONLY: do_cutoff_2D - USE esm, ONLY : do_comp_esm, esm_bc - implicit none + USE esm, ONLY: do_comp_esm, esm_bc ! - ! first the dummy variables + IMPLICIT NONE ! - integer, intent(in) :: ngl, mesh, msh - ! the number of shell of G vectors - ! max number of mesh points - ! number of mesh points for radial integration - - real(DP), intent(in) :: zp, rab (mesh), r (mesh), vloc_at (mesh), & - tpiba2, omega, gl (ngl) - ! valence pseudocharge - ! the derivative of the radial grid - ! the radial grid - ! the pseudo on the radial grid - ! 2 pi / alat - ! the volume of the unit cell - ! the moduli of g vectors for each s + INTEGER, INTENT(IN) :: ngl + !! the number of shell of G vectors + INTEGER, INTENT(IN) :: mesh + !! max number of mesh points + INTEGER, INTENT(IN) :: msh + !! number of mesh points for radial integration + REAL(DP), INTENT(IN) :: zp + !! valence pseudocharge + REAL(DP), INTENT(IN) :: rab(mesh) + !! the derivative of the radial grid + REAL(DP), INTENT(IN) :: r(mesh) + !! the radial grid + REAL(DP), INTENT(IN) :: vloc_at(mesh) + !! the pseudo on the radial grid + REAL(DP), INTENT(IN) :: tpiba2 + !! 2 pi / alat + REAL(DP), INTENT(IN) :: omega + !! the volume of the unit cell + REAL(DP), INTENT(IN) :: gl(ngl) + !! the moduli of g vectors for each s + REAL(DP), INTENT(OUT) :: dvloc(ngl) + !! the Fourier transform dVloc/dG ! - real(DP), intent(out) :: dvloc (ngl) - ! the fourier transform dVloc/dG + ! ... local variables ! - real(DP) :: vlcp, g2a, gx - real(DP), allocatable :: aux (:), aux1 (:) - - integer :: i, igl, igl0 + REAL(DP) :: vlcp, g2a, gx, vlcp_0, vlcp_1 + REAL(DP), ALLOCATABLE :: aux(:,:), aux1(:) + INTEGER :: i, igl, igl0 ! counter on erf functions or gaussians ! counter on g shells vectors ! first shell with g != 0 - + REAL(DP), PARAMETER :: r12=1.0d0/3.0d0 + ! + !$acc data present( dvloc, gl ) + ! ! the G=0 component is not computed - if (gl (1) < eps8) then - dvloc (1) = 0.0d0 + IF (gl(1) < eps8) THEN + !$acc kernels + dvloc(1) = 0.0d0 + !$acc end kernels igl0 = 2 - else + ELSE igl0 = 1 - endif - + ENDIF + ! ! Pseudopotentials in numerical form (Vloc contains the local part) ! In order to perform the Fourier transform, a term erf(r)/r is ! subtracted in real space and added again in G space - - allocate (aux1( mesh)) ! - ! This is the part of the integrand function - ! indipendent of |G| in real space + ALLOCATE( aux1(mesh) ) ! - do i = 1, msh - aux1 (i) = r (i) * vloc_at (i) + zp * e2 * erf (r (i) ) - enddo + ! This is the part of the integrand function + ! indipendent of |G| in real space ! -!$omp parallel private(aux, gx, vlcp, g2a) + ALLOCATE( aux(mesh,ngl) ) ! - allocate (aux( mesh)) + !$acc data copyin(r,rab) create(aux1,aux) ! + !$acc parallel loop copyin(vloc_at) + DO i = 1, msh + aux1(i) = r(i)*vloc_at(i) + zp*e2*ERF(r(i)) + ENDDO + ! +#if defined(_OPENACC) +!$acc parallel loop gang present(aux,aux1,rab,r,dvloc) +#else +!$omp parallel private( gx, vlcp, vlcp_1, vlcp_0, g2a ) !$omp do - do igl = igl0, ngl - gx = sqrt (gl (igl) * tpiba2) +#endif + DO igl = igl0, ngl + ! + gx = SQRT(gl(igl)*tpiba2) ! ! and here we perform the integral, after multiplying for the |G| ! dependent part ! ! DV(g)/Dg = Integral of r (Dj_0(gr)/Dg) V(r) dr - do i = 1, msh - aux (i) = aux1 (i) * (r (i) * cos (gx * r (i) ) / gx - sin (gx & - * r (i) ) / gx**2) - enddo - call simpson (msh, aux, rab, vlcp) + ! + !$acc loop seq + DO i = 1, msh + aux(i,igl) = aux1(i)*(r(i)*COS(gx*r(i))/gx - SIN(gx*r(i))/gx**2) + ENDDO + ! + !----Simpson int.--- + vlcp_0 = 0.0d0 + !$acc loop seq reduction(+:vlcp_0) + DO i = 2, msh-1, 2 + vlcp_0 = vlcp_0 + ( aux(i-1,igl)*rab(i-1) + 4.0d0*aux(i,igl)*rab(i) + & + aux(i+1,igl)*rab(i+1) )*r12 + ENDDO + !------ + vlcp_1 = vlcp_0 * fpi / omega / 2.0d0 / gx + ! ! DV(g^2)/Dg^2 = (DV(g)/Dg)/2g - vlcp = fpi / omega / 2.0d0 / gx * vlcp - - ! for ESM stress + !vlcp = fpi / omega / 2.0d0 / gx * vlcp + ! + ! for ESM stress ! In ESM, vloc and dvloc have only short term. - IF ( ( .not. do_comp_esm ) .or. ( esm_bc .eq. 'pbc' ) ) THEN + IF ( (( .NOT. do_comp_esm ) .OR. ( esm_bc .EQ. 'pbc' )) .AND. & + .NOT.do_cutoff_2D ) THEN ! subtract the long-range term - IF (.not.do_cutoff_2D) then ! 2D cutoff: do not re-add LR part here (re-added later in stres_loc) - g2a = gl (igl) * tpiba2 / 4.d0 - vlcp = vlcp + fpi / omega * zp * e2 * exp ( - g2a) * (g2a + & - 1.d0) / (gl (igl) * tpiba2) **2 - ENDIF - END IF - dvloc (igl) = vlcp - enddo + ! 2D cutoff: do not re-add LR part here re-added later in stres_loc) + g2a = gl(igl) * tpiba2 / 4.d0 + vlcp = vlcp_1 + fpi / omega * zp * e2 * EXP(-g2a) * (g2a + 1.d0) / & + (gl(igl)*tpiba2)**2 + ELSE + vlcp = vlcp_1 + ENDIF + dvloc(igl) = vlcp + ENDDO +#if !defined(_OPENACC) !$omp end do nowait - deallocate (aux) !$omp end parallel +#else +!$acc end data +!$acc end data +#endif ! - deallocate (aux1) - - return -end subroutine dvloc_of_g + DEALLOCATE( aux ) + DEALLOCATE( aux1 ) + ! + RETURN + ! +END SUBROUTINE dvloc_of_g ! !---------------------------------------------------------------------- -subroutine dvloc_coul (zp, tpiba2, ngl, gl, omega, dvloc) +SUBROUTINE dvloc_coul( zp, tpiba2, ngl, gl, omega, dvloc ) !---------------------------------------------------------------------- - ! - ! Fourier transform of the Coulomb potential - For all-electron - ! calculations, in specific cases only, for testing purposes + !! Fourier transform of the Coulomb potential - For all-electron + !! calculations, in specific cases only, for testing purposes. ! USE kinds - USE constants , ONLY : fpi, e2, eps8 - implicit none + USE constants, ONLY : fpi, e2, eps8 ! - integer, intent(in) :: ngl - ! the number of shell of G vectors - real(DP), intent(in) :: zp, tpiba2, omega, gl (ngl) - ! valence pseudocharge - ! 2 pi / alat - ! the volume of the unit cell - ! the moduli of g vectors for each s - real(DP), intent(out) :: dvloc (ngl) - ! fourier transform: dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4 + IMPLICIT NONE ! - integer :: igl0 + INTEGER, INTENT(IN) :: ngl + !! the number of shell of G vectors + REAL(DP), INTENT(IN) :: zp + !! valence pseudocharge + REAL(DP), INTENT(IN) :: tpiba2 + !! 2 pi / alat + REAL(DP), INTENT(IN) :: omega + !! the volume of the unit cell + REAL(DP), INTENT(IN) :: gl(ngl) + !! the moduli of g vectors for each s + REAL(DP), INTENT(OUT) :: dvloc(ngl) + !! Fourier transform: + !! dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4 + ! + INTEGER :: igl0 ! first shell with g != 0 - + ! + !$acc data present( dvloc, gl ) + ! ! the G=0 component is 0 - if (gl (1) < eps8) then - dvloc (1) = 0.0d0 + IF (gl(1) < eps8) THEN + !$acc kernels + dvloc(1) = 0.0d0 + !$acc end kernels igl0 = 2 - else + ELSE igl0 = 1 - endif - - dvloc (igl0:ngl) = fpi * zp * e2 / omega / ( tpiba2 * gl (igl0:ngl) ) ** 2 - -return -end subroutine dvloc_coul + ENDIF + ! + !$acc kernels + dvloc(igl0:ngl) = fpi*zp*e2 / omega / (tpiba2*gl(igl0:ngl))**2 + !$acc end kernels + ! + !$acc end data + ! + RETURN + ! +END SUBROUTINE dvloc_coul diff --git a/PW/src/dvloc_of_g_gpu.f90 b/PW/src/dvloc_of_g_gpu.f90 deleted file mode 100644 index b5ac70e42..000000000 --- a/PW/src/dvloc_of_g_gpu.f90 +++ /dev/null @@ -1,252 +0,0 @@ -! -! Copyright (C) 2001-2007 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -MODULE compute_dvloc_gpum -#if defined(__CUDA) - USE cudafor - IMPLICIT NONE - TYPE(dim3) :: dvloc_threads = dim3(32,8,1) - PUBLIC :: dvloc_threads - CONTAINS - ATTRIBUTES( global ) SUBROUTINE compute_dvloc_gpu( mesh, n, tpiba2, omega, zp, gl, aux1, r, rab ,dvloc, do_cutoff_2D, do_comp_esm, esm_bc_l ) - ! - ! - USE cudafor - USE kinds - USE constants, ONLY : pi, fpi, e2, eps8 - ! - IMPLICIT NONE - ! - INTEGER, VALUE :: n, mesh - REAL(DP), VALUE :: zp, tpiba2, omega - LOGICAL, VALUE :: do_cutoff_2D, do_comp_esm, esm_bc_l - REAL(DP), DEVICE, INTENT(IN) :: rab(mesh), r(mesh), gl(n) - REAL(DP), DEVICE, INTENT(IN) :: aux1(mesh) - ! - REAL(DP), DEVICE, INTENT(OUT) :: dvloc(n) - ! the Fourier transform dVloc/dG - ! - REAL(DP) :: val, mysum - REAL(DP) :: g2a, gx - ! - INTEGER :: tx, ty, ir, igl - ! - tx = threadIdx%x - ty = threadIdx%y - ! - igl = (blockIdx%x - 1) * blockDim%y + ty - ! - IF (igl > n ) RETURN - ! - gx = SQRT(gl(igl) * tpiba2) - mysum = 0.d0 - ! - DO ir = tx, mesh, blockDim%x - ! - val = aux1(ir) * ( r(ir) * COS(gx*r(ir)) / gx - SIN(gx*r(ir)) / gx**2 ) - val = val * rab(ir) - ! - IF (ir == 1 .OR. ir == mesh) THEN - mysum = mysum + val - ELSEIF ( MOD(ir,2) ) THEN - mysum = mysum + 2.d0*val - ELSE - mysum = mysum + 4.d0*val - ENDIF - ! - ENDDO - ! - ! - ! Reduce by warp - val = __shfl_down(mysum,1) - mysum = mysum + val - val = __shfl_down(mysum,2) - mysum = mysum + val - val = __shfl_down(mysum,4) - mysum = mysum + val - val = __shfl_down(mysum,8) - mysum = mysum + val - val = __shfl_down(mysum,16) - mysum = mysum + val - ! - IF (tx == 1) THEN - ! - mysum = mysum * fpi / omega / 2.0d0 / gx / 3.d0 - ! - IF ( (.NOT. do_comp_esm) .OR. esm_bc_l ) THEN - IF (.NOT. do_cutoff_2D) THEN - g2a = gl(igl) * tpiba2 / 4.d0 - mysum = mysum + fpi / omega * zp * e2 * EXP(-g2a) * (g2a + & - 1.d0) / (gl(igl)*tpiba2)**2 - ENDIF - ENDIF - ! - dvloc(igl) = mysum - ! - ENDIF - ! - ! - RETURN - ! -END SUBROUTINE compute_dvloc_gpu -! -#endif -END MODULE compute_dvloc_gpum - - -!---------------------------------------------------------------------- -SUBROUTINE dvloc_of_g_gpu( mesh, msh, rab_d, r_d, vloc_at_d, zp, tpiba2, & - ngl, gl_d, omega, dvloc_d ) !, igl0 ) - !---------------------------------------------------------------------- - ! - ! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g - ! -#if defined(__CUDA) - USE cudafor -#endif - USE kinds - USE constants, ONLY : pi, fpi, e2, eps8 - USE Coul_cut_2D, ONLY : do_cutoff_2D - USE esm, ONLY : do_comp_esm, esm_bc -#if defined(__CUDA) - USE compute_dvloc_gpum, ONLY : compute_dvloc_gpu -#endif - ! - IMPLICIT NONE - ! - ! first the dummy variables - ! - INTEGER, INTENT(IN) :: mesh, msh, ngl !, igl0 - ! the number of shell of G vectors - ! max number of mesh points - ! number of mesh points for radial integration - ! - REAL(DP), INTENT(IN) :: rab_d(mesh), r_d(mesh), vloc_at_d(mesh), gl_d(ngl) - REAL(DP), INTENT(IN) :: zp, tpiba2, omega - ! - REAL(DP), INTENT(OUT) :: dvloc_d(ngl) - ! the Fourier transform dVloc/dG - ! -#if defined(__CUDA) - attributes(DEVICE) :: dvloc_d, rab_d, r_d , gl_d, vloc_at_d -#endif - ! - ! valence pseudocharge - ! the derivative of the radial grid - ! the radial grid - ! the pseudo on the radial grid - ! 2 pi / alat - ! the volume of the unit cell - ! the moduli of g vectors for each s - ! - LOGICAL :: esm_bc_lg - ! - REAL(DP), ALLOCATABLE :: aux1_d(:) -#if defined(__CUDA) - attributes(DEVICE) :: aux1_d - ! - TYPE(dim3) :: threads -#endif - ! - REAL(DP) :: gl1 - INTEGER :: i, igl, blocks, igl0 - ! - ! counter on erf functions or gaussians - ! counter on g shells vectors - ! first shell with g != 0 - ! - ! the G=0 component is not computed - gl1 = gl_d(1) - IF (gl1 < eps8) THEN - dvloc_d(1) = 0.0_DP - igl0 = 2 - ELSE - igl0 = 1 - ENDIF - ! - ! Pseudopotentials in numerical form (Vloc contains the local part) - ! In order to perform the Fourier transform, a term erf(r)/r is - ! subtracted in real space and added again in G space - ! - ! - ALLOCATE( aux1_d(mesh) ) - ! - ! This is the part of the integrand function - ! indipendent of |G| in real space - ! -#if defined(__CUDA) - !$cuf kernel do(1)<<<*,*>>> - DO i = 1, msh - aux1_d(i) = r_d(i) * vloc_at_d(i) + zp * e2 * erf(r_d(i)) - ENDDO - ! - esm_bc_lg = ( esm_bc == 'pbe' ) - ! - threads = dim3(32,8,1) - blocks = CEILING(REAL(ngl-igl0+1)/8) - CALL compute_dvloc_gpu<<>>( msh, ngl-igl0+1, tpiba2, omega, zp, gl_d(igl0), & - aux1_d, r_d, rab_d, dvloc_d(igl0), do_cutoff_2D, & - do_comp_esm, esm_bc_lg ) - ! -#else - CALL errore( 'dvloc_of_g_gpu' , 'GPU version of dvloc_of_g called but not compiled.', 0 ) -#endif - DEALLOCATE( aux1_d ) - ! - ! - RETURN - ! -END SUBROUTINE dvloc_of_g_gpu -! -! -!---------------------------------------------------------------------- -SUBROUTINE dvloc_coul_gpu( zp, tpiba2, ngl, gl_d, omega, dvloc_d ) - !---------------------------------------------------------------------- - !! Fourier transform of the Coulomb potential - For all-electron - !! calculations, in specific cases only, for testing purposes. - ! - USE kinds - USE constants, ONLY : fpi, e2, eps8 - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: ngl - ! the number of shell of G vectors - ! first shell with g != 0 - REAL(DP), INTENT(IN) :: zp, tpiba2, omega - ! valence pseudocharge - ! 2 pi / alat - ! the volume of the unit cell - REAL(DP), INTENT(IN) :: gl_d(ngl) - ! the moduli of g vectors for each s - REAL(DP), INTENT(OUT) :: dvloc_d(ngl) - ! fourier transform: dvloc = D Vloc (g^2) / D g^2 = 4pi e^2/omegai /G^4 - INTEGER :: j, igl0 - REAL(DP) :: gl1 - ! -#if defined(__CUDA) - attributes(DEVICE) :: dvloc_d, gl_d -#endif - ! - ! the G=0 component is 0 - gl1 = gl_d(1) - IF (gl1 < eps8) then - dvloc_d(1) = 0.0_DP - igl0 = 2 - ELSE - igl0 = 1 - ENDIF - ! - !$cuf kernel do(1) <<<*,*>>> - DO j = igl0, ngl - dvloc_d(j) = fpi * zp * e2 / omega / ( tpiba2 * gl_d(j) )**2 - ENDDO - ! - RETURN - ! -END SUBROUTINE dvloc_coul_gpu diff --git a/PW/src/force_hub_gpu.f90 b/PW/src/force_hub_gpu.f90 index 44c14c08d..bffde5d50 100644 --- a/PW/src/force_hub_gpu.f90 +++ b/PW/src/force_hub_gpu.f90 @@ -1134,7 +1134,7 @@ SUBROUTINE dprojdtau_k_gpu( spsi_d, alpha, na, ijkb0, ipol, ik, nb_s, nb_e, myke offsetU_back1, ldim_u, backall, lda_plus_u_kind, & Hubbard_projectors, oatwfc USE wvfct, ONLY : nbnd, npwx, wg - USE uspp, ONLY : okvan, nkb, qq_at_d + USE uspp, ONLY : okvan, nkb USE uspp_param, ONLY : nh USE becmod_subs_gpum, ONLY : calbec_gpu USE mp_bands, ONLY : intra_bgrp_comm @@ -1530,7 +1530,7 @@ SUBROUTINE matrix_element_of_dSdtau_gpu (alpha, ipol, ik, ijkb0, lA, A, lB, B, A USE ions_base, ONLY : nat, ntyp => nsp, ityp USE cell_base, ONLY : tpiba USE wvfct, ONLY : npwx, wg - USE uspp, ONLY : nkb, okvan, vkb, qq_at_d + USE uspp, ONLY : nkb, okvan, vkb, qq_at USE uspp_param, ONLY : nh USE klist, ONLY : igk_k_d, ngk USE becmod_subs_gpum, ONLY : calbec_gpu @@ -1585,10 +1585,10 @@ SUBROUTINE matrix_element_of_dSdtau_gpu (alpha, ipol, ik, ijkb0, lA, A, lB, B, A CALL dev_buf%lock_buffer(qq , [nh(nt),nh(nt)], ierr) ! ALLOCATE ( qq(nh(nt),nh(nt)) ) IF ( ierr /= 0 ) CALL errore('matrix_element_of_dSdtau_gpu','Buffers allocation failed',ierr) ! - !$cuf kernel do(2) + !$acc parallel loop collapse(2) present(qq_at) DO jh=1,nh_nt DO ih=1,nh_nt - qq(ih,jh) = CMPLX(qq_at_d(ih,jh,alpha), 0.0d0, kind=DP) + qq(ih,jh) = CMPLX(qq_at(ih,jh,alpha), 0.0d0, kind=DP) ENDDO ENDDO ! @@ -1703,7 +1703,7 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, & offsetU_back, offsetU_back1, ldim_u, backall, & Hubbard_projectors USE wvfct, ONLY : nbnd, npwx, wg - USE uspp, ONLY : nkb, vkb, qq_at_d + USE uspp, ONLY : nkb, vkb, qq_at USE uspp_param, ONLY : nh USE wavefunctions, ONLY : evc USE becmod_gpum, ONLY : bec_type_d, becp_d @@ -1900,12 +1900,12 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, & CALL dev_memset ( betapsi_d, 0.0_dp ) ! ! here starts band parallelization -!$cuf kernel do(2) +!$acc parallel loop collapse(2) present(qq_at) DO ih = 1, nh_nt DO ibnd = nb_s, nb_e DO jh = 1, nh_nt betapsi_d(ih,ibnd) = betapsi_d(ih,ibnd) + & - qq_at_d(ih,jh,alpha) * dbetapsi_d(jh,ibnd) + qq_at(ih,jh,alpha) * dbetapsi_d(jh,ibnd) ENDDO ENDDO ENDDO @@ -1918,12 +1918,12 @@ SUBROUTINE dprojdtau_gamma_gpu( spsi_d, alpha, ijkb0, ipol, ik, nb_s, nb_e, & CALL dev_memset ( betapsi_d, 0.0_dp ) ! becpr_d => becp_d%r_d -!$cuf kernel do(2) +!$acc parallel loop collapse(2) present(qq_at) DO ih = 1, nh_nt DO ibnd = nb_s, nb_e DO jh = 1, nh_nt betapsi_d(ih,ibnd) = betapsi_d(ih,ibnd) + & - qq_at_d(ih,jh,alpha) * betapsi0_d(jh,ibnd) + qq_at(ih,jh,alpha) * betapsi0_d(jh,ibnd) ENDDO ENDDO ENDDO diff --git a/PW/src/force_us_gpu.f90 b/PW/src/force_us_gpu.f90 index fbd6a0309..cd782649e 100644 --- a/PW/src/force_us_gpu.f90 +++ b/PW/src/force_us_gpu.f90 @@ -182,7 +182,7 @@ SUBROUTINE force_us_gpu( forcenl ) #if defined(__CUDA) USE cublas #endif - USE uspp, ONLY : qq_at_d, deeq_d + USE uspp, ONLY : qq_at, deeq USE wvfct_gpum, ONLY : wg_d, using_wg_d, et_d, using_et_d ! IMPLICIT NONE @@ -230,9 +230,11 @@ SUBROUTINE force_us_gpu( forcenl ) IF ( ityp(na) == nt ) THEN ijkb0 = ofsbeta(na) ! this is \sum_j q_{ij} + !$acc host_data use_device(qq_at) CALL DGEMM ('N','N', nh(nt), becp_d%nbnd_loc, nh(nt), & - 1.0_dp, qq_at_d(1,1,na), nhm, becp_d%r_d(ijkb0+1,1),& + 1.0_dp, qq_at(1,1,na), nhm, becp_d%r_d(ijkb0+1,1),& nkb, 0.0_dp, aux_d, nh(nt) ) + !$acc end host_data ! multiply by -\epsilon_n !$cuf kernel do(2) DO ih = 1, nh_nt @@ -243,10 +245,12 @@ SUBROUTINE force_us_gpu( forcenl ) END DO ! add \sum_j d_{ij} + !$acc host_data use_device(deeq) CALL DGEMM ('N','N', nh(nt), becp_d%nbnd_loc, nh(nt), & - 1.0_dp, deeq_d(1,1,na,current_spin), nhm, & + 1.0_dp, deeq(1,1,na,current_spin), nhm, & becp_d%r_d(ijkb0+1,1), nkb, 1.0_dp, aux_d, nh(nt) ) - + !$acc end host_data + ! ! Auxiliary variable to perform the reduction with cuf kernels forcenl_ipol = 0.0_dp !$cuf kernel do(2) diff --git a/PW/src/init_run.f90 b/PW/src/init_run.f90 index d8e5e2737..b10bcf4b7 100644 --- a/PW/src/init_run.f90 +++ b/PW/src/init_run.f90 @@ -98,7 +98,7 @@ SUBROUTINE init_run() gg_d = gg END IF #endif - !$acc update device(mill, g) + !$acc update device(mill, g, gg) ! IF (do_comp_esm) CALL esm_init(.NOT. lrism) ! diff --git a/PW/src/init_us_2.f90 b/PW/src/init_us_2.f90 index 0449fe77f..49c18d1be 100644 --- a/PW/src/init_us_2.f90 +++ b/PW/src/init_us_2.f90 @@ -9,13 +9,13 @@ MODULE uspp_init ! PRIVATE PUBLIC :: init_us_2, gen_us_dj, gen_us_dy - PUBLIC :: gen_us_dj_gpu, gen_us_dy_gpu ! CONTAINS + ! !---------------------------------------------------------------------- - SUBROUTINE init_us_2( npw_, igk_, q_, vkb_, run_on_gpu_) + SUBROUTINE init_us_2( npw_, igk_, q_, vkb_, run_on_gpu_ ) !---------------------------------------------------------------------- - !! wrapper to call init_us_2_base + !! Wrapper to call init_us_2_base. !! Calculates beta functions (Kleinman-Bylander projectors), with !! structure factor, for all atoms, in reciprocal space. ! @@ -40,38 +40,42 @@ CONTAINS !! beta functions (npw_ <= npwx) LOGICAL, OPTIONAL, INTENT(IN) :: run_on_gpu_ !! whether you wish to run on gpu in case use_gpu is true - !! + ! LOGICAL :: run_on_gpu ! ! - run_on_gpu = .false. - if(present(run_on_gpu_)) run_on_gpu = run_on_gpu_ + run_on_gpu = .FALSE. + IF (PRESENT(run_on_gpu_)) run_on_gpu = run_on_gpu_ ! CALL start_clock( 'init_us_2' ) - if(use_gpu.and.run_on_gpu) then + ! + IF (use_gpu .AND. run_on_gpu) THEN ! - !$acc data present(igk_(1:npw_), mill(:,:), g(:,:), vkb_(1:npwx,1:nkb), eigts1(:,:), eigts2(:,:), eigts3(:,:)) - !$acc host_data use_device(eigts1, eigts2, eigts3, mill, g, igk_, vkb_) - CALL init_us_2_base_gpu(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega,& - dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,& - vkb_ ) + !$acc data present( igk_(1:npw_), mill(:,:), g(:,:), vkb_(1:npwx,1:nkb), & + !$acc eigts1(:,:), eigts2(:,:), eigts3(:,:) ) + !$acc host_data use_device( eigts1, eigts2, eigts3, mill, g, igk_, vkb_ ) + CALL init_us_2_base_gpu( npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, & + dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, & + eigts3, mill, g, vkb_ ) !$acc end host_data !$acc end data ! - else - CALL init_us_2_base(npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, & - dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, eigts3, mill, g,& - vkb_ ) - end if + ELSE + ! + CALL init_us_2_base( npw_, npwx, igk_, q_, nat, tau, ityp, tpiba, omega, & + dfftp%nr1, dfftp%nr2, dfftp%nr3, eigts1, eigts2, & + eigts3, mill, g, vkb_ ) + ENDIF + ! CALL stop_clock( 'init_us_2' ) ! ! END SUBROUTINE init_us_2 ! !---------------------------------------------------------------------- - SUBROUTINE gen_us_dj ( ik, dvkb ) + SUBROUTINE gen_us_dj( ik, dvkb ) !---------------------------------------------------------------------- - !! wrapper to call gen_us_dj: same as init_us_2, but with the derivative + !! Wrapper to call gen_us_dj: same as init_us_2, but with the derivative !! of the Bessel functions dj_l/dq instead of j_l(qr) in the integral ! USE kinds, ONLY : DP @@ -80,7 +84,7 @@ CONTAINS USE gvect, ONLY : eigts1, eigts2, eigts3, mill, g USE wvfct, ONLY : npwx USE uspp, ONLY : nkb - USE fft_base , ONLY : dfftp + USE fft_base, ONLY : dfftp USE klist, ONLY : xk, ngk, igk_k ! IMPLICIT NONE @@ -92,16 +96,18 @@ CONTAINS ! ! CALL start_clock( 'gen_us_dj' ) ! - CALL gen_us_dj_base (ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, & - ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, & - eigts1, eigts2, eigts3, mill, g, dvkb ) + !$acc data present_or_copyout( dvkb ) + CALL gen_us_dj_base( ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, & + ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, & + dfftp%nr3, eigts1, eigts2, eigts3, mill, g, dvkb ) + !$acc end data ! ! CALL stop_clock( 'gen_us_dj' ) ! END SUBROUTINE gen_us_dj ! !---------------------------------------------------------------------- - SUBROUTINE gen_us_dy ( ik, u, dvkb ) + SUBROUTINE gen_us_dy( ik, u, dvkb ) !---------------------------------------------------------------------- !! wrapper to call gen_us_dj: same as init_us_2, but with the derivative !! of the spherical harmonics dY_lm/dq instead of Y_lm(q) in the integral @@ -126,86 +132,14 @@ CONTAINS ! ! CALL start_clock( 'gen_us_dy' ) ! - CALL gen_us_dy_base (ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, & - ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, & - eigts1, eigts2, eigts3, mill, g, u, dvkb ) + !$acc data present_or_copyout( dvkb ) + CALL gen_us_dy_base( ngk(ik), npwx, igk_k(1,ik), xk(1,ik), nat, tau, & + ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, & + dfftp%nr3, eigts1, eigts2, eigts3, mill, g, u, dvkb ) + !$acc end data ! ! CALL stop_clock( 'gen_us_d' ) ! END SUBROUTINE gen_us_dy ! - !---------------------------------------------------------------------- - SUBROUTINE gen_us_dj_gpu ( ik, dvkb ) - !---------------------------------------------------------------------- - !! wrapper to call gen_us_dj: same as init_us_2, but with the derivative - !! of the Bessel functions dj_l/dq instead of j_l(qr) in the integral - !! (GPU version, FIXME: to be merged) - ! - USE kinds, ONLY : dp - USE ions_base, ONLY : nat, ntyp=>nsp, ityp, tau - USE cell_base, ONLY : tpiba, omega - USE gvect, ONLY : eigts1_d, eigts2_d, eigts3_d, mill_d, g_d - USE wvfct, ONLY : npwx - USE uspp, ONLY : nkb - USE fft_base , ONLY : dfftp - USE klist, ONLY : xk, ngk, igk_k_d - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: ik - !! k-point index - COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb) - !! beta functions computed with dj_l/dq -#if defined(__CUDA) - attributes(DEVICE) :: dvkb -#endif - ! - ! CALL start_clock( 'gen_us_dj' ) - ! - CALL gen_us_dj_gpu_ (ngk(ik), npwx, igk_k_d(1,ik), xk(1,ik), nat, tau, & - ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, & - eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, dvkb ) - ! - ! CALL stop_clock( 'gen_us_dj' ) - ! - END SUBROUTINE gen_us_dj_gpu - ! - !---------------------------------------------------------------------- - SUBROUTINE gen_us_dy_gpu ( ik, u, dvkb ) - !---------------------------------------------------------------------- - !! wrapper to call gen_us_dj: same as init_us_2, but with the derivative - !! of the spherical harmonics dY_lm/dq instead of Y_lm(q) in the integral - !! (GPU version, FIXME: to be merged) - ! - USE kinds, ONLY : DP - USE ions_base, ONLY : nat, ntyp=>nsp, ityp, tau - USE cell_base, ONLY : tpiba, omega - USE gvect, ONLY : eigts1_d, eigts2_d, eigts3_d, mill_d, g_d - USE wvfct, ONLY : npwx - USE uspp, ONLY : nkb - USE fft_base , ONLY : dfftp - USE klist, ONLY : xk, ngk, igk_k_d - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: ik - !! k-point index - REAL(dp), INTENT(IN) :: u(3) - !! k-point index - COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb) - !! beta functions computed with dY_lm/dq -#if defined(__CUDA) - attributes(DEVICE) :: dvkb -#endif - ! - ! CALL start_clock( 'gen_us_dy' ) - ! - CALL gen_us_dy_gpu_ (ngk(ik), npwx, igk_k_d(1,ik), xk(1,ik), nat, tau, & - ityp, ntyp, tpiba, omega, dfftp%nr1, dfftp%nr2, dfftp%nr3, & - eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, u, dvkb ) - ! - ! CALL stop_clock( 'gen_us_dy' ) - ! - END SUBROUTINE gen_us_dy_gpu - ! END MODULE diff --git a/PW/src/newd.f90 b/PW/src/newd.f90 index 16935d634..4590449c7 100644 --- a/PW/src/newd.f90 +++ b/PW/src/newd.f90 @@ -184,8 +184,8 @@ SUBROUTINE newq( vr, deeq, skip_vltot ) ENDDO ! DEALLOCATE( qmod, ylmk0, vaux ) - CALL mp_sum( deeq( :, :, :, 1:nspin_mag ), inter_pool_comm ) - CALL mp_sum( deeq( :, :, :, 1:nspin_mag ), intra_bgrp_comm ) + CALL mp_sum( deeq(:,:,:,1:nspin_mag ), inter_pool_comm ) + CALL mp_sum( deeq(:,:,:,1:nspin_mag ), intra_bgrp_comm ) ! END SUBROUTINE newq ! @@ -199,7 +199,7 @@ SUBROUTINE newd( ) USE kinds, ONLY : DP USE ions_base, ONLY : nat, ntyp => nsp, ityp USE lsda_mod, ONLY : nspin - USE uspp, ONLY : deeq, deeq_d, dvan, deeq_nc, deeq_nc_d, dvan_so, okvan + USE uspp, ONLY : deeq, dvan, deeq_nc, dvan_so, okvan USE uspp_param, ONLY : upf, lmaxq, nh, nhm USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb USE uspp, ONLY : nhtol, nhtolm @@ -248,13 +248,15 @@ SUBROUTINE newd( ) ! ENDDO ! -#if defined __CUDA - if (noncolin) then - if (nhm>0) deeq_nc_d=deeq_nc - else - if (nhm>0) deeq_d=deeq - endif -#endif + IF (noncolin) THEN + IF (nhm>0) THEN + !$acc update device(deeq_nc) + ENDIF + ELSE + IF (nhm>0) THEN + !$acc update device(deeq) + ENDIF + ENDIF ! ! ... early return ! @@ -311,13 +313,15 @@ SUBROUTINE newd( ) IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) CALL add_vhub_to_deeq( deeq ) ! ! sync with GPUs -#if defined __CUDA - if (noncolin) then - if (nhm>0) deeq_nc_d=deeq_nc - else - if (nhm>0) deeq_d=deeq - endif -#endif + IF (noncolin) THEN + IF (nhm>0) THEN + !$acc update device(deeq_nc) + ENDIF + ELSE + IF (nhm>0) THEN + !$acc update device(deeq) + ENDIF + ENDIF ! CALL stop_clock( 'newd' ) ! diff --git a/PW/src/newd_gpu.f90 b/PW/src/newd_gpu.f90 index a50b941ac..6befc3272 100644 --- a/PW/src/newd_gpu.f90 +++ b/PW/src/newd_gpu.f90 @@ -50,9 +50,6 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot) ! Input: potential , output: contribution to integral REAL(kind=dp), intent(in) :: vr(dfftp%nnr,nspin) REAL(kind=dp), intent(out) :: deeq_d( nhm, nhm, nat, nspin ) -#if defined(__CUDA) - attributes(DEVICE) :: deeq_d -#endif LOGICAL, intent(in) :: skip_vltot !If .false. vltot is added to vr when necessary ! INTERNAL INTEGER :: ngm_s, ngm_e, ngm_l @@ -70,7 +67,7 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot) ! workaround for cuf kernel limitations ! #if defined(__CUDA) - attributes(DEVICE) :: vaux_d, aux_d, qgm_d, ylmk0_d, qmod_d, deeaux_d, dfftp_nl_d + attributes(DEVICE) :: deeq_d, vaux_d, aux_d, qgm_d, ylmk0_d, qmod_d, deeaux_d, dfftp_nl_d #endif ! variable to map index of atoms of the same type INTEGER, ALLOCATABLE :: na_to_nab_h(:) @@ -87,7 +84,9 @@ SUBROUTINE newq_gpu(vr,deeq_d,skip_vltot) fact = 1.0_dp ENDIF ! + !$acc kernels deeq_d(:,:,:,:) = 0.D0 + !$acc end kernels ! ! With k-point parallelization, distribute G-vectors across processors ! ngm_s = index of first G-vector for this processor @@ -237,7 +236,7 @@ SUBROUTINE newd_gpu( ) USE kinds, ONLY : DP USE ions_base, ONLY : nat, ntyp => nsp, ityp USE lsda_mod, ONLY : nspin - USE uspp, ONLY : deeq, okvan, deeq, deeq_d, deeq_nc, deeq_nc_d, dvan_d, dvan_so_d + USE uspp, ONLY : okvan, deeq, deeq_nc, dvan_d, dvan_so_d USE uspp_param, ONLY : upf, lmaxq, nh, nhm USE noncollin_module, ONLY : noncolin, domag, nspin_mag, lspinorb USE uspp, ONLY : nhtol, nhtolm @@ -273,12 +272,12 @@ SUBROUTINE newd_gpu( ) ! IF ( lspinorb ) THEN ! - !$cuf kernel do(4) + !$acc parallel loop collapse(4) DO is = 1, nspin DO na = 1, nat DO jh = 1, nht DO ih = 1, nht - IF ( ityp_d(na) == nt ) deeq_nc_d(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt) + IF ( ityp_d(na) == nt ) deeq_nc(ih,jh,na,is) = dvan_so_d(ih,jh,is,nt) END DO END DO END DO @@ -286,15 +285,15 @@ SUBROUTINE newd_gpu( ) ! ELSE IF ( noncolin ) THEN ! - !$cuf kernel do(3) + !$acc parallel loop collapse(3) DO na = 1, nat DO jh = 1, nht DO ih = 1, nht IF ( ityp_d(na) == nt ) THEN - deeq_nc_d(ih,jh,na,1) = dvan_d(ih,jh,nt) - deeq_nc_d(ih,jh,na,2) = ( 0.D0, 0.D0 ) - deeq_nc_d(ih,jh,na,3) = ( 0.D0, 0.D0 ) - deeq_nc_d(ih,jh,na,4) = dvan_d(ih,jh,nt) + deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + deeq_nc(ih,jh,na,2) = ( 0.D0, 0.D0 ) + deeq_nc(ih,jh,na,3) = ( 0.D0, 0.D0 ) + deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) END IF END DO END DO @@ -303,13 +302,13 @@ SUBROUTINE newd_gpu( ) ELSE ! if ( nht > 0 ) THEN - !$cuf kernel do(4) + !$acc parallel loop collapse(4) DO is = 1, nspin DO na = 1, nat DO jh = 1, nht DO ih = 1, nht ! - IF ( ityp_d(na) == nt ) deeq_d(ih,jh,na,is) = dvan_d(ih,jh,nt) + IF ( ityp_d(na) == nt ) deeq(ih,jh,na,is) = dvan_d(ih,jh,nt) ! END DO END DO @@ -328,9 +327,9 @@ SUBROUTINE newd_gpu( ) ! ! ... sync with CPU if (noncolin) then - deeq_nc=deeq_nc_d + !$acc update self(deeq_nc) else - deeq=deeq_d + !$acc update self(deeq) endif ! RETURN @@ -346,12 +345,18 @@ SUBROUTINE newd_gpu( ) ! IF (tqr) THEN CALL newq_r(v%of_r,deeq,.false.) - deeq_d=deeq + !$acc update device(deeq) ELSE - CALL newq_gpu(v%of_r,deeq_d,.false.) + !$acc host_data use_device(deeq) + CALL newq_gpu(v%of_r,deeq,.false.) + !$acc end host_data END IF ! - IF (noncolin) call add_paw_to_deeq_gpu(deeq_d) + IF (noncolin) THEN + !$acc host_data use_device(deeq) + call add_paw_to_deeq_gpu(deeq) + !$acc end host_data + ENDIF ! types : & DO nt = 1, ntyp @@ -372,13 +377,13 @@ SUBROUTINE newd_gpu( ) ELSE if_noncolin ! nht = nh(nt) - !$cuf kernel do(4) + !$acc parallel loop collapse(4) DO is = 1, nspin DO na = 1, nat DO ih = 1, nht DO jh = 1, nht IF ( ityp_d(na) == nt ) THEN - deeq_d(ih,jh,na,is) = deeq_d(ih,jh,na,is) + dvan_d(ih,jh,nt) + deeq(ih,jh,na,is) = deeq(ih,jh,na,is) + dvan_d(ih,jh,nt) END IF END DO END DO @@ -389,17 +394,25 @@ SUBROUTINE newd_gpu( ) ! END DO types ! - IF (.NOT.noncolin) CALL add_paw_to_deeq_gpu(deeq_d) + IF (.NOT.noncolin) THEN + !$acc host_data use_device(deeq) + CALL add_paw_to_deeq_gpu(deeq) + !$acc end host_data + ENDIF ! - IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) CALL add_vhub_to_deeq_gpu(deeq_d) + IF (lda_plus_U .AND. (Hubbard_projectors == 'pseudo')) THEN + !$acc host_data use_device(deeq) + CALL add_vhub_to_deeq_gpu(deeq) + !$acc end host_data + ENDIF ! CALL buffer%release_buffer(ityp_d, ierr) CALL stop_clock_gpu( 'newd' ) ! if (noncolin) then - deeq_nc=deeq_nc_d + !$acc update self(deeq_nc) else - deeq=deeq_d + !$acc update self(deeq) endif ! RETURN @@ -429,7 +442,7 @@ SUBROUTINE newd_gpu( ) ijs = ijs + 1 ! IF (domag) THEN - !$cuf kernel do(3) + !$acc parallel loop collapse(3) present(deeq_nc,deeq) DO na = 1, nat ! DO ih = 1, nhnt @@ -438,24 +451,24 @@ SUBROUTINE newd_gpu( ) ! IF ( ityp_d(na) == nt ) THEN ! - deeq_nc_d(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt) + deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt) ! DO kh = 1, nhnt ! DO lh = 1, nhnt ! - deeq_nc_d(ih,jh,na,ijs) = deeq_nc_d(ih,jh,na,ijs) + & - deeq_d (kh,lh,na,1)* & - (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + & + deeq_nc(ih,jh,na,ijs) = deeq_nc(ih,jh,na,ijs) + & + deeq(kh,lh,na,1)* & + (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + & fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt)) + & - deeq_d (kh,lh,na,2)* & - (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) + & + deeq(kh,lh,na,2)* & + (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) + & fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,1,is2,nt)) + & - (0.D0,-1.D0)*deeq_d (kh,lh,na,3)* & - (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) - & + (0.D0,-1.D0)*deeq(kh,lh,na,3)* & + (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,2,is2,nt) - & fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,1,is2,nt)) + & - deeq_d (kh,lh,na,4)* & - (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) - & + deeq(kh,lh,na,4)* & + (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) - & fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt)) ! END DO @@ -471,7 +484,7 @@ SUBROUTINE newd_gpu( ) ! ELSE ! - !$cuf kernel do(3) <<<*,*>>> + !$acc parallel loop collapse(3) present(deeq_nc,deeq) DO na = 1, nat ! DO ih = 1, nhnt @@ -480,15 +493,15 @@ SUBROUTINE newd_gpu( ) ! IF ( ityp_d(na) == nt ) THEN ! - deeq_nc_d(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt) + deeq_nc(ih,jh,na,ijs) = dvan_so_d(ih,jh,ijs,nt) ! DO kh = 1, nhnt ! DO lh = 1, nhnt ! - deeq_nc_d(ih,jh,na,ijs) = deeq_nc_d(ih,jh,na,ijs) + & - deeq_d (kh,lh,na,1)* & - (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + & + deeq_nc(ih,jh,na,ijs) = deeq_nc(ih,jh,na,ijs) + & + deeq(kh,lh,na,1)* & + (fcoef_d(ih,kh,is1,1,nt)*fcoef_d(lh,jh,1,is2,nt) + & fcoef_d(ih,kh,is1,2,nt)*fcoef_d(lh,jh,2,is2,nt) ) ! END DO @@ -526,42 +539,38 @@ SUBROUTINE newd_gpu( ) ! nhnt = nh(nt) ! - !$cuf kernel do(3) + !$acc parallel loop collapse(3) present(deeq_nc,deeq) DO na = 1, nat - ! DO ih = 1, nhnt - ! DO jh = 1, nhnt ! IF ( ityp_d(na) == nt ) THEN ! IF (lspinorb) THEN - deeq_nc_d(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + & - deeq_d(ih,jh,na,1) + deeq_d(ih,jh,na,4) - ! - deeq_nc_d(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + & - deeq_d(ih,jh,na,1) - deeq_d(ih,jh,na,4) + deeq_nc(ih,jh,na,1) = dvan_so_d(ih,jh,1,nt) + & + deeq(ih,jh,na,1) + deeq(ih,jh,na,4) + ! + deeq_nc(ih,jh,na,4) = dvan_so_d(ih,jh,4,nt) + & + deeq(ih,jh,na,1) - deeq(ih,jh,na,4) ! ELSE - deeq_nc_d(ih,jh,na,1) = dvan_d(ih,jh,nt) + & - deeq_d(ih,jh,na,1) + deeq_d(ih,jh,na,4) - ! - deeq_nc_d(ih,jh,na,4) = dvan_d(ih,jh,nt) + & - deeq_d(ih,jh,na,1) - deeq_d(ih,jh,na,4) + deeq_nc(ih,jh,na,1) = dvan_d(ih,jh,nt) + & + deeq(ih,jh,na,1) + deeq(ih,jh,na,4) + ! + deeq_nc(ih,jh,na,4) = dvan_d(ih,jh,nt) + & + deeq(ih,jh,na,1) - deeq(ih,jh,na,4) ! END IF - deeq_nc_d(ih,jh,na,2) = deeq_d(ih,jh,na,2) - & - ( 0.D0, 1.D0 ) * deeq_d(ih,jh,na,3) - ! - deeq_nc_d(ih,jh,na,3) = deeq_d(ih,jh,na,2) + & - ( 0.D0, 1.D0 ) * deeq_d(ih,jh,na,3) + deeq_nc(ih,jh,na,2) = deeq(ih,jh,na,2) - & + ( 0.D0, 1.D0 ) * deeq(ih,jh,na,3) + ! + deeq_nc(ih,jh,na,3) = deeq(ih,jh,na,2) + & + ( 0.D0, 1.D0 ) * deeq(ih,jh,na,3) ! END IF ! END DO - ! END DO - ! END DO ! RETURN diff --git a/PW/src/read_file_new.f90 b/PW/src/read_file_new.f90 index d71f746cf..7e964c9b2 100644 --- a/PW/src/read_file_new.f90 +++ b/PW/src/read_file_new.f90 @@ -230,10 +230,10 @@ SUBROUTINE post_xml_init ( ) g_d = g gg_d = gg #endif - !$acc update device(mill, g) + !$acc update device(mill, g, gg) ! CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms ) - CALL gshells ( lmovecell ) + CALL gshells ( lmovecell ) ! IF (do_comp_esm) CALL esm_init() IF (do_cutoff_2D) CALL cutoff_fact() diff --git a/PW/src/realus.f90 b/PW/src/realus.f90 index df79b8304..b95f529e6 100644 --- a/PW/src/realus.f90 +++ b/PW/src/realus.f90 @@ -228,7 +228,7 @@ MODULE realus USE constants, ONLY : pi, fpi, eps16, eps6 USE ions_base, ONLY : nat, nsp, ityp, tau USE cell_base, ONLY : at, bg, alat - USE uspp, ONLY : okvan, qq_at, qq_at_d, qq_nt, nhtol + USE uspp, ONLY : okvan, qq_at, qq_at, qq_nt, nhtol USE uspp_param, ONLY : upf, nh USE atom, ONLY : rgrid USE fft_types, ONLY : fft_type_descriptor @@ -404,9 +404,8 @@ MODULE realus ! and sync on GPUs ! CALL mp_sum( qq_at, intra_bgrp_comm ) -#if defined __CUDA - qq_at_d=qq_at -#endif + ! + !$acc update device(qq_at) ! ! and test that they don't differ too much ! from the result computed on the atomic grid @@ -455,7 +454,7 @@ MODULE realus !! Sync with GPU memory is performed outside ! USE constants, ONLY : eps16, eps6 - USE uspp, ONLY : indv, nhtolm, ap, qq_at, qq_at_d + USE uspp, ONLY : indv, nhtolm, ap, qq_at USE uspp_param, ONLY : upf, lmaxq, nh USE atom, ONLY : rgrid USE splinelib, ONLY : spline, splint diff --git a/PW/src/s_psi_gpu.f90 b/PW/src/s_psi_gpu.f90 index 6071e6931..37a0131a6 100644 --- a/PW/src/s_psi_gpu.f90 +++ b/PW/src/s_psi_gpu.f90 @@ -216,7 +216,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) ! USE mp, ONLY : mp_get_comm_null, mp_circular_shift_left USE device_fbuff_m, ONLY : dev_buf - USE uspp, ONLY : qq_at_d + USE uspp, ONLY : qq_at ! IMPLICIT NONE ! @@ -273,9 +273,12 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) ! (l'=l+ijkb0, m'=m+ijkb0, indices run from 1 to nh(nt)) ! IF ( m_loc > 0 ) THEN + !$acc host_data use_device(qq_at) CALL DGEMM('N', 'N', nh(nt), m_loc, nh(nt), 1.0_dp, & - qq_at_d(1,1,na), nhm, becp_d%r_d(ofsbeta(na)+1,1),& + qq_at(1,1,na), nhm, becp_d%r_d(ofsbeta(na)+1,1),& nkb, 0.0_dp, ps_d(ofsbeta(na)+1,1), nkb ) + !$acc end host_data + END IF END IF END DO @@ -340,7 +343,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) !! k-points version of \(\textrm{s_psi}\) routine. ! USE device_fbuff_m, ONLY : dev_buf - USE uspp, ONLY : qq_at_d + USE uspp, ONLY : qq_at ! IMPLICIT NONE ! @@ -367,17 +370,17 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) ! qq is real: copy it into a complex variable to perform ! a zgemm - simple but sub-optimal solution ! - ! here we need to use qq_at_d instead of qq_nt_d otherwise real space augmentation brakes! + ! here we need to use qq_at (device) instead of qq_nt_d otherwise real space augmentation brakes! ! qq_nt_d would be much faster and works for calculations without real space augmentation CALL dev_buf%lock_buffer( qqc_d, (/ nhm, nhm, nat/), ierr ) IF( ierr /= 0 ) & CALL errore( ' s_psi_k_gpu ', ' cannot allocate buffer (qqc_d) ', ABS(ierr) ) -!$cuf kernel do(3) <<<*,*>>> + !$acc parallel loop collapse(3) present(qq_at) DO na = 1, nat DO jh = 1, nhm DO ih = 1, nhm - qqc_d(ih,jh, na) = CMPLX ( qq_at_d(ih,jh, na), 0.0_dp, KIND=dp ) + qqc_d(ih,jh, na) = CMPLX ( qq_at(ih,jh, na), 0.0_dp, KIND=dp ) END DO END DO END DO @@ -427,7 +430,7 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) !! k-points noncolinear/spinorbit version of \(\textrm{s_psi}\) routine. ! USE device_fbuff_m, ONLY : dev_buf - USE uspp, ONLY : qq_at_d, qq_so_d + USE uspp, ONLY : qq_at, qq_so ! IMPLICIT NONE ! @@ -455,11 +458,11 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) IF( ierr /= 0 .and. ierr /= -1 ) & CALL errore( ' s_psi_nc_gpu ', ' cannot allocate buffer (qqc_d) ', ABS(ierr) ) ! Possibly convert only what's needed?? -!$cuf kernel do(3) <<<*,*>>> + !$acc parallel loop collapse(3) present(qq_at) DO na = 1, nat DO jh = 1, nhm DO ih = 1, nhm - qqc_d(ih, jh, na) = CMPLX ( qq_at_d(ih,jh, na), 0.0_dp, KIND=dp ) + qqc_d(ih, jh, na) = CMPLX ( qq_at(ih,jh, na), 0.0_dp, KIND=dp ) END DO END DO END DO @@ -482,19 +485,21 @@ SUBROUTINE s_psi__gpu( lda, n, m, psi_d, spsi_d ) ELSE DO na = 1, nat IF ( ityp(na) == nt ) THEN + !$acc host_data use_device(qq_so) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - qq_so_d(1,1,1,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, & + qq_so(1,1,1,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, & (0.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,1,1), nkb*npol ) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - qq_so_d(1,1,2,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, & + qq_so(1,1,2,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, & (1.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,1,1), nkb*npol ) ! CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - qq_so_d(1,1,3,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, & + qq_so(1,1,3,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,1,1), nkb*npol, & (0.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,2,1), nkb*npol ) CALL ZGEMM('N','N', nh(nt), m, nh(nt), (1.0_dp,0.0_dp), & - qq_so_d(1,1,4,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, & + qq_so(1,1,4,nt), nhm, becp_d%nc_d(ofsbeta(na)+1,2,1), nkb*npol, & (1.0_dp,0.0_dp), ps_d(ofsbeta(na)+1,2,1), nkb*npol ) + !$acc end host_data END IF END DO END IF diff --git a/PW/src/stres_cc.f90 b/PW/src/stres_cc.f90 index d29113f65..4b7406e5f 100644 --- a/PW/src/stres_cc.f90 +++ b/PW/src/stres_cc.f90 @@ -7,8 +7,9 @@ ! ! !----------------------------------------------------------------------- -subroutine stres_cc( sigmaxcc ) +SUBROUTINE stres_cc( sigmaxcc ) !----------------------------------------------------------------------- + !! Core correction term of the stress. ! USE kinds, ONLY : DP USE atom, ONLY : rgrid, msh @@ -17,7 +18,7 @@ subroutine stres_cc( sigmaxcc ) USE cell_base, ONLY : alat, omega, tpiba, tpiba2 USE fft_base, ONLY : dfftp USE fft_rho, ONLY : rho_r2g - USE gvect, ONLY : ngm, gstart, g, gg, ngl, gl, igtongl + USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, g, gg USE ener, ONLY : etxc, vtxc USE lsda_mod, ONLY : nspin USE scf, ONLY : rho, rho_core, rhog_core @@ -26,81 +27,135 @@ subroutine stres_cc( sigmaxcc ) USE mp_bands, ONLY : intra_bgrp_comm USE mp, ONLY : mp_sum ! - implicit none - ! output - real(DP) :: sigmaxcc(3,3) - ! local variables + IMPLICIT NONE ! - integer :: nt, ng, l, m, ir - ! counters - real(DP) :: fact, sigmadiag - real(DP), allocatable:: rhocg(:), vxc(:,:) - complex(DP), allocatable :: vaux(:,:) + REAL(DP), INTENT(OUT) :: sigmaxcc(3,3) ! - sigmaxcc(:,:) = 0.d0 - if ( ANY (upf(1:ntyp)%nlcc) ) goto 15 + ! ... local variables ! - return + INTEGER :: nt, ng, l, m, ir + REAL(DP) :: fact + REAL(DP), ALLOCATABLE :: rhocg(:), vxc(:,:) + COMPLEX(DP), ALLOCATABLE :: vaux(:,:) ! -15 continue + REAL(DP) :: rhocg1, sigma_rid, sigmadiag + REAL(DP) :: sigma1, sigma2, sigma3, & + sigma4, sigma5, sigma6 ! - ! recalculate the exchange-correlation potential + sigmaxcc(:,:) = 0._DP ! - allocate( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) ) + IF ( .NOT. ANY( upf(1:ntyp)%nlcc ) ) RETURN ! - call v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc ) + ! ... recalculate the exchange-correlation potential ! - if ( nspin==2 ) then - do ir = 1, dfftp%nnr + ALLOCATE( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) ) + ! + CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc ) + ! + !$acc data create( vaux ) + !$acc data copyin( vxc ) + IF ( nspin==2 ) then + !$acc parallel loop + DO ir = 1, dfftp%nnr vxc(ir,1) = 0.5d0 * ( vxc(ir,1) + vxc(ir,2) ) - enddo - endif + ENDDO + ENDIF ! - call rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) ) + CALL rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) ) ! - ! vaux contains now Vxc(G) + !$acc end data + DEALLOCATE( vxc ) ! - allocate(rhocg(ngl)) - sigmadiag = 0.0d0 - if (gamma_only) then - fact = 2.d0 - else - fact = 1.d0 - end if - do nt = 1, ntyp - if ( upf(nt)%nlcc ) then - call drhoc( ngl, gl, omega, tpiba2, msh(nt), rgrid(nt)%r, & - rgrid(nt)%rab, upf(nt)%rho_atc, rhocg ) - ! diagonal term - if (gstart==2) then - sigmadiag = sigmadiag + CONJG(vaux(1,1)) * strf(1,nt) * rhocg(igtongl(1)) - endif - do ng = gstart, ngm - sigmadiag = sigmadiag + CONJG(vaux(ng,1)) * & - strf(ng,nt) * rhocg(igtongl(ng)) * fact - enddo + ! ... vaux contains now Vxc(G) + ! + sigmadiag = 0._DP + ! + fact = 1._DP + IF (gamma_only) fact = 2._DP + ! + !$acc data copyin( gl, strf, igtongl ) + ! + ALLOCATE( rhocg(ngl) ) + !$acc data create( rhocg ) + ! + sigma1 = 0._DP ; sigma4 = 0._DP + sigma2 = 0._DP ; sigma5 = 0._DP + sigma3 = 0._DP ; sigma6 = 0._DP + ! + ! + DO nt = 1, ntyp + IF ( upf(nt)%nlcc ) THEN ! - call deriv_drhoc( ngl, gl, omega, tpiba2, msh(nt), & - rgrid(nt)%r, rgrid(nt)%rab, upf(nt)%rho_atc, rhocg ) - ! non diagonal term (g=0 contribution missing) - do ng = gstart, ngm - do l = 1, 3 - do m = 1, 3 - sigmaxcc(l,m) = sigmaxcc(l,m) + CONJG(vaux(ng,1)) * strf(ng,nt) * & - rhocg(igtongl(ng)) * tpiba * g(l,ng) * g(m,ng) / & - sqrt(gg(ng)) * fact - enddo - enddo - enddo - endif - enddo + !$acc data copyin(rgrid(nt:nt),upf(nt:nt)) + !$acc data copyin(rgrid(nt)%r,rgrid(nt)%rab,upf(nt)%rho_atc) + + CALL drhoc( ngl, gl, omega, tpiba2, msh(nt), rgrid(nt)%r, & + rgrid(nt)%rab, upf(nt)%rho_atc, rhocg ) + ! + ! ... diagonal term + IF (gstart==2) THEN + !$acc kernels + rhocg1 = rhocg(igtongl(1)) + sigmadiag = sigmadiag + DBLE(CONJG(vaux(1,1)) * & + strf(1,nt)) * rhocg1 + !$acc end kernels + ENDIF + ! + !$acc parallel loop reduction(+:sigmadiag) + DO ng = gstart, ngm + sigmadiag = sigmadiag + DBLE(CONJG(vaux(ng,1)) * & + strf(ng,nt)) * rhocg(igtongl(ng)) * fact + ENDDO + ! + CALL deriv_drhoc( ngl, gl, omega, tpiba2, msh(nt), & + rgrid(nt)%r, rgrid(nt)%rab, upf(nt)%rho_atc, & + rhocg ) + ! + ! ... non diagonal term (g=0 contribution missing) + ! + !$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6) + DO ng = gstart, ngm + ! + sigma_rid = DBLE(CONJG(vaux(ng,1)) * strf(ng,nt)) * & + rhocg(igtongl(ng)) * tpiba / SQRT(gg(ng)) * fact + ! + sigma1 = sigma1 + sigma_rid * g(1,ng)*g(1,ng) + sigma2 = sigma2 + sigma_rid * g(1,ng)*g(2,ng) + sigma3 = sigma3 + sigma_rid * g(1,ng)*g(3,ng) + sigma4 = sigma4 + sigma_rid * g(2,ng)*g(2,ng) + sigma5 = sigma5 + sigma_rid * g(3,ng)*g(2,ng) + sigma6 = sigma6 + sigma_rid * g(3,ng)*g(3,ng) + ! + ENDDO + ! + !$acc end data + !$acc end data + ! + ENDIF + ! + ENDDO ! - do l = 1, 3 + !$acc end data + ! + sigmaxcc(1,1) = sigma1 ; sigmaxcc(2,3) = sigma5 + sigmaxcc(1,2) = sigma2 ; sigmaxcc(3,1) = sigma3 + sigmaxcc(1,3) = sigma3 ; sigmaxcc(3,2) = sigma5 + sigmaxcc(2,1) = sigma2 ; sigmaxcc(3,3) = sigma6 + sigmaxcc(2,2) = sigma4 + ! + DO l = 1, 3 sigmaxcc(l,l) = sigmaxcc(l,l) + sigmadiag - enddo - call mp_sum( sigmaxcc, intra_bgrp_comm ) - deallocate( rhocg ) - deallocate( vaux, vxc ) - return -end subroutine stres_cc + ENDDO + ! + CALL mp_sum( sigmaxcc, intra_bgrp_comm ) + ! + !$acc end data + !$acc end data + ! + DEALLOCATE( rhocg ) + DEALLOCATE( vaux ) + ! + RETURN + ! +END SUBROUTINE stres_cc diff --git a/PW/src/stres_cc_gpu.f90 b/PW/src/stres_cc_gpu.f90 deleted file mode 100644 index ffc2f4d23..000000000 --- a/PW/src/stres_cc_gpu.f90 +++ /dev/null @@ -1,178 +0,0 @@ -! -! Copyright (C) 2001-2007 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!----------------------------------------------------------------------- -SUBROUTINE stres_cc_gpu( sigmaxcc ) - !----------------------------------------------------------------------- - ! - USE kinds, ONLY : DP - USE atom, ONLY : rgrid, msh - USE uspp_param, ONLY : upf - USE ions_base, ONLY : ntyp => nsp - USE cell_base, ONLY : alat, omega, tpiba, tpiba2 - USE fft_base, ONLY : dfftp - USE fft_rho, ONLY : rho_r2g - USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, igtongl_d - USE ener, ONLY : etxc, vtxc - USE lsda_mod, ONLY : nspin - USE scf, ONLY : rho, rho_core, rhog_core - USE vlocal, ONLY : strf - USE control_flags, ONLY : gamma_only - USE mp_bands, ONLY : intra_bgrp_comm - USE mp, ONLY : mp_sum - ! - USE gvect, ONLY : g_d, gg_d -#if defined(__CUDA) - USE device_fbuff_m, ONLY : dev_buf - USE device_memcpy_m, ONLY : dev_memcpy -#endif - ! - IMPLICIT NONE - ! - ! output - REAL(DP) :: sigmaxcc(3,3) - ! local variables - ! - INTEGER :: nt, ng, l, m, ir - ! counters - REAL(DP) :: fact - REAL(DP), ALLOCATABLE :: vxc(:,:) - COMPLEX(DP), ALLOCATABLE :: vaux(:,:) - ! - REAL(DP), POINTER :: rhocg_d(:), r_d(:), rab_d(:), rhoc_d(:), gl_d(:) - COMPLEX(DP), POINTER :: strf_d(:) - ! - INTEGER :: maxmesh, ierrs(6) - REAL(DP) :: sigma_rid, sigmadiag - REAL(DP) :: sigma1, sigma2, sigma3, & - sigma4, sigma5, sigma6 - ! -#if defined(__CUDA) - attributes(DEVICE) :: rhocg_d, r_d, rab_d, rhoc_d, gl_d, strf_d - ! - sigmaxcc(:,:) = 0._DP - IF ( ANY( upf(1:ntyp)%nlcc ) ) GOTO 15 - ! - RETURN - ! -15 CONTINUE - ! - ! recalculate the exchange-correlation potential - ! - ALLOCATE( vxc(dfftp%nnr,nspin), vaux(dfftp%nnr,1) ) - ! - CALL v_xc( rho, rho_core, rhog_core, etxc, vtxc, vxc ) - ! - !$acc data copyin(vxc) create(vaux) - ! - IF ( nspin==2 ) THEN - !$acc parallel loop - DO ir = 1, dfftp%nnr - vxc(ir,1) = 0.5d0 * ( vxc(ir,1) + vxc(ir,2) ) - ENDDO - ENDIF - ! - CALL rho_r2g( dfftp, vxc(:,1), vaux(:,1:1) ) - ! - ! vaux contains now Vxc(G) - ! - sigmadiag = 0._DP - ! - fact = 1._DP - IF (gamma_only) fact = 2._DP - ! - maxmesh = MAXVAL(msh(1:ntyp)) - CALL dev_buf%lock_buffer( gl_d, ngl, ierrs(1) ) - CALL dev_memcpy( gl_d, gl, (/ 1, ngl /) ) - CALL dev_buf%lock_buffer( rhocg_d, ngl, ierrs(2) ) - CALL dev_buf%lock_buffer( r_d, maxmesh, ierrs(3) ) - CALL dev_buf%lock_buffer( rab_d, maxmesh, ierrs(4) ) - CALL dev_buf%lock_buffer( rhoc_d,maxmesh, ierrs(5) ) - CALL dev_buf%lock_buffer( strf_d, ngm, ierrs(6) ) - IF (ANY(ierrs /= 0)) CALL errore( 'stres_cc_gpu', 'cannot allocate buffers', -1 ) - ! - sigma1 = 0._DP ; sigma4 = 0._DP - sigma2 = 0._DP ; sigma5 = 0._DP - sigma3 = 0._DP ; sigma6 = 0._DP - ! - DO nt = 1, ntyp - IF ( upf(nt)%nlcc ) THEN - ! - CALL dev_memcpy( strf_d, strf(:,nt), (/1, ngm/) ) - CALL dev_memcpy( r_d, rgrid(nt)%r, (/1, msh(nt)/) ) - CALL dev_memcpy( rab_d, rgrid(nt)%rab, (/1, msh(nt)/) ) - CALL dev_memcpy( rhoc_d, upf(nt)%rho_atc, (/1, msh(nt)/) ) - ! - CALL drhoc_gpu( ngl, gl_d, omega, tpiba2, msh(nt), r_d, & - rab_d, rhoc_d, rhocg_d ) - ! - ! diagonal term - IF (gstart==2) THEN - !$acc kernels - sigmadiag = sigmadiag + DBLE(CONJG(vaux(1,1))*strf_d(1)) * & - rhocg_d(igtongl_d(1)) - !$acc end kernels - ENDIF - ! - !$acc parallel loop - DO ng = gstart, ngm - sigmadiag = sigmadiag + DBLE(CONJG(vaux(ng,1)) * strf_d(ng)) * & - rhocg_d(igtongl_d(ng)) * fact - ENDDO - ! - CALL deriv_drhoc_gpu( ngl, gl_d, omega, tpiba2, msh(nt), & - r_d, rab_d, rhoc_d, rhocg_d ) - ! - ! non diagonal term (g=0 contribution missing) - ! - !$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6) - DO ng = gstart, ngm - ! - sigma_rid = DBLE(CONJG(vaux(ng,1)) & - * strf_d(ng)) * rhocg_d(igtongl_d(ng)) * tpiba & - / SQRT(gg_d(ng)) * fact - ! - sigma1 = sigma1 + sigma_rid * g_d(1,ng)*g_d(1,ng) - sigma2 = sigma2 + sigma_rid * g_d(1,ng)*g_d(2,ng) - sigma3 = sigma3 + sigma_rid * g_d(1,ng)*g_d(3,ng) - sigma4 = sigma4 + sigma_rid * g_d(2,ng)*g_d(2,ng) - sigma5 = sigma5 + sigma_rid * g_d(3,ng)*g_d(2,ng) - sigma6 = sigma6 + sigma_rid * g_d(3,ng)*g_d(3,ng) - ! - ENDDO - ! - ENDIF - ! - ENDDO - ! - sigmaxcc(1,1) = sigma1 ; sigmaxcc(2,3) = sigma5 - sigmaxcc(1,2) = sigma2 ; sigmaxcc(3,1) = sigma3 - sigmaxcc(1,3) = sigma3 ; sigmaxcc(3,2) = sigma5 - sigmaxcc(2,1) = sigma2 ; sigmaxcc(3,3) = sigma6 - sigmaxcc(2,2) = sigma4 - ! - DO l = 1, 3 - sigmaxcc(l,l) = sigmaxcc(l,l) + sigmadiag - ENDDO - ! - CALL mp_sum( sigmaxcc, intra_bgrp_comm ) - ! - !$acc end data - DEALLOCATE( vxc, vaux ) - CALL dev_buf%release_buffer( gl_d, ierrs(1) ) - CALL dev_buf%release_buffer( rhocg_d,ierrs(2) ) - CALL dev_buf%release_buffer( r_d, ierrs(3) ) - CALL dev_buf%release_buffer( rab_d, ierrs(4) ) - CALL dev_buf%release_buffer( rhoc_d, ierrs(5) ) - CALL dev_buf%release_buffer( strf_d, ierrs(6) ) -#endif - ! - RETURN - ! -END SUBROUTINE stres_cc_gpu - diff --git a/PW/src/stres_ewa.f90 b/PW/src/stres_ewa.f90 index df3795d4e..ac37ed24f 100644 --- a/PW/src/stres_ewa.f90 +++ b/PW/src/stres_ewa.f90 @@ -53,7 +53,7 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & REAL(DP), INTENT(IN) :: gcutm !! input: cut-off of g vectors REAL(DP), INTENT(OUT) :: sigmaewa(3,3) - ! output: the ewald stress + !! output: the ewald stress ! ! ... local variables ! @@ -85,6 +85,9 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & ! diagonal term ! nondiagonal term COMPLEX(DP) :: rhostar + REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 + ! + !$acc data present( g, gg ) ! tpiba2 = (tpi / alat)**2 sigmaewa(:,:) = 0.d0 @@ -94,85 +97,113 @@ SUBROUTINE stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, tau, & charge = charge + zv(ityp(na)) ENDDO ! - ! choose alpha in order to have convergence in the sum over G - ! upperbound is a safe upper bound for the error ON THE ENERGY + ! ... choose alpha in order to have convergence in the sum over G + ! upperbound is a safe upper bound for the error ON THE ENERGY ! alpha = 2.9d0 12 alpha = alpha - 0.1d0 ! IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 ) upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * & - erfc ( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) ) + ERFC( SQRT(tpiba2 * gcutm / 4.0d0 / alpha) ) ! IF (upperbound > 1d-7) GOTO 12 ! - ! G-space sum here - ! - ! Determine if this processor contains G=0 and set the constant term - ! + ! ... Determine if this processor contains G=0 and set the constant term + ! sdewald is the diagonal term IF (gstart == 2) THEN sdewald = tpi * e2 / 4.d0 / alpha * (charge / omega)**2 ELSE sdewald = 0.d0 ENDIF ! - ! sdewald is the diagonal term IF (gamma_only) THEN - fact = 2.d0 + fact = 2.d0 ELSE fact = 1.d0 ENDIF ! + ! ... G-space sum here below + ! IF (do_cutoff_2D) THEN + ! CALL cutoff_stres_sigmaewa( alpha, sdewald, sigmaewa ) + ! ELSE -!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau, zv, ityp, nat, omega, fact)& -!$omp &private(g2, g2a, rhostar, na, arg, l, m, sewald)& -!$omp &reduction(+:sigmaewa,sdewald) + ! + sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP + sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP + ! +#if !defined(_OPENACC) +!$omp parallel do default(none) shared(gstart, ngm, g, gg, tpiba2, alpha, tau,& +!$omp nat, zv, ityp, omega, fact) private(g2,g2a, rhostar, na, arg,& +!$omp sewald) reduction(+:sdewald,sigma11,sigma21,sigma22,sigma31,& +!$omp sigma32,sigma33) +#else +!$acc parallel loop copyin(tau,zv,ityp) reduction(+:sigma11,sigma21,sigma22,& +!$acc sigma31,sigma32,sigma33) reduction(-:sdewald) +#endif DO ng = gstart, ngm - g2 = gg (ng) * tpiba2 - g2a = g2 / 4.d0 / alpha - rhostar = (0.d0, 0.d0) + g2 = gg(ng) * tpiba2 + g2a = g2 / 4._DP / alpha + rhostar = (0._DP,0._DP) DO na = 1, nat arg = (g(1,ng) * tau(1,na) + g(2,ng) * tau(2,na) + & g(3,ng) * tau(3,na) ) * tpi - rhostar = rhostar + zv(ityp(na)) * CMPLX(COS(arg), SIN(arg), KIND=DP) + rhostar = rhostar + CMPLX(zv(ityp(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP) ENDDO - rhostar = rhostar / omega + rhostar = rhostar / CMPLX(omega) sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2 sdewald = sdewald - sewald - DO l = 1, 3 - DO m = 1, l - sigmaewa(l,m) = sigmaewa(l,m) + sewald * tpiba2 * 2.d0 * & - g(l,ng) * g(m,ng) / g2 * (g2a + 1) - ENDDO - ENDDO ! + sigma11 = sigma11 + sewald * tpiba2 * 2._DP * & + g(1,ng) * g(1,ng) / g2 * (g2a + 1) + sigma21 = sigma21 + sewald * tpiba2 * 2._DP * & + g(2,ng) * g(1,ng) / g2 * (g2a + 1) + sigma22 = sigma22 + sewald * tpiba2 * 2._DP * & + g(2,ng) * g(2,ng) / g2 * (g2a + 1) + sigma31 = sigma31 + sewald * tpiba2 * 2._DP * & + g(3,ng) * g(1,ng) / g2 * (g2a + 1) + sigma32 = sigma32 + sewald * tpiba2 * 2._DP * & + g(3,ng) * g(2,ng) / g2 * (g2a + 1) + sigma33 = sigma33 + sewald * tpiba2 * 2._DP * & + g(3,ng) * g(3,ng) / g2 * (g2a + 1) ENDDO +#if !defined(_OPENACC) !$omp end parallel do +#endif + ! + sigmaewa(1,1) = sigmaewa(1,1) + sigma11 + sigmaewa(2,1) = sigmaewa(2,1) + sigma21 + sigmaewa(2,2) = sigmaewa(2,2) + sigma22 + sigmaewa(3,1) = sigmaewa(3,1) + sigma31 + sigmaewa(3,2) = sigmaewa(3,2) + sigma32 + sigmaewa(3,3) = sigmaewa(3,3) + sigma33 + ! ENDIF ! + !$acc end data + ! DO l = 1, 3 sigmaewa(l,l) = sigmaewa(l,l) + sdewald ENDDO ! - ! R-space sum here (see ewald.f90 for details on parallelization) + ! ... R-space sum here (see ewald.f90 for details on parallelization) ! CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey ) ! IF ( mykey == 0 ) THEN rmax = 4.0d0 / SQRT(alpha) / alat ! - ! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1 + ! ... with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1 ! -!$omp parallel do default(none) shared(na_s, na_e, nat, tau, rmax, at, bg, alat, ityp, alpha, omega, zv)& -!$omp &private(nb, dtau, r, r2, nrm, nr, rr, fac, l, m)& -!$omp &reduction(+:sigmaewa) +!$omp parallel do default(none) shared(na_s,na_e,nat,tau,rmax,at,bg,alat,ityp,alpha,omega,zv)& +!$omp &private(nb,dtau,r,r2,nrm,nr,rr,fac,l,m) reduction(+:sigmaewa) DO na = na_s, na_e DO nb = 1, nat dtau(:) = tau(:,na) - tau(:,nb) ! - ! generates nearest-neighbors shells r(i)=R(i)-dtau(i) + ! ... generates nearest-neighbors shells r(i)=R(i)-dtau(i) ! CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm ) ! diff --git a/PW/src/stres_ewa_gpu.f90 b/PW/src/stres_ewa_gpu.f90 deleted file mode 100644 index 6a28db8e2..000000000 --- a/PW/src/stres_ewa_gpu.f90 +++ /dev/null @@ -1,246 +0,0 @@ -! -! Copyright (C) 2001-2009 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!----------------------------------------------------------------------- -SUBROUTINE stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg, tau, & - omega, g_d, gg_d, ngm, gstart, gamma_only, & - gcutm, sigmaewa ) - !--------------------------------------------------------------------- - !! Ewald contribution. Both real- and reciprocal-space terms are - !! present. - ! - USE kinds - USE constants, ONLY : tpi, e2, eps6 - USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, nproc_bgrp - USE mp, ONLY : mp_sum - USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_sigmaewa_gpu - ! - IMPLICIT NONE - ! - INTEGER :: nat - !! input: number of atoms in the unit cell - INTEGER :: ntyp - !! input: number of different types of atoms - INTEGER :: ityp(nat) - !! input: the type of each atom - INTEGER :: ngm - !! input: number of plane waves for G sum - INTEGER :: gstart - !! input: first nonzero g vector - LOGICAL, INTENT(IN) :: gamma_only - !! gamma point only - REAL(DP), INTENT(IN) :: tau(3,nat) - !! input: the positions of the atoms in the cell - REAL(DP), INTENT(IN) :: g_d(3,ngm) - !! input: the coordinates of G vectors - REAL(DP), INTENT(IN) :: gg_d(ngm) - !! input: the square moduli of G vectors - REAL(DP), INTENT(IN) :: zv(ntyp) - !! input: the charge of each type of atoms - REAL(DP), INTENT(IN) :: at(3,3) - !! input: the direct lattice vectors - REAL(DP), INTENT(IN) :: bg(3,3) - !! input: the reciprocal lattice vectors - REAL(DP), INTENT(IN) :: omega - !! input: the volume of the unit cell - REAL(DP), INTENT(IN) :: alat - !! input: measure of length - REAL(DP), INTENT(IN) :: gcutm - !! input: cut-off of g vectors - REAL(DP), INTENT(OUT) :: sigmaewa(3,3) - ! output: the ewald stress - ! - ! ... local variables - ! - INTEGER, PARAMETER :: mxr = 50 - ! the maximum number of R vectors included in r sum - INTEGER :: ng, nr, na, nb, l, m, nrm - ! counter over reciprocal G vectors - ! counter over direct vectors - ! counter on atoms - ! counter on atoms - ! counter on atoms - ! number of R vectors included in r sum - INTEGER :: na_s, na_e, mykey - ! - REAL(DP) :: charge, arg, tpiba2, dtau(3), alpha, r(3,mxr), & - r2(mxr), rmax, rr, upperbound, fact, fac, g2, g2a, & - sdewald, sewald - ! total ionic charge in the cell - ! the argument of the phase - ! length in reciprocal space - ! the difference tau_s - tau_s' - ! alpha term in ewald sum - ! input of the rgen routine ( not used here ) - ! the square modulus of R_j-tau_s-tau_s' - ! the maximum radius to consider real space sum - ! buffer variable - ! used to optimize alpha - ! auxiliary variables - ! diagonal term - ! nondiagonal term - ! - INTEGER :: ierr(2) - REAL(DP) :: sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 - COMPLEX(DP) :: rhostar - ! - INTEGER, ALLOCATABLE :: ityp_d(:) - REAL(DP), ALLOCATABLE :: zv_d(:), tau_d(:,:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: g_d, gg_d, zv_d, ityp_d, tau_d -#endif - ! - tpiba2 = (tpi / alat)**2 - sigmaewa(:,:) = 0._DP - charge = 0._DP - ! - ALLOCATE( zv_d(ntyp), tau_d(3,nat) ) - zv_d = zv - tau_d = tau - ALLOCATE( ityp_d(nat) ) - ityp_d = ityp - ! - DO na = 1, nat - charge = charge + zv(ityp(na)) - ENDDO - ! - ! choose alpha in order to have convergence in the sum over G - ! upperbound is a safe upper bound for the error ON THE ENERGY - ! - alpha = 2.9_DP -12 alpha = alpha - 0.1_DP - ! - IF (alpha==0.0) CALL errore( 'stres_ew', 'optimal alpha not found', 1 ) - upperbound = e2 * charge**2 * SQRT(2 * alpha / tpi) * & - erfc ( SQRT(tpiba2 * gcutm / 4._DP / alpha) ) - ! - IF (upperbound > 1d-7) GOTO 12 - ! - ! G-space sum here - ! - ! Determine if this processor contains G=0 and set the constant term - ! - IF (gstart == 2) THEN - sdewald = tpi * e2 / 4._DP / alpha * (charge / omega)**2 - ELSE - sdewald = 0._DP - ENDIF - ! - ! sdewald is the diagonal term - IF ( gamma_only ) THEN - fact = 2._DP - ELSE - fact = 1._DP - ENDIF - ! - IF ( do_cutoff_2D ) THEN - ! - CALL cutoff_stres_sigmaewa_gpu( alpha, sdewald, sigmaewa ) - ! - ELSE - ! - sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP - sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP - ! - !$cuf kernel do (1) <<<*,*>>> - DO ng = gstart, ngm - g2 = gg_d(ng) * tpiba2 - g2a = g2 / 4._DP / alpha - rhostar = (0._DP,0._DP) - DO na = 1, nat - arg = (g_d(1,ng) * tau_d(1,na) + g_d(2,ng) * tau_d(2,na) + & - g_d(3,ng) * tau_d(3,na) ) * tpi - rhostar = rhostar + CMPLX(zv_d(ityp_d(na))) * CMPLX(COS(arg), SIN(arg), KIND=DP) - ENDDO - rhostar = rhostar / CMPLX(omega) - sewald = fact * tpi * e2 * EXP(-g2a) / g2 * ABS(rhostar)**2 - sdewald = sdewald - sewald - ! - sigma11 = sigma11 + sewald * tpiba2 * 2._DP * & - g_d(1,ng) * g_d(1,ng) / g2 * (g2a + 1) - sigma21 = sigma21 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(1,ng) / g2 * (g2a + 1) - sigma22 = sigma22 + sewald * tpiba2 * 2._DP * & - g_d(2,ng) * g_d(2,ng) / g2 * (g2a + 1) - sigma31 = sigma31 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(1,ng) / g2 * (g2a + 1) - sigma32 = sigma32 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(2,ng) / g2 * (g2a + 1) - sigma33 = sigma33 + sewald * tpiba2 * 2._DP * & - g_d(3,ng) * g_d(3,ng) / g2 * (g2a + 1) - ! - ENDDO - ! - sigmaewa(1,1) = sigmaewa(1,1) + sigma11 - sigmaewa(2,1) = sigmaewa(2,1) + sigma21 - sigmaewa(2,2) = sigmaewa(2,2) + sigma22 - sigmaewa(3,1) = sigmaewa(3,1) + sigma31 - sigmaewa(3,2) = sigmaewa(3,2) + sigma32 - sigmaewa(3,3) = sigmaewa(3,3) + sigma33 - ! - ENDIF - ! - DO l = 1, 3 - sigmaewa(l,l) = sigmaewa(l,l) + sdewald - ENDDO - ! - ! R-space sum here (see ewald.f90 for details on parallelization) - ! - CALL block_distribute( nat, me_bgrp, nproc_bgrp, na_s, na_e, mykey ) - ! - IF ( mykey == 0 ) THEN - rmax = 4.0d0 / SQRT(alpha) / alat - ! - ! with this choice terms up to ZiZj*erfc(5) are counted (erfc(5)=2x10^-1 - ! - DO na = na_s, na_e - DO nb = 1, nat - dtau(:) = tau(:,na) - tau(:,nb) - ! - ! generates nearest-neighbors shells r(i)=R(i)-dtau(i) - ! - CALL rgen( dtau, rmax, mxr, at, bg, r, r2, nrm ) - ! - DO nr = 1, nrm - rr = SQRT(r2 (nr) ) * alat - fac = - e2 / 2.0_DP/ omega * alat**2 * zv(ityp(na)) * & - zv(ityp(nb)) / rr**3 * (erfc(SQRT(alpha) * rr) + & - rr * SQRT(8.0_DP * alpha / tpi) * EXP( - alpha * rr**2) ) - DO l = 1, 3 - DO m = 1, l - sigmaewa(l,m) = sigmaewa(l,m) + fac * r(l,nr) * r(m,nr) - ENDDO - ENDDO - ENDDO - ! - ENDDO - ENDDO - ENDIF - ! - DO l = 1, 3 - DO m = 1, l - 1 - sigmaewa(m,l) = sigmaewa(l,m) - ENDDO - ENDDO - ! - DO l = 1, 3 - DO m = 1, 3 - sigmaewa(l,m) = - sigmaewa(l,m) - ENDDO - ENDDO - ! - DEALLOCATE( zv_d, tau_d ) - DEALLOCATE( ityp_d ) - ! - CALL mp_sum( sigmaewa, intra_bgrp_comm ) - ! - RETURN - ! -END SUBROUTINE stres_ewa_gpu - diff --git a/PW/src/stres_gradcorr.f90 b/PW/src/stres_gradcorr.f90 index 76f0ab00a..c16f59a9a 100644 --- a/PW/src/stres_gradcorr.f90 +++ b/PW/src/stres_gradcorr.f90 @@ -7,8 +7,8 @@ ! ! !---------------------------------------------------------------------------- -SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & - dfft, g, alat, omega, sigmaxc ) +SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, nspin, & + dfft, g, alat, omega, sigmaxc, kedtau ) !---------------------------------------------------------------------------- ! USE kinds, ONLY: DP @@ -24,9 +24,10 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & TYPE(fft_type_descriptor), INTENT(IN):: dfft INTEGER, INTENT(IN) :: nspin REAL(DP), INTENT(IN) :: rho(dfft%nnr,nspin), rho_core(dfft%nnr) - REAL(DP), INTENT(IN) :: g(3,dfft%ngm), alat, omega - REAL(DP), INTENT(INOUT) :: kedtau(dfft%nnr, nspin) ! FIXME: should be INTENT(IN) - COMPLEX(DP), INTENT(IN) :: rhog(dfft%ngm, nspin) + REAL(DP), INTENT(IN) :: g(3,dfft%ngm) + REAL(DP), INTENT(IN) :: alat, omega + REAL(DP), INTENT(IN), OPTIONAL :: kedtau(dfft%nnr,nspin) + COMPLEX(DP), INTENT(IN) :: rhog(dfft%ngm,nspin) COMPLEX(DP), INTENT(IN) :: rhog_core(dfft%ngm) REAL(DP), INTENT(INOUT) :: sigmaxc(3,3) ! @@ -34,30 +35,36 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & ! INTEGER :: k, l, m, ipol, ir, ig, is, nspin0, np INTEGER :: nr1, nr2, nr3, nrxx, ngm - REAL(DP), ALLOCATABLE :: grho(:,:,:), grho2(:,:), rhoaux(:,:), segni(:) + REAL(DP), ALLOCATABLE :: grho(:,:,:), grho2(:,:), rhoaux(:,:), & + segni(:), kedtaue2(:,:) COMPLEX(DP), ALLOCATABLE :: rhogaux(:,:) ! REAL(DP), ALLOCATABLE :: sx(:), sc(:) REAL(DP), ALLOCATABLE :: v1x(:,:), v2x(:,:), v3x(:,:) - REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:,:), v3c(:,:), v2c_ud(:) + REAL(DP), ALLOCATABLE :: v1c(:,:), v2c(:,:), v3c(:,:), v2c_ud(:), v2cm(:,:,:) ! REAL(DP), PARAMETER :: epsr = 1.0d-6, epsg = 1.0d-10, e2 = 2.d0 - REAL(DP) :: sigma_gradcorr(3, 3) - ! + REAL(DP) :: v2xc, v2xc_uu, v2xc_dd + REAL(DP) :: sigma_gc11, sigma_gc31, sigma_gc21, & + sigma_gc32, sigma_gc22, sigma_gc33 + REAL(DP) :: sigma_gradcorr(3,3) ! IF ( .NOT. xclib_dft_is('gradient') .AND. .NOT. xclib_dft_is('meta') ) RETURN ! - IF ( xclib_dft_is('meta') .and. nspin>1 ) CALL errore('stres_gradcorr', & - 'Meta-GGA stress does not work with spin polarization',1) + IF ( xclib_dft_is('meta') .AND. nspin>1 ) CALL errore( 'stres_gradcorr', & + 'Meta-GGA stress does not work with spin polarization', 1 ) + ! + !$acc data present_or_copyin( rho, rho_core, rhog, rhog_core, g ) ! np = 1 - IF ( nspin==2 .AND. xclib_dft_is('meta') ) np=3 + IF ( nspin==2 .AND. xclib_dft_is('meta') ) np = 3 ! nspin0 = nspin IF (nspin==4) nspin0 = 1 - IF (nspin==4.and.domag) nspin0 = 2 + IF (nspin==4.AND.domag) nspin0 = 2 ! - sigma_gradcorr(:,:) = 0.0_DP + sigma_gc11=0.0_DP ; sigma_gc21=0.0_DP ; sigma_gc22=0.0_DP + sigma_gc31=0.0_DP ; sigma_gc32=0.0_DP ; sigma_gc33=0.0_DP ! nr1 = dfft%nr1 nr2 = dfft%nr2 @@ -66,44 +73,71 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & ngm = dfft%ngm ! ALLOCATE( grho(3,nrxx,nspin0) ) - ALLOCATE (rhoaux(nrxx,nspin0)) + ALLOCATE( rhoaux(nrxx,nspin0) ) ALLOCATE( rhogaux(ngm,nspin0) ) + IF (xclib_dft_is('meta')) ALLOCATE( kedtaue2(dfft%nnr,nspin) ) + !$acc data create( grho, rhoaux ) + !$acc data create( rhogaux ) ! ! calculate the gradient of rho+rhocore in real space ! For convenience rhoaux is in (up,down) format ! IF ( nspin0 == 1 ) THEN ! - rhoaux(:,1) = rho(:,1) + rho_core(:) - rhogaux(:,1) = rhog(:,1) + rhog_core(:) + !$acc parallel loop + DO k = 1, nrxx + rhoaux(k,1) = rho(k,1) + rho_core(k) + ENDDO + !$acc parallel loop + DO k = 1, ngm + rhogaux(k,1) = rhog(k,1) + rhog_core(k) + ENDDO ! - ELSE IF ( nspin0 == 2 ) THEN + ELSEIF ( nspin0 == 2 ) THEN ! IF ( nspin == 4 .AND. domag ) THEN + ! ALLOCATE( segni( nrxx ) ) + ! + !$acc data copyout( segni ) CALL compute_rho( rho, rhoaux, segni, nrxx ) - DEALLOCATE( segni ) - rhoaux(:,1) = rhoaux(:,1) + rho_core(:) / 2.0_DP - rhoaux(:,2) = rhoaux(:,2) + rho_core(:) / 2.0_DP - CALL rho_r2g ( dfft, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) ) + !$acc parallel loop + DO k = 1, nrxx + rhoaux(k,1) = rhoaux(k,1) + rho_core(k) / 2.0_DP + rhoaux(k,2) = rhoaux(k,2) + rho_core(k) / 2.0_DP + ENDDO + CALL rho_r2g( dfft, rhoaux(:,1:nspin0), rhogaux(:,1:nspin0) ) + !$acc end data ELSE - rhoaux(:,1) = ( rho(:,1) + rho(:,2) + rho_core(:) ) / 2.0_DP - rhoaux(:,2) = ( rho(:,1) - rho(:,2) + rho_core(:) ) / 2.0_DP - rhogaux(:,1) = ( rhog(:,1) + rhog(:,2) + rhog_core(:) ) / 2.0_DP - rhogaux(:,2) = ( rhog(:,1) - rhog(:,2) + rhog_core(:) ) / 2.0_DP - END IF + !$acc parallel loop + DO k = 1, nrxx + rhoaux(k,1) = ( rho(k,1) + rho(k,2) + rho_core(k) ) / 2.0_DP + rhoaux(k,2) = ( rho(k,1) - rho(k,2) + rho_core(k) ) / 2.0_DP + ENDDO + !$acc parallel loop + DO k = 1, ngm + rhogaux(k,1) = ( rhog(k,1) + rhog(k,2) + rhog_core(k) ) / 2.0_DP + rhogaux(k,2) = ( rhog(k,1) - rhog(k,2) + rhog_core(k) ) / 2.0_DP + ENDDO + ENDIF ENDIF ! DO is = 1, nspin0 - CALL fft_gradient_g2r( dfft, rhogaux(1,is), g, grho(1,1,is) ) + CALL fft_gradient_g2r( dfft, rhogaux(:,is), g, grho(:,:,is) ) ENDDO - DEALLOCATE (rhogaux) + ! + !$acc end data + DEALLOCATE( rhogaux ) ! ALLOCATE( grho2(nrxx,nspin0) ) - ALLOCATE( v1x(nrxx,nspin0), v2x(nrxx,nspin0), v3x(nrxx,nspin0) ) - ALLOCATE( v1c(nrxx,nspin0), v2c(np,nrxx,nspin0), v3c(nrxx,nspin0) ) + ALLOCATE( v1x(nrxx,nspin0), v2x(nrxx,nspin0) ) + ALLOCATE( v1c(nrxx,nspin0), v2c(nrxx,nspin0) ) ALLOCATE( sx(nrxx), sc(nrxx) ) ! + IF ( xclib_dft_is('meta') ) & + ALLOCATE( v2cm(np,nrxx,nspin0), v3x(nrxx,nspin0), v3c(nrxx,nspin0) ) + !$acc data create( grho2, sx, sc, v1x, v2x, v1c, v2c ) + ! IF (nspin0==1) THEN ! ! Spin-unpolarized case @@ -113,73 +147,128 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & ! ! routine computing v1x_v and v2x_v is different for GGA and meta-GGA ! - grho2(:,1) = grho(1,:,1)**2 + grho(2,:,1)**2 + grho(3,:,1)**2 + !$acc parallel loop + DO k = 1, nrxx + grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 + ENDDO ! IF ( xclib_dft_is('meta') .AND. xclib_get_id('MGGA','EXCH') /= 4 ) THEN - kedtau(:,1) = kedtau(:,1) / e2 - CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtau, sx, sc, & - v1x, v2x, v3x, v1c, v2c, v3c ) - kedtau(:,1) = kedtau(:,1) * e2 + !$acc data present_or_copyin(kedtau) create( kedtaue2, v2cm, v3x, v3c ) + !$acc parallel loop + DO k = 1, nrxx + kedtaue2(k,1) = kedtau(k,1) / e2 + ENDDO + CALL xc_metagcx( nrxx, 1, np, rhoaux, grho, kedtaue2, sx, sc, & + v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. ) + !$acc parallel loop + DO k = 1, nrxx + v2c(k,1) = v2cm(1,k,1) + ENDDO + !$acc end data ELSE - CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c(1,:,:) ) + CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, gpu_args_=.TRUE. ) ENDIF ! - DO l = 1, 3 - DO m = 1, l - sigma_gradcorr(l,m) = sigma_gradcorr(l,m) + SUM( grho(l,:,1)*grho(m,:,1)* & - e2 * (v2x(:,1) + v2c(1,:,1)) ) - ENDDO + !$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, & + !$acc& sigma_gc31,sigma_gc32,sigma_gc33) + DO k = 1, nrxx + v2xc = e2 * (v2x(k,1) + v2c(k,1)) + sigma_gc11 = sigma_gc11 + grho(1,k,1)*grho(1,k,1) * v2xc + sigma_gc21 = sigma_gc21 + grho(2,k,1)*grho(1,k,1) * v2xc + sigma_gc22 = sigma_gc22 + grho(2,k,1)*grho(2,k,1) * v2xc + sigma_gc31 = sigma_gc31 + grho(3,k,1)*grho(1,k,1) * v2xc + sigma_gc32 = sigma_gc32 + grho(3,k,1)*grho(2,k,1) * v2xc + sigma_gc33 = sigma_gc33 + grho(3,k,1)*grho(3,k,1) * v2xc ENDDO ! ELSEIF (nspin0 == 2) THEN ! ! Spin-polarized case ! - grho2(:,:) = grho(1,:,:)**2 + grho(2,:,:)**2 + grho(3,:,:)**2 + !$acc parallel loop + DO k = 1, nrxx + grho2(k,1) = grho(1,k,1)**2 + grho(2,k,1)**2 + grho(3,k,1)**2 + grho2(k,2) = grho(1,k,2)**2 + grho(2,k,2)**2 + grho(3,k,2)**2 + ENDDO ! IF ( xclib_dft_is('meta') ) THEN ! - kedtau(:,1:nspin0) = kedtau(:,1:nspin0) / e2 - CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtau, sx, sc, & - v1x, v2x, v3x, v1c, v2c, v3c ) - kedtau(:,1:nspin0) = kedtau(:,1:nspin0) * e2 + !$acc data present(kedtau) create(kedtaue2, v2cm, v3x, v3c) + !$acc parallel loop + DO k = 1, nrxx + kedtaue2(k,1:nspin0) = kedtau(k,1:nspin0) / e2 + ENDDO + CALL xc_metagcx( nrxx, nspin0, np, rhoaux, grho, kedtaue2, sx, sc, & + v1x, v2x, v3x, v1c, v2cm, v3c, gpu_args_=.TRUE. ) + !$acc parallel loop + DO k = 1, nrxx + v2c(k,:) = v2cm(1,k,:) + ENDDO + !$acc end data ! FIXME : what are we supposed to do now? ! ELSE ! ALLOCATE( v2c_ud(nrxx) ) + !$acc data create( v2c_ud ) ! - CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c(1,:,:), v2c_ud ) + CALL xc_gcx( nrxx, nspin0, rhoaux, grho, sx, sc, v1x, v2x, v1c, v2c, v2c_ud, gpu_args_=.TRUE. ) ! - DO l = 1, 3 - DO m = 1, l - ! - ! ... exchange - sigma_gradcorr(l,m) = & - SUM( grho(l,:,1) * grho(m,:,1) * e2 * v2x(:,1) + & - grho(l,:,2) * grho(m,:,2) * e2 * v2x(:,2) ) - ! - ! ... correlation - sigma_gradcorr(l,m) = sigma_gradcorr(l,m) + & - SUM( grho(l,:,1) * grho(m,:,1) * v2c(1,:,1) + & - grho(l,:,2) * grho(m,:,2) * v2c(1,:,2) + & - (grho(l,:,1) * grho(m,:,2) + & - grho(l,:,2) * grho(m,:,1)) * v2c_ud(:) ) * e2 - ENDDO + !$acc parallel loop reduction(+:sigma_gc11,sigma_gc21,sigma_gc22, & + !$acc& sigma_gc31,sigma_gc32,sigma_gc33) + DO k = 1, nrxx + ! + v2xc_uu = e2 * (v2x(k,1)+v2c(k,1)) + v2xc_dd = e2 * (v2x(k,2)+v2c(k,2)) + ! + sigma_gc11 = sigma_gc11 + grho(1,k,1)*grho(1,k,1) * v2xc_uu + & + grho(1,k,2)*grho(1,k,2) * v2xc_dd + & + (grho(1,k,1)*grho(1,k,2) + & + grho(1,k,2)*grho(1,k,1)) * v2c_ud(k) * e2 + sigma_gc21 = sigma_gc21 + grho(2,k,1)*grho(1,k,1) * v2xc_uu + & + grho(2,k,2)*grho(1,k,2) * v2xc_dd + & + (grho(2,k,1)*grho(1,k,2) + & + grho(1,k,2)*grho(2,k,1)) * v2c_ud(k) * e2 + sigma_gc22 = sigma_gc22 + grho(2,k,1)*grho(2,k,1) * v2xc_uu + & + grho(2,k,2)*grho(2,k,2) * v2xc_dd + & + (grho(2,k,1)*grho(2,k,2) + & + grho(2,k,2)*grho(2,k,1)) * v2c_ud(k) * e2 + sigma_gc31 = sigma_gc31 + grho(3,k,1)*grho(1,k,1) * v2xc_uu + & + grho(3,k,2)*grho(1,k,2) * v2xc_dd + & + (grho(3,k,1)*grho(1,k,2) + & + grho(1,k,2)*grho(3,k,1)) * v2c_ud(k) * e2 + sigma_gc32 = sigma_gc32 + grho(3,k,1)*grho(2,k,1) * v2xc_uu + & + grho(3,k,2)*grho(2,k,2) * v2xc_dd + & + (grho(3,k,1)*grho(2,k,2) + & + grho(2,k,2)*grho(3,k,1)) * v2c_ud(k) * e2 + sigma_gc33 = sigma_gc33 + grho(3,k,1)*grho(3,k,1) * v2xc_uu + & + grho(3,k,2)*grho(3,k,2) * v2xc_dd + & + (grho(3,k,1)*grho(3,k,2) + & + grho(3,k,2)*grho(3,k,1)) * v2c_ud(k) * e2 ENDDO ! + !$acc end data DEALLOCATE( v2c_ud ) ! ENDIF ! ENDIF ! + sigma_gradcorr(1,1) = sigma_gc11 + sigma_gradcorr(2,1) = sigma_gc21 + sigma_gradcorr(2,2) = sigma_gc22 + sigma_gradcorr(3,1) = sigma_gc31 + sigma_gradcorr(3,2) = sigma_gc32 + sigma_gradcorr(3,3) = sigma_gc33 + ! + !$acc end data + !$acc end data DEALLOCATE( sc, sx ) - DEALLOCATE( v1c, v2c, v3c ) - DEALLOCATE( v1x, v2x, v3x ) - DEALLOCATE( grho2 ) + DEALLOCATE( v1c, v2c ) + DEALLOCATE( v1x, v2x ) + DEALLOCATE( grho, grho2 ) DEALLOCATE( rhoaux ) - DEALLOCATE( grho ) + IF (xclib_dft_is('meta')) DEALLOCATE( kedtaue2, v2cm, v3x, v3c ) ! DO l = 1, 3 DO m = 1, l - 1 @@ -191,6 +280,8 @@ SUBROUTINE stres_gradcorr( rho, rhog, rho_core, rhog_core, kedtau, nspin, & ! sigmaxc(:,:) = sigmaxc(:,:) + sigma_gradcorr(:,:) / (nr1 * nr2 * nr3) ! + !$acc end data + ! RETURN ! END SUBROUTINE stres_gradcorr diff --git a/PW/src/stres_har.f90 b/PW/src/stres_har.f90 index 0a6719cc5..ffb1fd44a 100644 --- a/PW/src/stres_har.f90 +++ b/PW/src/stres_har.f90 @@ -11,18 +11,18 @@ SUBROUTINE stres_har( sigmahar ) !-------------------------------------------------------------------- !! Calculates the Hartree contribution to the stress ! - USE kinds, ONLY: DP - USE constants, ONLY: e2, fpi - USE cell_base, ONLY: omega, tpiba2 - USE ener, ONLY: ehart - USE fft_base, ONLY: dfftp - USE fft_rho, ONLY: rho_r2g - USE gvect, ONLY: ngm, gstart, g, gg - USE scf, ONLY: rho - USE control_flags, ONLY: gamma_only - USE mp_bands, ONLY: intra_bgrp_comm - USE mp, ONLY: mp_sum - USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar + USE kinds, ONLY: DP + USE constants, ONLY: e2, fpi + USE cell_base, ONLY: omega, tpiba2 + USE ener, ONLY: ehart + USE fft_base, ONLY: dfftp + USE fft_rho, ONLY: rho_r2g + USE gvect, ONLY: ngm, gstart, g, gg + USE scf, ONLY: rho + USE control_flags, ONLY: gamma_only + USE mp_bands, ONLY: intra_bgrp_comm + USE mp, ONLY: mp_sum + USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar ! IMPLICIT NONE ! @@ -31,58 +31,86 @@ SUBROUTINE stres_har( sigmahar ) ! ! ... local variables ! - INTEGER :: ig, l, m REAL(DP) :: shart, g2 - REAL(DP), PARAMETER :: eps = 1.d-8 COMPLEX(DP), ALLOCATABLE :: rhog(:,:) - ! - sigmahar(:,:) = 0.0_DP + REAL(DP), PARAMETER :: eps = 1.E-8_DP + INTEGER :: ig, l, m + REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, & + sigmahar32, sigmahar22, sigmahar33 ! ALLOCATE( rhog(dfftp%nnr,1) ) + !$acc data create(rhog) ! CALL rho_r2g( dfftp, rho%of_r(:,1), rhog ) ! - ! rhog contains now the charge density in G space - ! the G=0 component is not computed - IF (do_cutoff_2D) THEN - CALL cutoff_stres_sigmahar( rhog, sigmahar ) - ELSE - DO ig = gstart, ngm - g2 = gg(ig) * tpiba2 - shart = rhog(ig,1) * CONJG(rhog(ig,1)) / g2 - DO l = 1, 3 - DO m = 1, l - sigmahar(l,m) = sigmahar(l,m) + shart * tpiba2 * 2 * & - g(l,ig) * g(m,ig) / g2 - ENDDO - ENDDO - ENDDO - ENDIF + ! ... the G=0 component is not computed ! + sigmahar(:,:) = 0.0_DP + ! + IF (do_cutoff_2D) THEN + ! + CALL cutoff_stres_sigmahar( rhog(:,1), sigmahar ) + ! + ELSE + ! + sigmahar11 = 0._DP ; sigmahar31 = 0._DP + sigmahar21 = 0._DP ; sigmahar32 = 0._DP + sigmahar22 = 0._DP ; sigmahar33 = 0._DP + ! + !$acc parallel loop reduction(+:sigmahar11,sigmahar21,sigmahar22,& + !$acc& sigmahar31,sigmahar32,sigmahar33) + DO ig = gstart, ngm + ! + g2 = gg(ig) + ! + shart = DBLE(rhog(ig,1)*CONJG(rhog(ig,1))) / g2 + ! + sigmahar11 = sigmahar11 + shart *2._DP * & + g(1,ig) * g(1,ig) / g2 + sigmahar21 = sigmahar21 + shart *2._DP * & + g(2,ig) * g(1,ig) / g2 + sigmahar22 = sigmahar22 + shart *2._DP * & + g(2,ig) * g(2,ig) / g2 + sigmahar31 = sigmahar31 + shart *2._DP * & + g(3,ig) * g(1,ig) / g2 + sigmahar32 = sigmahar32 + shart *2._DP * & + g(3,ig) * g(2,ig) / g2 + sigmahar33 = sigmahar33 + shart *2._DP * & + g(3,ig) * g(3,ig) / g2 + ENDDO + ! + sigmahar(1,1) = sigmahar(1,1) + sigmahar11 / tpiba2 + sigmahar(2,1) = sigmahar(2,1) + sigmahar21 / tpiba2 + sigmahar(2,2) = sigmahar(2,2) + sigmahar22 / tpiba2 + sigmahar(3,1) = sigmahar(3,1) + sigmahar31 / tpiba2 + sigmahar(3,2) = sigmahar(3,2) + sigmahar32 / tpiba2 + sigmahar(3,3) = sigmahar(3,3) + sigmahar33 / tpiba2 + ! + ENDIF + ! + !$acc end data DEALLOCATE( rhog ) ! CALL mp_sum( sigmahar, intra_bgrp_comm ) ! IF (gamma_only) THEN - sigmahar(:,:) = fpi * e2 * sigmahar(:,:) + sigmahar(:,:) = fpi * e2 * sigmahar(:,:) ELSE - sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP + sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP ENDIF ! DO l = 1, 3 - sigmahar(l,l) = sigmahar(l,l) - ehart / omega + sigmahar(l,l) = sigmahar(l,l) - ehart / omega ENDDO ! DO l = 1, 3 - DO m = 1, l-1 - sigmahar(m,l) = sigmahar(l,m) - ENDDO + DO m = 1, l-1 + sigmahar(m,l) = sigmahar(l,m) + ENDDO ENDDO ! sigmahar(:,:) = -sigmahar(:,:) ! - ! RETURN ! END SUBROUTINE stres_har - diff --git a/PW/src/stres_har_gpu.f90 b/PW/src/stres_har_gpu.f90 deleted file mode 100644 index e2ed99738..000000000 --- a/PW/src/stres_har_gpu.f90 +++ /dev/null @@ -1,122 +0,0 @@ -! -! Copyright (C) 2001-2007 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!---------------------------------------------------------------------- -SUBROUTINE stres_har_gpu( sigmahar ) - !-------------------------------------------------------------------- - !! Calculates the Hartree contribution to the stress - ! - USE kinds, ONLY: DP - USE constants, ONLY: e2, fpi - USE cell_base, ONLY: omega, tpiba2 - USE ener, ONLY: ehart - USE fft_base, ONLY: dfftp - USE fft_rho, ONLY: rho_r2g - USE gvect, ONLY: ngm, gstart - USE scf, ONLY: rho - USE control_flags, ONLY: gamma_only - USE mp_bands, ONLY: intra_bgrp_comm - USE mp, ONLY: mp_sum - USE Coul_cut_2D, ONLY: do_cutoff_2D, cutoff_stres_sigmahar_gpu - ! - USE gvect, ONLY: g_d, gg_d - ! - IMPLICIT NONE - ! - REAL(DP) :: sigmahar(3,3) - !! Hartree term of the stress tensor - ! - ! ... local variables - ! - REAL(DP) :: shart, g2 - REAL(DP), PARAMETER :: eps = 1.E-8_DP - INTEGER :: ig, l, m - ! - REAL(DP) :: sigmahar11, sigmahar31, sigmahar21, & - sigmahar32, sigmahar22, sigmahar33 - COMPLEX(DP), ALLOCATABLE :: rhog(:,:) - ! - sigmahar(:,:) = 0.0_DP - ! - ALLOCATE( rhog(dfftp%nnr,1) ) - !$acc data create(rhog) - ! - CALL rho_r2g( dfftp, rho%of_r(:,1), rhog ) - ! - ! rhog contains now the charge density in G space - ! the G=0 component is not computed - ! - IF (do_cutoff_2D) THEN - ! - CALL cutoff_stres_sigmahar_gpu( rhog, sigmahar ) - ! - ELSE - ! - sigmahar11 = 0._DP ; sigmahar31 = 0._DP - sigmahar21 = 0._DP ; sigmahar32 = 0._DP - sigmahar22 = 0._DP ; sigmahar33 = 0._DP - ! - !$acc parallel loop reduction(+:sigmahar11,sigmahar21,sigmahar22, & - !$acc sigmahar31,sigmahar32,sigmahar33) - DO ig = gstart, ngm - ! - g2 = gg_d(ig) - ! - shart = DBLE(rhog(ig,1)*CONJG(rhog(ig,1))) / g2 - ! - sigmahar11 = sigmahar11 + shart *2._DP * & - g_d(1,ig) * g_d(1,ig) / g2 - sigmahar21 = sigmahar21 + shart *2._DP * & - g_d(2,ig) * g_d(1,ig) / g2 - sigmahar22 = sigmahar22 + shart *2._DP * & - g_d(2,ig) * g_d(2,ig) / g2 - sigmahar31 = sigmahar31 + shart *2._DP * & - g_d(3,ig) * g_d(1,ig) / g2 - sigmahar32 = sigmahar32 + shart *2._DP * & - g_d(3,ig) * g_d(2,ig) / g2 - sigmahar33 = sigmahar33 + shart *2._DP * & - g_d(3,ig) * g_d(3,ig) / g2 - ENDDO - ! - sigmahar(1,1) = sigmahar(1,1) + sigmahar11 / tpiba2 - sigmahar(2,1) = sigmahar(2,1) + sigmahar21 / tpiba2 - sigmahar(2,2) = sigmahar(2,2) + sigmahar22 / tpiba2 - sigmahar(3,1) = sigmahar(3,1) + sigmahar31 / tpiba2 - sigmahar(3,2) = sigmahar(3,2) + sigmahar32 / tpiba2 - sigmahar(3,3) = sigmahar(3,3) + sigmahar33 / tpiba2 - ! - ENDIF - ! - !$acc end data - DEALLOCATE( rhog ) - ! - CALL mp_sum( sigmahar, intra_bgrp_comm ) - ! - IF (gamma_only) THEN - sigmahar(:,:) = fpi * e2 * sigmahar(:,:) - ELSE - sigmahar(:,:) = fpi * e2 * sigmahar(:,:) * 0.5_DP - ENDIF - ! - DO l = 1, 3 - sigmahar(l,l) = sigmahar(l,l) - ehart / omega - ENDDO - ! - DO l = 1, 3 - DO m = 1, l-1 - sigmahar(m,l) = sigmahar(l,m) - ENDDO - ENDDO - ! - sigmahar(:,:) = -sigmahar(:,:) - ! - ! - RETURN - ! -END SUBROUTINE stres_har_gpu - diff --git a/PW/src/stres_knl.f90 b/PW/src/stres_knl.f90 index 93ccfcf14..b2e268646 100644 --- a/PW/src/stres_knl.f90 +++ b/PW/src/stres_knl.f90 @@ -9,11 +9,11 @@ !----------------------------------------------------------------------- SUBROUTINE stres_knl( sigmanlc, sigmakin ) !----------------------------------------------------------------------- - !! Computes the kinetic + nonlocal contribuition to the stress + !! Computes the kinetic + nonlocal contribuition to the stress. ! USE kinds, ONLY: DP USE constants, ONLY: pi, e2 - USE cell_base, ONLY: omega, alat, at, bg, tpiba + USE cell_base, ONLY: omega, tpiba USE gvect, ONLY: g USE gvecw, ONLY: qcutz, ecfixed, q2sigma USE klist, ONLY: nks, xk, ngk, igk_k @@ -27,76 +27,114 @@ SUBROUTINE stres_knl( sigmanlc, sigmakin ) USE mp_pools, ONLY: inter_pool_comm USE mp_bands, ONLY: intra_bgrp_comm USE mp, ONLY: mp_sum - USE wavefunctions_gpum, ONLY: using_evc +#if defined(__CUDA) + USE wavefunctions_gpum, ONLY: using_evc, using_evc_d +#endif ! IMPLICIT NONE ! - REAL(DP) :: sigmanlc (3, 3) + REAL(DP) :: sigmanlc(3,3) !! non-local contribution to stress - REAL(DP) :: sigmakin (3, 3) + REAL(DP) :: sigmakin(3,3) !! kinetic contribution to stress ! ! ... local variables ! - REAL(DP), ALLOCATABLE :: gk (:,:), kfac (:) - REAL(DP) :: twobysqrtpi, gk2, arg - INTEGER :: npw, ik, l, m, i, ibnd, is + REAL(DP), ALLOCATABLE :: gk(:,:), kfac(:) + REAL(DP) :: twobysqrtpi, gk2, arg, s11, s21, s31, s22, s32, s33, & + xk1, xk2, xk3, tmpf, wg_nk + INTEGER :: npw, ik, l, m, i, ibnd ! +#if defined(__CUDA) CALL using_evc(0) - ALLOCATE( gk(npwx,3) ) - ALLOCATE( kfac(npwx) ) + CALL using_evc_d(0) +#endif ! - sigmanlc(:,:) = 0.d0 - sigmakin(:,:) = 0.d0 - twobysqrtpi = 2.d0/SQRT(pi) + !$acc enter data create( evc ) ! - kfac(:) = 1.d0 + ALLOCATE( gk(npwx,3), kfac(npwx) ) + !$acc data create( gk, kfac ) + !$acc data copyin( wg ) + ! + sigmanlc(:,:) = 0._DP + sigmakin(:,:) = 0._DP + twobysqrtpi = 2._DP/SQRT(pi) + ! + !$acc kernels + kfac(:) = 1._DP + !$acc end kernels + ! + s11 = 0._DP ; s22 = 0._DP + s21 = 0._DP ; s32 = 0._DP + s31 = 0._DP ; s33 = 0._DP ! DO ik = 1, nks - IF ( nks > 1 ) CALL get_buffer( evc, nwordwfc, iunwfc, ik ) - if ( nks > 1 ) CALL using_evc(2) + IF ( nks > 1 ) THEN + CALL get_buffer( evc, nwordwfc, iunwfc, ik ) +#if defined(__CUDA) + CALL using_evc(2) + CALL using_evc_d(0) +#endif + ENDIF + ! npw = ngk(ik) + ! + xk1 = xk(1,ik) + xk2 = xk(2,ik) + xk3 = xk(3,ik) + ! + !$acc parallel loop DO i = 1, npw - gk(i,1) = ( xk(1,ik) + g(1,igk_k(i,ik)) ) * tpiba - gk(i,2) = ( xk(2,ik) + g(2,igk_k(i,ik)) ) * tpiba - gk(i,3) = ( xk(3,ik) + g(3,igk_k(i,ik)) ) * tpiba - IF (qcutz > 0.d0) THEN + gk(i,1) = ( xk1 + g(1,igk_k(i,ik)) ) * tpiba + gk(i,2) = ( xk2 + g(2,igk_k(i,ik)) ) * tpiba + gk(i,3) = ( xk3 + g(3,igk_k(i,ik)) ) * tpiba + IF (qcutz > 0._DP) THEN gk2 = gk(i,1)**2 + gk(i,2)**2 + gk(i,3)**2 arg = ( (gk2-ecfixed)/q2sigma )**2 - kfac(i) = 1.d0 + qcutz / q2sigma * twobysqrtpi * EXP(-arg) + kfac(i) = 1._DP + qcutz / q2sigma * twobysqrtpi * EXP(-arg) ENDIF ENDDO ! ! ... kinetic contribution ! - DO l = 1, 3 - DO m = 1, l - DO ibnd = 1, nbnd - DO i = 1, npw - IF (noncolin) THEN - sigmakin(l,m) = sigmakin(l,m) + wg(ibnd,ik) * & - gk(i,l) * gk(i, m) * kfac(i) * & - ( DBLE (CONJG(evc( i ,ibnd))*evc( i ,ibnd)) + & - DBLE (CONJG(evc(i+npwx,ibnd))*evc(i+npwx,ibnd))) - ELSE - sigmakin(l,m) = sigmakin(l,m) + wg(ibnd,ik) * & - gk(i,l) * gk(i, m) * kfac(i) * & - DBLE (CONJG(evc(i, ibnd) ) * evc(i, ibnd) ) - ENDIF - ENDDO - ENDDO + !$acc update device(evc) + ! + !$acc parallel loop collapse(2) reduction(+:s11,s21,s31,s22,s32,s33) + DO ibnd = 1, nbnd + DO i = 1, npw + wg_nk = wg(ibnd,ik) + IF (noncolin) THEN + tmpf = wg_nk * kfac(i) * & + ( DBLE(CONJG(evc( i ,ibnd))*evc( i ,ibnd)) + & + DBLE(CONJG(evc(i+npwx,ibnd))*evc(i+npwx,ibnd)) ) + ELSE + tmpf = wg_nk * kfac(i) * & + DBLE( CONJG(evc(i,ibnd) ) * evc(i,ibnd) ) + ENDIF + s11 = s11 + tmpf * gk(i,1)*gk(i,1) + s21 = s21 + tmpf * gk(i,2)*gk(i,1) + s31 = s31 + tmpf * gk(i,3)*gk(i,1) + s22 = s22 + tmpf * gk(i,2)*gk(i,2) + s32 = s32 + tmpf * gk(i,3)*gk(i,2) + s33 = s33 + tmpf * gk(i,3)*gk(i,3) ENDDO - ! ENDDO ! - ! ... contribution from the nonlocal part + ! ... contribution from the nonlocal part ! CALL stres_us( ik, gk, sigmanlc ) ! ENDDO ! - DEALLOCATE( kfac ) - DEALLOCATE( gk ) + sigmakin(:,1) = [s11, s21, s31] + sigmakin(:,2) = [0._DP,s22, s32] + sigmakin(:,3) = [0._DP,0._DP,s33] + ! + !$acc end data + !$acc end data + DEALLOCATE( gk, kfac ) + ! + !$acc exit data delete(evc) ! ! ... the kinetic term must be summed over PW's and over k-points ! @@ -121,11 +159,11 @@ SUBROUTINE stres_knl( sigmanlc, sigmakin ) ENDDO ! IF ( gamma_only ) THEN - sigmakin(:,:) = 2.d0 * e2 / omega * sigmakin(:,:) + sigmakin(:,:) = 2._DP * e2 / omega * sigmakin(:,:) ELSE sigmakin(:,:) = e2 / omega * sigmakin(:,:) ENDIF - sigmanlc(:,:) = -1.d0 / omega * sigmanlc(:,:) + sigmanlc(:,:) = -1._DP / omega * sigmanlc(:,:) ! ! ... symmetrize stress ! diff --git a/PW/src/stres_knl_gpu.f90 b/PW/src/stres_knl_gpu.f90 deleted file mode 100644 index 7b8893ba0..000000000 --- a/PW/src/stres_knl_gpu.f90 +++ /dev/null @@ -1,168 +0,0 @@ -! -! Copyright (C) 2001-2007 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!----------------------------------------------------------------------- -SUBROUTINE stres_knl_gpu( sigmanlc, sigmakin ) - !----------------------------------------------------------------------- - !! Computes the kinetic + nonlocal contribuition to the stress - ! - USE kinds, ONLY: DP - USE constants, ONLY: pi, e2 - USE cell_base, ONLY: omega, alat, at, bg, tpiba - USE gvect, ONLY: g, g_d - USE gvecw, ONLY: qcutz, ecfixed, q2sigma - USE klist, ONLY: nks, xk, ngk, igk_k_d - USE io_files, ONLY: iunwfc, nwordwfc - USE buffers, ONLY: get_buffer - USE symme, ONLY: symmatrix - USE wvfct, ONLY: npwx, nbnd, wg - USE control_flags, ONLY: gamma_only, use_gpu - USE noncollin_module, ONLY: noncolin, npol - USE wavefunctions, ONLY: evc - USE mp_pools, ONLY: inter_pool_comm - USE mp_bands, ONLY: intra_bgrp_comm - USE mp, ONLY: mp_sum -#if defined(__CUDA) - USE wavefunctions_gpum, ONLY: using_evc, using_evc_d, evc_d - USE device_fbuff_m, ONLY : dev_buf -#endif - ! - IMPLICIT NONE - ! - REAL(DP) :: sigmanlc(3,3) - !! non-local contribution to stress - REAL(DP) :: sigmakin(3,3) - !! kinetic contribution to stress - ! - ! ... local variables - ! - REAL(DP), POINTER :: gk_d(:,:), kfac_d (:) - REAL(DP) :: twobysqrtpi, gk2, arg, s11, s21, s31, s22, s32, s33, & - xk1, xk2, xk3, tmpf, wg_nk - INTEGER :: npw, ik, l, m, i, ibnd, is - INTEGER :: ierr(2) - ! -#if defined(__CUDA) - ATTRIBUTES(DEVICE) :: gk_d, kfac_d - ! - CALL using_evc(0) - CALL using_evc_d(0) - ! - CALL dev_buf%lock_buffer( gk_d, [npwx,3], ierr(1) ) - CALL dev_buf%lock_buffer( kfac_d, npwx, ierr(2) ) - ! - sigmanlc(:,:) = 0._DP - sigmakin(:,:) = 0._DP - twobysqrtpi = 2._DP/SQRT(pi) - ! - kfac_d(:) = 1._DP - ! - s11 = 0._DP ; s22 = 0._DP - s21 = 0._DP ; s32 = 0._DP - s31 = 0._DP ; s33 = 0._DP - ! - DO ik = 1, nks - IF ( nks > 1 ) THEN - CALL get_buffer( evc, nwordwfc, iunwfc, ik ) - CALL using_evc(2) - CALL using_evc_d(0) - ENDIF - ! - npw = ngk(ik) - ! - xk1 = xk(1,ik) - xk2 = xk(2,ik) - xk3 = xk(3,ik) - ! - !$cuf kernel do(1) <<<*,*>>> - DO i = 1, npw - gk_d(i,1) = ( xk1 + g_d(1,igk_k_d(i,ik)) ) * tpiba - gk_d(i,2) = ( xk2 + g_d(2,igk_k_d(i,ik)) ) * tpiba - gk_d(i,3) = ( xk3 + g_d(3,igk_k_d(i,ik)) ) * tpiba - IF (qcutz > 0._DP) THEN - gk2 = gk_d(i,1)**2 + gk_d(i,2)**2 + gk_d(i,3)**2 - arg = ( (gk2-ecfixed)/q2sigma )**2 - kfac_d(i) = 1._DP + qcutz / q2sigma * twobysqrtpi * EXP(-arg) - ENDIF - ENDDO - ! - ! ... kinetic contribution - ! - DO ibnd = 1, nbnd - wg_nk = wg(ibnd,ik) - !$cuf kernel do(1) <<<*,*>>> - DO i = 1, npw - IF (noncolin) THEN - tmpf = wg_nk * kfac_d(i) * & - ( DBLE(CONJG(evc_d( i ,ibnd))*evc_d( i ,ibnd)) + & - DBLE(CONJG(evc_d(i+npwx,ibnd))*evc_d(i+npwx,ibnd)) ) - ELSE - tmpf = wg_nk * kfac_d(i) * & - DBLE( CONJG(evc_d(i, ibnd) ) * evc_d(i, ibnd) ) - ENDIF - s11 = s11 + tmpf * gk_d(i,1)*gk_d(i,1) - s21 = s21 + tmpf * gk_d(i,2)*gk_d(i,1) - s31 = s31 + tmpf * gk_d(i,3)*gk_d(i,1) - s22 = s22 + tmpf * gk_d(i,2)*gk_d(i,2) - s32 = s32 + tmpf * gk_d(i,3)*gk_d(i,2) - s33 = s33 + tmpf * gk_d(i,3)*gk_d(i,3) - ENDDO - ENDDO - ! - ! ... contribution from the nonlocal part - ! - CALL stres_us_gpu( ik, gk_d, sigmanlc ) - ! - ENDDO - ! - sigmakin(:,1) = [s11, s21, s31] - sigmakin(:,2) = [0._DP,s22, s32] - sigmakin(:,3) = [0._DP,0._DP,s33] - ! - CALL dev_buf%release_buffer( kfac_d, ierr(2) ) - CALL dev_buf%release_buffer( gk_d, ierr(1) ) - ! - ! ... the kinetic term must be summed over PW's and over k-points - ! - CALL mp_sum( sigmakin, intra_bgrp_comm ) - CALL mp_sum( sigmakin, inter_pool_comm ) - ! - ! ... the nonlocal term is summed here only over k-points, because we add - ! ... to it the US term from augmentation charge derivatives - ! - CALL mp_sum( sigmanlc, inter_pool_comm ) - ! - ! ... add US term from augmentation charge derivatives, sum result over PW's - ! - CALL addusstress( sigmanlc ) - CALL mp_sum( sigmanlc, intra_bgrp_comm ) - ! - DO l = 1, 3 - DO m = 1, l-1 - sigmanlc(m,l) = sigmanlc(l,m) - sigmakin(m,l) = sigmakin(l,m) - ENDDO - ENDDO - ! - IF ( gamma_only ) THEN - sigmakin(:,:) = 2._DP * e2 / omega * sigmakin(:,:) - ELSE - sigmakin(:,:) = e2 / omega * sigmakin(:,:) - ENDIF - sigmanlc(:,:) = -1._DP / omega * sigmanlc(:,:) - ! - ! ... symmetrize stress - ! - CALL symmatrix( sigmakin ) - CALL symmatrix( sigmanlc ) -#endif - ! - RETURN - ! -END SUBROUTINE stres_knl_gpu - diff --git a/PW/src/stres_loc.f90 b/PW/src/stres_loc.f90 index cca8305e3..20487244e 100644 --- a/PW/src/stres_loc.f90 +++ b/PW/src/stres_loc.f90 @@ -1,4 +1,3 @@ - ! Copyright (C) 2001-2007 Quantum ESPRESSO group ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' @@ -7,8 +6,9 @@ ! ! !---------------------------------------------------------------------- -subroutine stres_loc( sigmaloc ) +SUBROUTINE stres_loc( sigmaloc ) !---------------------------------------------------------------------- + !! Calculate the local term of the stress. ! USE kinds, ONLY : DP USE atom, ONLY : msh, rgrid @@ -24,88 +24,125 @@ subroutine stres_loc( sigmaloc ) USE uspp_param, ONLY : upf USE mp_bands, ONLY : intra_bgrp_comm USE mp, ONLY : mp_sum - USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc, cutoff_stres_sigmaloc + USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc, & + cutoff_stres_sigmaloc ! - implicit none + IMPLICIT NONE ! - real(DP) :: sigmaloc(3,3) - real(DP), allocatable :: dvloc(:) - complex(DP), allocatable :: rhog(:,:) - real(DP) :: evloc, fact - integer :: ng, nt, l, m + REAL(DP) :: sigmaloc(3,3) + REAL(DP), ALLOCATABLE :: dvloc(:) + COMPLEX(DP), ALLOCATABLE :: rhog(:,:) + REAL(DP) :: evloc, fact + INTEGER :: ng, nt, l, m ! counter on g vectors ! counter on atomic type ! counter on angular momentum ! counter on spin components + REAL(DP) :: sigma11, sigma21, sigma22, spart, & + sigma31, sigma32, sigma33 ! - allocate( dvloc(ngl), rhog(dfftp%nnr,1) ) + ALLOCATE( dvloc(ngl), rhog(dfftp%nnr,1) ) sigmaloc(:,:) = 0.d0 ! - call rho_r2g( dfftp, rho%of_r(:,1), rhog ) + !$acc data create(rhog) + CALL rho_r2g( dfftp, rho%of_r(:,1), rhog ) ! - ! rhog contains the charge density in G space - if (gamma_only) then + !$acc data copyin(vloc,strf,gl,igtongl) create(dvloc) + ! + IF (gamma_only) THEN fact = 2.d0 - else + ELSE fact = 1.d0 - endif + ENDIF + ! evloc = 0.0d0 - do nt = 1, ntyp - if (gstart==2) evloc = evloc + rhog(1,1) * strf(1,nt) * vloc(igtongl(1),nt) - do ng = gstart, ngm - evloc = evloc + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt) ) & - * vloc(igtongl(ng),nt) * fact - enddo - enddo - ! 2D: add contribution from cutoff long-range part of Vloc - IF (do_cutoff_2D) call cutoff_stres_evloc( rhog(:,1), strf, evloc ) + ! + IF (gstart==2) THEN + !$acc parallel loop reduction(+:evloc) + DO nt = 1, ntyp + evloc = evloc + rhog(1,1)*strf(1,nt)*vloc(igtongl(1),nt) + ENDDO + ENDIF + ! + !$acc parallel loop collapse(2) reduction(+:evloc) + DO nt = 1, ntyp + DO ng = gstart, ngm + evloc = evloc + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt)) & + * vloc(igtongl(ng),nt) * fact + ENDDO + ENDDO + ! + ! ... 2D: add contribution from cutoff long-range part of Vloc + IF (do_cutoff_2D) CALL cutoff_stres_evloc( rhog(:,1), strf, evloc ) ! ! WRITE( 6,*) ' evloc ', evloc, evloc*omega ! DEBUG ! - do nt = 1, ntyp + DO nt = 1, ntyp + ! IF ( upf(nt)%is_gth ) THEN ! ! special case: GTH pseudopotential ! - call dvloc_gth( nt, upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc ) + CALL dvloc_gth( nt, upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc ) ! ELSEIF ( upf(nt)%tcoulombp ) THEN ! ! special case: pseudopotential is coulomb 1/r potential ! - call dvloc_coul( upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc ) + CALL dvloc_coul( upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc ) ! ELSE ! ! normal case: dvloc contains dV_loc(G)/dG ! - call dvloc_of_g( rgrid(nt)%mesh, msh(nt), rgrid(nt)%rab, rgrid(nt)%r, & - upf(nt)%vloc(:), upf(nt)%zp, tpiba2, ngl, gl, omega, dvloc ) + CALL dvloc_of_g( rgrid(nt)%mesh, msh(nt), rgrid(nt)%rab, rgrid(nt)%r, & + upf(nt)%vloc(:), upf(nt)%zp, tpiba2, ngl, gl, omega, & + dvloc ) ! ENDIF - ! no G=0 contribution - do ng = 1, ngm - do l = 1, 3 - do m = 1, l - sigmaloc(l,m) = sigmaloc(l,m) + DBLE(CONJG(rhog(ng,1)) * strf(ng,nt))* & - 2.0d0 * dvloc(igtongl(ng) ) * tpiba2 * & - g(l,ng) * g(m,ng) * fact - enddo - enddo - enddo - enddo - IF (do_cutoff_2D) call cutoff_stres_sigmaloc( rhog(:,1), strf, sigmaloc ) ! 2D: re-add LR Vloc to sigma here + ! ... no G=0 contribution + sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP + sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP + ! + !$acc parallel loop reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32,& + !$acc sigma33) + DO ng = 1, ngm + spart = DBLE(CONJG(rhog(ng,1)) * strf(ng,nt)) * 2.0_DP * & + dvloc(igtongl(ng)) + sigma11 = sigma11 + spart * g(1,ng) * g(1,ng) + sigma21 = sigma21 + spart * g(2,ng) * g(1,ng) + sigma22 = sigma22 + spart * g(2,ng) * g(2,ng) + sigma31 = sigma31 + spart * g(3,ng) * g(1,ng) + sigma32 = sigma32 + spart * g(3,ng) * g(2,ng) + sigma33 = sigma33 + spart * g(3,ng) * g(3,ng) + ENDDO + ! + sigmaloc(1,1) = sigmaloc(1,1) + sigma11 * fact * tpiba2 + sigmaloc(2,1) = sigmaloc(2,1) + sigma21 * fact * tpiba2 + sigmaloc(2,2) = sigmaloc(2,2) + sigma22 * fact * tpiba2 + sigmaloc(3,1) = sigmaloc(3,1) + sigma31 * fact * tpiba2 + sigmaloc(3,2) = sigmaloc(3,2) + sigma32 * fact * tpiba2 + sigmaloc(3,3) = sigmaloc(3,3) + sigma33 * fact * tpiba2 + ! + ENDDO ! - do l = 1, 3 + ! ... 2D: re-add LR Vloc to sigma here + IF (do_cutoff_2D) CALL cutoff_stres_sigmaloc( rhog(:,1), strf, sigmaloc ) + ! + !$acc end data + !$acc end data + ! + DO l = 1, 3 sigmaloc(l,l) = sigmaloc(l,l) + evloc - do m = 1, l-1 + DO m = 1, l-1 sigmaloc(m,l) = sigmaloc(l,m) - enddo - enddo + ENDDO + ENDDO ! - call mp_sum( sigmaloc, intra_bgrp_comm ) + CALL mp_sum( sigmaloc, intra_bgrp_comm ) ! - deallocate( dvloc, rhog ) - return -end subroutine stres_loc - + DEALLOCATE( dvloc, rhog ) + ! + RETURN + ! +END SUBROUTINE stres_loc diff --git a/PW/src/stres_loc_gpu.f90 b/PW/src/stres_loc_gpu.f90 deleted file mode 100644 index a3d341bf0..000000000 --- a/PW/src/stres_loc_gpu.f90 +++ /dev/null @@ -1,199 +0,0 @@ - -! Copyright (C) 2001-2007 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!---------------------------------------------------------------------- -SUBROUTINE stres_loc_gpu( sigmaloc ) - !---------------------------------------------------------------------- - ! - USE kinds, ONLY : DP - USE atom, ONLY : msh, rgrid - USE m_gth, ONLY : dvloc_gth_gpu - USE ions_base, ONLY : ntyp => nsp - USE cell_base, ONLY : omega, tpiba2 - USE fft_base, ONLY : dfftp - USE fft_rho, ONLY : rho_r2g - USE gvect, ONLY : ngm, gstart, ngl, gl, igtongl, & - g_d, gl_d, igtongl_d - USE scf, ONLY : rho - USE vlocal, ONLY : strf, vloc - USE control_flags, ONLY : gamma_only - USE uspp_param, ONLY : upf - USE mp_bands, ONLY : intra_bgrp_comm - USE mp, ONLY : mp_sum - USE Coul_cut_2D, ONLY : do_cutoff_2D, cutoff_stres_evloc_gpu, cutoff_stres_sigmaloc_gpu - ! -#if defined(__CUDA) - USE device_fbuff_m, ONLY : dev_buf - USE device_memcpy_m, ONLY : dev_memcpy -#endif - ! - implicit none - ! - REAL(DP) :: sigmaloc(3,3) - REAL(DP) :: evloc, fact - INTEGER :: ng, nt, l, m - ! counter on g vectors - ! counter on atomic type - ! counter on angular momentum - ! counter on spin components - ! - INTEGER :: ierrs(5) - REAL(DP) :: evloc_d, zp_d - REAL(DP) :: spart, sigma11, sigma21, sigma22, sigma31, sigma32, sigma33 - ! - INTEGER :: mshd - ! - INTEGER, POINTER :: nl_d(:) - REAL(DP), POINTER :: rab_d(:), r_d(:), vloc_d(:,:), dvloc_d(:), & - upfvloc_d(:) - COMPLEX(DP), POINTER :: strf_d(:,:) - COMPLEX(DP), ALLOCATABLE :: rhog(:,:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: vloc_d, strf_d, nl_d, rab_d, r_d, dvloc_d, & - upfvloc_d - ! - nl_d => dfftp%nl_d - ! - sigmaloc(:,:) = 0._DP - ! - ALLOCATE( rhog(dfftp%nnr,1) ) - !$acc data create( rhog ) - ! - CALL rho_r2g( dfftp, rho%of_r(:,1), rhog ) - ! - gl_d = gl ; igtongl_d = igtongl - ! - ! rhog contains now the charge density in G space - fact = 1._DP - IF (gamma_only) fact = 2._DP - ! - evloc = 0.0_DP - ! - CALL dev_buf%lock_buffer( strf_d, (/ ngm, ntyp /), ierrs(1) ) - CALL dev_memcpy( strf_d, strf ) - ! - CALL dev_buf%lock_buffer( vloc_d, (/ ngl, ntyp /), ierrs(2) ) - CALL dev_memcpy( vloc_d, vloc ) - ! - ! - DO nt = 1, ntyp - IF (gstart==2) THEN - !$acc kernels - evloc = evloc + rhog(1,1) * strf_d(1,nt) * vloc_d(igtongl_d(1),nt) - !$acc end kernels - ENDIF - ! - evloc_d = 0._DP - ! - !$acc parallel loop - DO ng = gstart, ngm - evloc_d = evloc_d + DBLE(CONJG(rhog(ng,1)) * strf_d(ng,nt)) & - * vloc_d(igtongl_d(ng),nt) - ENDDO - ! - evloc = evloc + evloc_d * fact - ! - ENDDO - ! - ! - CALL dev_buf%release_buffer( vloc_d, ierrs(2) ) - ! - CALL dev_buf%lock_buffer( dvloc_d, ngl, ierrs(2) ) - mshd = MAXVAL(rgrid(1:ntyp)%mesh) - CALL dev_buf%lock_buffer( rab_d, mshd, ierrs(3) ) - CALL dev_buf%lock_buffer( r_d, mshd, ierrs(4) ) - CALL dev_buf%lock_buffer( upfvloc_d, mshd, ierrs(5) ) - ! - ! - ! 2D: add contribution from cutoff long-range part of Vloc - !$acc host_data use_device(rhog) - IF (do_cutoff_2D) CALL cutoff_stres_evloc_gpu( rhog(:,1), strf_d, evloc ) - !$acc end host_data - ! - ! WRITE( 6,*) ' evloc ', evloc, evloc*omega ! DEBUG - ! - DO nt = 1, ntyp - IF ( upf(nt)%is_gth ) THEN - ! - ! special case: GTH pseudopotential - ! - CALL dvloc_gth_gpu( nt, upf(nt)%zp, tpiba2, ngl, gl_d, omega, dvloc_d ) - ! - ELSEIF ( upf(nt)%tcoulombp ) THEN - ! - ! special case: pseudopotential is coulomb 1/r potential - ! - CALL dvloc_coul_gpu( upf(nt)%zp, tpiba2, ngl, gl_d, omega, dvloc_d ) - ! - ELSE - ! - ! normal case: dvloc contains dV_loc(G)/dG - ! - ! the G=0 component is not computed - rab_d(1:msh(nt)) = rgrid(nt)%rab(1:msh(nt)) - r_d(1:msh(nt)) = rgrid(nt)%r(1:msh(nt)) - upfvloc_d(1:msh(nt)) = upf(nt)%vloc(1:msh(nt)) - zp_d = upf(nt)%zp - ! - CALL dvloc_of_g_gpu( rgrid(nt)%mesh, msh(nt), rab_d(1:rgrid(nt)%mesh), & - r_d(1:rgrid(nt)%mesh), upfvloc_d(1:rgrid(nt)%mesh), & - zp_d, tpiba2, ngl, gl_d, omega, dvloc_d ) - ! - ENDIF - ! - sigma11 = 0._DP ; sigma21 = 0._DP ; sigma22 = 0._DP - sigma31 = 0._DP ; sigma32 = 0._DP ; sigma33 = 0._DP - ! - !$acc parallel loop reduction(+:sigma11,sigma21,sigma22,sigma31,sigma32,sigma33) - DO ng = 1, ngm - spart = DBLE(CONJG(rhog(ng,1)) * strf_d(ng,nt)) * 2.0_DP *& - dvloc_d(igtongl_d(ng)) - sigma11 = sigma11 + spart * g_d(1,ng) * g_d(1,ng) - sigma21 = sigma21 + spart * g_d(2,ng) * g_d(1,ng) - sigma22 = sigma22 + spart * g_d(2,ng) * g_d(2,ng) - sigma31 = sigma31 + spart * g_d(3,ng) * g_d(1,ng) - sigma32 = sigma32 + spart * g_d(3,ng) * g_d(2,ng) - sigma33 = sigma33 + spart * g_d(3,ng) * g_d(3,ng) - ENDDO - ! - sigmaloc(1,1) = sigmaloc(1,1) + sigma11 * fact * tpiba2 - sigmaloc(2,1) = sigmaloc(2,1) + sigma21 * fact * tpiba2 - sigmaloc(2,2) = sigmaloc(2,2) + sigma22 * fact * tpiba2 - sigmaloc(3,1) = sigmaloc(3,1) + sigma31 * fact * tpiba2 - sigmaloc(3,2) = sigmaloc(3,2) + sigma32 * fact * tpiba2 - sigmaloc(3,3) = sigmaloc(3,3) + sigma33 * fact * tpiba2 - ! - ENDDO - ! - !$acc host_data use_device(rhog) - IF (do_cutoff_2D) CALL cutoff_stres_sigmaloc_gpu( rhog(:,1), strf_d, sigmaloc ) ! 2D: re-add LR Vloc to sigma here - !$acc end host_data - ! - DO l = 1, 3 - sigmaloc(l,l) = sigmaloc(l,l) + evloc - DO m = 1, l-1 - sigmaloc(m,l) = sigmaloc(l,m) - ENDDO - ENDDO - ! - CALL mp_sum( sigmaloc, intra_bgrp_comm ) - ! - !$acc end data - DEALLOCATE( rhog ) - CALL dev_buf%release_buffer( strf_d, ierrs(1) ) - CALL dev_buf%release_buffer( dvloc_d, ierrs(2) ) - CALL dev_buf%release_buffer( rab_d, ierrs(3) ) - CALL dev_buf%release_buffer( r_d, ierrs(4) ) - CALL dev_buf%release_buffer( upfvloc_d, ierrs(5) ) - ! -#endif - RETURN - ! -END SUBROUTINE stres_loc_gpu - diff --git a/PW/src/stres_mgga.f90 b/PW/src/stres_mgga.f90 index 587192739..5b4715294 100644 --- a/PW/src/stres_mgga.f90 +++ b/PW/src/stres_mgga.f90 @@ -9,16 +9,16 @@ !---------------------------------------------------------------------------- SUBROUTINE stres_mgga( sigmaxc ) !---------------------------------------------------------------------------- - ! - ! Analytic stress tensor contribution from metagga is added to sigmaxc + !! Analytic stress tensor contribution from metagga is added to sigmaxc. ! USE kinds, ONLY : DP USE control_flags, ONLY : gamma_only USE noncollin_module, ONLY : noncolin - USE cell_base, ONLY : alat, at, bg, omega, tpiba + USE cell_base, ONLY : omega USE gvect, ONLY : g USE scf, ONLY : rho, v USE wavefunctions, ONLY : evc + USE wavefunctions_gpum, ONLY : using_evc USE xc_lib, ONLY : xclib_dft_is USE klist, ONLY : nks, xk, ngk USE buffers, ONLY : get_buffer @@ -26,74 +26,75 @@ SUBROUTINE stres_mgga( sigmaxc ) USE wvfct, ONLY : nbnd, npwx, wg USE lsda_mod, ONLY : lsda, nspin, current_spin, isk USE fft_interfaces, ONLY : fwfft, invfft - USE fft_base, ONLY : dfftp, dffts + USE fft_base, ONLY : dffts USE mp, ONLY : mp_sum USE mp_pools, ONLY : inter_pool_comm USE mp_bands, ONLY : intra_bgrp_comm - USE wavefunctions_gpum, ONLY : using_evc ! IMPLICIT NONE ! - REAL(DP), INTENT(INOUT) :: sigmaxc(3,3) + REAL(DP), INTENT(INOUT) :: sigmaxc(3,3) ! - ! Internal variables + ! ... local variables ! - INTEGER :: ix, iy, ir, iss, ipol, incr, ibnd, ik, npw - INTEGER :: ipol2xy(3,3) - !! ipol2xy(i,j) = ipol2x(j,i) is a collapsed symmetric index - DATA ipol2xy / 1, 2, 3, 2, 4, 5, 3, 5, 6/ - REAL(DP), PARAMETER :: epsr = 1.0d-6, epsg = 1.0d-10, e2 = 2.d0 - COMPLEX(DP), ALLOCATABLE :: gradwfc (:,:), crosstaus(:,:,:) - REAL(DP) :: w1, w2, delta, sigma_mgga(3,3) + INTEGER :: ix, iy, K, ir, ipol, iss, incr, ibnd, ik, npw ! - if ( .not. xclib_dft_is('meta') ) return + REAL(DP), PARAMETER :: epsr = 1.E-6_DP, epsg = 1.E-10_DP, e2 = 2._DP ! - current_spin=1 + COMPLEX(DP), ALLOCATABLE :: gradwfc(:,:), crosstaus(:,:,:) + REAL(DP), ALLOCATABLE :: vkin(:), rhokin(:) + REAL(DP) :: sigma1, sigma2, sigma3, & + sigma4, sigma5, sigma6 + ! + REAL(DP) :: w1, w2, delta, sigma_mgga(3,3) ! ! - ! Stop if something is not yet implemented + IF ( .NOT. xclib_dft_is('meta') ) RETURN ! - if (noncolin) call errore('stres_mgga', & - 'noncollinear stress + meta-GGA not implemented',1) + CALL using_evc(1) ! - ! Initialization of a set of variables + current_spin = 1 ! - allocate (gradwfc( dffts%nnr, 3)) - allocate (crosstaus( dffts%nnr,6,nspin)) + IF ( noncolin ) CALL errore( 'stres_mgga', 'noncollinear stress + meta-GGA & + ¬ implemented', 1 ) ! - ! For gamma_only efficiency + ALLOCATE( gradwfc(dffts%nnr,3) ) + ALLOCATE( crosstaus(dffts%nnr,6,nspin) ) + !$acc data create(crosstaus) + !$acc data create(gradwfc) ! - incr=1 - IF ( gamma_only ) incr=2 + !$acc kernels + crosstaus = 0.d0 + !$acc end kernels ! - crosstaus(:,:,:) = 0.d0 - gradwfc(:,:) = 0.d0 + ! ... For gamma_only efficiency ! - ! Loop over the k points + incr = 1 + IF ( gamma_only ) incr = 2 + ! + ! ... Loop over the k points ! k_loop: DO ik = 1, nks - ! ! IF ( lsda ) current_spin = isk(ik) ! npw = ngk(ik) ! - ! Read the wavefunctions + ! ... Read the wavefunctions ! IF ( nks > 1 ) THEN - ! - CALL get_buffer ( evc, nwordwfc, iunwfc, ik ) - CALL using_evc(2) - ! - END IF + CALL get_buffer( evc, nwordwfc, iunwfc, ik ) + !$acc update device(evc) + ENDIF ! - do ibnd = 1, nbnd, incr + ! + DO ibnd = 1, nbnd, incr ! - ! w1, w2: weights for each k point and band + ! ... w1, w2: weights for each k point and band ! w1 = wg(ibnd,ik) / omega ! - IF ( (ibnd < nbnd) .and. (gamma_only) ) THEN + IF ( (ibnd < nbnd) .AND. (gamma_only) ) THEN ! ! ... two ffts at the same time ! @@ -103,165 +104,223 @@ SUBROUTINE stres_mgga( sigmaxc ) ! w2 = w1 ! - END IF + ENDIF + ! + ! ... Gradient of the wavefunction in real space ! - ! Gradient of the wavefunction in real space - ! CALL wfc_gradient( ibnd, ik, npw, gradwfc ) ! - ! Cross terms of kinetic energy density + ! ... Cross terms of kinetic energy density ! - do ix=1,3 - ! - do iy=1,ix - ! - ipol = ipol2xy(iy,ix) - ! - do ir=1,dffts%nnr - ! - crosstaus(ir,ipol,current_spin) = crosstaus(ir,ipol,current_spin) +& - 2.0_DP*w1*DBLE(gradwfc(ir,ix))*DBLE(gradwfc(ir,iy)) +& - 2.0_DP*w2*AIMAG(gradwfc(ir,ix))*AIMAG(gradwfc(ir,iy)) - ! - end do - ! - end do - ! - end do + !$acc parallel loop collapse(2) + DO ir = 1, dffts%nnr + DO ipol = 1, 6 + ! + ! ... explanation here: https://stackoverflow.com/a/244550 + ! + ! M*(M+1)/ 2 + K = (3.0*4.0)/2.0 - 1 - (ipol - 1) + K = FLOOR((SQRT(FLOAT(8*K+1))-1)/2) + ix = (ipol-1) - (3.0*4.0)/2.0 + (K+1)*(K+2)/2.0 + 1 + (2-K) + iy = 3 - K + ! + crosstaus(ir,ipol,current_spin) = crosstaus(ir,ipol,current_spin) + & + 2.0_DP*w1*DBLE(gradwfc(ir,ix))*DBLE(gradwfc(ir,iy)) + & + 2.0_DP*w2*AIMAG(gradwfc(ir,ix))*AIMAG(gradwfc(ir,iy)) + ENDDO + ENDDO ! - end do + ENDDO !ibnd ! - END DO k_loop + ENDDO k_loop ! - call mp_sum( crosstaus, inter_pool_comm ) + !$acc end data + DEALLOCATE( gradwfc ) ! - ! gradwfc not used anymore + !$acc host_data use_device(crosstaus) + CALL mp_sum( crosstaus, inter_pool_comm ) + !$acc end host_data ! - deallocate (gradwfc) + ALLOCATE( vkin(dffts%nnr) ) + ALLOCATE( rhokin(dffts%nnr) ) + !$acc data create(vkin,rhokin) ! - sigma_mgga(:,:) = 0.D0 + ! ... metagga contribution to the stress tensor + sigma_mgga(:,:) = 0._DP ! - ! metagga contribution to the stress tensor + sigma1 = 0.d0 ; sigma4 = 0.d0 + sigma2 = 0.d0 ; sigma5 = 0.d0 + sigma3 = 0.d0 ; sigma6 = 0.d0 ! - do iss=1,nspin + DO iss = 1, nspin ! - do ix=1,3 - ! - do iy=1,3 - ! - delta=0. - if (ix==iy) delta=1. - ! - do ir=1,dffts%nnr - ! - sigma_mgga(ix,iy) = sigma_mgga(ix,iy) + v%kin_r(ir,iss) & - * ( rho%kin_r(ir,iss) * delta & - + crosstaus(ir,ipol2xy(ix,iy),iss) ) - ! - end do - ! - ! - end do - ! - end do + vkin = v%kin_r(:,iss) + rhokin = rho%kin_r(:,iss) + !$acc update device(vkin,rhokin) ! - end do - deallocate( crosstaus ) + !$acc parallel loop reduction(+:sigma1,sigma2,sigma3,sigma4,sigma5,sigma6) + DO ir = 1, dffts%nnr + ! + sigma1 = sigma1 + vkin(ir) * ( rhokin(ir) & + + DBLE(crosstaus(ir,1,iss)) ) + sigma2 = sigma2 + vkin(ir) * DBLE(crosstaus(ir,2,iss)) + sigma3 = sigma3 + vkin(ir) * DBLE(crosstaus(ir,3,iss)) + sigma4 = sigma4 + vkin(ir) * ( rhokin(ir) & + + DBLE(crosstaus(ir,4,iss)) ) + sigma5 = sigma5 + vkin(ir) * DBLE(crosstaus(ir,5,iss)) + sigma6 = sigma6 + vkin(ir) * ( rhokin(ir) & + + DBLE(crosstaus(ir,6,iss)) ) + ! + ENDDO + ! + ENDDO + ! + sigma_mgga(1,1) = sigma1 ; sigma_mgga(2,3) = sigma5 + sigma_mgga(1,2) = sigma2 ; sigma_mgga(3,1) = sigma3 + sigma_mgga(1,3) = sigma3 ; sigma_mgga(3,2) = sigma5 + sigma_mgga(2,1) = sigma2 ; sigma_mgga(3,3) = sigma6 + sigma_mgga(2,2) = sigma4 + ! + !$acc end data + !$acc end data + DEALLOCATE( vkin, rhokin ) + DEALLOCATE( crosstaus ) + ! + CALL mp_sum( sigma_mgga, intra_bgrp_comm ) ! - call mp_sum( sigma_mgga, intra_bgrp_comm ) sigmaxc(:,:) = sigmaxc(:,:) + sigma_mgga(:,:) / & (dffts%nr1 * dffts%nr2 * dffts%nr3) ! - return - ! + RETURN + ! END SUBROUTINE stres_mgga - -SUBROUTINE wfc_gradient ( ibnd, ik, npw, gradpsi ) +! +! +!---------------------------------------------------------- +SUBROUTINE wfc_gradient( ibnd, ik, npw, gradpsi ) + !---------------------------------------------------------- + !! Returns the gradient of the wavefunction in real space. + !! Slightly adapted from sum_bands.f90 ! - ! Returns the gradient of the wavefunction in real space - ! Slightly adapted from sum_bands.f90 - ! - USE kinds, ONLY : DP - USE control_flags, ONLY : gamma_only - USE wavefunctions, ONLY : psic, evc - USE wvfct, ONLY : npwx, nbnd - USE cell_base, ONLY : omega, tpiba - USE klist, ONLY : xk, igk_k - USE gvect, ONLY : g - USE fft_base, ONLY : dffts - USE fft_interfaces, ONLY : invfft - USE wavefunctions_gpum, ONLY : using_evc + USE kinds, ONLY: DP + USE control_flags, ONLY: gamma_only + USE wvfct, ONLY: npwx, nbnd + USE cell_base, ONLY: omega, tpiba + USE klist, ONLY: xk, igk_k + USE wavefunctions, ONLY: psic, evc + USE fft_base, ONLY: dffts + USE fft_interfaces, ONLY: invfft + USE gvect, ONLY: ngm, g ! IMPLICIT NONE ! - INTEGER :: ibnd, ik, npw - COMPLEX(DP) :: gradpsi(dffts%nnr,3) + INTEGER, INTENT(IN) :: ibnd, ik, npw + COMPLEX(DP), INTENT(OUT) :: gradpsi(dffts%nnr,3) ! - ! Internal variables + ! ... local variables ! - REAL(DP) :: kplusg(npwx) - INTEGER :: ipol + INTEGER :: ipol, j + REAL(DP) :: kplusg + INTEGER, ALLOCATABLE :: nld(:), nlmd(:) + REAL(DP) :: xki(3) ! - CALL using_evc(0) + !$acc data present(gradpsi) copyin(evc) create(psic) ! - ! Compute the gradient of the wavefunction in reciprocal space + ALLOCATE( nld(npw) ) + nld = dffts%nl + ! + xki(1:3) = xk(1:3,ik) + ! + ! ... Compute the gradient of the wavefunction in reciprocal space ! IF ( gamma_only ) THEN ! - DO ipol=1,3 + ALLOCATE( nlmd(npw) ) + nlmd = dffts%nlm + !$acc data copyin(xki,nld,nlmd) + ! + DO ipol = 1, 3 ! - psic(:) = ( 0.D0, 0.D0 ) - ! - kplusg (1:npw) = (xk(ipol,ik)+g(ipol,igk_k(1:npw,ik))) * tpiba + !$acc kernels + psic(:) = (0._DP,0._DP) + !$acc end kernels ! IF ( ibnd < nbnd ) THEN ! ! ... two ffts at the same time ! - psic(dffts%nl(1:npw)) = CMPLX(0d0, kplusg(1:npw),kind=DP)* & - ( evc(1:npw,ibnd) + & - ( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) ) - ! - psic(dffts%nlm(1:npw)) = CMPLX(0d0,-kplusg(1:npw),kind=DP) * & - CONJG( evc(1:npw,ibnd) - & - ( 0.D0, 1.D0 ) * evc(1:npw,ibnd+1) ) + !$acc parallel loop + DO j = 1, npw + kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba + ! + psic(nld(j)) = CMPLX(0._DP, kplusg, kind=DP) * & + ( evc(j,ibnd) + (0._DP,1._DP) * evc(j,ibnd+1) ) + ! + psic(nlmd(j)) = CMPLX(0._DP,-kplusg, kind=DP) * & + CONJG( evc(j,ibnd) - (0._DP,1._DP) * evc(j,ibnd+1) ) + ENDDO ! ELSE ! - psic(dffts%nl(1:npw)) = CMPLX(0d0, kplusg(1:npw),kind=DP)* & - evc(1:npw,ibnd) + !$acc parallel loop + DO j = 1, npw + kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba + ! + psic(nld(j)) = CMPLX(0._DP, kplusg, kind=DP) * evc(j,ibnd) + ! + psic(nlmd(j)) = CMPLX(0._DP,-kplusg,kind=DP) * CONJG(evc(j,ibnd)) + ENDDO ! - psic(dffts%nlm(1:npw)) = CMPLX(0d0,-kplusg(1:npw),kind=DP) * & - CONJG( evc(1:npw,ibnd) ) - ! - END IF + ENDIF ! - ! Gradient of the wavefunction in real space + ! ... Gradient of the wavefunction in real space ! - CALL invfft ('Wave', psic, dffts) + !$acc host_data use_device(psic) + CALL invfft( 'Wave', psic, dffts ) + !$acc end host_data ! + !$acc kernels gradpsi(:,ipol) = psic + !$acc end kernels ! - END DO + ENDDO + !$acc end data + DEALLOCATE( nlmd ) ! ELSE ! - DO ipol=1,3 - ! - psic(:) = ( 0.D0, 0.D0 ) - ! - kplusg (1:npw) = (xk(ipol,ik)+g(ipol,igk_k(1:npw,ik))) * tpiba - psic(dffts%nl(igk_k(1:npw,ik))) = CMPLX(0d0, kplusg(1:npw),kind=DP)* & - evc(1:npw,ibnd) - ! - ! Gradient of the wavefunction in real space - ! - CALL invfft ('Wave', psic, dffts) - ! - gradpsi(:,ipol) = psic - ! - END DO - ! - END IF + !$acc data copyin(xki,nld) + DO ipol = 1, 3 + ! + !$acc kernels + psic(:) = (0._DP,0._DP) + !$acc end kernels + ! + !$acc parallel loop + DO j = 1, npw + kplusg = (xki(ipol)+g(ipol,igk_k(j,ik))) * tpiba + ! + psic(nld(j)) = CMPLX(0._DP,kplusg,kind=DP)*evc(j,ibnd) + ENDDO + ! + ! ... Gradient of the wavefunction in real space + ! + !$acc host_data use_device(psic) + CALL invfft( 'Wave', psic, dffts ) + !$acc end host_data + ! + !$acc kernels + gradpsi(:,ipol) = psic + !$acc end kernels + ! + ENDDO + !$acc end data + ENDIF + ! + DEALLOCATE( nld ) + ! + !$acc end data + ! + RETURN ! END SUBROUTINE wfc_gradient diff --git a/PW/src/stres_mgga_gpu.f90 b/PW/src/stres_mgga_gpu.f90 deleted file mode 100644 index 1ace83254..000000000 --- a/PW/src/stres_mgga_gpu.f90 +++ /dev/null @@ -1,335 +0,0 @@ -! -! Copyright (C) 2017 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!---------------------------------------------------------------------------- -SUBROUTINE stres_mgga_gpu( sigmaxc ) - !---------------------------------------------------------------------------- - !! Analytic stress tensor contribution from metagga is added to sigmaxc - ! - USE kinds, ONLY : DP - USE control_flags, ONLY : gamma_only - USE noncollin_module, ONLY : noncolin - USE cell_base, ONLY : alat, at, bg, omega, tpiba - USE gvect, ONLY : g - USE scf, ONLY : rho, v - USE wavefunctions, ONLY : evc - USE xc_lib, ONLY : xclib_dft_is - USE klist, ONLY : nks, xk, ngk - USE buffers, ONLY : get_buffer - USE io_files, ONLY : iunwfc, nwordwfc - USE wvfct, ONLY : nbnd, npwx, wg - USE lsda_mod, ONLY : lsda, nspin, current_spin, isk - USE fft_interfaces, ONLY : fwfft, invfft - USE fft_base, ONLY : dfftp, dffts - USE mp, ONLY : mp_sum - USE mp_pools, ONLY : inter_pool_comm - USE mp_bands, ONLY : intra_bgrp_comm - USE wavefunctions_gpum, ONLY : using_evc - ! -#if defined(__CUDA) - USE device_fbuff_m , ONLY : dev_buf - USE device_memcpy_m, ONLY : dev_memcpy, dev_memset -#endif - ! - IMPLICIT NONE - ! - REAL(DP), INTENT(INOUT) :: sigmaxc(3,3) - ! - ! Internal variables - ! - INTEGER :: ix, iy, K, ir, ipol, iss, incr, ibnd, ik, npw - ! - INTEGER :: ierrs(4) - ! - REAL(DP), PARAMETER :: epsr = 1.E-6_DP, epsg = 1.E-10_DP, e2 = 2._DP - ! - COMPLEX(DP), POINTER :: gradwfc_d(:,:), crosstaus_d(:,:,:) - INTEGER :: ix_d(6), iy_d(6) - REAL(DP), POINTER :: vkin_d(:), rhokin_d(:) - REAL(DP) :: sigma1_d, sigma2_d, sigma3_d, & - sigma4_d, sigma5_d, sigma6_d - ! - REAL(DP) :: w1, w2, delta, sigma_mgga(3,3) - ! -#if defined(__CUDA) - attributes(DEVICE) :: gradwfc_d, crosstaus_d, vkin_d, rhokin_d, ix_d, iy_d - ! - if ( .not. xclib_dft_is('meta') ) return - ! - current_spin = 1 - ! - ! Stop if something is not yet implemented - ! - IF ( noncolin ) CALL errore( 'stres_mgga', & - 'noncollinear stress + meta-GGA not implemented', 1 ) - ! - ! Initialization of a set of variables - ! - CALL dev_buf%lock_buffer( gradwfc_d, (/ dffts%nnr, 3 /), ierrs(1) ) - CALL dev_buf%lock_buffer( crosstaus_d, (/ dffts%nnr, 6, nspin /), ierrs(2) ) - IF (ANY(ierrs(1:2) /= 0)) CALL errore( 'stres_mgga_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs(1:2))) ) - ! - CALL dev_memset(crosstaus_d , (0._DP,0._DP) ) - ! - ! For gamma_only efficiency - ! - incr = 1 - IF ( gamma_only ) incr = 2 - ! - !Polarization indexes - ! ix_d(1) = 1 ; iy_d(1) = 1 - ! ix_d(2) = 2 ; iy_d(2) = 1 - ! ix_d(3) = 3 ; iy_d(3) = 1 - ! ix_d(4) = 2 ; iy_d(4) = 2 - ! ix_d(5) = 3 ; iy_d(5) = 2 - ! ix_d(6) = 3 ; iy_d(6) = 3 - ! - ! Loop over the k points - ! - k_loop: DO ik = 1, nks - ! - IF ( lsda ) current_spin = isk(ik) - ! - npw = ngk(ik) - ! - ! Read the wavefunctions - ! - IF ( nks > 1 ) THEN - CALL get_buffer( evc, nwordwfc, iunwfc, ik ) - CALL using_evc(2) - ENDIF - ! - DO ibnd = 1, nbnd, incr - ! - ! w1, w2: weights for each k point and band - ! - w1 = wg(ibnd,ik) / omega - ! - IF ( (ibnd < nbnd) .AND. (gamma_only) ) THEN - ! - ! ... two ffts at the same time - ! - w2 = wg(ibnd+1,ik) / omega - ! - ELSE - ! - w2 = w1 - ! - ENDIF - ! - ! Gradient of the wavefunction in real space - ! - CALL wfc_gradient_gpu( ibnd, ik, npw, gradwfc_d ) - ! - ! Cross terms of kinetic energy density - ! - ! FIX ME: PGI complains here if I set do(2) - !$cuf kernel do (1) <<<*,*>>> - DO ir = 1, dffts%nnr - DO ipol = 1, 6 - ! - ! explenation here: https://stackoverflow.com/a/244550 - ! - ! M*(M+1)/ 2 - K = (3.0*4.0)/2.0 - 1 - (ipol - 1) - K = floor((SQRT(FLOAT(8*K+1))-1)/2) - ix = (ipol-1) - (3.0*4.0)/2.0 + (K+1)*(K+2)/2.0 + 1 + (2-K) - iy = 3 - K - ! - crosstaus_d(ir,ipol,current_spin) = crosstaus_d(ir,ipol,current_spin) + & - 2.0_DP*w1*DBLE(gradwfc_d(ir,ix))*DBLE(gradwfc_d(ir,iy)) + & - 2.0_DP*w2*AIMAG(gradwfc_d(ir,ix))*AIMAG(gradwfc_d(ir,iy)) - ENDDO - ENDDO - ! - !crosstaus(:,:,current_spin) = crosstaus(:,:,current_spin) + & - ! crosstaus_d(:,:,current_spin) - ! - ENDDO !ibnd - ! - ENDDO k_loop - ! - ! - CALL dev_buf%release_buffer( gradwfc_d, ierrs(1) ) - ! - CALL mp_sum( crosstaus_d, inter_pool_comm ) - ! - CALL dev_buf%lock_buffer( vkin_d, dffts%nnr, ierrs(3) ) - CALL dev_buf%lock_buffer( rhokin_d, dffts%nnr, ierrs(4) ) - IF (ANY(ierrs(3:4) /= 0)) CALL errore( 'stres_mgga_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs(3:4))) ) - ! - ! metagga contribution to the stress tensor - sigma_mgga(:,:) = 0._DP - ! - sigma1_d = 0.d0 ; sigma4_d = 0.d0 - sigma2_d = 0.d0 ; sigma5_d = 0.d0 - sigma3_d = 0.d0 ; sigma6_d = 0.d0 - ! - ! - DO iss = 1, nspin - ! - CALL dev_memcpy( vkin_d, v%kin_r(:,iss) ) - CALL dev_memcpy( rhokin_d, rho%kin_r(:,iss) ) - ! - !$cuf kernel do (1) <<<*,*>>> - DO ir = 1, dffts%nnr - ! - sigma1_d = sigma1_d + vkin_d(ir) * (rhokin_d(ir) & - + DBLE(crosstaus_d(ir,1,iss)) ) - sigma2_d = sigma2_d + vkin_d(ir) * DBLE(crosstaus_d(ir,2,iss)) - sigma3_d = sigma3_d + vkin_d(ir) * DBLE(crosstaus_d(ir,3,iss)) - sigma4_d = sigma4_d + vkin_d(ir) * (rhokin_d(ir) & - + DBLE(crosstaus_d(ir,4,iss)) ) - sigma5_d = sigma5_d + vkin_d(ir) * DBLE(crosstaus_d(ir,5,iss)) - sigma6_d = sigma6_d + vkin_d(ir) * (rhokin_d(ir) & - + DBLE(crosstaus_d(ir,6,iss)) ) - ! - ENDDO - ! - ENDDO - ! - sigma_mgga(1,1) = sigma1_d ; sigma_mgga(2,3) = sigma5_d - sigma_mgga(1,2) = sigma2_d ; sigma_mgga(3,1) = sigma3_d - sigma_mgga(1,3) = sigma3_d ; sigma_mgga(3,2) = sigma5_d - sigma_mgga(2,1) = sigma2_d ; sigma_mgga(3,3) = sigma6_d - sigma_mgga(2,2) = sigma4_d - ! - CALL dev_buf%release_buffer( vkin_d, ierrs(3) ) - CALL dev_buf%release_buffer( rhokin_d, ierrs(4) ) - CALL dev_buf%release_buffer( crosstaus_d, ierrs(2) ) - ! - CALL mp_sum( sigma_mgga, intra_bgrp_comm ) - ! - sigmaxc(:,:) = sigmaxc(:,:) + sigma_mgga(:,:) / & - (dffts%nr1 * dffts%nr2 * dffts%nr3) - ! -#endif - RETURN - ! -END SUBROUTINE stres_mgga_gpu -! -! -!---------------------------------------------------------- -SUBROUTINE wfc_gradient_gpu( ibnd, ik, npw, gradpsi_d ) - !---------------------------------------------------------- - !! Returns the gradient of the wavefunction in real space. - !! Slightly adapted from sum_bands.f90 - ! - USE kinds, ONLY: DP - USE control_flags, ONLY: gamma_only - USE wvfct, ONLY: npwx, nbnd - USE cell_base, ONLY: omega, tpiba - USE klist, ONLY: xk, igk_k_d - - USE fft_base, ONLY: dffts - USE fft_interfaces, ONLY: invfft - ! - USE gvect, ONLY: g_d - USE wavefunctions_gpum, ONLY: using_evc, using_evc_d, evc_d, & - using_psic, using_psic_d, psic_d -#if defined(__CUDA) - USE device_fbuff_m, ONLY: dev_buf - USE device_memcpy_m, ONLY: dev_memcpy -#endif - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: ibnd, ik, npw - COMPLEX(DP), INTENT(OUT) :: gradpsi_d(dffts%nnr,3) - ! - ! ... local variables - ! - INTEGER :: ipol, j - REAL(DP) :: kplusg - INTEGER, POINTER :: nl_d(:), nlm_d(:) - REAL(DP) :: xk_d(3) - ! -#if defined(__CUDA) - attributes(DEVICE) :: gradpsi_d, nl_d, nlm_d, xk_d - ! - CALL using_evc_d(0) - CALL using_psic_d(2) - ! - nl_d => dffts%nl_d - nlm_d => dffts%nlm_d - ! - xk_d(1:3) = xk(1:3,ik) - ! - ! Compute the gradient of the wavefunction in reciprocal space - ! - IF ( gamma_only ) THEN - ! - DO ipol = 1, 3 - ! - psic_d(:) = (0._DP,0._DP) - ! - IF ( ibnd < nbnd ) THEN - ! - ! ... two ffts at the same time - ! - !$cuf kernel do (1) <<<*,*>>> - DO j = 1, npw - kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba - ! - psic_d(nl_d(j)) = CMPLX(0._DP, kplusg, kind=DP) * & - ( evc_d(j,ibnd) + & - ( 0._DP, 1._DP ) * evc_d(j,ibnd+1) ) - ! - psic_d(nlm_d(j)) = CMPLX(0._DP,-kplusg, kind=DP) * & - CONJG( evc_d(j,ibnd) - & - ( 0._DP, 1._DP ) * evc_d(j,ibnd+1) ) - ENDDO - ! - ELSE - ! - !$cuf kernel do (1) <<<*,*>>> - DO j = 1, npw - kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba - ! - psic_d(nl_d(j)) = CMPLX(0._DP, kplusg, kind=DP) * & - evc_d(j,ibnd) - ! - psic_d(nlm_d(j)) = CMPLX(0._DP,-kplusg,kind=DP) * & - CONJG( evc_d(j,ibnd) ) - ENDDO - ! - ENDIF - ! - ! Gradient of the wavefunction in real space - ! - CALL invfft( 'Wave', psic_d, dffts ) - ! - CALL dev_memcpy( gradpsi_d(:,ipol), psic_d ) - ! - ENDDO - ! - ELSE - ! - DO ipol = 1, 3 - ! - psic_d(:) = (0._DP,0._DP) - ! - !$cuf kernel do(1) <<<*,*>>> - DO j = 1, npw - kplusg = (xk_d(ipol)+g_d(ipol,igk_k_d(j,ik))) * tpiba - ! - psic_d(nl_d(j)) = CMPLX(0._DP,kplusg,kind=DP)*evc_d(j,ibnd) - ENDDO - ! - ! Gradient of the wavefunction in real space - ! - CALL invfft( 'Wave', psic_d, dffts ) - ! - CALL dev_memcpy( gradpsi_d(:,ipol), psic_d ) - ! - ENDDO - ! - ENDIF -#endif - ! -END SUBROUTINE wfc_gradient_gpu diff --git a/PW/src/stres_us.f90 b/PW/src/stres_us.f90 index b95f88dd7..88f884d02 100644 --- a/PW/src/stres_us.f90 +++ b/PW/src/stres_us.f90 @@ -1,5 +1,5 @@ ! -! Copyright (C) 2001-2012 Quantum ESPRESSO group +! Copyright (C) 2001-2021 Quantum ESPRESSO group ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' ! in the root directory of the present distribution, @@ -9,7 +9,7 @@ !---------------------------------------------------------------------------- SUBROUTINE stres_us( ik, gk, sigmanlc ) !---------------------------------------------------------------------------- - !! nonlocal (separable pseudopotential) contribution to the stress + !! Nonlocal (separable pseudopotential) contribution to the stress !! NOTICE: sum of partial results over procs is performed in calling routine ! USE kinds, ONLY : DP @@ -20,56 +20,124 @@ SUBROUTINE stres_us( ik, gk, sigmanlc ) USE wvfct, ONLY : npwx, nbnd, wg, et USE control_flags, ONLY : gamma_only USE uspp_param, ONLY : upf, lmaxkb, nh, nhm - USE uspp, ONLY : nkb, vkb, deeq, deeq_nc - USE wavefunctions, ONLY : evc + USE uspp, ONLY : nkb, vkb, deeq USE lsda_mod, ONLY : nspin USE noncollin_module, ONLY : noncolin, npol, lspinorb USE mp_pools, ONLY : me_pool, root_pool USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, root_bgrp USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, & bec_type, becp, calbec - USE mp, ONLY : mp_sum, mp_get_comm_null, mp_circular_shift_left - USE wavefunctions_gpum, ONLY : using_evc + USE mp, ONLY : mp_sum, mp_get_comm_null, & + mp_circular_shift_left + USE wavefunctions, ONLY : evc USE wvfct_gpum, ONLY : using_et - USE becmod_subs_gpum, ONLY : using_becp_auto + USE becmod_gpum, ONLY : becp_d, bec_type_d + USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, & + calbec_gpu USE uspp_init, ONLY : init_us_2, gen_us_dj, gen_us_dy ! IMPLICIT NONE ! - INTEGER, INTENT(IN) :: ik + INTEGER, INTENT(IN) :: ik !! k-point index - REAL(DP), INTENT(IN) :: gk(npwx,3) + REAL(DP), INTENT(IN) :: gk(npwx,3) !! wave function components for fixed k-point REAL(DP), INTENT(INOUT) :: sigmanlc(3,3) !! stress tensor, non-local contribution ! - REAL(DP), ALLOCATABLE :: qm1(:) - REAL(DP) :: q - INTEGER :: npw, i + ! ... local variables ! - CALL using_evc(0) + REAL(DP), ALLOCATABLE :: qm1(:) + COMPLEX(DP), ALLOCATABLE :: evcv(:) ! + REAL(DP) :: q + INTEGER :: npw , iu, np + ! + INTEGER :: na1, np1, nh_np1, ijkb01, itot, levc + LOGICAL :: ismulti_np + INTEGER, ALLOCATABLE :: shift(:), na_list(:), nh_list(:), ih_list(:), & + ishift_list(:) + LOGICAL, ALLOCATABLE :: is_multinp(:) ! IF ( nkb == 0 ) RETURN ! IF ( lsda ) current_spin = isk(ik) npw = ngk(ik) - IF ( nks > 1 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb ) ! - CALL allocate_bec_type ( nkb, nbnd, becp, intra_bgrp_comm ) - + IF ( nks > 1 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .TRUE. ) + ! + CALL allocate_bec_type( nkb, nbnd, becp, intra_bgrp_comm ) CALL using_becp_auto(2) +#if defined(__CUDA) + CALL using_becp_d_auto(2) + !$acc host_data use_device(vkb,evc) + CALL calbec_gpu( npw, vkb, evc, becp_d ) + !$acc end host_data +#else + !$acc update self(vkb,evc) CALL calbec( npw, vkb, evc, becp ) +#endif ! - ALLOCATE( qm1( npwx ) ) - DO i = 1, npw - q = SQRT( gk(i, 1)**2 + gk(i, 2)**2 + gk(i, 3)**2 ) + ALLOCATE( qm1(npwx) ) + !$acc data create(qm1) present(gk) + ! + !$acc parallel loop + DO iu = 1, npw + q = SQRT( gk(iu,1)**2 + gk(iu,2)**2 + gk(iu,3)**2 ) IF ( q > eps8 ) THEN - qm1(i) = 1.D0 / q + qm1(iu) = 1._DP / q ELSE - qm1(i) = 0.D0 - END IF - END DO + qm1(iu) = 0._DP + ENDIF + ENDDO + ! + ! ... define index arrays (type, atom, etc.) for kernel loops + ! + ALLOCATE( is_multinp(nat*nhm) ) + ALLOCATE( na_list(nat*nhm), nh_list(nat*nhm), ih_list(nat*nhm), & + ishift_list(nat*nhm) ) + !$acc data create(is_multinp,na_list,nh_list,ih_list,ishift_list) & + !$acc& copyin(ityp,nh) + ! + ALLOCATE( shift(nat) ) + ijkb01 = 0 + DO iu = 1, ntyp + DO na1 = 1, nat + IF (ityp(na1) == iu ) THEN + shift(na1) = ijkb01 + ijkb01 = ijkb01 + nh(iu) + ENDIF + ENDDO + ENDDO + ! + !$acc data copyin(shift) + ! + ijkb01 = 0 + itot=0 + DO np = 1, ntyp + DO na1 = 1, nat + np1 = ityp(na1) + IF (np /= np1) CYCLE + ijkb01 = shift(na1) + nh_np1 = nh(np1) + ismulti_np = upf(np1)%tvanp .OR. upf(np1)%is_multiproj + IF ( .NOT. ismulti_np .AND. noncolin .AND. lspinorb ) & + CALL errore('stres_us','wrong case',1) + !$acc parallel loop + DO iu = itot+1, itot+nh_np1 + na_list(iu) = na1 + ih_list(iu) = iu-itot + nh_list(iu) = nh_np1 + ishift_list(iu) = ijkb01 + is_multinp(iu) = ismulti_np + ENDDO + itot = itot + nh_np1 + ENDDO + ENDDO + ! + levc = SIZE(evc(:,1)) + ALLOCATE( evcv(1:levc) ) + !$acc data create(evcv) ! IF ( gamma_only ) THEN ! @@ -79,9 +147,20 @@ SUBROUTINE stres_us( ik, gk, sigmanlc ) ! CALL stres_us_k() ! - END IF + ENDIF + ! + !$acc end data + DEALLOCATE( evcv ) + ! + !$acc end data + !$acc end data + !$acc end data ! DEALLOCATE( qm1 ) + DEALLOCATE( is_multinp ) + DEALLOCATE( na_list, nh_list, ih_list, ishift_list ) + DEALLOCATE( shift ) + ! CALL deallocate_bec_type( becp ) CALL using_becp_auto(2) ! @@ -92,24 +171,35 @@ SUBROUTINE stres_us( ik, gk, sigmanlc ) !----------------------------------------------------------------------- SUBROUTINE stres_us_gamma() !----------------------------------------------------------------------- - !! nonlocal contribution to the stress - gamma version + !! nonlocal contribution to the stress - gamma version. ! IMPLICIT NONE ! ! ... local variables ! - INTEGER :: na, np, ibnd, ipol, jpol, l, i, & - ikb, jkb, ih, jh, ijkb0, ibnd_loc, & - nproc, nbnd_loc, nbnd_begin, icyc - REAL(DP) :: fac, xyz(3,3), evps, ddot - REAL(DP), ALLOCATABLE :: deff(:,:,:) - COMPLEX(DP), ALLOCATABLE :: work1(:), work2(:), dvkb(:,:) - ! dvkb contains the derivatives of the kb potential - COMPLEX(DP) :: ps + INTEGER :: na, np, nt, ibnd, ipol, jpol, l, i, ikb, & + jkb, ih, jh, ibnd_loc,ijkb0,nh_np, nproc, & + nbnd_loc, nbnd_begin, icyc, ishift + REAL(DP) :: dot11, dot21, dot31, dot22, dot32, dot33, & + qm1i, gk1, gk2, gk3, wg_nk, fac, evps, aux,& + Re_worksum, Im_worksum + COMPLEX(DP) :: worksum, cv, wsum1, wsum2, wsum3, evci + ! + REAL(DP), ALLOCATABLE :: deff(:,:,:) + COMPLEX(DP), ALLOCATABLE :: ps(:), dvkb(:,:,:) + ! + REAL(DP) :: xyz(3,3) + ! ! xyz are the three unit vectors in the x,y,z directions - DATA xyz / 1.0d0, 0.0d0, 0.0d0, 0.0d0, 1.0d0, 0.0d0, 0.0d0, 0.0d0, 1.0d0 / - ! + DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, 0._DP, & + 1._DP / +#if !defined(__CUDA) || !defined(_OPENACC) + REAL(DP), ALLOCATABLE :: becpr(:,:) +#else + REAL(DP), POINTER, DEVICE :: becpr(:,:) ! + CALL using_becp_auto(0) +#endif IF( becp%comm /= mp_get_comm_null() ) THEN nproc = becp%nproc nbnd_loc = becp%nbnd_loc @@ -119,523 +209,729 @@ SUBROUTINE stres_us( ik, gk, sigmanlc ) nproc = 1 nbnd_loc = nbnd nbnd_begin = 1 - END IF - - ALLOCATE( work1( npwx ), work2( npwx ) ) + ENDIF + ! ALLOCATE( deff(nhm,nhm,nat) ) + ALLOCATE( ps(nkb) ) + !$acc data create(deff,ps) ! ! ... diagonal contribution - if the result from "calbec" are not ! ... distributed, must be calculated on a single processor ! - evps = 0.D0 - IF ( nproc == 1 .AND. me_pool /= root_pool ) GO TO 100 + ! ... for the moment when using_gpu is true becp is always fully present + ! in all processors ! CALL using_et(0) ! compute_deff : intent(in) - DO ibnd_loc = 1, nbnd_loc - ibnd = ibnd_loc + becp%ibnd_begin - 1 - CALL compute_deff ( deff, et(ibnd,ik) ) - fac = wg(ibnd,ik) - ijkb0 = 0 - DO np = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == np ) THEN - DO ih = 1, nh(np) - ikb = ijkb0 + ih - evps = evps + fac * deff(ih,ih,na) * & - ABS( becp%r(ikb,ibnd_loc) )**2 - ! - IF ( upf(np)%tvanp .OR. upf(np)%is_multiproj ) THEN - ! - ! ... only in the US case there is a contribution - ! ... for jh<>ih - ! ... we use here the symmetry in the interchange of - ! ... ih and jh - ! - DO jh = ( ih + 1 ), nh(np) - jkb = ijkb0 + jh - evps = evps + deff(ih,jh,na) * fac * 2.D0 * & - becp%r(ikb,ibnd_loc) * becp%r(jkb,ibnd_loc) - END DO - END IF - END DO - ijkb0 = ijkb0 + nh(np) - END IF - END DO - END DO - END DO ! -100 CONTINUE +#if defined(__CUDA) + becpr => becp_d%r_d +#else + ALLOCATE( becpr(nkb,nbnd_loc) ) + becpr = becp%r +#endif ! - ! ... non diagonal contribution - derivative of the bessel function - !------------------------------------ - ALLOCATE( dvkb( npwx, nkb ) ) + evps = 0._DP ! - CALL gen_us_dj( ik, dvkb ) + compute_evps: IF ( .NOT. (nproc==1 .AND. me_pool/=root_pool) ) THEN + ! + DO ibnd_loc = 1, nbnd_loc + ibnd = ibnd_loc+becp%ibnd_begin-1 + CALL compute_deff( deff, et(ibnd,ik) ) + wg_nk = wg(ibnd,ik) + ! + !$acc parallel loop reduction(+:evps) + DO i = 1, itot + ih = ih_list(i) ; na = na_list(i) + ishift = ishift_list(i) ; ikb = ishift + ih + ! + IF (.NOT. is_multinp(i)) THEN + aux = wg_nk * deff(ih,ih,na) * ABS(becpr(ikb,ibnd_loc))**2 + ELSE + nh_np = nh_list(i) + ! + aux = wg_nk * deff(ih,ih,na) * ABS(becpr(ikb,ibnd_loc))**2 & + + becpr(ikb,ibnd_loc)* wg_nk * 2._DP & + * SUM( deff(ih,ih+1:nh_np,na) & + * becpr(ishift+ih+1:ishift+nh_np,ibnd_loc)) + ENDIF + ! + evps = evps + aux + ! + ENDDO + ! + ENDDO + ! + ENDIF compute_evps ! - CALL using_et(0) ! compute_deff : intent(in) - CALL using_evc(0) - DO icyc = 0, nproc -1 + ! ... non diagonal contribution - derivative of the Bessel function + ! + ALLOCATE( dvkb(npwx,nkb,4) ) + !$acc data create(dvkb) + ! + CALL gen_us_dj( ik, dvkb(:,:,4) ) + IF ( lmaxkb > 0 ) THEN + DO ipol = 1, 3 + CALL gen_us_dy( ik, xyz(1,ipol), dvkb(:,:,ipol) ) + ENDDO + ENDIF + ! + DO icyc = 0, nproc-1 ! DO ibnd_loc = 1, nbnd_loc - ! - ibnd = ibnd_loc + becp%ibnd_begin - 1 - CALL compute_deff ( deff, et(ibnd,ik) ) - work2(:) = (0.D0,0.D0) - ijkb0 = 0 - DO np = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == np ) THEN - DO ih = 1, nh(np) - ikb = ijkb0 + ih - IF ( .NOT. ( upf(np)%tvanp .OR. upf(np)%is_multiproj ) ) THEN - ps = becp%r(ikb,ibnd_loc) * deff(ih,ih,na) - ELSE - ! - ! ... in the US case there is a contribution - ! ... also for jh<>ih - ! - ps = (0.D0,0.D0) - DO jh = 1, nh(np) - jkb = ijkb0 + jh - ps = ps + becp%r(jkb,ibnd_loc) * deff(ih,jh,na) - END DO - END IF - CALL zaxpy( npw, ps, dvkb(1,ikb), 1, work2, 1 ) - END DO - ijkb0 = ijkb0 + nh(np) - END IF - END DO - END DO + ! + ibnd = ibnd_loc + becp%ibnd_begin - 1 + CALL compute_deff( deff, et(ibnd,ik) ) + ! + !$acc kernels + evcv(:) = evc(:,ibnd) + !$acc end kernels + ! + !$acc parallel loop + DO i = 1, itot + ih = ih_list(i) ; na = na_list(i) + ishift = ishift_list(i) ; ikb = ishift + ih + ! + IF (.NOT. is_multinp(i)) THEN + ps(ikb) = CMPLX(deff(ih,ih,na) * becpr(ikb,ibnd_loc)) + ELSE + nh_np = nh_list(i) + ! + ps(ikb) = CMPLX( SUM( becpr(ishift+1:ishift+nh_np,ibnd_loc) & + * deff(ih,1:nh_np,na) ) ) + ENDIF + ENDDO + ! + dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP + dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP + ! + !$acc parallel loop collapse(2) reduction(+:dot11,dot21,dot31,& + !$acc& dot22,dot32,dot33) + DO na =1, nat + DO i = 1, npw + np = ityp(na) + ijkb0 = shift(na) + nh_np = nh(np) + worksum = (0._DP,0._DP) + DO ih = 1, nh_np + ikb = ijkb0 + ih + worksum = worksum + ps(ikb) * dvkb(i,ikb,4) + ENDDO + Re_worksum = DBLE(worksum) ; Im_worksum = DIMAG(worksum) + evci = evcv(i) + gk1 = gk(i,1) ; gk2 = gk(i,2) ; gk3 = gk(i,3) + qm1i = qm1(i) + ! + cv = evci * CMPLX(gk1 * gk1 * qm1i) + dot11 = dot11 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * CMPLX(gk2 * gk1 * qm1i) + dot21 = dot21 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * CMPLX(gk3 * gk1 * qm1i) + dot31 = dot31 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * CMPLX(gk2 * gk2 * qm1i) + dot22 = dot22 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * CMPLX(gk3 * gk2 * qm1i) + dot32 = dot32 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * CMPLX(gk3 * gk3 * qm1i) + dot33 = dot33 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ENDDO + ENDDO + ! ... a factor 2 accounts for the other half of the G-vector sphere + sigmanlc(:,1) = sigmanlc(:,1) - 4._DP * wg(ibnd,ik) * [dot11, dot21, dot31] + sigmanlc(:,2) = sigmanlc(:,2) - 4._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] + sigmanlc(:,3) = sigmanlc(:,3) - 4._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] + ! + ! ... non diagonal contribution - derivative of the spherical harmonics + ! ... (no contribution from l=0) + ! + IF ( lmaxkb == 0 ) CYCLE + ! + dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP + dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP + ! + !$acc parallel loop collapse(2) reduction(+:dot11,dot21,dot31,dot22,dot32,dot33) + DO ikb = 1, nkb + DO i = 1, npw + wsum1 = ps(ikb)*dvkb(i,ikb,1) + wsum2 = ps(ikb)*dvkb(i,ikb,2) + wsum3 = ps(ikb)*dvkb(i,ikb,3) + ! + evci = evcv(i) + gk1 = gk(i,1) + gk2 = gk(i,2) + gk3 = gk(i,3) + ! + cv = evci * CMPLX(gk1) + dot11 = dot11 + DBLE(wsum1)* DBLE(cv) + DIMAG(wsum1)*DIMAG(cv) + dot21 = dot21 + DBLE(wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv) + dot31 = dot31 + DBLE(wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv) + ! + cv = evci * CMPLX(gk2) + dot22 = dot22 + DBLE(wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv) + dot32 = dot32 + DBLE(wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv) + ! + cv = evci * CMPLX(gk3) + dot33 = dot33 + DBLE(wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv) + ENDDO + ENDDO ! ! ... a factor 2 accounts for the other half of the G-vector sphere ! - DO ipol = 1, 3 - DO jpol = 1, ipol - DO i = 1, npw - work1(i) = evc(i,ibnd) * gk(i, ipol) * gk(i, jpol) * qm1(i) - END DO - sigmanlc(ipol,jpol) = sigmanlc(ipol,jpol) - & - 4.D0 * wg(ibnd,ik) * & - ddot( 2 * npw, work1, 1, work2, 1 ) - END DO - END DO - END DO - IF ( nproc > 1 ) THEN - CALL mp_circular_shift_left(becp%r, icyc, becp%comm) - CALL mp_circular_shift_left(becp%ibnd_begin, icyc, becp%comm) - CALL mp_circular_shift_left(nbnd_loc, icyc, becp%comm) - END IF - END DO - ! - ! ... non diagonal contribution - derivative of the spherical harmonics - ! ... (no contribution from l=0) - ! - IF ( lmaxkb == 0 ) GO TO 10 - ! - !------------------------------------ - CALL using_evc(0); CALL using_et(0) ! compute_deff : intent(in) (this is redundant) - DO ipol = 1, 3 - ! - CALL gen_us_dy( ik, xyz(1,ipol), dvkb ) - ! - DO icyc = 0, nproc -1 - ! - DO ibnd_loc = 1, nbnd_loc - ibnd = ibnd_loc + becp%ibnd_begin - 1 - CALL compute_deff ( deff, et(ibnd,ik) ) - work2(:) = (0.D0,0.D0) - ijkb0 = 0 - DO np = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == np ) THEN - DO ih = 1, nh(np) - ikb = ijkb0 + ih - IF ( .NOT. ( upf(np)%tvanp .OR. upf(np)%is_multiproj ) ) THEN - ps = becp%r(ikb,ibnd_loc) * deff(ih,ih,na) - ELSE - ! - ! ... in the US case there is a contribution - ! ... also for jh<>ih - ! - ps = (0.D0,0.D0) - DO jh = 1, nh(np) - jkb = ijkb0 + jh - ps = ps + becp%r(jkb,ibnd_loc)*deff(ih,jh,na) - END DO - END IF - CALL zaxpy( npw, ps, dvkb(1,ikb), 1, work2, 1 ) - END DO - ijkb0 = ijkb0 + nh(np) - END IF - END DO - END DO - ! - ! ... a factor 2 accounts for the other half of the G-vector sphere - ! - DO jpol = 1, ipol - DO i = 1, npw - work1(i) = evc(i,ibnd) * gk(i, jpol) - END DO - sigmanlc(ipol,jpol) = sigmanlc(ipol,jpol) - & - 4.D0 * wg(ibnd,ik) * & - ddot( 2 * npw, work1, 1, work2, 1 ) - END DO - END DO - - IF ( nproc > 1 ) THEN - CALL mp_circular_shift_left(becp%r, icyc, becp%comm) - CALL mp_circular_shift_left(becp%ibnd_begin, icyc, becp%comm) - CALL mp_circular_shift_left(nbnd_loc, icyc, becp%comm) - END IF - + sigmanlc(:,1) = sigmanlc(:,1) -4._DP * wg(ibnd,ik) * [dot11, dot21, dot31] + sigmanlc(:,2) = sigmanlc(:,2) -4._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] + sigmanlc(:,3) = sigmanlc(:,3) -4._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] ENDDO - END DO - + ! + IF ( nproc > 1 ) THEN +#if defined(__CUDA) || defined(_OPENACC) + CALL errore( 'stres_us_gamma', & + 'unexpected error nproc be 1 with GPU acceleration', 100 ) +#else + CALL mp_circular_shift_left( becp%r, icyc, becp%comm ) + becpr = becp%r + CALL mp_circular_shift_left( becp%ibnd_begin, icyc, becp%comm ) + CALL mp_circular_shift_left( nbnd_loc, icyc, becp%comm ) +#endif + ENDIF + ! + ENDDO + ! 10 CONTINUE ! DO l = 1, 3 sigmanlc(l,l) = sigmanlc(l,l) - evps - END DO + ENDDO ! + !$acc end data + !$acc end data + DEALLOCATE( deff, ps ) DEALLOCATE( dvkb ) - DEALLOCATE( deff, work2, work1 ) +#if !defined(__CUDA) + DEALLOCATE( becpr ) +#endif ! RETURN ! - END SUBROUTINE stres_us_gamma + END SUBROUTINE stres_us_gamma ! ! !---------------------------------------------------------------------- SUBROUTINE stres_us_k() - !---------------------------------------------------------------------- - !! nonlocal contribution to the stress - k-points version + !---------------------------------------------------------------------- + !! nonlocal contribution to the stress - k-points version. ! IMPLICIT NONE ! ! ... local variables ! - INTEGER :: na, np, ibnd, ipol, jpol, l, i, ipw, & - ikb, jkb, ih, jh, ijkb0, is, js, ijs - REAL(DP) :: fac, xyz (3, 3), evps, ddot - COMPLEX(DP), ALLOCATABLE :: work1(:), work2(:), dvkb(:,:) - COMPLEX(DP), ALLOCATABLE :: work2_nc(:,:) - COMPLEX(DP), ALLOCATABLE :: deff_nc(:,:,:,:) + INTEGER :: na, np, ibnd, ipol, jpol, l, i, nt, & + ikb, jkb, ih, jh, is, js, ijs, ishift, nh_np + REAL(DP) :: fac, evps, dot11, dot21, dot31, dot22, dot32, dot33, aux, & + Re_worksum, Re_worksum1, Re_worksum2, Im_worksum, & + Im_worksum1, Im_worksum2 + COMPLEX(DP) :: qm1i, gk1, gk2, gk3, pss + COMPLEX(DP) :: cv, cv1, cv2, worksum, worksum1, worksum2, evci, evc1i, & + evc2i, ps1, ps2, ps1d1, ps1d2, ps1d3, ps2d1, ps2d2, & + ps2d3, psd1, psd2, psd3 + ! REAL(DP), ALLOCATABLE :: deff(:,:,:) - ! dvkb contains the derivatives of the kb potential - COMPLEX(DP) :: ps, ps_nc(2) + COMPLEX(DP), ALLOCATABLE :: deff_nc(:,:,:,:) + COMPLEX(DP), ALLOCATABLE :: ps(:), ps_nc(:,:), dvkb(:,:,:) + ! + REAL(DP) :: xyz(3,3) ! xyz are the three unit vectors in the x,y,z directions - DATA xyz / 1.0d0, 0.0d0, 0.0d0, 0.0d0, 1.0d0, 0.0d0, 0.0d0, 0.0d0, 1.0d0 / + DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, & + 0._DP, 1._DP / ! +#if defined(__CUDA) && defined(_OPENACC) + COMPLEX(DP), POINTER, DEVICE :: becpnc(:,:,:), becpk(:,:) +#else + COMPLEX(DP), ALLOCATABLE :: becpnc(:,:,:), becpk(:,:) +#endif ! - if (noncolin) then - ALLOCATE( work2_nc(npwx,npol) ) - ALLOCATE( deff_nc(nhm,nhm,nat,nspin) ) - else - ALLOCATE( deff(nhm,nhm,nat) ) - endif - ! - ALLOCATE( work1(npwx), work2(npwx) ) - ! - evps = 0.D0 + evps = 0._DP ! ... diagonal contribution ! - IF ( me_bgrp /= root_bgrp ) GO TO 100 + ALLOCATE( dvkb(npwx,nkb,4) ) + !$acc data create( dvkb ) + ! + CALL gen_us_dj( ik, dvkb(:,:,4) ) + IF ( lmaxkb > 0 ) THEN + DO ipol = 1, 3 + CALL gen_us_dy( ik, xyz(1,ipol), dvkb(:,:,ipol) ) + ENDDO + ENDIF + ! + IF (noncolin) THEN + ALLOCATE( ps_nc(nkb,npol) ) + ALLOCATE( deff_nc(nhm,nhm,nat,nspin) ) +#if defined(__CUDA) && defined(_OPENACC) + becpnc => becp_d%nc_d +#else + ALLOCATE( becpnc(nkb,npol,nbnd) ) + becpnc = becp%nc +#endif + ELSE + ALLOCATE( ps(nkb) ) + ALLOCATE( deff(nhm,nhm,nat) ) +#if defined(__CUDA) && defined(_OPENACC) + becpk => becp_d%k_d +#else + ALLOCATE( becpk(nkb,nbnd) ) + becpk = becp%k +#endif + ENDIF + !$acc data create( ps, ps_nc, deff, deff_nc ) + ! + CALL using_et(0) ! ! ... the contribution is calculated only on one processor because ! ... partial results are later summed over all processors ! - CALL using_et(0) ! compute_deff : intent(in) + IF ( me_bgrp /= root_bgrp ) GO TO 100 + ! DO ibnd = 1, nbnd fac = wg(ibnd,ik) IF (ABS(fac) < 1.d-9) CYCLE IF (noncolin) THEN - CALL compute_deff_nc(deff_nc,et(ibnd,ik)) + CALL compute_deff_nc( deff_nc, et(ibnd,ik) ) ELSE - CALL compute_deff(deff,et(ibnd,ik)) + CALL compute_deff( deff, et(ibnd,ik) ) ENDIF - ijkb0 = 0 - DO np = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == np ) THEN - DO ih = 1, nh(np) - ikb = ijkb0 + ih - IF (noncolin) THEN - ijs=0 - DO is=1,npol - DO js=1,npol - ijs=ijs+1 - evps=evps+fac*deff_nc(ih,ih,na,ijs)* & - CONJG(becp%nc(ikb,is,ibnd))* & - becp%nc(ikb,js,ibnd) - END DO - END DO - ELSE - evps = evps+fac*deff(ih,ih,na)*ABS(becp%k(ikb,ibnd) )**2 - END IF - ! - IF ( upf(np)%tvanp .OR. upf(np)%is_multiproj ) THEN - ! - ! ... only in the US case there is a contribution - ! ... for jh<>ih - ! ... we use here the symmetry in the interchange of - ! ... ih and jh - ! - DO jh = ( ih + 1 ), nh(np) - jkb = ijkb0 + jh - IF (noncolin) THEN - ijs=0 - DO is=1,npol - DO js=1,npol - ijs=ijs+1 - evps = evps+2.d0*fac& - *DBLE(deff_nc(ih,jh,na,ijs)* & - (CONJG( becp%nc(ikb,is,ibnd) ) * & - becp%nc(jkb,js,ibnd)) ) - END DO - END DO - ELSE - evps = evps + deff(ih,jh,na) * fac * 2.D0 * & - DBLE( CONJG( becp%k(ikb,ibnd) ) * & - becp%k(jkb,ibnd) ) - END IF - END DO - END IF - END DO - ijkb0 = ijkb0 + nh(np) - END IF - END DO - END DO - END DO + ! + IF (noncolin) THEN + ! +#if defined(_OPENACC) + !$acc parallel loop reduction(+:evps) +#else + !$omp parallel do reduction(+:evps), private(ih,na,ishift,ikb,aux,& + !$omp& ijs,is,js,nh_np,jkb) +#endif + DO i = 1, itot + ! + ih = ih_list(i) ; na = na_list(i) + ishift = ishift_list(i) ; ikb = ishift + ih + aux = 0.d0 + ! + IF (.NOT. is_multinp(i)) THEN + ijs = 0 + !$acc loop seq collapse(2) reduction(+:aux) + DO is = 1, npol + DO js = 1, npol + ijs = ijs + 1 + aux = aux + fac * DBLE(deff_nc(ih,ih,na,ijs) * & + CONJG(becpnc(ikb,is,ibnd)) * & + becpnc(ikb,js,ibnd)) + ENDDO + ENDDO + ELSE + nh_np = nh_list(i) + ijs = 0 + !$acc loop seq collapse(2) reduction(+:aux) + DO is = 1, npol + DO js = 1, npol + ijs = ijs + 1 + aux = aux + fac * DBLE(deff_nc(ih,ih,na,ijs) * & + CONJG(becpnc(ikb,is,ibnd)) * & + becpnc(ikb,js,ibnd)) + ENDDO + ENDDO + !$acc loop seq + DO jh = ih+1, nh_np + jkb = ishift + jh + ijs = 0 + !$acc loop seq collapse(2) reduction(+:aux) + DO is = 1, npol + DO js = 1, npol + ijs = ijs + 1 + aux = aux + 2._DP*fac * & + DBLE(deff_nc(ih,jh,na,ijs) * & + (CONJG(becpnc(ikb,is,ibnd)) * & + becpnc(jkb,js,ibnd))) + ENDDO + ENDDO + ENDDO + ENDIF + ! + evps = evps + aux + ! + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ELSE + ! + aux = 0.d0 +#if defined(_OPENACC) + !$acc parallel loop reduction(+:evps) +#else + !$omp parallel do reduction(+:evps) private(ih,na,ishift,ikb,& + !$omp& aux,nh_np) +#endif + DO i = 1, itot + ! + ih = ih_list(i) ; na = na_list(i) + ishift = ishift_list(i) ; ikb = ishift + ih + ! + IF (.NOT. is_multinp(i)) THEN + aux = fac * deff(ih,ih,na) * & + ABS(becpk(ikb,ibnd) )**2 + ELSE + nh_np = nh_list(i) + aux = fac * deff(ih,ih,na) * ABS(becpk(ikb,ibnd) )**2 + & + SUM( deff(ih,ih+1:nh_np,na) * & + fac * 2._DP*DBLE( CONJG(becpk(ikb,ibnd)) & + * becpk(ishift+ih+1:ishift+nh_np,ibnd) ) ) + ENDIF + ! + evps = evps + aux + ! + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ENDIF + ! + ENDDO + ! DO l = 1, 3 sigmanlc(l,l) = sigmanlc(l,l) - evps - END DO + ENDDO ! 100 CONTINUE ! ! ... non diagonal contribution - derivative of the bessel function ! - ALLOCATE( dvkb( npwx, nkb ) ) - ! - CALL gen_us_dj( ik, dvkb ) - ! - CALL using_evc(0); CALL using_et(0) ! this is redundant - ! DO ibnd = 1, nbnd - IF (noncolin) THEN - work2_nc = (0.D0,0.D0) - CALL compute_deff_nc(deff_nc,et(ibnd,ik)) - ELSE - work2 = (0.D0,0.D0) - CALL compute_deff(deff,et(ibnd,ik)) - ENDIF - !$omp parallel private(ijkb0, ikb, ijs, ps_nc, ps, jkb) - ijkb0 = 0 - DO np = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == np ) THEN - DO ih = 1, nh(np) - ikb = ijkb0 + ih - IF ( .NOT. ( upf(np)%tvanp .OR. upf(np)%is_multiproj ) ) THEN - IF (noncolin) THEN - if (lspinorb) call errore('stres_us','wrong case',1) - ijs=0 - ps_nc=(0.D0, 0.D0) - DO is=1,npol - DO js=1,npol - ijs=ijs+1 - ps_nc(is)=ps_nc(is)+becp%nc(ikb,js,ibnd)* & - deff_nc(ih,ih,na,ijs) - END DO - END DO - ELSE - ps = becp%k(ikb, ibnd) * deeq(ih,ih,na,current_spin) - ENDIF - ELSE - ! - ! ... in the US case there is a contribution - ! ... also for jh<>ih - ! - ps = (0.D0,0.D0) - ps_nc = (0.D0,0.D0) - DO jh = 1, nh(np) - jkb = ijkb0 + jh - IF (noncolin) THEN - ijs=0 - DO is=1,npol - DO js=1,npol - ijs=ijs+1 - ps_nc(is)=ps_nc(is)+becp%nc(jkb,js,ibnd)* & - deff_nc(ih,jh,na,ijs) - END DO - END DO - ELSE - ps = ps + becp%k(jkb,ibnd) * deff(ih,jh,na) - END IF - END DO - END IF - IF (noncolin) THEN - DO is=1,npol - !$omp do - DO ipw = 1, npw - work2_nc(ipw,is) = ps_nc(is) * dvkb(ipw, ikb) + work2_nc(ipw,is) - END DO - !$omp end do nowait - END DO - ELSE - !$omp do - DO ipw = 1, npw - work2(ipw) = ps * dvkb(ipw, ikb) + work2(ipw) - END DO - !$omp end do nowait - END IF - END DO - ijkb0 = ijkb0 + nh(np) - END IF - END DO - END DO - !$omp end parallel - DO ipol = 1, 3 - DO jpol = 1, ipol - IF (noncolin) THEN - DO i = 1, npw - work1(i) = evc( i ,ibnd)*gk(i,ipol)* & - gk(i,jpol)*qm1(i) - work2(i) = evc(i+npwx,ibnd)*gk(i,ipol)* & - gk(i,jpol)*qm1(i) - END DO - sigmanlc(ipol,jpol) = sigmanlc(ipol,jpol) - & - 2.D0 * wg(ibnd,ik) * & - ( ddot(2*npw,work1,1,work2_nc(1,1), 1) + & - ddot(2*npw,work2,1,work2_nc(1,2), 1) ) - ELSE - DO i = 1, npw - work1(i) = evc(i,ibnd)*gk(i, ipol)*gk(i, jpol)*qm1(i) - END DO - sigmanlc(ipol,jpol) = sigmanlc(ipol,jpol) - & - 2.D0 * wg(ibnd,ik) * & - ddot( 2*npw, work1, 1, work2, 1 ) - END IF - END DO - END DO - END DO - ! - ! ... non diagonal contribution - derivative of the spherical harmonics - ! ... (no contribution from l=0) - ! - IF ( lmaxkb == 0 ) GO TO 10 - ! - DO ipol = 1, 3 - CALL gen_us_dy( ik, xyz(1,ipol), dvkb ) - DO ibnd = 1, nbnd - IF (noncolin) THEN - work2_nc = (0.D0,0.D0) - CALL compute_deff_nc( deff_nc, et(ibnd,ik) ) - ELSE - work2 = (0.D0,0.D0) - CALL compute_deff( deff, et(ibnd,ik) ) - ENDIF - !$omp parallel private(ijkb0, ikb, ijs, ps_nc, ps, jkb) - ijkb0 = 0 - DO np = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == np ) THEN - DO ih = 1, nh(np) - ikb = ijkb0 + ih - IF ( .NOT. ( upf(np)%tvanp .OR. upf(np)%is_multiproj ) ) THEN - IF (noncolin) THEN - ijs=0 - ps_nc = (0.D0,0.D0) - DO is=1,npol - DO js=1,npol - ijs=ijs+1 - ps_nc(is) = ps_nc(is) + becp%nc( ikb,js,ibnd )* & - deff_nc( ih,ih,na,ijs ) - END DO - END DO - ELSE - ps = becp%k(ikb,ibnd) * deeq(ih,ih,na,current_spin) - END IF - ELSE - ! - ! ... in the US case there is a contribution - ! ... also for jh<>ih - ! - ps = (0.D0,0.D0) - ps_nc = (0.D0,0.D0) - DO jh = 1, nh(np) - jkb = ijkb0 + jh - IF (noncolin) THEN - ijs=0 - DO is=1,npol - DO js=1,npol - ijs=ijs+1 - ps_nc(is)=ps_nc(is)+ & - becp%nc(jkb,js,ibnd)* & - deff_nc(ih,jh,na,ijs) - END DO - END DO - ELSE - ps = ps + becp%k(jkb,ibnd) * deff(ih,jh,na) - END IF - END DO - END IF - IF (noncolin) THEN - DO is=1,npol - !$omp do - DO ipw = 1, npw - work2_nc(ipw,is) = ps_nc(is) * dvkb(ipw, ikb) + work2_nc(ipw,is) - END DO - !$omp end do nowait - END DO - ELSE - !$omp do - DO ipw = 1, npw - work2(ipw) = ps * dvkb(ipw, ikb) + work2(ipw) - END DO - !$omp end do nowait - END IF - END DO - ijkb0 = ijkb0 + nh(np) - END IF - END DO - END DO - !$omp end parallel - DO jpol = 1, ipol - IF (noncolin) THEN - DO i = 1, npw - work1(i) = evc(i ,ibnd) * gk(i, jpol) - work2(i) = evc(i+npwx,ibnd) * gk(i, jpol) - END DO - sigmanlc(ipol,jpol) = sigmanlc(ipol,jpol) - & - 2.D0 * wg(ibnd,ik) * & - ( ddot( 2 * npw, work1, 1, work2_nc(1,1), 1 ) + & - ddot( 2 * npw, work2, 1, work2_nc(1,2), 1 ) ) - ELSE - DO i = 1, npw - work1(i) = evc(i,ibnd) * gk(i, jpol) - END DO - sigmanlc(ipol,jpol) = sigmanlc(ipol,jpol) - & - 2.D0 * wg(ibnd,ik) * & - ddot( 2 * npw, work1, 1, work2, 1 ) - END IF - END DO - END DO - END DO + ! + !$acc kernels + evcv(:) = evc(:,ibnd) + !$acc end kernels + ! + IF ( noncolin ) THEN + ! + CALL compute_deff_nc( deff_nc, et(ibnd,ik) ) + ! +#if defined(_OPENACC) + !$acc parallel loop +#else + !$omp parallel do private(ih,na,ishift,ikb,is,ijs,nh_np) +#endif + DO i = 1, itot + ! + ih = ih_list(i) ; na = na_list(i) + ishift = ishift_list(i) ; ikb = ishift + ih + ! + IF (.NOT. is_multinp(i)) THEN + ! + !$acc loop seq + DO is = 1, npol + ijs = (is-1)*npol + ps_nc(ikb,is) = SUM( becpnc(ikb,1:npol,ibnd) * & + deff_nc(ih,ih,na,ijs+1:ijs+npol) ) + ENDDO + ! + ELSE + ! + nh_np = nh_list(i) + ! + !$acc loop seq + DO is = 1, npol + ijs = (is-1)*npol + ps_nc(ikb,is) = SUM( becpnc(ishift+1:ishift+nh_np,1:npol,ibnd) * & + deff_nc(ih,1:nh_np,na,ijs+1:ijs+npol) ) + ENDDO + ! + ENDIF + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ELSE + ! + CALL compute_deff( deff, et(ibnd,ik) ) + ! +#if defined(_OPENACC) + !$acc parallel loop +#else + !$omp parallel do private(ih,na,ishift,ikb,nh_np) +#endif + DO i = 1, itot + ! + ih = ih_list(i) ; na = na_list(i) + ishift = ishift_list(i) ; ikb = ishift + ih + ! + IF (.NOT. is_multinp(i)) THEN + ps(ikb) = CMPLX(deeq(ih,ih,na,current_spin)) * & + becpk(ikb,ibnd) + ELSE + nh_np = nh_list(i) + ! + ps(ikb) = SUM( becpk(ishift+1:ishift+nh_np,ibnd) * & + deff(ih,1:nh_np,na) ) + ENDIF + ! + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ENDIF + ! + dot11=0._DP ; dot21=0._DP ; dot31=0._DP + dot22=0._DP ; dot32=0._DP ; dot33=0._DP + ! + IF (noncolin) THEN +#if defined(_OPENACC) + !$acc parallel loop collapse(2) reduction(+:dot11,dot21,dot31,& + !$acc& dot22,dot32,dot33) +#else + !$omp parallel do collapse(2) reduction(+:dot11,dot21,dot31,dot22,& + !$omp& dot32,dot33) shared(evcv,qm1,gk,ps_nc,dvkb) +#endif + DO ikb = 1, nkb + DO i = 1, npw + evc1i = evcv(i) + evc2i = evcv(i+npwx) + qm1i = CMPLX(qm1(i)) + gk1 = CMPLX(gk(i,1)) + gk2 = CMPLX(gk(i,2)) + gk3 = CMPLX(gk(i,3)) + worksum1 = ps_nc(ikb,1) * dvkb(i,ikb,4) + worksum2 = ps_nc(ikb,2) * dvkb(i,ikb,4) + Re_worksum1 = DBLE(worksum1) ; Im_worksum1 = DIMAG(worksum1) + Re_worksum2 = DBLE(worksum2) ; Im_worksum2 = DIMAG(worksum2) + ! + cv1 = evc1i * gk1 * gk1 * qm1i + cv2 = evc2i * gk1 * gk1 * qm1i + dot11 = dot11 + Re_worksum1*DBLE(cv1) + Im_worksum1*DIMAG(cv1) + & + Re_worksum2*DBLE(cv2) + Im_worksum2*DIMAG(cv2) + ! + cv1 = evc1i * gk2 * gk1 * qm1i + cv2 = evc2i * gk2 * gk1 * qm1i + dot21 = dot21 + Re_worksum1*DBLE(cv1) + Im_worksum1*DIMAG(cv1) + & + Re_worksum2*DBLE(cv2) + Im_worksum2*DIMAG(cv2) + ! + cv1 = evc1i * gk3 * gk1 * qm1i + cv2 = evc2i * gk3 * gk1 * qm1i + dot31 = dot31 + Re_worksum1*DBLE(cv1) + Im_worksum1*DIMAG(cv1) + & + Re_worksum2*DBLE(cv2) + Im_worksum2*DIMAG(cv2) + ! + cv1 = evc1i * gk2 * gk2 * qm1i + cv2 = evc2i * gk2 * gk2 * qm1i + dot22 = dot22 + Re_worksum1*DBLE(cv1) + Im_worksum1*DIMAG(cv1) + & + Re_worksum2*DBLE(cv2) + Im_worksum2*DIMAG(cv2) + ! + cv1 = evc1i * gk3 * gk2 * qm1i + cv2 = evc2i * gk3 * gk2 * qm1i + dot32 = dot32 + Re_worksum1*DBLE(cv1) + Im_worksum1*DIMAG(cv1) + & + Re_worksum2*DBLE(cv2) + Im_worksum2*DIMAG(cv2) + ! + cv1 = evc1i * gk3 * gk3 * qm1i + cv2 = evc2i * gk3 * gk3 * qm1i + dot33 = dot33 + Re_worksum1*DBLE(cv1) + Im_worksum1*DIMAG(cv1) + & + Re_worksum2*DBLE(cv2) + Im_worksum2*DIMAG(cv2) + ENDDO + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ELSE + ! +#if defined(_OPENACC) + !$acc parallel loop collapse(2) reduction(+:dot11,dot21,dot31,& + !$acc& dot22,dot32,dot33) +#else + !$omp parallel do collapse(2) reduction(+:dot11,dot21,dot31,dot22,& + !$omp& dot32,dot33) shared(evcv,qm1,gk,ps,dvkb) +#endif + DO ikb = 1, nkb + DO i = 1, npw + ! + worksum = ps(ikb) *dvkb(i,ikb,4) + Re_worksum = DBLE(worksum) ; Im_worksum = DIMAG(worksum) + ! + evci = evcv(i) + qm1i = CMPLX(qm1(i)) + gk1 = CMPLX(gk(i,1)) + gk2 = CMPLX(gk(i,2)) + gk3 = CMPLX(gk(i,3)) + ! + cv = evci * gk1 * gk1 * qm1i + dot11 = dot11 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * gk2 * gk1 * qm1i + dot21 = dot21 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * gk3 * gk1 * qm1i + dot31 = dot31 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * gk2 * gk2 * qm1i + dot22 = dot22 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * gk3 * gk2 * qm1i + dot32 = dot32 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + cv = evci * gk3 * gk3 * qm1i + dot33 = dot33 + Re_worksum*DBLE(cv) + Im_worksum*DIMAG(cv) + ! + ENDDO + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ENDIF + ! + sigmanlc(:,1) = sigmanlc(:,1) - 2._DP * wg(ibnd,ik) * [dot11, dot21, dot31] + sigmanlc(:,2) = sigmanlc(:,2) - 2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] + sigmanlc(:,3) = sigmanlc(:,3) - 2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] + ! + ! ... non diagonal contribution - derivative of the spherical harmonics + ! ... (no contribution from l=0) + ! + IF ( lmaxkb == 0 ) CYCLE + ! + dot11 = 0._DP ; dot21 = 0._DP + dot31 = 0._DP ; dot22 = 0._DP + dot32 = 0._DP ; dot33 = 0._DP + ! + IF (noncolin) THEN + ! +#if defined(_OPENACC) + !$acc parallel loop collapse(2) reduction(+:dot11,dot21,dot31,& + !$acc& dot22,dot32,dot33) +#else + !$omp parallel do collapse(2) reduction(+:dot11,dot21,dot31,dot22,& + !$omp& dot32,dot33) shared(evcv,gk,ps_nc,dvkb) +#endif + DO ikb =1, nkb + DO i = 1, npw + ! + gk1 = CMPLX(gk(i,1)) + gk2 = CMPLX(gk(i,2)) + gk3 = CMPLX(gk(i,3)) + ! + ps1 = ps_nc(ikb,1) + ps2 = ps_nc(ikb,2) + ! + ps1d1 = ps1 * dvkb(i,ikb,1) + ps1d2 = ps1 * dvkb(i,ikb,2) + ps1d3 = ps1 * dvkb(i,ikb,3) + ! + ps2d1 = ps2 * dvkb(i,ikb,1) + ps2d2 = ps2 * dvkb(i,ikb,2) + ps2d3 = ps2 * dvkb(i,ikb,3) + ! + evc1i = evcv(i) + evc2i = evcv(i+npwx) + ! + cv1 = evc1i * gk1 + cv2 = evc2i * gk1 + dot11 = dot11 + DBLE(ps1d1)*DBLE(cv1) + DIMAG(ps1d1)*DIMAG(cv1) + & + DBLE(ps2d1)*DBLE(cv2) + DIMAG(ps2d1)*DIMAG(cv2) + ! + dot21 = dot21 + DBLE(ps1d2)*DBLE(cv1) + DIMAG(ps1d2)*DIMAG(cv1) + & + DBLE(ps2d2)*DBLE(cv2) + DIMAG(ps2d2)*DIMAG(cv2) + ! + dot31 = dot31 + DBLE(ps1d3)*DBLE(cv1) + DIMAG(ps1d3)*DIMAG(cv1) + & + DBLE(ps2d3)*DBLE(cv2) + DIMAG(ps2d3)*DIMAG(cv2) + ! + cv1 = evc1i * gk2 + cv2 = evc2i * gk2 + dot22 = dot22 + DBLE(ps1d2)*DBLE(cv1) + DIMAG(ps1d2)*DIMAG(cv1) + & + DBLE(ps2d2)*DBLE(cv2) + DIMAG(ps2d2)*DIMAG(cv2) + ! + dot32 = dot32 + DBLE(ps1d3)*DBLE(cv1) + DIMAG(ps1d3)*DIMAG(cv1) + & + DBLE(ps2d3)*DBLE(cv2) + DIMAG(ps2d3)*DIMAG(cv2) + ! + cv1 = evc1i * gk3 + cv2 = evc2i * gk3 + dot33 = dot33 + DBLE(ps1d3)*DBLE(cv1) + DIMAG(ps1d3)*DIMAG(cv1) + & + DBLE(ps2d3)*DBLE(cv2) + DIMAG(ps2d3)*DIMAG(cv2) + ! + ENDDO + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ELSE + ! +#if defined(_OPENACC) + !$acc parallel loop collapse(2) reduction(+:dot11,dot21,dot31,& + !$acc& dot22,dot32,dot33) +#else + !$omp parallel do collapse(2) reduction(+:dot11,dot21,dot31,dot22,& + !$omp& dot32,dot33) shared(evcv,gk,ps,dvkb) +#endif + DO ikb = 1, nkb + DO i = 1, npw + pss = ps(ikb) + psd1 = pss*dvkb(i,ikb,1) + psd2 = pss*dvkb(i,ikb,2) + psd3 = pss*dvkb(i,ikb,3) + evci = evcv(i) + gk1 = CMPLX(gk(i,1)) + gk2 = CMPLX(gk(i,2)) + gk3 = CMPLX(gk(i,3)) + ! + cv = evci * gk1 + dot11 = dot11 + DBLE(psd1)*DBLE(cv) + DIMAG(psd1)*DIMAG(cv) + dot21 = dot21 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv) + dot31 = dot31 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv) + ! + cv = evci * gk2 + dot22 = dot22 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv) + dot32 = dot32 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv) + ! + cv = evci * gk3 + dot33 = dot33 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv) + ENDDO + ENDDO +#if !defined(_OPENACC) + !$omp end parallel do +#endif + ! + ENDIF + ! + sigmanlc(:,1) = sigmanlc(:,1) -2._DP * wg(ibnd,ik) * [dot11, dot21, dot31] + sigmanlc(:,2) = sigmanlc(:,2) -2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] + sigmanlc(:,3) = sigmanlc(:,3) -2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] + ! + ENDDO ! 10 CONTINUE ! + !$acc end data IF (noncolin) THEN - DEALLOCATE( work2_nc ) - DEALLOCATE( deff_nc ) + DEALLOCATE( ps_nc ) + DEALLOCATE( deff_nc ) ELSE - DEALLOCATE( work2 ) - DEALLOCATE( deff ) + DEALLOCATE( ps ) + DEALLOCATE( deff ) ENDIF + ! +#if !defined(__CUDA) || !defined(_OPENACC) + IF ( noncolin ) THEN + DEALLOCATE( becpnc ) + ELSE + DEALLOCATE( becpk ) + ENDIF +#endif + !$acc end data DEALLOCATE( dvkb ) - DEALLOCATE( work1 ) ! RETURN ! END SUBROUTINE stres_us_k ! + ! END SUBROUTINE stres_us diff --git a/PW/src/stres_us_gpu.f90 b/PW/src/stres_us_gpu.f90 deleted file mode 100644 index 2ab3ba3b4..000000000 --- a/PW/src/stres_us_gpu.f90 +++ /dev/null @@ -1,865 +0,0 @@ -! -! Copyright (C) 2001-2021 Quantum ESPRESSO group -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!---------------------------------------------------------------------------- -SUBROUTINE stres_us_gpu( ik, gk_d, sigmanlc ) - !---------------------------------------------------------------------------- - !! nonlocal (separable pseudopotential) contribution to the stress - !! NOTICE: sum of partial results over procs is performed in calling routine - ! - USE kinds, ONLY : DP - USE ions_base, ONLY : nat, ntyp => nsp, ityp - USE constants, ONLY : eps8 - USE klist, ONLY : nks, xk, ngk, igk_k - USE lsda_mod, ONLY : current_spin, lsda, isk - USE wvfct, ONLY : npwx, nbnd, wg, et - USE control_flags, ONLY : gamma_only - USE uspp_param, ONLY : upf, lmaxkb, nh, nhm - USE uspp, ONLY : nkb, vkb, deeq_d - USE lsda_mod, ONLY : nspin - USE noncollin_module, ONLY : noncolin, npol, lspinorb - USE mp_pools, ONLY : me_pool, root_pool - USE mp_bands, ONLY : intra_bgrp_comm, me_bgrp, root_bgrp - USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, & - bec_type, becp, calbec - USE mp, ONLY : mp_sum, mp_get_comm_null, & - mp_circular_shift_left - USE wavefunctions_gpum, ONLY : using_evc, using_evc_d, evc_d - USE wvfct_gpum, ONLY : using_et - USE becmod_gpum, ONLY : becp_d, bec_type_d - USE becmod_subs_gpum, ONLY : using_becp_auto, using_becp_d_auto, & - calbec_gpu - USE uspp_init, ONLY : init_us_2, gen_us_dj_gpu, gen_us_dy_gpu -#if defined(__CUDA) - USE device_fbuff_m, ONLY : dev_buf -#endif - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: ik - !! k-point index - REAL(DP), INTENT(IN) :: gk_d(npwx,3) - !! wave function components for fixed k-point - REAL(DP), INTENT(INOUT) :: sigmanlc(3,3) - !! stress tensor, non-local contribution - ! - ! ... local variables - ! - REAL(DP), POINTER :: qm1_d(:) - REAL(DP) :: q - INTEGER :: npw , iu, np, ierr - ! - INTEGER :: na1, np1, nh_np1, ijkb01, itot - LOGICAL :: ismulti_np - INTEGER, ALLOCATABLE :: shift(:) - INTEGER, ALLOCATABLE :: ix_d(:,:), ityp_d(:), nh_d(:), shift_d(:) - LOGICAL, ALLOCATABLE :: is_multinp_d(:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: gk_d, qm1_d, is_multinp_d, ix_d, shift_d, & - ityp_d, nh_d - ! - CALL using_evc_d(0) - CALL using_evc(0) - ! - IF ( nkb == 0 ) RETURN - ! - IF ( lsda ) current_spin = isk(ik) - npw = ngk(ik) - IF ( nks > 1 ) CALL init_us_2( npw, igk_k(1,ik), xk(1,ik), vkb, .true. ) - ! - CALL allocate_bec_type( nkb, nbnd, becp, intra_bgrp_comm ) - CALL using_becp_auto(2) - ! - CALL using_evc_d(0) - CALL using_becp_d_auto(2) - ! -!$acc data present(vkb(:,:)) -!$acc host_data use_device(vkb) - CALL calbec_gpu( npw, vkb, evc_d, becp_d ) -!$acc end host_data -!$acc end data - ! - CALL dev_buf%lock_buffer( qm1_d, npwx, ierr ) - !$cuf kernel do (1) <<<*,*>>> - DO iu = 1, npw - q = SQRT( gk_d(iu,1)**2 + gk_d(iu,2)**2 + gk_d(iu,3)**2 ) - IF ( q > eps8 ) THEN - qm1_d(iu) = 1._DP / q - ELSE - qm1_d(iu) = 0._DP - ENDIF - ENDDO - ! - !----------define index arrays (type, atom, etc.) for cuf kernel loops------ - ALLOCATE( is_multinp_d(nat*nhm) ) - ALLOCATE( ix_d(nat*nhm,4), ityp_d(nat), nh_d(ntyp) ) - ityp_d = ityp ; nh_d = nh - ! - ALLOCATE( shift(nat) ) - ijkb01 = 0 - DO iu = 1, ntyp - DO na1 = 1, nat - IF (ityp(na1) == iu ) THEN - shift(na1) = ijkb01 - ijkb01 = ijkb01 + nh(iu) - ENDIF - ENDDO - ENDDO - ! - ALLOCATE( shift_d(nat) ) - shift_d = shift - ! - ijkb01 = 0 - itot=0 - DO np = 1, ntyp - DO na1 = 1, nat - np1 = ityp(na1) - IF (np /= np1) CYCLE - ijkb01 = shift(na1) - nh_np1 = nh(np1) - ismulti_np = upf(np1)%tvanp .OR. upf(np1)%is_multiproj - IF ( .NOT. ismulti_np .AND. noncolin .AND. lspinorb ) & - CALL errore('stres_us','wrong case',1) - !$cuf kernel do (1) <<<*,*>>> - DO iu = itot+1, itot+nh_np1 - ix_d(iu,1) = na1 !na - ix_d(iu,2) = iu-itot !ih - ix_d(iu,3) = nh_np1 !nh(np) - ix_d(iu,4) = ijkb01 !ishift - is_multinp_d(iu) = ismulti_np - ENDDO - itot = itot + nh_np1 - ENDDO - ENDDO - !-------------- - ! - ! - IF ( gamma_only ) THEN - ! - CALL stres_us_gamma_gpu() - ! - ELSE - ! - CALL stres_us_k_gpu() - ! - ENDIF - ! - CALL dev_buf%release_buffer( qm1_d, ierr ) - ! - DEALLOCATE( is_multinp_d ) - DEALLOCATE( ix_d, ityp_d, nh_d, shift_d ) - DEALLOCATE( shift ) - ! - CALL deallocate_bec_type( becp ) - CALL using_becp_auto(2) - ! - RETURN - ! - CONTAINS - ! - !----------------------------------------------------------------------- - SUBROUTINE stres_us_gamma_gpu() - !----------------------------------------------------------------------- - !! nonlocal contribution to the stress - gamma version - ! - IMPLICIT NONE - ! - ! ... local variables - ! - INTEGER :: na, np, nt, ibnd, ipol, jpol, l, i, ikb, & - jkb, ih, jh, ibnd_loc,ijkb0,nh_np, nproc, & - nbnd_loc, nbnd_begin, icyc, ishift, nhmx - REAL(DP) :: dot11, dot21, dot31, dot22, dot32, dot33, & - qm1i, gk1, gk2, gk3, wg_nk, fac, evps, aux - COMPLEX(DP) :: worksum, cv, wsum1, wsum2, wsum3, ps, evci - ! - COMPLEX(DP), POINTER :: ps_d(:), deff_d(:,:,:), dvkb_d(:,:,:) - REAL(DP), POINTER :: becpr_d(:,:) - ! - REAL(DP) :: xyz(3,3) - INTEGER :: ierrs(3) - ! - ! xyz are the three unit vectors in the x,y,z directions - DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, 0._DP, & - 1._DP / -#if defined(__CUDA) - attributes(DEVICE) :: deff_d, becpr_d, ps_d, dvkb_d -#endif - ! - CALL using_becp_auto(0) - ! - IF( becp%comm /= mp_get_comm_null() ) THEN - nproc = becp%nproc - nbnd_loc = becp%nbnd_loc - nbnd_begin = becp%ibnd_begin - IF( ( nbnd_begin + nbnd_loc - 1 ) > nbnd ) nbnd_loc = nbnd - nbnd_begin + 1 - ELSE - nproc = 1 - nbnd_loc = nbnd - nbnd_begin = 1 - ENDIF - ! - CALL dev_buf%lock_buffer( deff_d, (/ nhm,nhm,nat /), ierrs(1) ) - ! - ! ... diagonal contribution - if the result from "calbec" are not - ! ... distributed, must be calculated on a single processor - ! - ! ... for the moment when using_gpu is true becp is always fully present in all processors - ! - CALL using_et(0) ! compute_deff : intent(in) - ! - CALL dev_buf%lock_buffer( ps_d, nkb, ierrs(2) ) - IF (ANY(ierrs(1:2) /= 0)) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs)) ) - ! - becpr_d => becp_d%r_d - ! - evps = 0._DP - ! - compute_evps: IF ( .NOT. (nproc == 1 .AND. me_pool /= root_pool) ) THEN - ! - DO ibnd_loc = 1, nbnd_loc - ibnd = ibnd_loc + becp%ibnd_begin - 1 - CALL compute_deff_gpu( deff_d, et(ibnd,ik) ) - wg_nk = wg(ibnd,ik) - ! - !$cuf kernel do (1) <<<*,*>>> - DO i = 1, itot - ! - ih = ix_d(i,2) ; na = ix_d(i,1) - ishift = ix_d(i,4) ; ikb = ishift + ih - ! - IF (.NOT. is_multinp_d(i)) THEN - aux = wg_nk * DBLE(deff_d(ih,ih,na)) * & - ABS(becpr_d(ikb,ibnd_loc))**2 - ELSE - nh_np = ix_d(i,3) - ! - aux = wg_nk * DBLE(deff_d(ih,ih,na)) & - * ABS(becpr_d(ikb,ibnd_loc))**2 & - + becpr_d(ikb,ibnd_loc)* wg_nk * 2._DP & - * SUM( DBLE(deff_d(ih,ih+1:nh_np,na)) & - * becpr_d(ishift+ih+1:ishift+nh_np,ibnd_loc)) - ENDIF - evps = evps + aux - ! - ENDDO - ! - ENDDO - ! - ENDIF compute_evps - ! - ! - ! ... non diagonal contribution - derivative of the bessel function - !------------------------------------ - CALL dev_buf%lock_buffer( dvkb_d, (/ npwx,nkb,4 /), ierrs(3) ) - IF (ierrs(3) /= 0) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(ierrs(3))) - ! - CALL gen_us_dj_gpu( ik, dvkb_d(:,:,4) ) - IF ( lmaxkb > 0 ) THEN - DO ipol = 1, 3 - CALL gen_us_dy_gpu( ik, xyz(1,ipol), dvkb_d(:,:,ipol)) - ENDDO - ENDIF - ! - ! - DO icyc = 0, nproc -1 - ! - DO ibnd_loc = 1, nbnd_loc - ! - ibnd = ibnd_loc + becp%ibnd_begin - 1 - CALL compute_deff_gpu( deff_d, et(ibnd,ik) ) - ! - !$cuf kernel do (1) <<<*,*>>> - DO i = 1, itot - ! - ih = ix_d(i,2) ; na = ix_d(i,1) - ishift = ix_d(i,4) ; ikb = ishift + ih - ! - IF (.NOT. is_multinp_d(i)) THEN - ps_d(ikb) = deff_d(ih,ih,na) * CMPLX(becpr_d(ikb,ibnd_loc)) - ELSE - nh_np = ix_d(i,3) - ! - ps_d(ikb) = CMPLX( SUM( becpr_d(ishift+1:ishift+nh_np,ibnd_loc) & - * DBLE(deff_d(ih,1:nh_np,na)))) - ENDIF - ! - ENDDO - ! - dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP - dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP - ! - !$cuf kernel do(2) <<<*,*>>> - DO na =1, nat - DO i = 1, npw - worksum = (0._DP,0._DP) - np = ityp_d(na) - ijkb0 = shift_d(na) - nh_np = nh_d(np) - DO ih = 1, nh_np - ikb = ijkb0 + ih - worksum = worksum + ps_d(ikb) * dvkb_d(i,ikb,4) - ENDDO - evci = evc_d(i,ibnd) - gk1 = gk_d(i,1) - gk2 = gk_d(i,2) - gk3 = gk_d(i,3) - qm1i = qm1_d(i) - ! - cv = evci * CMPLX(gk1 * gk1 * qm1i) - dot11 = dot11 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * CMPLX(gk2 * gk1 * qm1i ) - dot21 = dot21 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * CMPLX(gk3 * gk1 * qm1i) - dot31 = dot31 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * CMPLX(gk2 * gk2 * qm1i) - dot22 = dot22 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * CMPLX(gk3 * gk2 * qm1i) - dot32 = dot32 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * CMPLX(gk3 * gk3 * qm1i) - dot33 = dot33 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ENDDO - ENDDO - ! ... a factor 2 accounts for the other half of the G-vector sphere - sigmanlc(:,1) = sigmanlc(:,1) - 4._DP * wg(ibnd,ik) * [dot11, dot21, dot31] - sigmanlc(:,2) = sigmanlc(:,2) - 4._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] - sigmanlc(:,3) = sigmanlc(:,3) - 4._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] - - ! - ! ... non diagonal contribution - derivative of the spherical harmonics - ! ... (no contribution from l=0) - ! - IF ( lmaxkb == 0 ) CYCLE - ! - !------------------------------------ - ! - dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP - dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP - ! - !$cuf kernel do(2) <<<*,*>>> - DO ikb = 1, nkb - DO i = 1, npw - ! - wsum1 = ps_d(ikb)*dvkb_d(i,ikb,1) - wsum2 = ps_d(ikb)*dvkb_d(i,ikb,2) - wsum3 = ps_d(ikb)*dvkb_d(i,ikb,3) - ! - evci = evc_d(i,ibnd) - gk1 = gk_d(i,1) - gk2 = gk_d(i,2) - gk3 = gk_d(i,3) - ! - cv = evci * CMPLX(gk1 ) - dot11 = dot11 + DBLE( wsum1)* DBLE(cv) + DIMAG(wsum1)*DIMAG(cv) - dot21 = dot21 + DBLE( wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv) - dot31 = dot31 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv) - ! - cv = evci * CMPLX( gk2) - dot22 = dot22 + DBLE( wsum2)* DBLE(cv) + DIMAG(wsum2)*DIMAG(cv) - dot32 = dot32 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv) - ! - cv = evci * CMPLX( gk3 ) - dot33 = dot33 + DBLE( wsum3)* DBLE(cv) + DIMAG(wsum3)*DIMAG(cv) - ENDDO - ENDDO - ! - ! ... a factor 2 accounts for the other half of the G-vector sphere - ! - sigmanlc(:,1) = sigmanlc(:,1) -4._DP * wg(ibnd, ik) * [dot11, dot21, dot31] - sigmanlc(:,2) = sigmanlc(:,2) -4._DP * wg(ibnd, ik) * [0._DP, dot22, dot32] - sigmanlc(:,3) = sigmanlc(:,3) -4._DP * wg(ibnd, ik) * [0._DP, 0._DP, dot33] - IF ( nproc > 1 ) THEN - CALL errore ('stres_us_gamma_gpu line 303', & - 'unexpected error nproc be 1 with GPU acceleration', 100) - !CALL mp_circular_shift_left(becp%r, icyc, becp%comm) - !CALL mp_circular_shift_left(becp%ibnd_begin, icyc, becp%comm) - !CALL mp_circular_shift_left(nbnd_loc, icyc, becp%comm) - ENDIF - ENDDO - ENDDO - ! -10 CONTINUE - ! - DO l = 1, 3 - sigmanlc(l,l) = sigmanlc(l,l) - evps - ENDDO - ! - ! - CALL dev_buf%release_buffer( deff_d, ierrs(1) ) - CALL dev_buf%release_buffer( ps_d, ierrs(2) ) - CALL dev_buf%release_buffer( dvkb_d, ierrs(3) ) - ! - ! - RETURN - ! - END SUBROUTINE stres_us_gamma_gpu - ! - ! - !---------------------------------------------------------------------- - SUBROUTINE stres_us_k_gpu() - !---------------------------------------------------------------------- - !! nonlocal contribution to the stress - k-points version - ! - IMPLICIT NONE - ! - ! ... local variables - ! - INTEGER :: na, np, ibnd, ipol, jpol, l, i, nt, & - ikb, jkb, ih, jh, is, js, ijs, ishift, nh_np - REAL(DP) :: fac, evps, dot11, dot21, dot31, dot22, dot32, dot33, aux - COMPLEX(DP) :: qm1i, gk1, gk2, gk3, ps, ps_nc(2) - COMPLEX(DP) :: cv, cv1, cv2, worksum, worksum1, worksum2, evci, evc1i, & - evc2i, ps1, ps2, ps1d1, ps1d2, ps1d3, ps2d1, ps2d2, & - ps2d3, psd1, psd2, psd3 - ! - COMPLEX(DP), POINTER :: dvkb_d(:,:,:) - COMPLEX(DP), POINTER :: deff_d(:,:,:), deff_nc_d(:,:,:,:) - COMPLEX(DP), POINTER :: ps_d(:), ps_nc_d(:,:) - COMPLEX(DP), POINTER :: becpk_d(:,:), becpnc_d(:,:,:) - INTEGER :: nhmx, ierrs(3) - ! - REAL(DP) :: xyz(3,3) - ! xyz are the three unit vectors in the x,y,z directions - DATA xyz / 1._DP, 0._DP, 0._DP, 0._DP, 1._DP, 0._DP, 0._DP, & - 0._DP, 1._DP / - ! -#if defined(__CUDA) - ATTRIBUTES(DEVICE) :: ps_d, ps_nc_d, becpnc_d, becpk_d, dvkb_d, & - deff_d, deff_nc_d -#endif - ! - ! - ! - evps = 0._DP - ! ... diagonal contribution - ! - CALL dev_buf%lock_buffer( dvkb_d, (/ npwx,nkb,4 /), ierrs(1) ) - ! - CALL gen_us_dj_gpu( ik, dvkb_d(:,:,4) ) - IF ( lmaxkb > 0 ) THEN - DO ipol = 1, 3 - CALL gen_us_dy_gpu( ik, xyz(1,ipol), dvkb_d(:,:,ipol) ) - ENDDO - ENDIF - ! - IF (noncolin) THEN - CALL dev_buf%lock_buffer( ps_nc_d, (/ nkb,npol /), ierrs(2) ) - CALL dev_buf%lock_buffer( deff_nc_d, (/ nhm,nhm,nat,nspin /), ierrs(3) ) - becpnc_d => becp_d%nc_d - ELSE - CALL dev_buf%lock_buffer ( ps_d, nkb, ierrs(2) ) - CALL dev_buf%lock_buffer( deff_d, (/ nhm,nhm,nat /), ierrs(3) ) - becpk_d => becp_d%k_d - ENDIF - IF (ANY(ierrs /= 0)) CALL errore( 'stres_us_gpu', 'cannot allocate buffers', ABS(MAXVAL(ierrs)) ) - ! - CALL using_et(0) - ! - ! ... the contribution is calculated only on one processor because - ! ... partial results are later summed over all processors - ! - IF ( me_bgrp /= root_bgrp ) GO TO 100 - ! - ! - DO ibnd = 1, nbnd - fac = wg(ibnd,ik) - IF (ABS(fac) < 1.d-9) CYCLE - IF (noncolin) THEN - CALL compute_deff_nc_gpu( deff_nc_d , et(ibnd,ik) ) - ELSE - CALL compute_deff_gpu( deff_d, et(ibnd,ik) ) - ENDIF - ! - ! - IF (noncolin) THEN - ! - !$cuf kernel do (1) <<<*,*>>> - DO i = 1, itot - ! - ih = ix_d(i,2) ; na = ix_d(i,1) - ishift = ix_d(i,4) ; ikb = ishift + ih - aux = 0.d0 - ! - IF (.NOT. is_multinp_d(i)) THEN - ! - ijs = 0 - DO is = 1, npol - DO js = 1, npol - ijs = ijs + 1 - aux = aux + fac * DBLE(deff_nc_d(ih,ih,na,ijs) & - * CONJG(becpnc_d(ikb,is,ibnd)) * & - becpnc_d(ikb,js,ibnd)) - ENDDO - ENDDO - ! - ELSE - ! - nh_np = ix_d(i,3) - ! - ijs = 0 - DO is = 1, npol - DO js = 1, npol - ijs = ijs + 1 - aux = aux + fac * DBLE(deff_nc_d(ih,ih,na,ijs) & - * CONJG(becpnc_d(ikb,is,ibnd))* & - becpnc_d(ikb,js,ibnd)) - ENDDO - ENDDO - ! - DO jh = ih+1, nh_np - jkb = ishift + jh - ijs = 0 - DO is = 1, npol - DO js = 1, npol - ijs = ijs + 1 - aux = aux + 2._DP*fac * & - DBLE( deff_nc_d(ih,jh,na,ijs) * & - (CONJG(becpnc_d(ikb,is,ibnd)) * & - becpnc_d(jkb,js,ibnd)) ) - ENDDO - ENDDO - ENDDO - ! - ENDIF - evps = evps + aux - ENDDO - ! - ELSE - ! - aux = 0.d0 - !$cuf kernel do (1) <<<*,*>>> - DO i = 1, itot - ! - ih = ix_d(i,2) ; na = ix_d(i,1) - ishift = ix_d(i,4) ; ikb = ishift + ih - ! - IF (.NOT. is_multinp_d(i)) THEN - ! - aux = fac * deff_d(ih,ih,na) * & - ABS(becpk_d(ikb,ibnd) )**2 - ! - ELSE - ! - nh_np = ix_d(i,3) - ! - aux = fac * DBLE(deff_d(ih,ih,na) * & - ABS(becpk_d(ikb,ibnd) )**2) + & - DBLE(SUM( deff_d(ih,ih+1:nh_np,na) * & - fac * 2._DP*DBLE( CONJG(becpk_d(ikb,ibnd)) & - * becpk_d(ishift+ih+1:ishift+nh_np,ibnd) ) )) - ! - ENDIF - evps = evps + aux - ENDDO - ! - ENDIF - ! - ENDDO - ! - DO l = 1, 3 - sigmanlc(l,l) = sigmanlc(l,l) - evps - ENDDO - ! -100 CONTINUE - ! - ! ... non diagonal contribution - derivative of the bessel function - ! - DO ibnd = 1, nbnd - ! - IF ( noncolin ) THEN - ! - CALL compute_deff_nc_gpu( deff_nc_d, et(ibnd,ik) ) - ! - !$cuf kernel do (1) <<<*,*>>> - DO i = 1, itot - ! - ih = ix_d(i,2) ; na = ix_d(i,1) - ishift = ix_d(i,4) ; ikb = ishift + ih - ! - IF (.NOT. is_multinp_d(i)) THEN - ! - DO is = 1, npol - ijs = (is-1)*npol - ps_nc_d(ikb,is) = SUM( becpnc_d(ikb,1:npol,ibnd) * & - deff_nc_d(ih,ih,na,ijs+1:ijs+npol) ) - ENDDO - ! - ELSE - ! - nh_np = ix_d(i,3) - ! - DO is = 1, npol - ijs = (is-1)*npol - ps_nc_d(ikb,is) = SUM( becpnc_d(ishift+1:ishift+nh_np,1:npol,ibnd) * & - deff_nc_d(ih,1:nh_np,na,ijs+1:ijs+npol) ) - ENDDO - ! - ENDIF - ENDDO - ! - ELSE - ! - CALL compute_deff_gpu( deff_d, et(ibnd,ik) ) - ! - !$cuf kernel do (1) <<<*,*>>> - DO i = 1, itot - ! - ih = ix_d(i,2) ; na = ix_d(i,1) - ishift = ix_d(i,4) ; ikb = ishift + ih - ! - IF (.NOT. is_multinp_d(i)) THEN - ps_d(ikb) = CMPLX(deeq_d(ih,ih,na,current_spin)) * & - becpk_d(ikb,ibnd) - ELSE - nh_np = ix_d(i,3) - ! - ps_d(ikb) = SUM( becpk_d(ishift+1:ishift+nh_np,ibnd) * & - deff_d(ih,1:nh_np,na) ) - ENDIF - ! - ENDDO - ! - ENDIF - ! - dot11 = 0._DP ; dot21 = 0._DP ; dot31 = 0._DP - dot22 = 0._DP ; dot32 = 0._DP ; dot33 = 0._DP - ! - ! - IF (noncolin) THEN - !$cuf kernel do(2) <<<*,*>>> - DO ikb =1, nkb - DO i = 1, npw - evc1i = evc_d(i, ibnd) - evc2i = evc_d(i+npwx,ibnd) - qm1i = CMPLX(qm1_d(i)) - gk1 = CMPLX(gk_d(i,1)) - gk2 = CMPLX(gk_d(i,2)) - gk3 = CMPLX(gk_d(i,3)) - worksum1 = ps_nc_d(ikb,1) * dvkb_d(i,ikb,4) - worksum2 = ps_nc_d(ikb,2) * dvkb_d(i,ikb,4) - ! - cv1 = evc1i * gk1 * gk1 * qm1i - cv2 = evc2i * gk1 * gk1 * qm1i - dot11 = dot11 + DBLE(worksum1)*DBLE(cv1) + & - DIMAG(worksum1)*DIMAG(cv1) + & - DBLE(worksum2)*DBLE(cv2) + & - DIMAG(worksum2)*DIMAG(cv2) - ! - cv1 = evc1i * gk2 * gk1 * qm1i - cv2 = evc2i * gk2 * gk1 * qm1i - dot21 = dot21 + DBLE(worksum1)*DBLE(cv1) + & - DIMAG(worksum1)*DIMAG(cv1) + & - DBLE(worksum2)*DBLE(cv2) + & - DIMAG(worksum2)*DIMAG(cv2) - ! - cv1 = evc1i * gk3 * gk1 * qm1i - cv2 = evc2i * gk3 * gk1 * qm1i - dot31 = dot31 + DBLE(worksum1)*DBLE(cv1) + & - DIMAG(worksum1)*DIMAG(cv1) + & - DBLE(worksum2)*DBLE(cv2) + & - DIMAG(worksum2)*DIMAG(cv2) - ! - cv1 = evc1i * gk2 * gk2 * qm1i - cv2 = evc2i * gk2 * gk2 * qm1i - dot22 = dot22 + DBLE(worksum1)*DBLE(cv1) + & - DIMAG(worksum1)*DIMAG(cv1) + & - DBLE(worksum2)*DBLE(cv2) + & - DIMAG(worksum2)*DIMAG(cv2) - ! - cv1 = evc1i * gk3 * gk2 * qm1i - cv2 = evc2i * gk3 * gk2 * qm1i - dot32 = dot32 + DBLE(worksum1)*DBLE(cv1) + & - DIMAG(worksum1)*DIMAG(cv1) + & - DBLE(worksum2)*DBLE(cv2) + & - DIMAG(worksum2)*DIMAG(cv2) - ! - cv1 = evc1i * gk3 * gk3 * qm1i - cv2 = evc2i * gk3 * gk3 * qm1i - dot33 = dot33 + DBLE(worksum1)*DBLE(cv1) + & - DIMAG(worksum1)*DIMAG(cv1) + & - DBLE(worksum2)*DBLE(cv2) + & - DIMAG(worksum2)*DIMAG(cv2) - ENDDO - ENDDO - ! - ELSE - ! - ! - !$cuf kernel do(2) <<<*,*>>> - DO ikb = 1, nkb - DO i = 1, npw - ! - worksum = ps_d(ikb) *dvkb_d(i,ikb,4) - ! - evci = evc_d(i,ibnd) - qm1i = CMPLX(qm1_d(i)) - gk1 = CMPLX(gk_d(i,1)) - gk2 = CMPLX(gk_d(i,2)) - gk3 = CMPLX(gk_d(i,3)) - ! - cv = evci * gk1 * gk1 * qm1i - dot11 = dot11 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * gk2 * gk1 * qm1i - dot21 = dot21 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * gk3 * gk1 * qm1i - dot31 = dot31 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * gk2 * gk2 * qm1i - dot22 = dot22 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * gk3 * gk2 * qm1i - dot32 = dot32 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - cv = evci * gk3 * gk3 * qm1i - dot33 = dot33 + DBLE(worksum)*DBLE(cv) + DIMAG(worksum)*DIMAG(cv) - ! - ENDDO - ENDDO - ! - !IF ( me_bgrp /= root_bgrp ) stop - ! - ENDIF - ! - sigmanlc(:,1) = sigmanlc(:,1) - 2._DP * wg(ibnd,ik) * [dot11, dot21, dot31] - sigmanlc(:,2) = sigmanlc(:,2) - 2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] - sigmanlc(:,3) = sigmanlc(:,3) - 2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] - ! - ! ... non diagonal contribution - derivative of the spherical harmonics - ! ... (no contribution from l=0) - ! - IF ( lmaxkb == 0 ) CYCLE - ! - dot11 = 0._DP ; dot21 = 0._DP - dot31 = 0._DP ; dot22 = 0._DP - dot32 = 0._DP ; dot33 = 0._DP - ! - IF (noncolin) THEN - ! - !$cuf kernel do(2) <<<*,*>>> - DO ikb =1, nkb - DO i = 1, npw - ! - gk1 = CMPLX(gk_d(i,1)) - gk2 = CMPLX(gk_d(i,2)) - gk3 = CMPLX(gk_d(i,3)) - ! - ps1 = ps_nc_d(ikb,1) - ps2 = ps_nc_d(ikb,2) - ! - ps1d1 = ps1 * dvkb_d(i,ikb,1) - ps1d2 = ps1 * dvkb_d(i,ikb,2) - ps1d3 = ps1 * dvkb_d(i,ikb,3) - ! - ps2d1 = ps2 * dvkb_d(i,ikb,1) - ps2d2 = ps2 * dvkb_d(i,ikb,2) - ps2d3 = ps2 * dvkb_d(i,ikb,3) - ! - evc1i = evc_d(i,ibnd) - evc2i = evc_d(i+npwx,ibnd) - ! - cv1 = evc1i * gk1 - cv2 = evc2i * gk1 - dot11 = dot11 + DBLE(ps1d1)*DBLE(cv1) + & - DIMAG(ps1d1)*DIMAG(cv1) + & - DBLE(ps2d1)*DBLE(cv2) + & - DIMAG(ps2d1)*DIMAG(cv2) - ! - dot21 = dot21 + DBLE(ps1d2)*DBLE(cv1) + & - DIMAG(ps1d2)*DIMAG(cv1) + & - DBLE(ps2d2)*DBLE(cv2) + & - DIMAG(ps2d2)*DIMAG(cv2) - ! - dot31 = dot31 + DBLE(ps1d3)*DBLE(cv1) + & - DIMAG(ps1d3)*DIMAG(cv1) + & - DBLE(ps2d3)*DBLE(cv2) + & - DIMAG(ps2d3)*DIMAG(cv2) - ! - cv1 = evc1i * gk2 - cv2 = evc2i * gk2 - dot22 = dot22 + DBLE(ps1d2)*DBLE(cv1) + & - DIMAG(ps1d2)*DIMAG(cv1) + & - DBLE(ps2d2)*DBLE(cv2) + & - DIMAG(ps2d2)*DIMAG(cv2) - ! - dot32 = dot32 + DBLE(ps1d3)*DBLE(cv1) + & - DIMAG(ps1d3)*DIMAG(cv1) + & - DBLE(ps2d3)*DBLE(cv2) + & - DIMAG(ps2d3)*DIMAG(cv2) - ! - cv1 = evc1i * gk3 - cv2 = evc2i * gk3 - dot33 = dot33 + DBLE(ps1d3)*DBLE(cv1) + & - DIMAG(ps1d3)*DIMAG(cv1) + & - DBLE(ps2d3)*DBLE(cv2) + & - DIMAG(ps2d3)*DIMAG(cv2) - ! - ENDDO - ENDDO - ! - ELSE - ! - !$cuf kernel do(2) <<<*,*>>> - DO ikb = 1, nkb - DO i = 1, npw - ps = ps_d(ikb) - psd1 = ps*dvkb_d(i,ikb,1) - psd2 = ps*dvkb_d(i,ikb,2) - psd3 = ps*dvkb_d(i,ikb,3) - evci = evc_d(i,ibnd) - gk1 = CMPLX(gk_d(i,1)) - gk2 = CMPLX(gk_d(i,2)) - gk3 = CMPLX(gk_d(i,3)) - ! - cv = evci * gk1 - dot11 = dot11 + DBLE(psd1)*DBLE(cv) + DIMAG(psd1)*DIMAG(cv) - dot21 = dot21 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv) - dot31 = dot31 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv) - ! - cv = evci * gk2 - dot22 = dot22 + DBLE(psd2)*DBLE(cv) + DIMAG(psd2)*DIMAG(cv) - dot32 = dot32 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv) - ! - cv = evci * gk3 - dot33 = dot33 + DBLE(psd3)*DBLE(cv) + DIMAG(psd3)*DIMAG(cv) - ENDDO - ENDDO - ! - ENDIF - ! - sigmanlc(:,1) = sigmanlc(:,1) -2._DP * wg(ibnd,ik) * [dot11, dot21, dot31] - sigmanlc(:,2) = sigmanlc(:,2) -2._DP * wg(ibnd,ik) * [0._DP, dot22, dot32] - sigmanlc(:,3) = sigmanlc(:,3) -2._DP * wg(ibnd,ik) * [0._DP, 0._DP, dot33] - ! - ENDDO - ! -10 CONTINUE - ! - CALL dev_buf%release_buffer( dvkb_d, ierrs(1) ) - ! - IF (noncolin) THEN - CALL dev_buf%release_buffer( ps_nc_d, ierrs(2) ) - CALL dev_buf%release_buffer( deff_nc_d, ierrs(3) ) - ELSE - CALL dev_buf%release_buffer( ps_d, ierrs(2) ) - CALL dev_buf%release_buffer( deff_d, ierrs(3) ) - ENDIF - ! - RETURN - ! - END SUBROUTINE stres_us_k_gpu -#endif - ! -END SUBROUTINE stres_us_gpu diff --git a/PW/src/stress.f90 b/PW/src/stress.f90 index 40d6dd526..1fe7f983e 100644 --- a/PW/src/stress.f90 +++ b/PW/src/stress.f90 @@ -17,12 +17,13 @@ SUBROUTINE stress( sigma ) USE ions_base, ONLY : nat, ntyp => nsp, ityp, tau, zv, atm USE constants, ONLY : ry_kbar USE ener, ONLY : etxc, vtxc - USE gvect, ONLY : ngm, gstart, g, gg, gcutm + USE gvect, ONLY : ngm, gstart, g, gg, gcutm, gl, gl_d USE fft_base, ONLY : dfftp USE ldaU, ONLY : lda_plus_u, Hubbard_projectors USE lsda_mod, ONLY : nspin USE scf, ONLY : rho, rho_core, rhog_core - USE control_flags, ONLY : iverbosity, gamma_only, llondon, ldftd3, lxdm, ts_vdw, mbd_vdw, use_gpu + USE control_flags, ONLY : iverbosity, gamma_only, llondon, ldftd3, lxdm, & + ts_vdw, mbd_vdw USE xc_lib, ONLY : xclib_dft_is USE symme, ONLY : symmatrix USE bp, ONLY : lelfield @@ -36,8 +37,7 @@ SUBROUTINE stress( sigma ) USE libmbd_interface, ONLY : HmbdvdW USE rism_module, ONLY : lrism, stres_rism USE esm, ONLY : do_comp_esm, esm_bc ! for ESM stress - USE esm, ONLY : esm_stres_har, esm_stres_ewa, esm_stres_loclong ! for ESM stress - USE gvect, ONLY : g_d, gg_d + USE esm, ONLY : esm_stres_har, esm_stres_ewa, esm_stres_loclong ! IMPLICIT NONE ! @@ -55,7 +55,7 @@ SUBROUTINE stress( sigma ) REAL(DP) :: sigmasol(3,3) ! for RISM stress INTEGER :: l, m ! - ! Auxiliary variables for Grimme-D3 + ! ... Auxiliary variables for Grimme-D3 ! INTEGER :: atnum(1:nat) REAL(DP) :: latvecs(3,3) @@ -70,10 +70,14 @@ SUBROUTINE stress( sigma ) ! CALL start_clock( 'stress' ) ! - ! contribution from local potential + !$acc update device( g, gg ) +#if defined(__CUDA) + gl_d = gl +#endif ! - IF (.NOT. use_gpu) CALL stres_loc( sigmaloc ) - IF ( use_gpu) CALL stres_loc_gpu( sigmaloc ) + ! ... contribution from local potential + ! + CALL stres_loc( sigmaloc ) ! IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! In ESM, sigmaloc has only short-range term: add long-range term @@ -81,51 +85,50 @@ SUBROUTINE stress( sigma ) sigmaloc(:,:) = sigmaloc(:,:) + sigmaloclong(:,:) END IF ! - ! hartree contribution + ! ... Hartree contribution ! IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress CALL esm_stres_har( sigmahar, rho%of_g(:,1) ) ELSE - IF (.NOT. use_gpu) CALL stres_har( sigmahar ) - IF ( use_gpu) CALL stres_har_gpu( sigmahar ) - END IF + CALL stres_har( sigmahar ) + ENDIF ! - ! xc contribution (diagonal) + ! ... XC contribution (diagonal) ! sigmaxc(:,:) = 0.d0 DO l = 1, 3 - sigmaxc (l, l) = - (etxc - vtxc) / omega + sigmaxc(l,l) = - (etxc - vtxc) / omega ENDDO ! - ! xc contribution: add gradient corrections (non diagonal) + ! ... XC contribution: add gradient corrections (non diagonal) ! - CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, rho%kin_r, & - nspin, dfftp, g, alat, omega, sigmaxc ) + IF (.NOT.xclib_dft_is('meta')) THEN + CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, & + nspin, dfftp, g, alat, omega, sigmaxc ) + ELSE + CALL stres_gradcorr( rho%of_r, rho%of_g, rho_core, rhog_core, & + nspin, dfftp, g, alat, omega, sigmaxc, rho%kin_r ) + ENDIF ! - ! meta-GGA contribution + ! ... meta-GGA contribution ! - IF (.NOT. use_gpu) CALL stres_mgga( sigmaxc ) - IF ( use_gpu) CALL stres_mgga_gpu( sigmaxc ) + CALL stres_mgga( sigmaxc ) ! - ! core correction contribution + ! ... core correction contribution ! - IF (.NOT. use_gpu) CALL stres_cc( sigmaxcc ) - IF ( use_gpu) CALL stres_cc_gpu( sigmaxcc ) + CALL stres_cc( sigmaxcc ) ! - ! ewald contribution + ! ... Ewald contribution ! IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress CALL esm_stres_ewa( sigmaewa ) ELSE - IF (.NOT. use_gpu) CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, & - bg, tau, omega, g, gg, ngm, gstart, & - gamma_only, gcutm, sigmaewa ) - IF ( use_gpu) CALL stres_ewa_gpu( alat, nat, ntyp, ityp, zv, at, bg,& - tau, omega, g_d,gg_d, ngm, gstart,& - gamma_only, gcutm, sigmaewa ) - END IF + CALL stres_ewa( alat, nat, ntyp, ityp, zv, at, bg, & + tau, omega, g, gg, ngm, gstart, & + gamma_only, gcutm, sigmaewa ) + ENDIF ! - ! semi-empirical dispersion contribution: Grimme-D2 and D3 + ! ... semi-empirical dispersion contribution: Grimme-D2 and D3 ! sigmad23( : , : ) = 0.d0 IF ( llondon ) THEN @@ -145,10 +148,9 @@ SUBROUTINE stress( sigma ) CALL stop_clock('stres_dftd3') END IF ! - ! kinetic + nonlocal contribuition + ! ... kinetic + nonlocal contribuition ! - IF (.NOT. use_gpu) CALL stres_knl( sigmanlc, sigmakin ) - IF ( use_gpu) CALL stres_knl_gpu( sigmanlc, sigmakin ) + CALL stres_knl( sigmanlc, sigmakin ) ! DO l = 1, 3 DO m = 1, 3 @@ -156,13 +158,13 @@ SUBROUTINE stress( sigma ) ENDDO ENDDO ! - ! Hubbard contribution - ! (included by stres_knl if using beta as local projectors) + ! ... Hubbard contribution + ! (included by stres_knl if using beta as local projectors) ! sigmah(:,:) = 0.d0 IF ( lda_plus_u .AND. Hubbard_projectors /= 'pseudo' ) CALL stres_hub( sigmah ) ! - ! Electric field contribution + ! ... Electric field contribution ! sigmael(:,:)=0.d0 sigmaion(:,:)=0.d0 @@ -170,15 +172,15 @@ SUBROUTINE stress( sigma ) ! call stress_bp_efield (sigmael ) ! call stress_ion_efield (sigmaion ) ! - ! vdW dispersion contribution: xdm + ! ... vdW dispersion contribution: xdm ! sigmaxdm = 0._dp IF (lxdm) sigmaxdm = stress_xdm() ! - ! vdW dispersion contribution: Tkatchenko-Scheffler + ! ... vdW dispersion contribution: Tkatchenko-Scheffler ! sigma_ts = 0.0_DP - ! vdW dispersion contribution: Many-Body Dispersion + ! ... vdW dispersion contribution: Many-Body Dispersion ! sigma_mbd = 0.0_DP IF ( mbd_vdw ) THEN @@ -187,17 +189,17 @@ SUBROUTINE stress( sigma ) sigma_ts = -2.0_DP*alat*MATMUL( HtsvdW, TRANSPOSE(at) )/omega ENDIF ! - ! DFT-non_local contribution + ! ... DFT-non_local contribution ! sigma_nonloc_dft(:,:) = 0.d0 CALL stres_nonloc_dft( rho%of_r, rho_core, nspin, sigma_nonloc_dft ) ! - ! The solvation contribution (3D-RISM) + ! ... The solvation contribution (3D-RISM) ! sigmasol(:,:) = 0.d0 IF (lrism) CALL stres_rism(sigmasol) ! - ! SUM + ! ... Sum all terms ! sigma(:,:) = sigmakin(:,:) + sigmaloc(:,:) + sigmahar(:,:) + & sigmaxc(:,:) + sigmaxcc(:,:) + sigmaewa(:,:) + & @@ -213,12 +215,12 @@ SUBROUTINE stress( sigma ) ELSE sigmaexx = 0.d0 ENDIF - ! Resymmetrize the total stress. This should not be strictly necessary, - ! but prevents loss of symmetry in long vc-bfgs runs + ! ... Resymmetrize the total stress. This should not be strictly necessary, + ! but prevents loss of symmetry in long vc-bfgs runs CALL symmatrix( sigma ) ! - ! write results in Ry/(a.u.)^3 and in kbar + ! ... write results in Ry/(a.u.)^3 and in kbar ! IF ( do_comp_esm .AND. ( esm_bc /= 'pbc' ) ) THEN ! for ESM stress WRITE( stdout, 9000) (sigma(1,1) + sigma(2,2)) * ry_kbar/3d0, & diff --git a/PW/src/usnldiag_gpu.f90 b/PW/src/usnldiag_gpu.f90 index 32b076566..6303f5c24 100644 --- a/PW/src/usnldiag_gpu.f90 +++ b/PW/src/usnldiag_gpu.f90 @@ -7,19 +7,18 @@ ! ! !----------------------------------------------------------------------- -SUBROUTINE usnldiag_gpu (npw, h_diag_d, s_diag_d) +SUBROUTINE usnldiag_gpu( npw, h_diag_d, s_diag_d ) !----------------------------------------------------------------------- + !! Add nonlocal pseudopotential term to diagonal part of Hamiltonian. + !! Compute the diagonal part of the S matrix. ! - ! add nonlocal pseudopotential term to diagonal part of Hamiltonian - ! compute the diagonal part of the S matrix. - ! - ! Routine splitted for improving performance + ! Routine splitted to improve performance ! USE kinds, ONLY: DP USE ions_base, ONLY: nat, ityp, ntyp => nsp USE wvfct, ONLY: npwx - USE uspp, ONLY: ofsbeta, deeq_d, qq_at_d, qq_so_d, & - deeq_nc_d + USE uspp, ONLY: ofsbeta, deeq, qq_at, qq_so, & + deeq_nc USE uspp_param, ONLY: upf, nh USE noncollin_module, ONLY: noncolin, npol, lspinorb USE device_memcpy_m, ONLY: dev_memset @@ -54,7 +53,7 @@ CONTAINS SUBROUTINE usnldiag_collinear() USE lsda_mod, ONLY: current_spin - USE uspp, ONLY: deeq_d, qq_at_d, vkb + USE uspp, ONLY: deeq, qq_at, vkb IMPLICIT NONE ! @@ -72,7 +71,7 @@ CONTAINS IF (ityp (na) == nt) THEN ijkb_start = ofsbeta(na) nh_ = nh(nt) - !$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) + !$acc data present(vkb(:,:),deeq) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) !$acc parallel vector_length(32) !$acc loop gang reduction(+:sum_h,sum_s) DO ig = 1, npw @@ -82,11 +81,11 @@ CONTAINS DO ih = 1, nh_ DO jh = 1, nh_ ikb = ijkb_start + ih - cv = vkb (ig, ikb) + cv = vkb(ig,ikb) jkb = ijkb_start + jh - ar = cv*conjg(vkb (ig, jkb)) - sum_h = sum_h + dble(deeq_d (ih, jh, na, current_spin) * ar) - sum_s = sum_s + dble(qq_at_d (ih, jh, na) * ar) + ar = cv*conjg(vkb(ig,jkb)) + sum_h = sum_h + dble(deeq(ih,jh,na,current_spin) * ar) + sum_s = sum_s + dble(qq_at(ih,jh,na) * ar) END DO END DO !$acc atomic update @@ -105,7 +104,7 @@ CONTAINS IF (ityp (na) == nt) THEN ijkb_start = ofsbeta(na) nh_ = nh(nt) - !$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) + !$acc data present(vkb(:,:),deeq) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) !$acc parallel vector_length(32) !$acc loop gang reduction(+:sum_h,sum_s) DO ig = 1, npw @@ -114,9 +113,9 @@ CONTAINS !$acc loop vector private(ikb,ar) reduction(+:sum_h,sum_s) DO ih = 1, nh_ ikb = ijkb_start + ih - ar = vkb (ig, ikb)*conjg(vkb (ig, ikb)) - sum_h = sum_h + dble(deeq_d (ih, ih, na, current_spin) * ar) - sum_s = sum_s + dble(qq_at_d (ih, ih, na) * ar) + ar = vkb(ig,ikb)*conjg(vkb(ig,ikb)) + sum_h = sum_h + dble(deeq(ih,ih,na,current_spin) * ar) + sum_s = sum_s + dble(qq_at(ih,ih,na) * ar) END DO !$acc atomic update h_diag_d (ig,1) = h_diag_d (ig,1) + sum_h @@ -135,7 +134,7 @@ CONTAINS ! SUBROUTINE usnldiag_noncollinear() USE lsda_mod, ONLY: current_spin - USE uspp, ONLY: vkb, qq_at_d, qq_so_d, deeq_nc_d + USE uspp, ONLY: vkb, qq_at, qq_so, deeq_nc IMPLICIT NONE ! @@ -153,7 +152,7 @@ CONTAINS IF (ityp (na) == nt) THEN ijkb_start = ofsbeta(na) nh_ = nh(nt) - !$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) + !$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) !$acc parallel vector_length(32) !$acc loop gang reduction(+:sum_h1,sum_h4,sum_s) DO ig = 1, npw ! change this to 2*npw ? @@ -166,10 +165,10 @@ CONTAINS ikb = ijkb_start + ih cv = vkb (ig, ikb) jkb = ijkb_start + jh - ar = cv*conjg(vkb (ig, jkb)) - sum_h1 = sum_h1 + dble(deeq_nc_d (ih, jh, na, 1) * ar) - sum_h4 = sum_h4 + dble(deeq_nc_d (ih, jh, na, 4) * ar) - sum_s = sum_s + dble(qq_at_d (ih, jh, na) * ar) + ar = cv*conjg(vkb(ig,jkb)) + sum_h1 = sum_h1 + dble(deeq_nc(ih,jh,na,1) * ar) + sum_h4 = sum_h4 + dble(deeq_nc(ih,jh,na,4) * ar) + sum_s = sum_s + dble(qq_at(ih,jh,na) * ar) END DO END DO ! @@ -197,7 +196,7 @@ CONTAINS IF (ityp (na) == nt) THEN ijkb_start = ofsbeta(na) nh_ = nh(nt) - !$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) + !$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) !$acc parallel vector_length(32) !$acc loop gang reduction(+:sum_h1,sum_h4,sum_s) DO ig = 1, npw @@ -207,10 +206,10 @@ CONTAINS !$acc loop vector private(ikb,ar) reduction(+:sum_h1,sum_h4,sum_s) DO ih = 1, nh_ ikb = ijkb_start + ih - ar = vkb (ig, ikb)*conjg(vkb (ig, ikb)) - sum_h1 = sum_h1 + dble(deeq_nc_d (ih, ih, na, 1) * ar) - sum_h4 = sum_h4 + dble(deeq_nc_d (ih, ih, na, 4) * ar) - sum_s = sum_s + dble(qq_at_d (ih, ih, na) * ar) + ar = vkb (ig, ikb)*conjg(vkb(ig,ikb)) + sum_h1 = sum_h1 + dble(deeq_nc(ih,ih,na,1) * ar) + sum_h4 = sum_h4 + dble(deeq_nc(ih,ih,na,4) * ar) + sum_s = sum_s + dble(qq_at(ih,ih,na) * ar) END DO ! ! OPTIMIZE HERE : this scattered assign is bad! @@ -238,7 +237,7 @@ CONTAINS ! SUBROUTINE usnldiag_spinorb() USE lsda_mod, ONLY: current_spin - USE uspp, ONLY: vkb, qq_at_d, qq_so_d, deeq_nc_d + USE uspp, ONLY: vkb, qq_at, qq_so, deeq_nc IMPLICIT NONE ! @@ -256,7 +255,7 @@ CONTAINS IF (ityp (na) == nt) THEN ijkb_start = ofsbeta(na) nh_ = nh(nt) - !$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) + !$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) !$acc parallel vector_length(32) !$acc loop gang reduction(+:sum_h1,sum_h4,sum_s1,sum_s4) DO ig = 1, npw ! change this to 2*npw ? @@ -268,13 +267,13 @@ CONTAINS DO ih = 1, nh_ DO jh = 1, nh_ ikb = ijkb_start + ih - cv = vkb (ig, ikb) + cv = vkb(ig,ikb) jkb = ijkb_start + jh - ar = cv*conjg(vkb (ig, jkb)) - sum_h1 = sum_h1 + dble(deeq_nc_d (ih, jh, na, 1) * ar) - sum_h4 = sum_h4 + dble(deeq_nc_d (ih, jh, na, 4) * ar) - sum_s1 = sum_s1 + dble(qq_so_d (ih, jh, 1, nt) * ar) - sum_s4 = sum_s4 + dble(qq_so_d (ih, jh, 4, nt) * ar) + ar = cv*conjg(vkb(ig,jkb)) + sum_h1 = sum_h1 + dble(deeq_nc(ih,jh,na,1) * ar) + sum_h4 = sum_h4 + dble(deeq_nc(ih,jh,na,4) * ar) + sum_s1 = sum_s1 + dble(qq_so(ih,jh,1,nt) * ar) + sum_s4 = sum_s4 + dble(qq_so(ih,jh,4,nt) * ar) END DO END DO ! @@ -302,7 +301,7 @@ CONTAINS IF (ityp (na) == nt) THEN ijkb_start = ofsbeta(na) nh_ = nh(nt) - !$acc data present(vkb(:,:)) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) + !$acc data present(vkb(:,:),deeq_nc) deviceptr(h_diag_d(:,:), s_diag_d(:,:)) !$acc parallel vector_length(32) !$acc loop gang reduction(+:sum_h1,sum_h4,sum_s1,sum_s4) DO ig = 1, npw @@ -314,10 +313,10 @@ CONTAINS DO ih = 1, nh_ ikb = ijkb_start + ih ar = vkb (ig, ikb)*conjg(vkb (ig, ikb)) - sum_h1 = sum_h1 + dble(deeq_nc_d (ih, ih, na, 1) * ar) - sum_h4 = sum_h4 + dble(deeq_nc_d (ih, ih, na, 4) * ar) - sum_s1 = sum_s1 + dble(qq_so_d (ih, ih, 1, nt) * ar) - sum_s4 = sum_s4 + dble(qq_so_d (ih, ih, 4, nt) * ar) + sum_h1 = sum_h1 + dble(deeq_nc(ih,ih,na,1) * ar) + sum_h4 = sum_h4 + dble(deeq_nc(ih,ih,na,4) * ar) + sum_s1 = sum_s1 + dble(qq_so(ih,ih,1,nt) * ar) + sum_s4 = sum_s4 + dble(qq_so(ih,ih,4,nt) * ar) END DO ! ! OPTIMIZE HERE : this scattered assign is bad! diff --git a/upflib/CMakeLists.txt b/upflib/CMakeLists.txt index 431d5c620..21152bc6f 100644 --- a/upflib/CMakeLists.txt +++ b/upflib/CMakeLists.txt @@ -12,8 +12,6 @@ set(src_upflib init_tab_qrad.f90 interp_atwfc.f90 init_us_2_base.f90 - gen_us_dj_gpu.f90 - gen_us_dy_gpu.f90 gth.f90 paw_variables.f90 pseudo_types.f90 @@ -56,7 +54,8 @@ set(src_upflib sph_bes_gpu.f90 ylmr2_gpu.f90 dylmr2_gpu.f90 - simpsn_gpu.f90) + simpsn_gpu.f90 + upf_acc_interfaces.f90) qe_enable_cuda_fortran("${src_upflib}") qe_add_library(qe_xml xmltools.f90 dom.f90 wxml.f90) diff --git a/upflib/Makefile b/upflib/Makefile index 39f44b3b0..0a82bdd22 100644 --- a/upflib/Makefile +++ b/upflib/Makefile @@ -65,14 +65,13 @@ ylmr2.o # GPU versions of routines OBJS_GPU= \ dylmr2_gpu.o \ - gen_us_dj_gpu.o \ - gen_us_dy_gpu.o \ init_us_2_base_gpu.o \ interp_atwfc_gpu.o \ qvan2_gpu.o \ simpsn_gpu.o \ sph_bes_gpu.o \ - ylmr2_gpu.o + ylmr2_gpu.o \ + upf_acc_interfaces.o OBJS_NODEP+= dom.o wxml.o diff --git a/upflib/dqvan2.f90 b/upflib/dqvan2.f90 index edaa8f5c7..59c2c0da5 100644 --- a/upflib/dqvan2.f90 +++ b/upflib/dqvan2.f90 @@ -9,12 +9,13 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg ) !----------------------------------------------------------------------- !! This routine computes the derivatives of the Fourier transform of - !! the Q function needed in stress assuming that the radial fourier - !! transform is already computed and stored in table qrad. - !! The formula implemented here is: - - !! dq(g,i,j) = sum_lm (-i)^l ap(lm,i,j) * - !! ( yr_lm(g^) dqrad(g,l,i,j) + dyr_lm(g^) qrad(g,l,i,j)) + !! the Q function needed in stress assuming that the radial Fourier + !! transform is already computed and stored in table \(\text{qrad}\). + !! The implemented formula: + ! + !! \[ \text{dq}(g,i,j) = \sum_lm (-i)^l \text{ap}(lm,i,j) * + !! ( \text{yr}_{lm}(g^) \text{dqrad}(g,l,i,j) + + !! \text{dyr}_{lm}(g') \text{qrad}(g,l,i,j)) \] ! USE upf_kinds, ONLY: DP USE uspp_data, ONLY: dq, qrad @@ -48,7 +49,7 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg ) ! ! ... local variables ! - COMPLEX(DP) :: sig + COMPLEX(DP) :: sig, dqg_bgr ! (-i)^L INTEGER :: nb, mb, ijv, ivl, jvl, ig, lp, l, lm, i0, i1, i2, i3 ! the atomic index corresponding to ih @@ -71,7 +72,9 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg ) ! auxiliary variable ! auxiliary variable ! - ! compute the indices which correspond to ih,jh + ! ... compute the indices which correspond to ih,jh + ! + !$acc data present_or_copyin(g,qmod,ylmk0,dylmk0) present_or_copyout(dqg) ! sixth = 1.d0 / 6.d0 dqi = 1 / dq @@ -91,14 +94,16 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg ) IF (ivl > nlx .OR. jvl > nlx) & CALL upf_error (' dqvan2 ', ' wrong dimensions (2)', MAX(ivl,jvl)) ! + !$acc kernels dqg(:) = (0.d0,0.d0) + !$acc end kernels ! - ! and make the sum over the non zero LM + ! ... and make the sum over the non zero LM ! - DO lm = 1, lpx(ivl, jvl) - lp = lpl(ivl, jvl, lm) + DO lm = 1, lpx(ivl,jvl) + lp = lpl(ivl,jvl,lm) ! - ! extraction of angular momentum l from lp: + ! ... extraction of angular momentum l from lp: ! IF (lp==1) THEN l = 1 @@ -119,16 +124,20 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg ) ENDIF ! sig = (0.d0, -1.d0)**(l - 1) - sig = sig * ap(lp, ivl, jvl) + sig = sig * ap(lp,ivl,jvl) ! qm1 = -1.0_dp ! any number smaller than qmod(1) ! +#if !defined(_OPENACC) !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(qm,px,ux,vx,wx,i0,i1,i2,i3,uvx,pwx,work,work1) +#else +!$acc parallel loop +#endif DO ig = 1, ngy ! - ! calculate quantites depending on the module of G only when needed + ! ... calculate quantites depending on the module of G only when needed ! -#if !defined(_OPENMP) +#if !defined(_OPENMP) && !defined(_OPENACC) IF ( ABS( qmod(ig) - qm1 ) > 1.0D-6 ) THEN #endif qm = qmod (ig) * dqi @@ -141,33 +150,40 @@ SUBROUTINE dqvan2( ih, jh, np, ipol, ngy, g, tpiba, qmod, ylmk0, dylmk0, dqg ) i2 = qm + 3 i3 = qm + 4 uvx = ux * vx * sixth - + ! pwx = px * wx * 0.5d0 - + ! work = qrad(i0, ijv, l, np) * uvx * wx + & qrad(i1, ijv, l, np) * pwx * vx - & qrad(i2, ijv, l, np) * pwx * ux + & qrad(i3, ijv, l, np) * px * uvx - work1 = - qrad(i0, ijv, l, np) * (ux*vx + vx*wx + ux*wx) * sixth & - + qrad(i1, ijv, l, np) * (wx*vx - px*wx - px*vx) * 0.5d0 & - - qrad(i2, ijv, l, np) * (wx*ux - px*wx - px*ux) * 0.5d0 & - + qrad(i3, ijv, l, np) * (ux*vx - px*ux - px*vx) * sixth - - work1 = work1 * dqi - -#if !defined(_OPENMP) + work1 = (- qrad(i0, ijv, l, np) * (ux*vx + vx*wx + ux*wx) * sixth & + + qrad(i1, ijv, l, np) * (wx*vx - px*wx - px*vx) * 0.5d0 & + - qrad(i2, ijv, l, np) * (wx*ux - px*wx - px*ux) * 0.5d0 & + + qrad(i3, ijv, l, np) * (ux*vx - px*ux - px*vx) * sixth) * dqi + ! +#if !defined(_OPENMP) && !defined(_OPENACC) qm1 = qmod(ig) ENDIF #endif - - dqg(ig) = dqg(ig) + sig * dylmk0(ig, lp) * work / tpiba - IF (qmod(ig) > 1.d-9) dqg(ig) = dqg(ig) + & - sig * ylmk0(ig, lp) * work1 * tpiba * g(ipol, ig) / qmod(ig) + ! + IF (qmod(ig) > 1.d-9) THEN + dqg_bgr = sig * ylmk0(ig,lp) * work1 * tpiba * g(ipol,ig) / qmod(ig) + ELSE + dqg_bgr = (0.d0,0.d0) + ENDIF + ! + dqg(ig) = dqg(ig) + sig * dylmk0(ig,lp) * work / tpiba + dqg_bgr + ! ENDDO +#if !defined(_OPENACC) !$OMP END PARALLEL DO - ! +#endif + ! ENDDO ! + !$acc end data + ! RETURN ! END SUBROUTINE dqvan2 diff --git a/upflib/gen_us_dj.f90 b/upflib/gen_us_dj.f90 index ff4de8d57..d1aaaf879 100644 --- a/upflib/gen_us_dj.f90 +++ b/upflib/gen_us_dj.f90 @@ -7,9 +7,9 @@ ! ! !---------------------------------------------------------------------- -SUBROUTINE gen_us_dj_base & - ( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, & - omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, mill, g, dvkb ) +SUBROUTINE gen_us_dj_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, & + omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, & + mill, g, dvkb ) !---------------------------------------------------------------------- !! Calculates the beta function pseudopotentials with !! the derivative of the Bessel functions. @@ -18,7 +18,7 @@ SUBROUTINE gen_us_dj_base & USE upf_const, ONLY: tpi USE uspp, ONLY: nkb, indv, nhtol, nhtolm USE uspp_data, ONLY: nqx, tab, dq - USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh + USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm ! IMPLICIT NONE ! @@ -42,7 +42,7 @@ SUBROUTINE gen_us_dj_base & !! rec.lattice units 2pi/a REAL(DP), INTENT(IN) :: omega !! cell volume - INTEGER, INTENT(IN) :: nr1,nr2,nr3 + INTEGER, INTENT(IN) :: nr1, nr2, nr3 !! fft dims (dense grid) COMPLEX(DP), INTENT(IN) :: eigts1(-nr1:nr1,nat) !! structure factor 1 @@ -54,7 +54,7 @@ SUBROUTINE gen_us_dj_base & !! miller index map REAL(DP), INTENT(IN) :: g(3,*) !! g vectors (2pi/a units) - COMPLEX(DP), INTENT(OUT) :: dvkb(npwx, nkb) + COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb) !! the beta function pseudopotential ! ! ... local variables @@ -67,46 +67,54 @@ SUBROUTINE gen_us_dj_base & ! index of the first nonzero point in the r ! counter on atomic type ! - REAL(DP) :: arg, px, ux, vx, wx + INTEGER :: ina, na, l, iig, lm, ikb_t, nht + REAL(DP) :: arg, px, ux, vx, wx, qt ! argument of the atomic phase factor - ! - COMPLEX(DP) :: phase, pref - ! atomic phase factor + COMPLEX(DP) :: pref ! prefactor ! - INTEGER :: na, l, iig, lm, iq - REAL(DP), ALLOCATABLE :: djl(:,:,:), ylm(:,:), q(:), gk(:,:) - REAL(DP) :: qt - COMPLEX(DP), ALLOCATABLE :: sk(:) + INTEGER, ALLOCATABLE :: nas(:), ihv(:), nav(:) + REAL(DP), ALLOCATABLE :: djl(:,:,:), ylm(:,:), q(:), gk(:,:) + COMPLEX(DP), ALLOCATABLE :: sk(:,:), phase(:) + ! phase: atomic phase factor ! IF (nkb == 0) RETURN ! - CALL start_clock( 'stres_us31' ) + !$acc data present_or_copyin(igk,eigts1,eigts2,eigts3,mill,g) & + !$acc present_or_copyout(dvkb) copyin(ityp,tau,xk) ! - ALLOCATE( djl(npw,nbetam,ntyp) ) - ALLOCATE( ylm(npw,(lmaxkb+1)**2) ) - ALLOCATE( gk(3,npw) ) - ALLOCATE( q(npw) ) + !CALL start_clock( 'stres_us31' ) ! + ALLOCATE( djl(npw,nbetam,ntyp), ylm(npw,(lmaxkb+1)**2) ) + ALLOCATE( gk(3,npw), q(npw) ) + !$acc data create( djl, ylm ) + !$acc data create( gk, q ) + ! + !$acc parallel loop DO ig = 1, npw iig = igk(ig) - gk(1, ig) = xk(1) + g(1, iig) - gk(2, ig) = xk(2) + g(2, iig) - gk(3, ig) = xk(3) + g(3, iig) - q(ig) = gk(1, ig)**2 + gk(2, ig)**2 + gk(3, ig)**2 + gk(1,ig) = xk(1) + g(1,iig) + gk(2,ig) = xk(2) + g(2,iig) + gk(3,ig) = xk(3) + g(3,iig) + q(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2 ENDDO ! - CALL stop_clock( 'stres_us31' ) - CALL start_clock( 'stres_us32' ) +#if defined(__CUDA) + !$acc host_data use_device(gk,q,ylm) + CALL ylmr2_gpu( (lmaxkb+1)**2, npw, gk, q, ylm ) + !$acc end host_data +#else + !$acc update self(gk,q) CALL ylmr2( (lmaxkb+1)**2, npw, gk, q, ylm ) - CALL stop_clock( 'stres_us32' ) - CALL start_clock( 'stres_us33' ) + !$acc update device(ylm) +#endif ! + !$acc data copyin( tab ) DO nt = 1, ntyp + !$acc parallel loop collapse(2) DO nb = 1, upf(nt)%nbeta - ! DO ig = 1, npw - qt = SQRT(q (ig)) * tpiba + qt = SQRT(q(ig)) * tpiba px = qt / dq - INT(qt/dq) ux = 1.d0 - px vx = 2.d0 - px @@ -115,62 +123,99 @@ SUBROUTINE gen_us_dj_base & i1 = i0 + 1 i2 = i0 + 2 i3 = i0 + 3 - djl(ig,nb,nt) = ( tab(i0, nb, nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + & - tab(i1, nb, nt) * (+vx*wx-px*wx-px*vx)/2.d0 - & - tab(i2, nb, nt) * (+ux*wx-px*wx-px*ux)/2.d0 + & - tab(i3, nb, nt) * (+ux*vx-px*vx-px*ux)/6.d0 )/dq + djl(ig,nb,nt) = ( tab(i0,nb,nt) * (-vx*wx-ux*wx-ux*vx)/6.d0 + & + tab(i1,nb,nt) * (+vx*wx-px*wx-px*vx)/2.d0 - & + tab(i2,nb,nt) * (+ux*wx-px*wx-px*ux)/2.d0 + & + tab(i3,nb,nt) * (+ux*vx-px*vx-px*ux)/6.d0 ) / dq ENDDO - ! ENDDO ENDDO + !$acc end data ! - CALL stop_clock( 'stres_us33' ) - CALL start_clock( 'stres_us34' ) + !CALL stop_clock( 'stres_us33' ) + !CALL start_clock( 'stres_us34' ) ! - DEALLOCATE ( q ) - DEALLOCATE ( gk ) + !$acc end data + DEALLOCATE( q, gk ) ! - ALLOCATE ( sk(npw) ) - ikb = 0 + ! + ALLOCATE( phase(nat), sk(npw,nat) ) + ALLOCATE( nas(nat), ihv(nat*nhm), nav(nat*nhm) ) + ! + ina = 0 DO nt = 1, ntyp - DO na = 1, nat - ! - IF (ityp(na) == nt) THEN - arg = ( xk(1) * tau(1,na) + & - xk(2) * tau(2,na) + & - xk(3) * tau(3,na) ) * tpi - phase = CMPLX( COS(arg), -SIN(arg) ,KIND=DP ) - DO ig = 1, npw - iig = igk(ig) - sk (ig) = eigts1(mill (1,iig), na) * & - eigts2(mill (2,iig), na) * & - eigts3(mill (3,iig), na) * phase - ENDDO - DO ih = 1, nh(nt) - nb = indv(ih, nt) - l = nhtol(ih, nt) - lm= nhtolm(ih, nt) - ikb = ikb + 1 - pref = (0.d0, -1.d0) **l - ! - DO ig = 1, npw - dvkb(ig, ikb) = djl(ig, nb, nt) * sk(ig) * ylm(ig, lm) & - * pref - ENDDO - ENDDO - ENDIF - ! - ENDDO + DO na = 1, nat + IF ( ityp(na) == nt ) THEN + ina = ina + 1 + nas(ina) = na + ENDIF + ENDDO ENDDO ! - CALL stop_clock('stres_us34') + !$acc data create( phase, sk, ihv, nav ) copyin( nas ) ! - IF (ikb /= nkb) CALL upf_error('gen_us_dj', 'unexpected error', 1) - DEALLOCATE( sk ) - DEALLOCATE( ylm ) - DEALLOCATE( djl ) + !$acc parallel loop + DO ina = 1, nat + na = nas(ina) + arg = (xk(1) * tau(1,na) + xk(2) * tau(2,na) & + + xk(3) * tau(3,na) ) * tpi + phase(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP ) + ENDDO + ! + !$acc parallel loop collapse(2) + DO ina = 1, nat + DO ig = 1, npw + ! + na = nas(ina) + iig = igk(ig) + sk(ig,na) = eigts1(mill(1,iig),na) * & + eigts2(mill(2,iig),na) * & + eigts3(mill(3,iig),na) * phase(na) + ENDDO + ENDDO + ! + ikb_t = 0 + DO ina = 1, nat + na = nas(ina) + nht = nh(ityp(na)) + !$acc kernels + DO ih = 1, nht + ihv(ikb_t+ih) = ih + nav(ikb_t+ih) = na + ENDDO + !$acc end kernels + ikb_t = ikb_t + nht + ENDDO + ! + ! + !$acc parallel loop collapse(2) copyin(indv,nhtol,nhtolm) + DO ikb = 1, ikb_t + DO ig = 1, npw + ih = ihv(ikb) + na = nav(ikb) + nt = ityp(na) + nb = indv(ih,nt) + l = nhtol(ih,nt) + lm = nhtolm(ih,nt) + pref = (0._DP,-1._DP)**l + ! + dvkb(ig,ikb) = CMPLX(djl(ig,nb,nt)) * sk(ig,na) * & + CMPLX(ylm(ig,lm)) * pref + ENDDO + ENDDO + ! + !ALL stop_clock('stres_us34') + ! + !$acc end data + !$acc end data + DEALLOCATE( phase, sk ) + DEALLOCATE( nas, ihv, nav ) + ! + IF (ikb_t /= nkb) CALL upf_error('gen_us_dj', 'unexpected error', 1) + ! + !$acc end data + DEALLOCATE( djl, ylm ) ! RETURN ! END SUBROUTINE gen_us_dj_base - diff --git a/upflib/gen_us_dj_gpu.f90 b/upflib/gen_us_dj_gpu.f90 deleted file mode 100644 index 5bf5c16cc..000000000 --- a/upflib/gen_us_dj_gpu.f90 +++ /dev/null @@ -1,219 +0,0 @@ -! -! Copyright (C) 2021 Quantum ESPRESSO Foundation -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!---------------------------------------------------------------------- -SUBROUTINE gen_us_dj_gpu_ & - ( npw, npwx, igk_d, xk, nat, tau, ityp, ntyp, tpiba, & - omega, nr1, nr2, nr3, eigts1_d, eigts2_d, eigts3_d, mill_d, g_d, dvkb_d ) - !---------------------------------------------------------------------- - !! Calculates the beta function pseudopotentials with - !! the derivative of the Bessel functions - GFPU version - ! - ! AF: more gpu-resident variables can be used, avoiding local GPU-alloc - ! and host2dev transfers - ! - USE upf_kinds, ONLY: dp - USE upf_const, ONLY: tpi - USE uspp, ONLY: nkb, indv_d, nhtol_d, nhtolm_d - USE uspp_data, ONLY: nqx, tab, tab_d, dq - USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm - USE device_fbuff_m, ONLY: dev_buf - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: npw - !! number ok plane waves - INTEGER, INTENT(IN) :: npwx - !! max number ok plane waves across k-points - INTEGER, INTENT(IN) :: igk_d(npw) - !! indices of plane waves k+G - REAL(dp), INTENT(IN) :: xk(3) - !! k-point - INTEGER, INTENT(IN) :: nat - !! number of atoms - INTEGER, INTENT(IN) :: ityp(nat) - !! index of type per atom - INTEGER, INTENT(IN) :: ntyp - !! number of atomic types - REAL(DP), INTENT(IN) :: tau(3,nat) - !! atomic positions (cc alat units) - REAL(DP), INTENT(IN) :: tpiba - !! rec.lattice units 2pi/a - REAL(DP), INTENT(IN) :: omega - !! cell volume - INTEGER, INTENT(IN) :: nr1,nr2,nr3 - !! fft dims (dense grid) - COMPLEX(DP), INTENT(IN) :: eigts1_d(-nr1:nr1,nat) - !! structure factor 1 - COMPLEX(DP), INTENT(IN) :: eigts2_d(-nr2:nr2,nat) - !! structure factor 2 - COMPLEX(DP), INTENT(IN) :: eigts3_d(-nr3:nr3,nat) - !! structure factor 3 - INTEGER, INTENT(IN) :: mill_d(3,*) - !! miller index map - REAL(DP), INTENT(IN) :: g_d(3,*) - !! g vectors (2pi/a units) - COMPLEX(DP), INTENT(OUT) :: dvkb_d(npwx, nkb) - !! the beta function pseudopotential - ! - ! ... local variables - ! - INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, i0, i1, i2, & - i3, ig, nbm, iq, mil1, mil2, mil3, & - ikb_t, nht, ina, nas(nat), ierr(3) - REAL(DP) :: px, ux, vx, wx, arg, u_ipol, xk1, xk2, xk3, qt - COMPLEX(DP) :: pref - INTEGER, ALLOCATABLE :: ityp_d(:), ih_d(:), na_d(:), nas_d(:) - ! - REAL(DP), POINTER :: gk_d(:,:), djl_d(:,:,:), ylm_d(:,:) - REAL(DP), ALLOCATABLE :: q_d(:), tau_d(:,:) - COMPLEX(DP), ALLOCATABLE :: phase_d(:), sk_d(:,:) - ! -#if defined(__CUDA) - attributes(DEVICE) :: igk_d, mill_d, eigts1_d, eigts2_d, eigts3_d, g_d - attributes(DEVICE) :: gk_d, q_d, sk_d, djl_d, ylm_d, & - ityp_d, phase_d, ih_d, na_d, tau_d, nas_d - attributes(DEVICE) :: dvkb_d - ! - IF (nkb == 0) RETURN - ! - CALL dev_buf%lock_buffer( ylm_d, (/ npw,(lmaxkb+1)**2 /), ierr(1) ) - CALL dev_buf%lock_buffer( djl_d, (/ npw,nbetam,ntyp /), ierr(2) ) - CALL dev_buf%lock_buffer( gk_d, (/ 3,npw /), ierr(3) ) - ALLOCATE( q_d(npw) ) - ! - xk1 = xk(1) - xk2 = xk(2) - xk3 = xk(3) - ! - !$cuf kernel do (1) <<<*,*>>> - DO ig = 1, npw - iig = igk_d(ig) - gk_d(1,ig) = xk1 + g_d(1,iig) - gk_d(2,ig) = xk2 + g_d(2,iig) - gk_d(3,ig) = xk3 + g_d(3,iig) - q_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2 - ENDDO - ! - CALL ylmr2_gpu( (lmaxkb+1)**2, npw, gk_d, q_d, ylm_d ) - ! - DO nt = 1, ntyp - nbm = upf(nt)%nbeta - !$cuf kernel do (2) <<<*,*>>> - DO nb = 1, nbm - DO ig = 1, npw - qt = SQRT(q_d(ig)) * tpiba - px = qt/dq - DBLE(INT(qt/dq)) - ux = 1._DP - px - vx = 2._DP - px - wx = 3._DP - px - i0 = INT(qt/dq) + 1 - i1 = i0 + 1 - i2 = i0 + 2 - i3 = i0 + 3 - djl_d(ig,nb,nt) = (tab_d(i0,nb,nt) * (-vx*wx-ux*wx-ux*vx)/6._DP + & - tab_d(i1,nb,nt) * (+vx*wx-px*wx-px*vx)/2._DP - & - tab_d(i2,nb,nt) * (+ux*wx-px*wx-px*ux)/2._DP + & - tab_d(i3,nb,nt) * (+ux*vx-px*vx-px*ux)/6._DP)/dq - ENDDO - ENDDO - ENDDO - ! - DEALLOCATE( q_d ) - ! - ALLOCATE( ityp_d(nat), nas_d(nat) ) - ityp_d = ityp - ALLOCATE( tau_d(3,nat) ) - tau_d = tau - ALLOCATE( phase_d(nat) ) - ! - ina = 0 - DO nt = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == nt ) THEN - ina = ina + 1 - nas(ina) = na - ENDIF - ENDDO - ENDDO - nas_d = nas - ! - !$cuf kernel do (1) <<<*,*>>> - DO ina = 1, nat - na = nas_d(ina) - arg = (xk1 * tau_d(1,na) + xk2 * tau_d(2,na) & - + xk3 * tau_d(3,na) ) * tpi - phase_d(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP ) - ENDDO - ! - DEALLOCATE( tau_d ) - ! - ALLOCATE( sk_d(npw,nat) ) - ! - !$cuf kernel do (2) <<<*,*>>> - DO ina = 1, nat - DO ig = 1, npw - ! - na = nas_d(ina) - iig = igk_d(ig) - mil1 = mill_d(1,iig) - mil2 = mill_d(2,iig) - mil3 = mill_d(3,iig) - sk_d(ig,na) = eigts1_d(mil1,na) * & - eigts2_d(mil2,na) * & - eigts3_d(mil3,na) * phase_d(na) - ENDDO - ENDDO - ! - ! - DEALLOCATE( phase_d ) - ! - ALLOCATE( ih_d(nat*nhm), na_d(nat*nhm) ) - ! - ikb_t = 0 - DO ina = 1, nat - na = nas(ina) - nht = nh(ityp(na)) - !$cuf kernel do (1) <<<*,*>>> - DO ih = 1, nht - ih_d(ikb_t+ih) = ih - na_d(ikb_t+ih) = na - ENDDO - ikb_t = ikb_t + nht - ENDDO - ! - !$cuf kernel do (2) <<<*,*>>> - DO ikb = 1, ikb_t - DO ig = 1, npw - ih = ih_d(ikb) - na = na_d(ikb) - nt = ityp_d(na) - nb = indv_d(ih,nt) - l = nhtol_d(ih,nt) - lm = nhtolm_d(ih,nt) - pref = (0._DP,-1._DP)**l - ! - dvkb_d(ig,ikb) = CMPLX(djl_d(ig,nb,nt)) * sk_d(ig,na) * & - CMPLX(ylm_d(ig,lm)) * pref - ENDDO - ENDDO - ! - DEALLOCATE( sk_d ) - ! - IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dj', 'unexpected error', 1 ) - ! - CALL dev_buf%release_buffer( ylm_d, ierr(1) ) - CALL dev_buf%release_buffer( djl_d, ierr(2) ) - CALL dev_buf%release_buffer( gk_d, ierr(3) ) - ! - DEALLOCATE( ih_d, na_d, nas_d ) -#endif - ! - RETURN - ! -END SUBROUTINE gen_us_dj_gpu_ diff --git a/upflib/gen_us_dy.f90 b/upflib/gen_us_dy.f90 index 05f5636ac..1730f42e2 100644 --- a/upflib/gen_us_dy.f90 +++ b/upflib/gen_us_dy.f90 @@ -1,5 +1,5 @@ ! -! Copyright (C) 2001-2021 Quantum ESPRESSO Foundation +! Copyright (C) 2021 Quantum ESPRESSSO Foundation ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' ! in the root directory of the present distribution, @@ -7,18 +7,20 @@ ! ! !---------------------------------------------------------------------- -SUBROUTINE gen_us_dy_base & - ( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, & - omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, mill, g, u, dvkb ) +SUBROUTINE gen_us_dy_base( npw, npwx, igk, xk, nat, tau, ityp, ntyp, tpiba, & + omega, nr1, nr2, nr3, eigts1, eigts2, eigts3, & + mill, g, u, dvkb ) !---------------------------------------------------------------------- - !! Calculates the beta functions of the pseudopotential with the - !! derivative of the spherical harmonics projected on vector u + !! Calculates the Kleinman-Bylander pseudopotentials with the + !! derivative of the spherical harmonics projected on vector u. + ! + ! AF: more extensive use of GPU-resident vars possible ! USE upf_kinds, ONLY: dp USE upf_const, ONLY: tpi USE uspp, ONLY: nkb, indv, nhtol, nhtolm USE uspp_data, ONLY: nqx, tab, dq - USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh + USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm ! IMPLICIT NONE ! @@ -55,112 +57,205 @@ SUBROUTINE gen_us_dy_base & REAL(DP), INTENT(IN) :: g(3,*) !! g vectors (2pi/a units) REAL(DP), INTENT(IN) :: u(3) - !! input: projection vector - COMPLEX(DP), INTENT(OUT) :: dvkb(npwx,nkb) - !! output: kleinman-bylander pseudopotential + !! projection vector + COMPLEX(DP), INTENT(OUT) :: dvkb(npwx, nkb) + !! the beta function pseudopotential ! ! ... local variables ! INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, & - i3, ig - REAL(DP), ALLOCATABLE :: gk(:,:), q(:) - REAL(DP) :: px, ux, vx, wx, arg + i3, ig, nbm, iq, mil1, mil2, mil3, ikb_t, & + nht, ina, lmx2 ! - REAL(DP), ALLOCATABLE :: vkb0(:,:,:), dylm(:,:), dylm_u(:,:) + INTEGER, ALLOCATABLE :: nas(:), ihv(:), nav(:) + ! + REAL(DP), ALLOCATABLE :: dylm(:,:,:), dylm_u(:,:) + REAL(DP), ALLOCATABLE :: q(:), gk(:,:), vkb0(:,:,:) ! dylm = d Y_lm/dr_i in cartesian axes ! dylm_u as above projected on u + COMPLEX(DP), ALLOCATABLE :: phase(:), sk(:,:) ! - COMPLEX(DP), ALLOCATABLE :: sk(:) - COMPLEX(DP) :: phase, pref + REAL(DP) :: px, ux, vx, wx, arg, u_ipol1, u_ipol2, u_ipol3, xk1, xk2, xk3 + COMPLEX(DP) :: pref ! - INTEGER :: iq + !$acc kernels present_or_copyout(dvkb) + dvkb = (0._DP,0._DP) + !$acc end kernels ! - dvkb(:,:) = (0.d0, 0.d0) IF (lmaxkb <= 0) RETURN ! - ALLOCATE( vkb0(npw,nbetam,ntyp), dylm_u(npw,(lmaxkb+1)**2), gk(3,npw) ) - ALLOCATE( q(npw) ) + !$acc data present_or_copyin(igk,eigts1,eigts2,eigts3,mill,g) present(dvkb) ! + lmx2 = (lmaxkb+1)**2 + ! + ALLOCATE( gk(3,npw) ) + ALLOCATE( dylm_u(npw,lmx2) ) + ALLOCATE( vkb0(npw,nbetam,ntyp) ) + ALLOCATE( q(npw) ) + !$acc data create( dylm_u, vkb0 ) + !$acc data create( q, gk ) + ! + xk1 = xk(1) + xk2 = xk(2) + xk3 = xk(3) + ! + !$acc parallel loop DO ig = 1, npw iig = igk(ig) - gk(1, ig) = xk(1) + g(1, iig) - gk(2, ig) = xk(2) + g(2, iig) - gk(3, ig) = xk(3) + g(3, iig) - q(ig) = gk(1, ig)**2 + gk(2, ig)**2 + gk(3, ig)**2 + gk(1,ig) = xk1 + g(1,iig) + gk(2,ig) = xk2 + g(2,iig) + gk(3,ig) = xk3 + g(3,iig) + q(ig) = gk(1,ig)**2 + gk(2,ig)**2 + gk(3,ig)**2 ENDDO ! - ALLOCATE( dylm(npw,(lmaxkb+1)**2) ) - dylm_u(:,:) = 0.d0 + ALLOCATE( dylm(npw,(lmaxkb+1)**2,3) ) + !$acc data create( dylm ) + ! +#if defined(__CUDA) + !$acc host_data use_device( gk, q, dylm ) DO ipol = 1, 3 - CALL dylmr2( (lmaxkb+1)**2, npw, gk, q, dylm, ipol ) - CALL daxpy( npw * (lmaxkb + 1) **2, u(ipol), dylm, 1, dylm_u, 1 ) + CALL dylmr2_gpu( lmx2, npw, gk, q, dylm(:,:,ipol), ipol ) ENDDO + !$acc end host_data +#else + !$acc update self( gk, q ) + DO ipol = 1, 3 + CALL dylmr2( lmx2, npw, gk, q, dylm(:,:,ipol), ipol ) + ENDDO + !$acc update device( dylm ) +#endif + ! + u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3) + ! + !$acc parallel loop collapse(2) + DO lm = 1, lmx2 + DO ig = 1, npw + dylm_u(ig,lm) = u_ipol1*dylm(ig,lm,1) + & + u_ipol2*dylm(ig,lm,2) + & + u_ipol3*dylm(ig,lm,3) + ENDDO + ENDDO + !$acc end data DEALLOCATE( dylm ) ! - DO ig = 1, npw - q(ig) = SQRT(q(ig)) * tpiba - ENDDO + !$acc kernels + q(:) = SQRT(q(:)) * tpiba + !$acc end kernels ! + !$acc data copyin( tab ) DO nt = 1, ntyp - ! calculate beta in G-space using an interpolation table - DO nb = 1, upf(nt)%nbeta + nbm = upf(nt)%nbeta + !$acc parallel loop collapse(2) + DO nb = 1, nbm DO ig = 1, npw - px = q(ig)/dq - INT(q(ig)/dq) - ux = 1.d0 - px - vx = 2.d0 - px - wx = 3.d0 - px - i0 = q(ig)/dq + 1 + px = q(ig)/dq - DBLE(INT(q(ig)/dq)) + ux = 1._DP - px + vx = 2._DP - px + wx = 3._DP - px + i0 = INT(q(ig)/dq) + 1 i1 = i0 + 1 i2 = i0 + 2 i3 = i0 + 3 - vkb0(ig, nb, nt) = tab(i0, nb, nt) * ux * vx * wx / 6.d0 + & - tab(i1, nb, nt) * px * vx * wx / 2.d0 - & - tab(i2, nb, nt) * px * ux * wx / 2.d0 + & - tab(i3, nb, nt) * px * ux * vx / 6.d0 - ENDDO - ENDDO + vkb0(ig,nb,nt) = tab(i0,nb,nt) * ux * vx * wx / 6._DP + & + tab(i1,nb,nt) * px * vx * wx / 2._DP - & + tab(i2,nb,nt) * px * ux * wx / 2._DP + & + tab(i3,nb,nt) * px * ux * vx / 6._DP + ENDDO + ENDDO ENDDO + !$acc end data ! - DEALLOCATE( q ) + !$acc end data + DEALLOCATE( gk, q ) ! - ALLOCATE( sk(npw) ) + ALLOCATE( nas(nat), phase(nat) ) ! - ikb = 0 + ina = 0 DO nt = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == nt ) THEN - arg = (xk(1) * tau(1, na) + xk(2) * tau(2, na) & - + xk(3) * tau(3, na) ) * tpi - phase = CMPLX( COS(arg), -SIN(arg), KIND=DP ) - DO ig = 1, npw - iig = igk(ig) - sk(ig) = eigts1(mill (1,iig), na) * & - eigts2(mill (2,iig), na) * & - eigts3(mill (3,iig), na) * phase - ENDDO - ! - DO ih = 1, nh(nt) - nb = indv(ih, nt) - l = nhtol(ih, nt) - lm = nhtolm(ih, nt) - ikb = ikb + 1 - pref = (0.d0, -1.d0)**l - ! - DO ig = 1, npw - dvkb(ig, ikb) = vkb0(ig, nb, nt) * sk(ig) * dylm_u(ig, lm) & - * pref / tpiba - ENDDO - ENDDO - ENDIF - ENDDO + DO na = 1, nat + IF ( ityp(na) == nt ) THEN + ina = ina + 1 + nas(ina) = na + ENDIF + ENDDO ENDDO ! - IF (ikb /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 ) + + ALLOCATE( sk(npw,nat) ) + !$acc data create( sk ) ! + !$acc data create( phase ) copyin( nas ) + ! + !$acc parallel loop copyin( tau ) + DO ina = 1, nat + na = nas(ina) + arg = ( xk1 * tau(1,na) + xk2 * tau(2,na) & + + xk3 * tau(3,na) ) * tpi + phase(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP ) + ENDDO + ! + !$acc parallel loop collapse(2) + DO ina = 1, nat + DO ig = 1, npw + ! + na = nas(ina) + iig = igk(ig) + mil1 = mill(1,iig) + mil2 = mill(2,iig) + mil3 = mill(3,iig) + sk(ig,na) = eigts1(mil1,na) * & + eigts2(mil2,na) * & + eigts3(mil3,na) * phase(na) + ENDDO + ENDDO + ! + !$acc end data + ! + ALLOCATE( ihv(nat*nhm), nav(nat*nhm) ) + !$acc data create( ihv, nav ) + ! + ikb_t = 0 + DO ina = 1, nat + na = nas(ina) + nht = nh(ityp(na)) + !$acc kernels + DO ih = 1, nht + ihv(ikb_t+ih) = ih + nav(ikb_t+ih) = na + ENDDO + !$acc end kernels + ikb_t = ikb_t + nht + ENDDO + ! + !$acc parallel loop collapse(2) copyin(ityp,indv,nhtol,nhtolm) + DO ikb = 1, ikb_t + DO ig = 1, npw + ih = ihv(ikb) + na = nav(ikb) + nt = ityp(na) + nb = indv(ih,nt) + l = nhtol(ih,nt) + lm = nhtolm(ih,nt) + pref = (0._DP,-1._DP)**l + ! + dvkb(ig,ikb) = CMPLX(vkb0(ig,nb,nt)) * sk(ig,na) * & + CMPLX(dylm_u(ig,lm)) * pref / CMPLX(tpiba) + ENDDO + ENDDO + ! + !$acc end data + !$acc end data + DEALLOCATE( ihv, nav ) + DEALLOCATE( phase, nas ) DEALLOCATE( sk ) - DEALLOCATE( vkb0, dylm_u, gk ) + ! + IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 ) + ! + !$acc end data + DEALLOCATE( dylm_u, vkb0 ) + ! + !$acc end data ! RETURN ! END SUBROUTINE gen_us_dy_base -! diff --git a/upflib/gen_us_dy_gpu.f90 b/upflib/gen_us_dy_gpu.f90 deleted file mode 100644 index 59a61bcb5..000000000 --- a/upflib/gen_us_dy_gpu.f90 +++ /dev/null @@ -1,253 +0,0 @@ -! -! Copyright (C) 2021 Quantum ESPRESSSO Foundation -! This file is distributed under the terms of the -! GNU General Public License. See the file `License' -! in the root directory of the present distribution, -! or http://www.gnu.org/copyleft/gpl.txt . -! -! -!---------------------------------------------------------------------- -SUBROUTINE gen_us_dy_gpu_ ( npw, npwx, igk_d, xk, nat, tau, ityp, ntyp, & - tpiba, omega, nr1, nr2, nr3, eigts1_d, eigts2_d, eigts3_d, & - mill_d, g_d, u, dvkb_d ) - !---------------------------------------------------------------------- - !! Calculates the kleinman-bylander pseudopotentials with the - !! derivative of the spherical harmonics projected on vector u - ! - ! AF: more extensive use of GPU-resident vars possible - ! - USE upf_kinds, ONLY: dp - USE upf_const, ONLY: tpi - USE uspp, ONLY: nkb, indv_d, nhtol_d, nhtolm_d - USE uspp_data, ONLY: nqx, tab, tab_d, dq - USE uspp_param, ONLY: upf, lmaxkb, nbetam, nh, nhm - USE device_fbuff_m, ONLY: dev_buf - ! - IMPLICIT NONE - ! - INTEGER, INTENT(IN) :: npw - !! number ok plane waves - INTEGER, INTENT(IN) :: npwx - !! max number ok plane waves across k-points - INTEGER, INTENT(IN) :: igk_d(npw) - !! indices of plane waves k+G - REAL(dp), INTENT(IN) :: xk(3) - !! k-point - INTEGER, INTENT(IN) :: nat - !! number of atoms - INTEGER, INTENT(IN) :: ityp(nat) - !! index of type per atom - INTEGER, INTENT(IN) :: ntyp - !! number of atomic types - REAL(DP), INTENT(IN) :: tau(3,nat) - !! atomic positions (cc alat units) - REAL(DP), INTENT(IN) :: tpiba - !! rec.lattice units 2pi/a - REAL(DP), INTENT(IN) :: omega - !! cell volume - INTEGER, INTENT(IN) :: nr1,nr2,nr3 - !! fft dims (dense grid) - COMPLEX(DP), INTENT(IN) :: eigts1_d(-nr1:nr1,nat) - !! structure factor 1 - COMPLEX(DP), INTENT(IN) :: eigts2_d(-nr2:nr2,nat) - !! structure factor 2 - COMPLEX(DP), INTENT(IN) :: eigts3_d(-nr3:nr3,nat) - !! structure factor 3 - INTEGER, INTENT(IN) :: mill_d(3,*) - !! miller index map - REAL(DP), INTENT(IN) :: g_d(3,*) - !! g vectors (2pi/a units) - REAL(DP), INTENT(IN) :: u(3) - !! input: projection vector - COMPLEX(DP), INTENT(OUT) :: dvkb_d(npwx, nkb) - !! the beta function pseudopotential - ! - ! ... local variables - ! - INTEGER :: na, nt, nb, ih, l, lm, ikb, iig, ipol, i0, i1, i2, & - i3, ig, nbm, iq, mil1, mil2, mil3, ikb_t, & - nht, ina, lmx2 - INTEGER :: nas(nat), ierr(4) - ! - INTEGER, ALLOCATABLE :: ityp_d(:), ih_d(:), na_d(:), nas_d(:) - ! - REAL(DP), ALLOCATABLE :: q(:), dylm(:,:) - ! - REAL(DP), POINTER :: gk_d(:,:) - REAL(DP), POINTER :: vkb0_d(:,:,:), dylm_u_d(:,:), dylm_d(:,:,:) - REAL(DP), ALLOCATABLE :: q_d(:), tau_d(:,:) - ! dylm = d Y_lm/dr_i in cartesian axes - ! dylm_u as above projected on u - COMPLEX(DP), ALLOCATABLE :: phase_d(:), sk_d(:,:) - ! - REAL(DP) :: px, ux, vx, wx, arg, u_ipol1, u_ipol2, u_ipol3, xk1, xk2, xk3 - COMPLEX(DP) :: pref - ! -#if defined(__CUDA) - attributes(DEVICE) :: igk_d, mill_d, eigts1_d, eigts2_d, eigts3_d, g_d - attributes(DEVICE) :: gk_d, q_d, sk_d, vkb0_d, dylm_u_d, dylm_d, & - ityp_d, phase_d, ih_d, na_d, tau_d, nas_d - attributes(DEVICE) :: dvkb_d - ! - dvkb_d = (0._DP,0._DP) - ! - IF (lmaxkb <= 0) RETURN - ! - lmx2 = (lmaxkb+1)**2 - ! - CALL dev_buf%lock_buffer( dylm_u_d, (/ npw,lmx2 /), ierr(1) ) - CALL dev_buf%lock_buffer( vkb0_d, (/ npw,nbetam,ntyp /), ierr(2) ) - CALL dev_buf%lock_buffer( gk_d, (/ 3,npw /), ierr(3) ) - IF (ANY(ierr /= 0)) CALL upf_error( 'gen_us_dy_gpu', 'cannot allocate buffers', ABS(ierr) ) - ALLOCATE( q_d(npw) ) - ! - xk1 = xk(1) - xk2 = xk(2) - xk3 = xk(3) - ! - !$cuf kernel do <<<*,*>>> - DO ig = 1, npw - iig = igk_d(ig) - gk_d(1,ig) = xk1 + g_d(1,iig) - gk_d(2,ig) = xk2 + g_d(2,iig) - gk_d(3,ig) = xk3 + g_d(3,iig) - q_d(ig) = gk_d(1,ig)**2 + gk_d(2,ig)**2 + gk_d(3,ig)**2 - ENDDO - ! - CALL dev_buf%lock_buffer( dylm_d, (/npw,lmx2,3/), ierr(4) ) - DO ipol = 1, 3 - CALL dylmr2_gpu( lmx2, npw, gk_d, q_d, dylm_d(:,:,ipol), ipol ) - ENDDO - ! - u_ipol1 = u(1) ; u_ipol2 = u(2) ; u_ipol3 = u(3) - ! - !$cuf kernel do (2) <<<*,*>>> - DO lm = 1, lmx2 - DO ig = 1, npw - dylm_u_d(ig,lm) = u_ipol1*dylm_d(ig,lm,1) + & - u_ipol2*dylm_d(ig,lm,2) + & - u_ipol3*dylm_d(ig,lm,3) - ENDDO - ENDDO - CALL dev_buf%release_buffer( dylm_d, ierr(4) ) - ! - ! - !$cuf kernel do (1) <<<*,*>>> - DO ig = 1, npw - q_d(ig) = SQRT(q_d(ig)) * tpiba - ENDDO - ! - ! - DO nt = 1, ntyp - nbm = upf(nt)%nbeta - !$cuf kernel do (2) <<<*,*>>> - DO nb = 1, nbm - DO ig = 1, npw - px = q_d(ig)/dq - DBLE(INT(q_d(ig)/dq)) - ux = 1._DP - px - vx = 2._DP - px - wx = 3._DP - px - i0 = INT(q_d(ig)/dq) + 1 - i1 = i0 + 1 - i2 = i0 + 2 - i3 = i0 + 3 - vkb0_d(ig,nb,nt) = tab_d(i0,nb,nt) * ux * vx * wx / 6._DP + & - tab_d(i1,nb,nt) * px * vx * wx / 2._DP - & - tab_d(i2,nb,nt) * px * ux * wx / 2._DP + & - tab_d(i3,nb,nt) * px * ux * vx / 6._DP - ENDDO - ENDDO - ENDDO - ! - DEALLOCATE( q_d ) - ! - ALLOCATE( ityp_d(nat), nas_d(nat) ) - ityp_d = ityp - ALLOCATE( tau_d(3,nat) ) - tau_d = tau - ALLOCATE( phase_d(nat) ) - ! - ina = 0 - DO nt = 1, ntyp - DO na = 1, nat - IF ( ityp(na) == nt ) THEN - ina = ina + 1 - nas(ina) = na - ENDIF - ENDDO - ENDDO - nas_d = nas - ! - !$cuf kernel do (1) <<<*,*>>> - DO ina = 1, nat - na = nas_d(ina) - arg = (xk1 * tau_d(1,na) + xk2 * tau_d(2,na) & - + xk3 * tau_d(3,na) ) * tpi - phase_d(na) = CMPLX( COS(arg), -SIN(arg), KIND=DP ) - ENDDO - ! - DEALLOCATE( tau_d ) - ! - ALLOCATE( sk_d(npw,nat) ) - ! - !$cuf kernel do (2) <<<*,*>>> - DO ina = 1, nat - DO ig = 1, npw - ! - na = nas_d(ina) - iig = igk_d(ig) - mil1 = mill_d(1,iig) - mil2 = mill_d(2,iig) - mil3 = mill_d(3,iig) - sk_d(ig,na) = eigts1_d(mil1,na) * & - eigts2_d(mil2,na) * & - eigts3_d(mil3,na) * phase_d(na) - ENDDO - ENDDO - ! - DEALLOCATE( phase_d ) - ! - ALLOCATE( ih_d(nat*nhm), na_d(nat*nhm) ) - ! - ikb_t = 0 - DO ina = 1, nat - na = nas(ina) - nht = nh(ityp(na)) - !$cuf kernel do (1) <<<*,*>>> - DO ih = 1, nht - ih_d(ikb_t+ih) = ih - na_d(ikb_t+ih) = na - ENDDO - ikb_t = ikb_t + nht - ENDDO - ! - !$cuf kernel do (2) <<<*,*>>> - DO ikb = 1, ikb_t - DO ig = 1, npw - ih = ih_d(ikb) - na = na_d(ikb) - nt = ityp_d(na) - nb = indv_d(ih,nt) - l = nhtol_d(ih,nt) - lm = nhtolm_d(ih,nt) - pref = (0._DP, -1._DP)**l - ! - dvkb_d(ig,ikb) = CMPLX(vkb0_d(ig,nb,nt)) * sk_d(ig,na) * & - CMPLX(dylm_u_d(ig,lm)) * pref / CMPLX(tpiba) - ENDDO - ENDDO - ! - DEALLOCATE( sk_d ) - ! - IF (ikb_t /= nkb) CALL upf_error( 'gen_us_dy', 'unexpected error', 1 ) - ! - CALL dev_buf%release_buffer( dylm_u_d, ierr(1) ) - CALL dev_buf%release_buffer( vkb0_d, ierr(2) ) - CALL dev_buf%release_buffer( gk_d, ierr(3) ) - ! - DEALLOCATE( ih_d, na_d, nas_d ) -#endif - ! - RETURN - ! -END SUBROUTINE gen_us_dy_gpu_ diff --git a/upflib/gth.f90 b/upflib/gth.f90 index 4d4989e25..65b7bc6a7 100644 --- a/upflib/gth.f90 +++ b/upflib/gth.f90 @@ -14,7 +14,7 @@ module m_gth ! private public :: gth_parameters, readgth, vloc_gth, dvloc_gth, & - dvloc_gth_gpu, setlocq_gth, mk_ffnl_gth, mk_dffnl_gth, deallocate_gth + setlocq_gth, mk_ffnl_gth, mk_dffnl_gth, deallocate_gth ! type gth_parameters integer :: itype, lloc, lmax @@ -340,94 +340,9 @@ subroutine vloc_gth(itype, zion, tpiba2, ngl, gl, omega, vloc) vloc (:) = vloc(:) * fact ! end subroutine vloc_gth +! !----------------------------------------------------------------------- -subroutine dvloc_gth(itype, zion, tpiba2, ngl, gl, omega, dvloc) - !----------------------------------------------------------------------- - ! - ! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g - ! - USE upf_kinds, ONLY: dp - USE upf_const, ONLY: pi, tpi, e2, eps8 - - implicit none - ! - ! I/O - integer, intent(in) :: itype, ngl - real(dp), intent(in) :: zion, tpiba2, omega, gl (ngl) - real(dp), intent(out) :: dvloc (ngl) - ! - ! Local variables - integer :: ii, my_gth, igl, igl0 - real(dp) :: cc1, cc2, cc3, cc4, rloc, & - gx, gx2, gx3, rl2, rl3, rq2, r2q, r4g3, r6g5, e_rq2h, fact - ! -! IF ( do_comp_esm ) call upf_error('vloc_gth', 'ESM not implemented', itype) - ! - ! Find gtp param. set for type itype - my_gth=0 - do ii=1,size(gth_p) - if (gth_p(ii)%itype==itype) then - my_gth=ii - exit - endif - enddo - if (my_gth==0) call upf_error('dvloc_gth', 'cannot map itype in some gtp param. set', itype) - rloc=gth_p(my_gth)%rloc - cc1=gth_p(my_gth)%cc(1) - cc2=gth_p(my_gth)%cc(2) - cc3=gth_p(my_gth)%cc(3) - cc4=gth_p(my_gth)%cc(4) - - ! Compute vloc(q) - if (gl (1) < eps8) then - ! - ! first the G=0 term - ! - dvloc (1) = 0._dp - igl0 = 2 - else - igl0 = 1 - endif - ! - ! here the G<>0 terms, we first compute the part of the integrand - ! function independent of |G| in real space - ! - do igl = igl0, ngl - gx = sqrt (gl (igl) * tpiba2) - gx2 = gx**2 - gx3 = gx*gx2 - rl2 = rloc**2 - rl3 = rloc*rl2 - rq2 = gx2*rl2 - r2q = gx*rl2 - r4g3 = rl2*rl2*gx3 - r6g5 = r4g3*rl2*gx2 - e_rq2h = exp(-0.5_dp*rq2) - dvloc (igl) = & - e_rq2h*(zion*(rq2+2._dp)/gx3 + sqrt(pi/2._dp)*rl3* & - ( & - ( & - - 2._dp*r2q* (cc2+10._dp*cc3+105._dp*cc4) & - + 4._dp*r4g3*(cc3+21._dp*cc4) & - - 6._dp*r6g5* cc4 & - ) - r2q*( & - cc1 + & - cc2*(3._dp-rq2) + & - cc3*(15._dp-10._dp*rq2+rq2**2) + & - cc4*(105._dp-rq2*(105._dp-rq2*(21._dp-rq2))) & - ) & - ) & - )/gx - enddo - ! - fact = tpi * e2 / omega - dvloc (:) = dvloc(:) * fact - ! -end subroutine dvloc_gth -! -! -!------------------------------------------------------------------------------- -SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d ) +subroutine dvloc_gth( itype, zion, tpiba2, ngl, gl, omega, dvloc ) !------------------------------------------------------------------------------ !! GPU version of 'dvloc_gth' from 'Modules/gth.f90' !! dvloc = D Vloc (g^2) / D g^2 = (1/2g) * D Vloc(g) / D g @@ -435,34 +350,31 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d ) USE upf_kinds, ONLY : DP USE upf_const, ONLY : pi, tpi, e2, eps8 ! - IMPLICIT NONE + implicit none ! ! I/O - INTEGER, INTENT(IN) :: itype, ngl - REAL(DP), INTENT(IN) :: zion, tpiba2, omega - ! - REAL(DP), INTENT(IN) :: gl_d(ngl) - REAL(DP), INTENT(OUT) :: dvloc_d(ngl) + integer, intent(in) :: itype, ngl + real(dp), intent(in) :: zion, tpiba2, omega + real(dp), intent(in) :: gl(ngl) + real(dp), intent(out) :: dvloc(ngl) ! ! Local variables - INTEGER :: ii, my_gth, igl, igl0 - REAL(DP) :: cc1, cc2, cc3, cc4, rloc, gl1, & + integer :: ii, my_gth, igl, igl0 + real(dp) :: cc1, cc2, cc3, cc4, rloc, gl1, & gx, gx2, gx3, rl2, rl3, rq2, r2q, r4g3, r6g5, e_rq2h, fact ! -#if defined(__CUDA) - attributes(DEVICE) :: dvloc_d, gl_d -#endif + !$acc data present( dvloc, gl ) ! ! IF ( do_comp_esm ) call upf_error('vloc_gth', 'ESM not implemented', itype) ! ! Find gtp param. set for type itype my_gth = 0 - DO ii = 1, SIZE(gth_p) - IF (gth_p(ii)%itype==itype) THEN + do ii = 1, SIZE(gth_p) + if (gth_p(ii)%itype==itype) then my_gth = ii - EXIT - ENDIF - ENDDO + exit + endif + enddo ! IF ( my_gth==0 ) CALL upf_error( 'dvloc_gth', 'cannot map itype in some gtp param. set', itype ) rloc = gth_p(my_gth)%rloc @@ -472,25 +384,27 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d ) cc4 = gth_p(my_gth)%cc(4) ! ! Compute vloc(q) - gl1 = gl_d(1) - IF (gl1 < eps8) THEN + gl1 = gl(1) + if (gl1 < eps8) then ! ! first the G=0 term ! - dvloc_d(1) = 0.0_DP + !$acc kernels + dvloc(1) = 0.0_DP + !$acc end kernels igl0 = 2 - ELSE + else igl0 = 1 - ENDIF + endif ! ! here the G<>0 terms, we first compute the part of the integrand ! function independent of |G| in real space ! fact = tpi * e2 / omega ! - !$cuf kernel do (1) <<<*,*>>> - DO igl = igl0, ngl - gx = SQRT(gl_d(igl) * tpiba2) + !$acc parallel loop + do igl = igl0, ngl + gx = SQRT(gl(igl) * tpiba2) gx2 = gx**2 gx3 = gx*gx2 rl2 = rloc**2 @@ -500,7 +414,7 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d ) r4g3 = rl2*rl2*gx3 r6g5 = r4g3*rl2*gx2 e_rq2h = EXP(-0.5_DP*rq2) - dvloc_d(igl) = fact * & + dvloc(igl) = fact * & e_rq2h*(zion*(rq2+2._DP)/gx3 + SQRT(pi/2._DP)*rl3* & ( & ( & @@ -515,11 +429,11 @@ SUBROUTINE dvloc_gth_gpu( itype, zion, tpiba2, ngl, gl_d, omega, dvloc_d ) ) & ) & )/gx - ENDDO + enddo ! + !$acc end data ! -END SUBROUTINE dvloc_gth_gpu -! +end subroutine dvloc_gth ! !----------------------------------------------------------------------- subroutine setlocq_gth(itype, xq, zion, tpiba2, ngm, g, omega, vloc) diff --git a/upflib/init_us_1.f90 b/upflib/init_us_1.f90 index 40a174ba6..596e9b0f0 100644 --- a/upflib/init_us_1.f90 +++ b/upflib/init_us_1.f90 @@ -30,8 +30,8 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm ) USE atom, ONLY : rgrid USE uspp, ONLY : nhtol, nhtoj, nhtolm, ijtoh, dvan, qq_at, qq_nt, indv, & ap, aainit, qq_so, dvan_so, okvan, ofsbeta, & - nhtol_d, nhtoj_d, nhtolm_d, ijtoh_d, dvan_d, qq_at_d, & - qq_nt_d, indv_d, qq_so_d, dvan_so_d, ofsbeta_d + nhtol_d, nhtoj_d, nhtolm_d, ijtoh_d, dvan_d, & + qq_nt_d, indv_d, dvan_so_d, ofsbeta_d USE uspp_param, ONLY : upf, lmaxq, nh, nhm, lmaxkb, nsp USE upf_spinorb, ONLY : is_spinorbit, rot_ylm, fcoef, fcoef_d, lmaxx USE paw_variables,ONLY : okpaw @@ -320,10 +320,8 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm ) nhtolm_d=nhtolm nhtoj_d=nhtoj ijtoh_d=ijtoh - qq_at_d=qq_at qq_nt_d=qq_nt if (is_spinorbit) then - qq_so_d=qq_so dvan_so_d=dvan_so fcoef_d=fcoef else @@ -333,6 +331,13 @@ subroutine init_us_1( nat, ityp, omega, ngm, g, gg, intra_bgrp_comm ) ofsbeta_d=ofsbeta ! #endif + ! + if (nhm>0) then + !$acc update device(qq_at) + if (is_spinorbit) then + !$acc update device(qq_so) + endif + endif ! call stop_clock ('init_us_1') return diff --git a/upflib/simpsn.f90 b/upflib/simpsn.f90 index 0e815fa47..a9627142a 100644 --- a/upflib/simpsn.f90 +++ b/upflib/simpsn.f90 @@ -7,6 +7,7 @@ ! !----------------------------------------------------------------------- SUBROUTINE simpson(mesh, func, rab, asum) + !$acc routine vector !----------------------------------------------------------------------- ! ! simpson's rule integration. On input: @@ -22,27 +23,30 @@ SUBROUTINE simpson(mesh, func, rab, asum) ! USE upf_kinds, ONLY: DP IMPLICIT NONE - INTEGER, INTENT(in) :: mesh - real(DP), INTENT(in) :: rab (mesh), func (mesh) + INTEGER, INTENT(in) :: mesh + real(DP), INTENT(in) :: rab(mesh), func(mesh) real(DP), INTENT(out):: asum ! - real(DP) :: f1, f2, f3, r12 + real(DP) :: f1, f2, f3, r12, fct INTEGER :: i ! asum = 0.0d0 r12 = 1.0d0 / 3.0d0 - f3 = func (1) * rab (1) * r12 - - DO i = 2, mesh - 1, 2 - f1 = f3 - f2 = func (i) * rab (i) * r12 - f3 = func (i + 1) * rab (i + 1) * r12 - asum = asum + f1 + 4.0d0 * f2 + f3 + ! + !$acc loop vector reduction(+:asum) + DO i = 2, mesh-1 + fct = DBLE(ABS(MOD(i,2)-2)*2) + asum = asum + fct * func(i) * rab(i) ENDDO + IF (MOD(mesh,2)==1) THEN + asum = (asum + func(1)*rab(1) + func(mesh)*rab(mesh)) * r12 + ELSE + asum = (asum + func(1)*rab(1) - func(mesh-1)*rab(mesh-1)) * r12 + ENDIF ! ! if mesh is not odd, use open formula instead: ! ... 2/3*f(n-5) + 4/3*f(n-4) + 13/12*f(n-3) + 0*f(n-2) + 27/12*f(n-1) - !!! Under testing + !** Under testing ! !IF ( MOD(mesh,2) == 0 ) THEN ! print *, 'mesh even: correction:', f1*5.d0/4.d0-4.d0*f2+23.d0*f3/4.d0, & diff --git a/upflib/sph_bes.f90 b/upflib/sph_bes.f90 index 4b5113245..ca8d30471 100644 --- a/upflib/sph_bes.f90 +++ b/upflib/sph_bes.f90 @@ -8,11 +8,12 @@ ! !-------------------------------------------------------------------- subroutine sph_bes (msh, r, q, l, jl) + !$acc routine vector !-------------------------------------------------------------------- !! Spherical Bessel function. ! USE upf_kinds, only: DP - USE upf_const, ONLY : eps14 + USE upf_const, only: eps14 ! implicit none ! @@ -29,8 +30,8 @@ subroutine sph_bes (msh, r, q, l, jl) ! ! xseries = convergence radius of the series for small x of j_l(x) real(DP) :: x, xl, xseries = 0.05_dp - integer :: ir, ir0 - integer, external:: semifact + integer :: i, ir, ir0 + integer :: semifact ! #if defined (__MASS) real(DP) :: qr(msh), sin_qr(msh), cos_qr(msh) @@ -40,11 +41,17 @@ subroutine sph_bes (msh, r, q, l, jl) if (abs (q) < eps14) then if (l == -1) then - call upf_error ('sph_bes', 'j_{-1}(0) ?!?', 1) + stop !call upf_error ('sph_bes', 'j_{-1}(0) ?!?', 1) elseif (l == 0) then - jl(:) = 1.d0 + !$acc loop vector + do ir = 1, msh + jl(ir) = 1.d0 + enddo else - jl(:) = 0.d0 + !$acc loop vector + do ir = 1, msh + jl(ir) = 0.d0 + enddo endif return end if @@ -52,7 +59,7 @@ subroutine sph_bes (msh, r, q, l, jl) ! case l=-1 if (l == - 1) then - if (abs (q * r (1) ) < eps14) call upf_error ('sph_bes', 'j_{-1}(0) ?!?',1) + if (abs (q * r (1) ) < eps14) stop !call upf_error ('sph_bes', 'j_{-1}(0) ?!?',1) #if defined (__MASS) @@ -61,9 +68,10 @@ subroutine sph_bes (msh, r, q, l, jl) jl = cos_qr / qr #else - - jl (:) = cos (q * r (:) ) / (q * r (:) ) - + !$acc loop vector + do ir = 1, msh + jl (ir) = cos (q * r (ir) ) / (q * r (ir) ) + enddo #endif return @@ -75,6 +83,7 @@ subroutine sph_bes (msh, r, q, l, jl) ! notice that for small q it may happen that q*r(msh) < xseries ! ir0 = msh+1 + !$acc loop vector do ir = 1, msh if ( abs (q * r (ir) ) > xseries ) then ir0 = ir @@ -82,6 +91,7 @@ subroutine sph_bes (msh, r, q, l, jl) end if end do + !$acc loop vector do ir = 1, ir0 - 1 x = q * r (ir) if ( l == 0 ) then @@ -89,11 +99,18 @@ subroutine sph_bes (msh, r, q, l, jl) else xl = x**l end if - jl (ir) = xl/semifact(2*l+1) * & - ( 1.0_dp - x**2/1.0_dp/2.0_dp/(2.0_dp*l+3) * & - ( 1.0_dp - x**2/2.0_dp/2.0_dp/(2.0_dp*l+5) * & - ( 1.0_dp - x**2/3.0_dp/2.0_dp/(2.0_dp*l+7) * & - ( 1.0_dp - x**2/4.0_dp/2.0_dp/(2.0_dp*l+9) ) ) ) ) + !-- + semifact = 1 + !$acc loop seq reduction(*:semifact) + do i = 2*l+1, 1, -2 + semifact = i*semifact + enddo + !--- + jl (ir) = xl/DBLE(semifact) * & + ( 1.0_dp - x**2/1.0_dp/2.0_dp/DBLE(2*l+3) * & + ( 1.0_dp - x**2/2.0_dp/2.0_dp/DBLE(2*l+5) * & + ( 1.0_dp - x**2/3.0_dp/2.0_dp/DBLE(2*l+7) * & + ( 1.0_dp - x**2/4.0_dp/2.0_dp/DBLE(2*l+9) ) ) ) ) end do ! the following shouldn't be needed but do you trust compilers @@ -110,9 +127,10 @@ subroutine sph_bes (msh, r, q, l, jl) jl (ir0:) = sin_qr(ir0:) / (q * r (ir0:) ) #else - - jl (ir0:) = sin (q * r (ir0:) ) / (q * r (ir0:) ) - + !$acc loop vector + do ir = ir0, msh + jl (ir) = sin (q * r (ir) ) / (q * r (ir) ) + enddo #endif elseif (l == 1) then @@ -126,10 +144,11 @@ subroutine sph_bes (msh, r, q, l, jl) cos_qr(ir0:) ) / (q * r (ir0:) ) #else - - jl (ir0:) = (sin (q * r (ir0:) ) / (q * r (ir0:) ) - & - cos (q * r (ir0:) ) ) / (q * r (ir0:) ) - + !$acc loop vector + do ir = ir0, msh + jl (ir) = (sin (q * r (ir) ) / (q * r (ir) ) - & + cos (q * r (ir) ) ) / (q * r (ir) ) + enddo #endif elseif (l == 2) then @@ -143,10 +162,11 @@ subroutine sph_bes (msh, r, q, l, jl) 3.d0 * cos_qr(ir0:) ) / (q*r(ir0:))**2 #else - - jl (ir0:) = ( (3.d0 / (q*r(ir0:)) - (q*r(ir0:)) ) * sin (q*r(ir0:)) - & - 3.d0 * cos (q*r(ir0:)) ) / (q*r(ir0:))**2 - + !$acc loop vector + do ir = ir0, msh + jl (ir) = ( (3.d0 / (q*r(ir)) - (q*r(ir)) ) * sin (q*r(ir)) - & + 3.d0 * cos (q*r(ir)) ) / (q*r(ir))**2 + enddo #endif elseif (l == 3) then @@ -162,12 +182,13 @@ subroutine sph_bes (msh, r, q, l, jl) (q*r(ir0:))**3 #else - - jl (ir0:) = (sin (q*r(ir0:)) * & - (15.d0 / (q*r(ir0:)) - 6.d0 * (q*r(ir0:)) ) + & - cos (q*r(ir0:)) * ( (q*r(ir0:))**2 - 15.d0) ) / & - (q*r(ir0:)) **3 - + !$acc loop vector + do ir = ir0, msh + jl (ir) = (sin (q*r(ir)) * & + (15.d0 / (q*r(ir)) - 6.d0 * (q*r(ir)) ) + & + cos (q*r(ir)) * ( (q*r(ir))**2 - 15.d0) ) / & + (q*r(ir)) **3 + enddo #endif elseif (l == 4) then @@ -184,12 +205,14 @@ subroutine sph_bes (msh, r, q, l, jl) (q*r(ir0:))**5 #else - - jl (ir0:) = (sin (q*r(ir0:)) * & - (105.d0 - 45.d0 * (q*r(ir0:))**2 + (q*r(ir0:))**4) + & - cos (q*r(ir0:)) * & - (10.d0 * (q*r(ir0:))**3 - 105.d0 * (q*r(ir0:))) ) / & - (q*r(ir0:))**5 + !$acc loop vector + do ir = ir0, msh + jl (ir) = (sin (q*r(ir)) * & + (105.d0 - 45.d0 * (q*r(ir))**2 + (q*r(ir))**4) + & + cos (q*r(ir)) * & + (10.d0 * (q*r(ir))**3 - 105.d0 * (q*r(ir))) ) / & + (q*r(ir))**5 + enddo #endif elseif (l == 5) then @@ -205,12 +228,15 @@ subroutine sph_bes (msh, r, q, l, jl) (420.d0*sin_qr(ir0:)) / (q*r(ir0:)) ** 3 + & ( 15.d0*sin_qr(ir0:)) / (q*r(ir0:)) ) / (q*r(ir0:)) #else - jl (ir0:) = (-cos(q*r(ir0:)) - & - (945.d0*cos(q*r(ir0:))) / (q*r(ir0:)) ** 4 + & - (105.d0*cos(q*r(ir0:))) / (q*r(ir0:)) ** 2 + & - (945.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ** 5 - & - (420.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ** 3 + & - ( 15.d0*sin(q*r(ir0:))) / (q*r(ir0:)) ) / (q*r(ir0:)) + !$acc loop vector + do ir = ir0, msh + jl (ir) = (-cos(q*r(ir)) - & + (945.d0*cos(q*r(ir))) / (q*r(ir)) ** 4 + & + (105.d0*cos(q*r(ir))) / (q*r(ir)) ** 2 + & + (945.d0*sin(q*r(ir))) / (q*r(ir)) ** 5 - & + (420.d0*sin(q*r(ir))) / (q*r(ir)) ** 3 + & + ( 15.d0*sin(q*r(ir))) / (q*r(ir)) ) / (q*r(ir)) + enddo #endif elseif (l == 6) then @@ -228,19 +254,21 @@ subroutine sph_bes (msh, r, q, l, jl) ( 4725.d0*sin_qr(ir0:)) / (q*r(ir0:))**4 + & ( 210.d0*sin_qr(ir0:)) / (q*r(ir0:))**2 ) / (q*r(ir0:)) #else - - jl (ir0:) = ((-10395.d0*cos(q*r(ir0:))) / (q*r(ir0:))**5 + & - ( 1260.d0*cos(q*r(ir0:))) / (q*r(ir0:))**3 - & - ( 21.d0*cos(q*r(ir0:))) / (q*r(ir0:)) - & - sin(q*r(ir0:)) + & - ( 10395.d0*sin(q*r(ir0:))) / (q*r(ir0:))**6 - & - ( 4725.d0*sin(q*r(ir0:))) / (q*r(ir0:))**4 + & - ( 210.d0*sin(q*r(ir0:))) / (q*r(ir0:))**2 ) / (q*r(ir0:)) + !$acc loop vector + do ir = ir0, msh + jl (ir) = ((-10395.d0*cos(q*r(ir))) / (q*r(ir))**5 + & + ( 1260.d0*cos(q*r(ir))) / (q*r(ir))**3 - & + ( 21.d0*cos(q*r(ir))) / (q*r(ir)) - & + sin(q*r(ir)) + & + ( 10395.d0*sin(q*r(ir))) / (q*r(ir))**6 - & + ( 4725.d0*sin(q*r(ir))) / (q*r(ir))**4 + & + ( 210.d0*sin(q*r(ir))) / (q*r(ir))**2 ) / (q*r(ir)) + enddo #endif else - call upf_error ('sph_bes', 'not implemented', abs(l)) + stop !call upf_error ('sph_bes', 'not implemented', abs(l)) endif ! diff --git a/upflib/upf_acc_interfaces.f90 b/upflib/upf_acc_interfaces.f90 new file mode 100644 index 000000000..bc51be9a6 --- /dev/null +++ b/upflib/upf_acc_interfaces.f90 @@ -0,0 +1,30 @@ +! +! Copyright (C) 2001-2022 Quantum ESPRESSO group +! This file is distributed under the terms of the +! GNU General Public License. See the file `License' +! in the root directory of the present distribution, +! or http://www.gnu.org/copyleft/gpl.txt . +! + +MODULE upf_acc_interfaces + ! + INTERFACE sph_bes_acc + SUBROUTINE sph_bes( msh, r, q, l, jl ) + !$acc routine vector + IMPLICIT NONE + INTEGER :: msh, l + REAL(8) :: r(msh), q, jl(msh) + END SUBROUTINE + END INTERFACE + ! + INTERFACE + SUBROUTINE simpson( mesh, func, rab, asum ) + !$acc routine vector + IMPLICIT NONE + INTEGER, INTENT(IN) :: mesh + REAL(8), INTENT(IN) :: rab(mesh), func(mesh) + REAL(8), INTENT(OUT):: asum + END SUBROUTINE + END INTERFACE + ! +END MODULE diff --git a/upflib/uspp.f90 b/upflib/uspp.f90 index 811f77692..86a10a814 100644 --- a/upflib/uspp.f90 +++ b/upflib/uspp.f90 @@ -30,11 +30,10 @@ MODULE uspp nkb, nkbus, vkb, dvan, deeq, qq_at, qq_nt, nhtoj, ijtoh, beta, & becsum, ebecsum PUBLIC :: lpx_d, lpl_d, ap_d, indv_d, nhtol_d, nhtolm_d, ofsbeta_d, & - dvan_d, deeq_d, qq_at_d, qq_nt_d, nhtoj_d, ijtoh_d, & - becsum_d, ebecsum_d + dvan_d, qq_nt_d, nhtoj_d, ijtoh_d, becsum_d, ebecsum_d PUBLIC :: okvan, nlcc_any PUBLIC :: qq_so, dvan_so, deeq_nc, fcoef - PUBLIC :: qq_so_d, dvan_so_d, deeq_nc_d, fcoef_d + PUBLIC :: dvan_so_d, fcoef_d PUBLIC :: dbeta ! PUBLIC :: allocate_uspp, deallocate_uspp @@ -110,16 +109,12 @@ MODULE uspp REAL(DP), ALLOCATABLE :: becsum_d(:,:,:) REAL(DP), ALLOCATABLE :: ebecsum_d(:,:,:) REAL(DP), ALLOCATABLE :: dvan_d(:,:,:) - REAL(DP), ALLOCATABLE :: deeq_d(:,:,:,:) REAL(DP), ALLOCATABLE :: qq_nt_d(:,:,:) - REAL(DP), ALLOCATABLE :: qq_at_d(:,:,:) REAL(DP), ALLOCATABLE :: nhtoj_d(:,:) - COMPLEX(DP), ALLOCATABLE :: qq_so_d(:,:,:,:) COMPLEX(DP), ALLOCATABLE :: dvan_so_d(:,:,:,:) - COMPLEX(DP), ALLOCATABLE :: deeq_nc_d(:,:,:,:) #if defined(__CUDA) - attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, deeq_d, qq_nt_d, & - qq_at_d, nhtoj_d, qq_so_d, dvan_so_d, deeq_nc_d + attributes (DEVICE) :: becsum_d, ebecsum_d, dvan_d, qq_nt_d, & + nhtoj_d, dvan_so_d #endif ! @@ -356,15 +351,19 @@ CONTAINS allocate( nhtoj(nhm,nsp) ) allocate( ijtoh(nhm,nhm,nsp) ) allocate( deeq(nhm,nhm,nat,nspin) ) + !$acc enter data create(deeq) if ( noncolin ) then allocate( deeq_nc(nhm,nhm,nat,nspin) ) + !$acc enter data create(deeq_nc) endif allocate( qq_at(nhm,nhm,nat) ) + !$acc enter data create(qq_at) allocate( qq_nt(nhm,nhm,nsp) ) ! set the internal spin-orbit flag is_spinorbit = lspinorb if ( lspinorb ) then allocate( qq_so(nhm,nhm,4,nsp) ) + !$acc enter data create(qq_so) allocate( dvan_so(nhm,nhm,nspin,nsp) ) allocate( fcoef(nhm,nhm,2,2,nsp) ) else @@ -386,14 +385,8 @@ CONTAINS allocate( nhtolm_d(nhm,nsp) ) allocate( nhtoj_d(nhm,nsp) ) allocate( ijtoh_d(nhm,nhm,nsp) ) - allocate( deeq_d(nhm,nhm,nat,nspin) ) - if ( noncolin ) then - allocate( deeq_nc_d(nhm,nhm,nat,nspin) ) - endif - allocate( qq_at_d(nhm,nhm,nat) ) allocate( qq_nt_d(nhm,nhm,nsp) ) if ( lspinorb ) then - allocate( qq_so_d(nhm,nhm,4,nsp) ) allocate( dvan_so_d(nhm,nhm,nspin,nsp) ) allocate( fcoef_d(nhm,nhm,2,2,nsp) ) else @@ -419,22 +412,34 @@ CONTAINS IF( ALLOCATED( indv ) ) DEALLOCATE( indv ) IF( ALLOCATED( nhtolm ) ) DEALLOCATE( nhtolm ) IF( ALLOCATED( nhtoj ) ) DEALLOCATE( nhtoj ) - IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta ) + IF( ALLOCATED( ofsbeta ) ) DEALLOCATE( ofsbeta ) IF( ALLOCATED( ijtoh ) ) DEALLOCATE( ijtoh ) !FIXME in order to be created and deleted automatically by using !$acc declare create(vkb) in IF( ALLOCATED( vkb ) ) THEN -!$acc exit data delete(vkb ) + !$acc exit data delete(vkb ) DEALLOCATE( vkb ) END IF IF( ALLOCATED( becsum ) ) DEALLOCATE( becsum ) IF( ALLOCATED( ebecsum ) ) DEALLOCATE( ebecsum ) - IF( ALLOCATED( qq_at ) ) DEALLOCATE( qq_at ) + IF( ALLOCATED( qq_at ) ) THEN + !$acc exit data delete( qq_at ) + DEALLOCATE( qq_at ) + ENDIF IF( ALLOCATED( qq_nt ) ) DEALLOCATE( qq_nt ) IF( ALLOCATED( dvan ) ) DEALLOCATE( dvan ) - IF( ALLOCATED( deeq ) ) DEALLOCATE( deeq ) - IF( ALLOCATED( qq_so ) ) DEALLOCATE( qq_so ) + IF( ALLOCATED( deeq ) ) THEN + !$acc exit data delete( deeq ) + DEALLOCATE( deeq ) + ENDIF + IF( ALLOCATED( qq_so ) ) THEN + !$acc exit data delete( qq_so ) + DEALLOCATE( qq_so ) + ENDIF IF( ALLOCATED( dvan_so ) ) DEALLOCATE( dvan_so ) - IF( ALLOCATED( deeq_nc ) ) DEALLOCATE( deeq_nc ) + IF( ALLOCATED( deeq_nc ) ) THEN + !$acc exit data delete( deeq_nc ) + DEALLOCATE( deeq_nc ) + ENDIF IF( ALLOCATED( fcoef ) ) DEALLOCATE( fcoef ) IF( ALLOCATED( beta ) ) DEALLOCATE( beta ) IF( ALLOCATED( dbeta ) ) DEALLOCATE( dbeta ) @@ -451,13 +456,9 @@ CONTAINS IF( ALLOCATED( becsum_d ) ) DEALLOCATE( becsum_d ) IF( ALLOCATED( ebecsum_d ) ) DEALLOCATE( ebecsum_d ) IF( ALLOCATED( dvan_d ) ) DEALLOCATE( dvan_d ) - IF( ALLOCATED( deeq_d ) ) DEALLOCATE( deeq_d ) IF( ALLOCATED( qq_nt_d ) ) DEALLOCATE( qq_nt_d ) - IF( ALLOCATED( qq_at_d ) ) DEALLOCATE( qq_at_d ) IF( ALLOCATED( nhtoj_d ) ) DEALLOCATE( nhtoj_d ) - IF( ALLOCATED( qq_so_d ) ) DEALLOCATE( qq_so_d ) IF( ALLOCATED( dvan_so_d ) ) DEALLOCATE( dvan_so_d ) - IF( ALLOCATED( deeq_nc_d ) ) DEALLOCATE( deeq_nc_d ) IF( ALLOCATED( fcoef_d ) ) DEALLOCATE( fcoef_d ) ! END SUBROUTINE deallocate_uspp