Phonon-gpu openACC acceleration of apply_dpot routine.

- The position of the G vec in the FFT grid (nl) are copied to the device from host fft descriptor (dffts) in Fourier transform of wavefunction (cft_wave routines)
- index of G corresponding to a given index of k+G (igk_k) is copied
from host to device with openACC in cft_wave routine
This commit is contained in:
Sergio Orlandini 2022-01-10 16:01:34 +01:00 committed by Oscar Baseggio
parent eb84c05045
commit 6ef714d9a8
8 changed files with 71 additions and 9 deletions

View File

@ -49,6 +49,7 @@ SUBROUTINE apply_dpot(nrxxs, aux1, dv, current_spin)
! Noncollinear part without task groups
!
IF (domag) then
!$acc kernels present(aux1, dv)
DO ir = 1, nrxxs
sup=aux1(ir,1)*(dv(ir,1)+dv(ir,4))+ &
aux1(ir,2)*(dv(ir,2)-(0.d0,1.d0)*dv(ir,3))
@ -57,10 +58,13 @@ SUBROUTINE apply_dpot(nrxxs, aux1, dv, current_spin)
aux1(ir,1)=sup
aux1(ir,2)=sdwn
ENDDO
!$acc end kernels
ELSE
!$acc kernels present(aux1, dv)
DO ir = 1, nrxxs
aux1(ir,:)=aux1(ir,:)*dv(ir,1)
ENDDO
!$acc end kernels
ENDIF
!
ENDIF
@ -79,6 +83,7 @@ SUBROUTINE apply_dpot(nrxxs, aux1, dv, current_spin)
!
! Collinear part without task groups
!
!$acc parallel loop present(aux1, dv)
DO ir = 1, nrxxs
aux1(ir,1)=aux1(ir,1)*dv(ir,current_spin)
ENDDO

View File

@ -41,6 +41,8 @@ MODULE apply_dpot_mod
ALLOCATE(psi_r(dffts%nnr, npol), STAT=ierr)
IF (ierr /= 0) CALL errore('apply_dpot_allocate', 'Error allocating psi_r', 1)
!
!$acc enter data create(psi_r(1:dffts%nnr, 1:npol))
!
IF (dffts%has_task_groups) THEN
ALLOCATE(tg_dv(dffts%nnr_tg, nspin_mag), STAT=ierr)
IF (ierr /= 0) CALL errore('apply_dpot_allocate', 'Error allocating tg_dv', 1)
@ -66,6 +68,8 @@ MODULE apply_dpot_mod
IF (.NOT. is_allocated) RETURN
is_allocated = .FALSE.
!
!$acc exit data delete(psi_r)
!
DEALLOCATE(psi_r, STAT=ierr)
IF (ierr /= 0) CALL errore('apply_dpot_deallocate', 'Error deallocating psi_r', 1)
!
@ -119,6 +123,9 @@ MODULE apply_dpot_mod
!
CALL start_clock("apply_dpot_b")
!
!$acc enter data copyin(psi)
!$acc update device(dv(1:dffts%nnr, 1:nspin_mag))
!
IF (.NOT. is_allocated) CALL apply_dpot_allocate()
!
incr = 1
@ -141,7 +148,9 @@ MODULE apply_dpot_mod
ENDIF ! noncolin
ENDIF ! has_task_groups
!
!$acc kernels present(dvpsi)
dvpsi = (0.0_DP, 0.0_DP)
!$acc end kernels
!
DO ibnd = 1, nbnd, incr
IF (dffts%has_task_groups) THEN
@ -155,6 +164,9 @@ MODULE apply_dpot_mod
ENDIF ! has_task_groups
ENDDO ! ibnd
!
!$acc update self(dvpsi(1:npwx*npol, 1:nbnd))
!$acc exit data delete(psi)
!
CALL stop_clock("apply_dpot_b")
!
END SUBROUTINE apply_dpot_bands

View File

@ -46,6 +46,8 @@ SUBROUTINE cft_wave (ik, evc_g, evc_r, isw)
CALL start_clock ('cft_wave')
!$acc data copyin(igk_k) copyin(dffts) copyin(dffts%nl)
IF (isw == 1) THEN
ikk = ikks(ik) ! points to k+G indices
npw = ngk(ikk)
@ -58,6 +60,8 @@ SUBROUTINE cft_wave (ik, evc_g, evc_r, isw)
CALL errore ('cft_wave',' Wrong value for isw',1)
ENDIF
!$acc end data
CALL stop_clock ('cft_wave')
RETURN
@ -75,16 +79,24 @@ SUBROUTINE fwfft_wave (npwq, igkq, evc_g, evc_r )
INTEGER, INTENT(IN) :: npwq, igkq(npwq)
COMPLEX(DP), INTENT(INOUT) :: evc_g (npwx*npol), evc_r (dffts%nnr,npol)
!
INTEGER :: ig
INTEGER :: ig, ik
!$acc host_data use_device(evc_r)
CALL fwfft ('Wave', evc_r(:,1), dffts)
!$acc end host_data
!$acc parallel loop present(evc_g, evc_r, igkq, dffts, dffts%nl) private(ik)
DO ig = 1, npwq
evc_g (ig) = evc_g (ig) + evc_r (dffts%nl (igkq(ig) ), 1 )
ik = dffts%nl(igkq(ig))
evc_g (ig) = evc_g (ig) + evc_r (ik,1)
ENDDO
IF (noncolin) THEN
!$acc host_data use_device(evc_r)
CALL fwfft ('Wave', evc_r(:,2), dffts)
!$acc end host_data
!$acc parallel loop present(evc_g, evc_r, igkq, dffts, dffts%nl) private(ik)
DO ig = 1, npwq
evc_g (ig+npwx) = evc_g (ig+npwx) + evc_r (dffts%nl(igkq(ig)),2)
ik = dffts%nl(igkq(ig))
evc_g (ig+npwx) = evc_g (ig+npwx) + evc_r (ik,2)
ENDDO
ENDIF
END SUBROUTINE fwfft_wave
@ -101,18 +113,28 @@ SUBROUTINE invfft_wave (npw, igk, evc_g, evc_r )
COMPLEX(DP), INTENT(IN) :: evc_g (npwx*npol)
COMPLEX(DP), INTENT(OUT):: evc_r (dffts%nnr,npol)
!
INTEGER :: ig
INTEGER :: ig, ik
!$acc kernels present(evc_r)
evc_r = (0.0_dp, 0.0_dp)
!$acc end kernels
!$acc parallel loop present(evc_g, evc_r, igk, dffts, dffts%nl) private(ik)
DO ig = 1, npw
evc_r (dffts%nl (igk(ig) ),1 ) = evc_g (ig)
ik = dffts%nl(igk(ig))
evc_r (ik, 1) = evc_g (ig)
ENDDO
!$acc host_data use_device(evc_r)
CALL invfft ('Wave', evc_r(:,1), dffts)
!$acc end host_data
IF (noncolin) THEN
!$acc parallel loop present(evc_g, evc_r, igk, dffts, dffts%nl) private(ik)
DO ig = 1, npw
evc_r (dffts%nl(igk(ig)),2) = evc_g (ig+npwx)
ik = dffts%nl(igk(ig))
evc_r (ik, 2) = evc_g (ig+npwx)
ENDDO
!$acc host_data use_device(evc_r)
CALL invfft ('Wave', evc_r(:,2), dffts)
!$acc end host_data
ENDIF
END SUBROUTINE invfft_wave

View File

@ -139,6 +139,8 @@ SUBROUTINE sternheimer_kernel(first_iter, time_reversed, npert, lrdvpsi, iudvpsi
ALLOCATE(h_diag(npwx*npol, nbnd))
ALLOCATE(aux2(npwx*npol, nbnd))
!
!$acc enter data create(aux2(1:npwx*npol, 1:nbnd))
!
all_conv = .TRUE.
tot_num_iter = 0
tot_cg_calls = 0
@ -275,6 +277,8 @@ SUBROUTINE sternheimer_kernel(first_iter, time_reversed, npert, lrdvpsi, iudvpsi
CALL mp_sum(tot_cg_calls, inter_pool_comm)
avg_iter = REAL(tot_num_iter, DP) / REAL(tot_cg_calls, DP)
!
!$acc exit data delete(aux2)
!
DEALLOCATE(aux2)
DEALLOCATE(h_diag)
!

View File

@ -394,6 +394,9 @@ SUBROUTINE elphel (irr, npe, imode0, dvscfins)
ALLOCATE (aux2(npwx*npol, nbnd))
el_ph_mat_rec=(0.0_DP,0.0_DP)
aux2(:, :) = (0.0_DP, 0.0_DP)
!
!$acc enter data create(dvscfins(1:dffts%nnr, 1:nspin_mag, 1:npe), aux2(1:npwx*npol, 1:nbnd))
!
CALL apply_dpot_allocate()
!
! DFPT+U case
@ -604,6 +607,9 @@ SUBROUTINE elphel (irr, npe, imode0, dvscfins)
DEALLOCATE(el_ph_mat_rec)
!
CALL apply_dpot_deallocate()
!
!$acc exit data delete(dvscfins, aux2)
!
DEALLOCATE (elphmat)
DEALLOCATE (aux2)
!

View File

@ -592,7 +592,9 @@ SUBROUTINE elphel_refolded (npe, imode0, dvscfins)
ALLOCATE (elphmat ( nbnd , nbnd , 3*nat))
allocate (aux_psi(npol*npwx,nbnd))
CALL apply_dpot_allocate()
!
!$acc enter data create(dvscfins(1:dffts%nnr, 1:nspin_mag, 1:npe), dvpsi(1:npwx*npol, 1:nbnd))
!
! iunwfcwann=find_free_unit()
! CALL diropn (iunwfcwann, 'wfc', lrwfc, exst, dvscf_dir)
@ -693,7 +695,7 @@ SUBROUTINE elphel_refolded (npe, imode0, dvscfins)
ENDDO
ENDDO
ENDDO
!$acc exit data delete(dvscfins, dvpsi)
! CLOSE( UNIT = iunwfcwann, STATUS = 'KEEP' )
!

View File

@ -94,18 +94,22 @@ subroutine solve_e
!! the becsum with dpsi
COMPLEX(DP), ALLOCATABLE :: mixin(:), mixout(:)
!! auxiliary for paw mixing
INTEGER :: nnr
!
call start_clock ('solve_e')
!
! This routine is task group aware
!
allocate (dvscfin( dfftp%nnr, nspin_mag, 3))
nnr = dfftp%nnr
dvscfin=(0.0_DP,0.0_DP)
if (doublegrid) then
allocate (dvscfins(dffts%nnr, nspin_mag, 3))
nnr = dffts%nnr
else
dvscfins => dvscfin
endif
!$acc enter data create(dvscfins(1:nnr, 1:nspin_mag, 1:3))
allocate (dvscfout(dfftp%nnr, nspin_mag, 3))
IF (okpaw) THEN
ALLOCATE (mixin(dfftp%nnr*nspin_mag*3+(nhm*(nhm+1)*nat*nspin_mag*3)/2) )
@ -319,6 +323,7 @@ subroutine solve_e
DEALLOCATE(mixin)
DEALLOCATE(mixout)
ENDIF
!$acc exit data delete(dvscfins)
if (doublegrid) deallocate (dvscfins)
deallocate (dvscfin)
if (noncolin) deallocate(dbecsum_nc)

View File

@ -150,6 +150,8 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
integer :: iq_dummy
real(DP) :: tcpu, get_clock ! timing variables
character(len=256) :: filename
integer :: nnr
!
IF (rec_code_read > 20 ) RETURN
@ -161,12 +163,15 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
IF (noncolin.AND.domag) nsolv=2
allocate (dvscfin ( dfftp%nnr , nspin_mag , npe))
nnr = dfftp%nnr
dvscfin=(0.0_DP,0.0_DP)
if (doublegrid) then
allocate (dvscfins (dffts%nnr , nspin_mag , npe))
nnr = dffts%nnr
else
dvscfins => dvscfin
endif
!$acc enter data create(dvscfins(1:nnr, 1:nspin_mag, 1:npe))
allocate (drhoscfh ( dfftp%nnr, nspin_mag , npe))
allocate (dvscfout ( dfftp%nnr, nspin_mag , npe))
allocate (dbecsum ( (nhm * (nhm + 1))/2 , nat , nspin_mag , npe))
@ -593,6 +598,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
IF (noncolin) deallocate (dbecsum_nc)
deallocate (dvscfout)
deallocate (drhoscfh)
!$acc exit data delete(dvscfins)
if (doublegrid) deallocate (dvscfins)
deallocate (dvscfin)
deallocate(aux2)