mirror of https://gitlab.com/QEF/q-e.git
Phonon-gpu openACC acceleration of apply_dpot routine.
- The position of the G vec in the FFT grid (nl) are copied to the device from host fft descriptor (dffts) in Fourier transform of wavefunction (cft_wave routines) - index of G corresponding to a given index of k+G (igk_k) is copied from host to device with openACC in cft_wave routine
This commit is contained in:
parent
eb84c05045
commit
6ef714d9a8
|
@ -49,6 +49,7 @@ SUBROUTINE apply_dpot(nrxxs, aux1, dv, current_spin)
|
|||
! Noncollinear part without task groups
|
||||
!
|
||||
IF (domag) then
|
||||
!$acc kernels present(aux1, dv)
|
||||
DO ir = 1, nrxxs
|
||||
sup=aux1(ir,1)*(dv(ir,1)+dv(ir,4))+ &
|
||||
aux1(ir,2)*(dv(ir,2)-(0.d0,1.d0)*dv(ir,3))
|
||||
|
@ -57,10 +58,13 @@ SUBROUTINE apply_dpot(nrxxs, aux1, dv, current_spin)
|
|||
aux1(ir,1)=sup
|
||||
aux1(ir,2)=sdwn
|
||||
ENDDO
|
||||
!$acc end kernels
|
||||
ELSE
|
||||
!$acc kernels present(aux1, dv)
|
||||
DO ir = 1, nrxxs
|
||||
aux1(ir,:)=aux1(ir,:)*dv(ir,1)
|
||||
ENDDO
|
||||
!$acc end kernels
|
||||
ENDIF
|
||||
!
|
||||
ENDIF
|
||||
|
@ -79,6 +83,7 @@ SUBROUTINE apply_dpot(nrxxs, aux1, dv, current_spin)
|
|||
!
|
||||
! Collinear part without task groups
|
||||
!
|
||||
!$acc parallel loop present(aux1, dv)
|
||||
DO ir = 1, nrxxs
|
||||
aux1(ir,1)=aux1(ir,1)*dv(ir,current_spin)
|
||||
ENDDO
|
||||
|
|
|
@ -41,6 +41,8 @@ MODULE apply_dpot_mod
|
|||
ALLOCATE(psi_r(dffts%nnr, npol), STAT=ierr)
|
||||
IF (ierr /= 0) CALL errore('apply_dpot_allocate', 'Error allocating psi_r', 1)
|
||||
!
|
||||
!$acc enter data create(psi_r(1:dffts%nnr, 1:npol))
|
||||
!
|
||||
IF (dffts%has_task_groups) THEN
|
||||
ALLOCATE(tg_dv(dffts%nnr_tg, nspin_mag), STAT=ierr)
|
||||
IF (ierr /= 0) CALL errore('apply_dpot_allocate', 'Error allocating tg_dv', 1)
|
||||
|
@ -66,6 +68,8 @@ MODULE apply_dpot_mod
|
|||
IF (.NOT. is_allocated) RETURN
|
||||
is_allocated = .FALSE.
|
||||
!
|
||||
!$acc exit data delete(psi_r)
|
||||
!
|
||||
DEALLOCATE(psi_r, STAT=ierr)
|
||||
IF (ierr /= 0) CALL errore('apply_dpot_deallocate', 'Error deallocating psi_r', 1)
|
||||
!
|
||||
|
@ -119,6 +123,9 @@ MODULE apply_dpot_mod
|
|||
!
|
||||
CALL start_clock("apply_dpot_b")
|
||||
!
|
||||
!$acc enter data copyin(psi)
|
||||
!$acc update device(dv(1:dffts%nnr, 1:nspin_mag))
|
||||
!
|
||||
IF (.NOT. is_allocated) CALL apply_dpot_allocate()
|
||||
!
|
||||
incr = 1
|
||||
|
@ -141,7 +148,9 @@ MODULE apply_dpot_mod
|
|||
ENDIF ! noncolin
|
||||
ENDIF ! has_task_groups
|
||||
!
|
||||
!$acc kernels present(dvpsi)
|
||||
dvpsi = (0.0_DP, 0.0_DP)
|
||||
!$acc end kernels
|
||||
!
|
||||
DO ibnd = 1, nbnd, incr
|
||||
IF (dffts%has_task_groups) THEN
|
||||
|
@ -155,6 +164,9 @@ MODULE apply_dpot_mod
|
|||
ENDIF ! has_task_groups
|
||||
ENDDO ! ibnd
|
||||
!
|
||||
!$acc update self(dvpsi(1:npwx*npol, 1:nbnd))
|
||||
!$acc exit data delete(psi)
|
||||
!
|
||||
CALL stop_clock("apply_dpot_b")
|
||||
!
|
||||
END SUBROUTINE apply_dpot_bands
|
||||
|
|
|
@ -46,6 +46,8 @@ SUBROUTINE cft_wave (ik, evc_g, evc_r, isw)
|
|||
|
||||
CALL start_clock ('cft_wave')
|
||||
|
||||
!$acc data copyin(igk_k) copyin(dffts) copyin(dffts%nl)
|
||||
|
||||
IF (isw == 1) THEN
|
||||
ikk = ikks(ik) ! points to k+G indices
|
||||
npw = ngk(ikk)
|
||||
|
@ -57,6 +59,8 @@ SUBROUTINE cft_wave (ik, evc_g, evc_r, isw)
|
|||
ELSE
|
||||
CALL errore ('cft_wave',' Wrong value for isw',1)
|
||||
ENDIF
|
||||
|
||||
!$acc end data
|
||||
|
||||
CALL stop_clock ('cft_wave')
|
||||
|
||||
|
@ -75,16 +79,24 @@ SUBROUTINE fwfft_wave (npwq, igkq, evc_g, evc_r )
|
|||
INTEGER, INTENT(IN) :: npwq, igkq(npwq)
|
||||
COMPLEX(DP), INTENT(INOUT) :: evc_g (npwx*npol), evc_r (dffts%nnr,npol)
|
||||
!
|
||||
INTEGER :: ig
|
||||
INTEGER :: ig, ik
|
||||
|
||||
!$acc host_data use_device(evc_r)
|
||||
CALL fwfft ('Wave', evc_r(:,1), dffts)
|
||||
!$acc end host_data
|
||||
!$acc parallel loop present(evc_g, evc_r, igkq, dffts, dffts%nl) private(ik)
|
||||
DO ig = 1, npwq
|
||||
evc_g (ig) = evc_g (ig) + evc_r (dffts%nl (igkq(ig) ), 1 )
|
||||
ik = dffts%nl(igkq(ig))
|
||||
evc_g (ig) = evc_g (ig) + evc_r (ik,1)
|
||||
ENDDO
|
||||
IF (noncolin) THEN
|
||||
!$acc host_data use_device(evc_r)
|
||||
CALL fwfft ('Wave', evc_r(:,2), dffts)
|
||||
!$acc end host_data
|
||||
!$acc parallel loop present(evc_g, evc_r, igkq, dffts, dffts%nl) private(ik)
|
||||
DO ig = 1, npwq
|
||||
evc_g (ig+npwx) = evc_g (ig+npwx) + evc_r (dffts%nl(igkq(ig)),2)
|
||||
ik = dffts%nl(igkq(ig))
|
||||
evc_g (ig+npwx) = evc_g (ig+npwx) + evc_r (ik,2)
|
||||
ENDDO
|
||||
ENDIF
|
||||
END SUBROUTINE fwfft_wave
|
||||
|
@ -101,18 +113,28 @@ SUBROUTINE invfft_wave (npw, igk, evc_g, evc_r )
|
|||
COMPLEX(DP), INTENT(IN) :: evc_g (npwx*npol)
|
||||
COMPLEX(DP), INTENT(OUT):: evc_r (dffts%nnr,npol)
|
||||
!
|
||||
INTEGER :: ig
|
||||
|
||||
INTEGER :: ig, ik
|
||||
|
||||
!$acc kernels present(evc_r)
|
||||
evc_r = (0.0_dp, 0.0_dp)
|
||||
!$acc end kernels
|
||||
!$acc parallel loop present(evc_g, evc_r, igk, dffts, dffts%nl) private(ik)
|
||||
DO ig = 1, npw
|
||||
evc_r (dffts%nl (igk(ig) ),1 ) = evc_g (ig)
|
||||
ik = dffts%nl(igk(ig))
|
||||
evc_r (ik, 1) = evc_g (ig)
|
||||
ENDDO
|
||||
!$acc host_data use_device(evc_r)
|
||||
CALL invfft ('Wave', evc_r(:,1), dffts)
|
||||
!$acc end host_data
|
||||
IF (noncolin) THEN
|
||||
!$acc parallel loop present(evc_g, evc_r, igk, dffts, dffts%nl) private(ik)
|
||||
DO ig = 1, npw
|
||||
evc_r (dffts%nl(igk(ig)),2) = evc_g (ig+npwx)
|
||||
ik = dffts%nl(igk(ig))
|
||||
evc_r (ik, 2) = evc_g (ig+npwx)
|
||||
ENDDO
|
||||
!$acc host_data use_device(evc_r)
|
||||
CALL invfft ('Wave', evc_r(:,2), dffts)
|
||||
!$acc end host_data
|
||||
ENDIF
|
||||
|
||||
END SUBROUTINE invfft_wave
|
||||
|
|
|
@ -139,6 +139,8 @@ SUBROUTINE sternheimer_kernel(first_iter, time_reversed, npert, lrdvpsi, iudvpsi
|
|||
ALLOCATE(h_diag(npwx*npol, nbnd))
|
||||
ALLOCATE(aux2(npwx*npol, nbnd))
|
||||
!
|
||||
!$acc enter data create(aux2(1:npwx*npol, 1:nbnd))
|
||||
!
|
||||
all_conv = .TRUE.
|
||||
tot_num_iter = 0
|
||||
tot_cg_calls = 0
|
||||
|
@ -275,6 +277,8 @@ SUBROUTINE sternheimer_kernel(first_iter, time_reversed, npert, lrdvpsi, iudvpsi
|
|||
CALL mp_sum(tot_cg_calls, inter_pool_comm)
|
||||
avg_iter = REAL(tot_num_iter, DP) / REAL(tot_cg_calls, DP)
|
||||
!
|
||||
!$acc exit data delete(aux2)
|
||||
!
|
||||
DEALLOCATE(aux2)
|
||||
DEALLOCATE(h_diag)
|
||||
!
|
||||
|
|
|
@ -394,6 +394,9 @@ SUBROUTINE elphel (irr, npe, imode0, dvscfins)
|
|||
ALLOCATE (aux2(npwx*npol, nbnd))
|
||||
el_ph_mat_rec=(0.0_DP,0.0_DP)
|
||||
aux2(:, :) = (0.0_DP, 0.0_DP)
|
||||
!
|
||||
!$acc enter data create(dvscfins(1:dffts%nnr, 1:nspin_mag, 1:npe), aux2(1:npwx*npol, 1:nbnd))
|
||||
!
|
||||
CALL apply_dpot_allocate()
|
||||
!
|
||||
! DFPT+U case
|
||||
|
@ -604,6 +607,9 @@ SUBROUTINE elphel (irr, npe, imode0, dvscfins)
|
|||
DEALLOCATE(el_ph_mat_rec)
|
||||
!
|
||||
CALL apply_dpot_deallocate()
|
||||
!
|
||||
!$acc exit data delete(dvscfins, aux2)
|
||||
!
|
||||
DEALLOCATE (elphmat)
|
||||
DEALLOCATE (aux2)
|
||||
!
|
||||
|
|
|
@ -592,7 +592,9 @@ SUBROUTINE elphel_refolded (npe, imode0, dvscfins)
|
|||
ALLOCATE (elphmat ( nbnd , nbnd , 3*nat))
|
||||
allocate (aux_psi(npol*npwx,nbnd))
|
||||
CALL apply_dpot_allocate()
|
||||
|
||||
!
|
||||
!$acc enter data create(dvscfins(1:dffts%nnr, 1:nspin_mag, 1:npe), dvpsi(1:npwx*npol, 1:nbnd))
|
||||
!
|
||||
|
||||
! iunwfcwann=find_free_unit()
|
||||
! CALL diropn (iunwfcwann, 'wfc', lrwfc, exst, dvscf_dir)
|
||||
|
@ -693,7 +695,7 @@ SUBROUTINE elphel_refolded (npe, imode0, dvscfins)
|
|||
ENDDO
|
||||
ENDDO
|
||||
ENDDO
|
||||
|
||||
!$acc exit data delete(dvscfins, dvpsi)
|
||||
|
||||
! CLOSE( UNIT = iunwfcwann, STATUS = 'KEEP' )
|
||||
!
|
||||
|
|
|
@ -94,18 +94,22 @@ subroutine solve_e
|
|||
!! the becsum with dpsi
|
||||
COMPLEX(DP), ALLOCATABLE :: mixin(:), mixout(:)
|
||||
!! auxiliary for paw mixing
|
||||
INTEGER :: nnr
|
||||
!
|
||||
call start_clock ('solve_e')
|
||||
!
|
||||
! This routine is task group aware
|
||||
!
|
||||
allocate (dvscfin( dfftp%nnr, nspin_mag, 3))
|
||||
nnr = dfftp%nnr
|
||||
dvscfin=(0.0_DP,0.0_DP)
|
||||
if (doublegrid) then
|
||||
allocate (dvscfins(dffts%nnr, nspin_mag, 3))
|
||||
nnr = dffts%nnr
|
||||
else
|
||||
dvscfins => dvscfin
|
||||
endif
|
||||
!$acc enter data create(dvscfins(1:nnr, 1:nspin_mag, 1:3))
|
||||
allocate (dvscfout(dfftp%nnr, nspin_mag, 3))
|
||||
IF (okpaw) THEN
|
||||
ALLOCATE (mixin(dfftp%nnr*nspin_mag*3+(nhm*(nhm+1)*nat*nspin_mag*3)/2) )
|
||||
|
@ -319,6 +323,7 @@ subroutine solve_e
|
|||
DEALLOCATE(mixin)
|
||||
DEALLOCATE(mixout)
|
||||
ENDIF
|
||||
!$acc exit data delete(dvscfins)
|
||||
if (doublegrid) deallocate (dvscfins)
|
||||
deallocate (dvscfin)
|
||||
if (noncolin) deallocate(dbecsum_nc)
|
||||
|
|
|
@ -150,6 +150,8 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
integer :: iq_dummy
|
||||
real(DP) :: tcpu, get_clock ! timing variables
|
||||
character(len=256) :: filename
|
||||
|
||||
integer :: nnr
|
||||
!
|
||||
IF (rec_code_read > 20 ) RETURN
|
||||
|
||||
|
@ -161,12 +163,15 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
IF (noncolin.AND.domag) nsolv=2
|
||||
|
||||
allocate (dvscfin ( dfftp%nnr , nspin_mag , npe))
|
||||
nnr = dfftp%nnr
|
||||
dvscfin=(0.0_DP,0.0_DP)
|
||||
if (doublegrid) then
|
||||
allocate (dvscfins (dffts%nnr , nspin_mag , npe))
|
||||
nnr = dffts%nnr
|
||||
else
|
||||
dvscfins => dvscfin
|
||||
endif
|
||||
!$acc enter data create(dvscfins(1:nnr, 1:nspin_mag, 1:npe))
|
||||
allocate (drhoscfh ( dfftp%nnr, nspin_mag , npe))
|
||||
allocate (dvscfout ( dfftp%nnr, nspin_mag , npe))
|
||||
allocate (dbecsum ( (nhm * (nhm + 1))/2 , nat , nspin_mag , npe))
|
||||
|
@ -593,6 +598,7 @@ SUBROUTINE solve_linter (irr, imode0, npe, drhoscf)
|
|||
IF (noncolin) deallocate (dbecsum_nc)
|
||||
deallocate (dvscfout)
|
||||
deallocate (drhoscfh)
|
||||
!$acc exit data delete(dvscfins)
|
||||
if (doublegrid) deallocate (dvscfins)
|
||||
deallocate (dvscfin)
|
||||
deallocate(aux2)
|
||||
|
|
Loading…
Reference in New Issue