mirror of https://gitlab.com/QEF/q-e.git
some truncation errors fixed
This commit is contained in:
parent
d7dc8b1541
commit
41a2fef372
|
@ -112,7 +112,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
#if defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
attributes(device) :: e_d, psi_d, precondition_d
|
attributes(device) :: e_d, psi_d, precondition_d
|
||||||
attributes(device) :: hpsi_d, spsi_d, w_d, hw_d, sw_d, p_d, hp_d, sp_d
|
attributes(device) :: hpsi_d, spsi_d, w_d, hw_d, sw_d, p_d, hp_d, sp_d
|
||||||
attributes(device) :: G_d, act_idx_d, buffer_d, buffer1_d, col_idx_d, K_d, M_d, coord_psi_d, coord_w_d, coord_p_d, Gl_d
|
attributes(device) :: G_d, act_idx_d, buffer_d, buffer1_d, col_idx_d, K_d, M_d, coord_psi_d, coord_w_d, coord_p_d
|
||||||
|
attributes(device) :: Gl_d
|
||||||
#endif
|
#endif
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
|
@ -243,10 +244,12 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
CALL divide(inter_bgrp_comm,nbnd,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nbnd,n_start,n_end
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, C_ZERO, G_d(n_start,1), nbnd )
|
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE, spsi_d(1,n_start), kdimx, buffer_d, kdimx, &
|
||||||
|
C_ZERO, G_d(n_start,1), nbnd )
|
||||||
else
|
else
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, C_ZERO, G_d(n_start,1), nbnd )
|
CALL gpu_ZGEMM( 'C','N', my_n, nact, kdim, C_ONE,psi_d(1,n_start), kdimx, buffer_d, kdimx, &
|
||||||
|
C_ZERO, G_d(n_start,1), nbnd )
|
||||||
end if
|
end if
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm )
|
CALL mp_sum( G(1:nbnd,1:nact), inter_bgrp_comm )
|
||||||
|
@ -259,7 +262,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, w_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, w_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, buffer_d, kdimx)
|
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, psi_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, &
|
||||||
|
buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel DO(2)
|
!$cuf kernel DO(2)
|
||||||
DO i = 1, kdimx
|
DO i = 1, kdimx
|
||||||
|
@ -326,7 +330,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, kdimx, C_ZERO, G_d(n_start,1), nbnd)
|
CALL gpu_ZGEMM('C','N', my_n, nact, kdim, C_ONE, buffer_d(1,n_start), kdimx, buffer1_d, &
|
||||||
|
kdimx, C_ZERO, G_d(n_start,1), nbnd)
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
|
CALL mp_sum( G(1:nact,1:nact), inter_bgrp_comm )
|
||||||
!
|
!
|
||||||
|
@ -339,7 +344,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, p_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, p_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, psi_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) & ! could be done differently
|
if (n_start .le. n_end) & ! could be done differently
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, buffer_d, kdimx)
|
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO i = 1, kdimx
|
DO i = 1, kdimx
|
||||||
|
@ -353,7 +359,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, hp_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, hp_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hpsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, buffer_d, kdimx)
|
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO i = 1, kdimx
|
DO i = 1, kdimx
|
||||||
|
@ -368,7 +375,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call gpu_threaded_assign( buffer_d, sp_d, kdimx, nact, .true., act_idx_d, .true. )
|
call gpu_threaded_assign( buffer_d, sp_d, kdimx, nact, .true., act_idx_d, .true. )
|
||||||
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, spsi_d, kdimx, nact, .true., act_idx_d, .false. )
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, buffer_d, kdimx)
|
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO i = 1, kdimx
|
DO i = 1, kdimx
|
||||||
|
@ -462,19 +470,22 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(2*l + 1, 2*l+1), sbsize3)
|
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
|
K_d(2*l + 1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, buffer_d, kdimx, l, .false., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(2*l + 1, 2*l+1), sbsize3)
|
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
|
M_d(2*l + 1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
! ---
|
! ---
|
||||||
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, psi_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(1, 2*l+1), sbsize3)
|
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
|
K_d(1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
|
@ -489,14 +500,16 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer_d, w_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, hp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, K_d(l+1, 2*l+1), sbsize3)
|
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
|
K_d(l+1, 2*l+1), sbsize3)
|
||||||
!
|
!
|
||||||
if (overlap) then
|
if (overlap) then
|
||||||
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, sp_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
else
|
else
|
||||||
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
call gpu_threaded_assign( buffer1_d, p_d, kdimx, l, .true., col_idx_d, .false. )
|
||||||
end if
|
end if
|
||||||
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, M_d(l+1, 2*l+1), sbsize3)
|
CALL gpu_ZGEMM('C','N', l, l, kdim, C_ONE, buffer_d, kdimx, buffer1_d, kdimx, C_ZERO, &
|
||||||
|
M_d(l+1, 2*l+1), sbsize3)
|
||||||
call stop_clock('ppcg:zgemm')
|
call stop_clock('ppcg:zgemm')
|
||||||
!
|
!
|
||||||
END IF
|
END IF
|
||||||
|
@ -898,7 +911,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
END DO
|
END DO
|
||||||
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
CALL divide(inter_bgrp_comm,nact,n_start,n_end); my_n = n_end - n_start + 1; !write (*,*) nact,n_start,n_end
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, C_ZERO, G_d(1,n_start), nbnd)
|
CALL gpu_ZGEMM('C','N', nact, my_n, kdim, C_ONE, buffer_d, kdimx, buffer1_d(1,n_start), kdimx, &
|
||||||
|
C_ZERO, G_d(1,n_start), nbnd)
|
||||||
G = G_d
|
G = G_d
|
||||||
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
|
CALL mp_sum(G(1:nact,1:nact), inter_bgrp_comm)
|
||||||
!
|
!
|
||||||
|
@ -915,7 +929,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
end if
|
end if
|
||||||
call start_clock('ppcg:zgemm')
|
call start_clock('ppcg:zgemm')
|
||||||
if (n_start .le. n_end) &
|
if (n_start .le. n_end) &
|
||||||
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), nbnd, C_ONE, buffer_d, kdimx)
|
CALL gpu_ZGEMM('N','N', kdim, nact, my_n, -C_ONE, buffer1_d(1,n_start), kdimx, G_d(n_start,1), &
|
||||||
|
nbnd, C_ONE, buffer_d, kdimx)
|
||||||
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
CALL mp_sum( buffer_d(:,1:nact), inter_bgrp_comm )
|
||||||
!$cuf kernel do(2)
|
!$cuf kernel do(2)
|
||||||
DO ii = 1, kdimx
|
DO ii = 1, kdimx
|
||||||
|
@ -937,7 +952,8 @@ SUBROUTINE ppcg_k_gpu( h_psi_gpu, s_psi_gpu, overlap, precondition_d, &
|
||||||
!
|
!
|
||||||
! Print iteration info ...
|
! Print iteration info ...
|
||||||
IF (print_info >= 1) THEN
|
IF (print_info >= 1) THEN
|
||||||
WRITE(stdout, '("iter: ", I5, " nact = ", I5, ", trdif = ", 1pD9.2, ", trtol = ", 1pD9.2 )') iter, nact, trdif, trtol
|
WRITE(stdout, '("iter: ", I5, " nact = ", I5, ", trdif = ", 1pD9.2, ", trtol = ", 1pD9.2 )') &
|
||||||
|
iter, nact, trdif, trtol
|
||||||
IF (print_info == 3) THEN
|
IF (print_info == 3) THEN
|
||||||
CALL print_rnrm
|
CALL print_rnrm
|
||||||
WRITE(stdout,'("Res. norm: ", 1pD9.2)') res_array(iter)
|
WRITE(stdout,'("Res. norm: ", 1pD9.2)') res_array(iter)
|
||||||
|
@ -1054,7 +1070,8 @@ CONTAINS
|
||||||
IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate w ', ABS(ierr) )
|
IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate w ', ABS(ierr) )
|
||||||
ALLOCATE ( K(sbsize3, sbsize3), M(sbsize3,sbsize3), stat = ierr )
|
ALLOCATE ( K(sbsize3, sbsize3), M(sbsize3,sbsize3), stat = ierr )
|
||||||
IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate K and M ', ABS(ierr) )
|
IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate K and M ', ABS(ierr) )
|
||||||
ALLOCATE ( cwork( 1 + 18*sbsize + 18*sbsize**2 ), rwork( 1 + 18*sbsize + 18*sbsize**2 ), iwork(3 + 15*sbsize), stat = ierr )
|
ALLOCATE ( cwork( 1 + 18*sbsize + 18*sbsize**2 ), rwork( 1 + 18*sbsize + 18*sbsize**2 ), &
|
||||||
|
iwork(3 + 15*sbsize), stat = ierr )
|
||||||
IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate lapack work arrays ', ABS(ierr) )
|
IF (ierr /= 0) CALL errore( 'ppcg ',' cannot allocate lapack work arrays ', ABS(ierr) )
|
||||||
!
|
!
|
||||||
CALL desc_init( nbnd, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
|
CALL desc_init( nbnd, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip )
|
||||||
|
@ -1963,13 +1980,15 @@ nguard = 0 ! 24 ! 50
|
||||||
IF (present(act_idx) ) THEN
|
IF (present(act_idx) ) THEN
|
||||||
!$omp parallel do collapse(2)
|
!$omp parallel do collapse(2)
|
||||||
DO i=1, nact ; DO j=1,nblock
|
DO i=1, nact ; DO j=1,nblock
|
||||||
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), act_idx( i ) )
|
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = &
|
||||||
|
array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), act_idx( i ) )
|
||||||
ENDDO ; ENDDO
|
ENDDO ; ENDDO
|
||||||
!$omp end parallel do
|
!$omp end parallel do
|
||||||
ELSE
|
ELSE
|
||||||
!$omp parallel do collapse(2)
|
!$omp parallel do collapse(2)
|
||||||
DO i=1, nact ; DO j=1,nblock
|
DO i=1, nact ; DO j=1,nblock
|
||||||
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
|
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) = &
|
||||||
|
array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
|
||||||
ENDDO ; ENDDO
|
ENDDO ; ENDDO
|
||||||
!$omp end parallel do
|
!$omp end parallel do
|
||||||
END IF
|
END IF
|
||||||
|
|
Loading…
Reference in New Issue