more threaded_backassignement (including optionally summing another vector)

This commit is contained in:
Stefano de Gironcoli 2018-08-07 14:15:39 +02:00
parent 819ab53cc5
commit 64cca07a92
1 changed files with 27 additions and 10 deletions

View File

@ -531,20 +531,23 @@ SUBROUTINE ppcg_gamma( h_psi, s_psi, overlap, precondition, &
call start_clock('ppcg:dgemm') call start_clock('ppcg:dgemm')
call threaded_assign( buffer1, psi, npwx, l, col_idx ) call threaded_assign( buffer1, psi, npwx, l, col_idx )
CALL DGEMM('N','N',npw2, l, l, ONE, buffer1, npwx2, coord_psi, sbsize, ZERO, buffer, npwx2) CALL DGEMM('N','N',npw2, l, l, ONE, buffer1, npwx2, coord_psi, sbsize, ZERO, buffer, npwx2)
psi(:,col_idx(1:l)) = buffer(:,1:l) + p(:,col_idx(1:l)) ! psi(:,col_idx(1:l)) = buffer(:,1:l) + p(:,col_idx(1:l))
call threaded_backassign( psi, act_idx, buffer, npwx, l, p )
call stop_clock('ppcg:dgemm') call stop_clock('ppcg:dgemm')
! !
call start_clock('ppcg:dgemm') call start_clock('ppcg:dgemm')
call threaded_assign( buffer1, hpsi, npwx, l, col_idx ) call threaded_assign( buffer1, hpsi, npwx, l, col_idx )
CALL DGEMM('N','N',npw2, l, l, ONE, buffer1, npwx2, coord_psi, sbsize, ZERO, buffer, npwx2) CALL DGEMM('N','N',npw2, l, l, ONE, buffer1, npwx2, coord_psi, sbsize, ZERO, buffer, npwx2)
hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l)) ! hpsi(:,col_idx(1:l)) = buffer(:,1:l) + hp(:,col_idx(1:l))
call threaded_backassign( hpsi, act_idx, buffer, npwx, l, hp )
call stop_clock('ppcg:dgemm') call stop_clock('ppcg:dgemm')
! !
if (overlap) then if (overlap) then
call start_clock('ppcg:dgemm') call start_clock('ppcg:dgemm')
call threaded_assign( buffer1, spsi, npwx, l, col_idx ) call threaded_assign( buffer1, spsi, npwx, l, col_idx )
CALL DGEMM('N','N',npw2, l, l, ONE, buffer1, npwx2, coord_psi, sbsize, ZERO, buffer, npwx2) CALL DGEMM('N','N',npw2, l, l, ONE, buffer1, npwx2, coord_psi, sbsize, ZERO, buffer, npwx2)
spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l)) ! spsi(:,col_idx(1:l)) = buffer(:,1:l) + sp(:,col_idx(1:l))
call threaded_backassign( spsi, act_idx, buffer, npwx, l, sp )
call stop_clock('ppcg:dgemm') call stop_clock('ppcg:dgemm')
end if end if
! !
@ -1689,11 +1692,14 @@ nguard = 0 ! 24 ! 50
! !
END SUBROUTINE threaded_assign END SUBROUTINE threaded_assign
SUBROUTINE threaded_backassign(array_out, idx, array_in, kdimx, nact ) SUBROUTINE threaded_backassign(array_out, idx, array_in, kdimx, nact, a2_in )
! !
! assign (copy) a complex array in a threaded way ! assign (copy) a complex array in a threaded way
! !
! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) ! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) or
!
! array_out( 1:kdimx, idx(1:nact) ) = array_in( 1:kdimx, 1:nact ) + a2_in( 1:kdimx, idx(1:nact) (
! if a2_in is present
! !
! the index array idx is mandatory otherwise one could use previous routine) ! the index array idx is mandatory otherwise one could use previous routine)
! !
@ -1703,6 +1709,7 @@ nguard = 0 ! 24 ! 50
! !
COMPLEX(DP), INTENT(INOUT) :: array_out( kdimx, * ) ! we don't want to mess with un referenced columns COMPLEX(DP), INTENT(INOUT) :: array_out( kdimx, * ) ! we don't want to mess with un referenced columns
COMPLEX(DP), INTENT(IN) :: array_in ( kdimx, nact ) COMPLEX(DP), INTENT(IN) :: array_in ( kdimx, nact )
COMPLEX(DP), INTENT(IN), OPTIONAL :: a2_in ( kdimx, * )
INTEGER, INTENT(IN) :: kdimx, nact INTEGER, INTENT(IN) :: kdimx, nact
INTEGER, INTENT(IN) :: idx( * ) INTEGER, INTENT(IN) :: idx( * )
! !
@ -1716,11 +1723,21 @@ nguard = 0 ! 24 ! 50
nblock = (kdimx - 1)/blocksz + 1 nblock = (kdimx - 1)/blocksz + 1
!$omp parallel do collapse(2) IF ( present(a2_in) ) THEN
DO i=1, nact ; DO j=1,nblock !$omp parallel do collapse(2)
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) DO i=1, nact ; DO j=1,nblock
ENDDO ; ENDDO array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = &
!$omp end parallel do array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i ) + &
a2_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) )
ENDDO ; ENDDO
!$omp end parallel do
ELSE
!$omp parallel do collapse(2)
DO i=1, nact ; DO j=1,nblock
array_out(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), idx( i ) ) = array_in(1+(j-1)*blocksz:MIN(j*blocksz,kdimx), i )
ENDDO ; ENDDO
!$omp end parallel do
END IF
! !
END SUBROUTINE threaded_backassign END SUBROUTINE threaded_backassign