Support for CUDA EIGENSOLVER removed

This commit is contained in:
Paolo Giannozzi 2022-02-04 15:39:13 +01:00
parent e821a717dc
commit d262aca3cb
5 changed files with 4 additions and 305 deletions

View File

@ -223,11 +223,7 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
#if defined(__CUDA)
USE cudafor
!
#if defined(__USE_CUSOLVER)
USE cusolverdn
#else
USE zhegvdx_gpu
#endif
#endif
!
USE laxlib_parallel_include
@ -296,7 +292,6 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
ATTRIBUTES( DEVICE ) :: work_d, rwork_d, h_diag_d, s_diag_d
#endif
INTEGER :: i, j
#if defined( __USE_CUSOLVER )
INTEGER :: devInfo_d, h_meig
ATTRIBUTES( DEVICE ) :: devInfo_d
TYPE(cusolverDnHandle), SAVE :: cuSolverHandle
@ -304,7 +299,6 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
!
COMPLEX(DP), VARTYPE :: h_bkp_d(:,:), s_bkp_d(:,:)
ATTRIBUTES( DEVICE ) :: h_bkp_d, s_bkp_d
#endif
#undef VARTYPE
!
!
@ -318,9 +312,7 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
!
! Keeping compatibility for both CUSolver and CustomEigensolver, CUSolver below
!
#if defined(__USE_CUSOLVER) && defined(__CUDA)
!
! vvv __USE_CUSOLVER
#if defined(__CUDA)
#if ! defined(__USE_GLOBAL_BUFFER)
ALLOCATE(h_bkp_d(n,n), s_bkp_d(n,n), STAT = info)
@ -387,96 +379,9 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
CALL dev%release_buffer( h_bkp_d, info )
CALL dev%release_buffer( s_bkp_d, info )
#endif
! ^^^ __USE_CUSOLVER
!
! Keeping compatibility for both CUSolver and CustomEigensolver, CustomEigensolver below
!
#elif defined(__CUDA)
! vvv not __USE_CUSOLVER
#if ! defined(__USE_GLOBAL_BUFFER)
! NB: dimension is different!
ALLOCATE(v_h(ldh,n), e_h(n))
ALLOCATE(h_diag_d(n) , s_diag_d(n))
#else
CALL pin%lock_buffer( v_h, (/ldh,n/), info )
CALL pin%lock_buffer( e_h, n, info )
!
CALL dev%lock_buffer( h_diag_d, n, info )
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate h_bkp_d ', ABS( info ) )
CALL dev%lock_buffer( s_diag_d, n, info )
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate s_bkp_d ', ABS( info ) )
#endif
!
lwork = n
lrwork = 1+5*n+2*n*n
liwork = 3+5*n
!
lwork_d = 2*64*64 + 65 * n
lrwork_d = n
!
#if ! defined(__USE_GLOBAL_BUFFER)
ALLOCATE(work(lwork), rwork(lrwork), iwork(liwork))
!
ALLOCATE(work_d(1*lwork_d), STAT = info)
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' allocate work_d ', ABS( info ) )
!
ALLOCATE(rwork_d(1*lrwork_d), STAT = info)
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' allocate rwork_d ', ABS( info ) )
#else
CALL pin%lock_buffer(work, lwork, info)
CALL pin%lock_buffer(rwork, lrwork, info)
CALL pin%lock_buffer(iwork, liwork, info)
CALL dev%lock_buffer( work_d, lwork_d, info)
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate work_d ', ABS( info ) )
CALL dev%lock_buffer( rwork_d, lrwork_d, info)
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate rwork_d ', ABS( info ) )
#endif
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, n
h_diag_d(i) = DBLE( h_d(i,i) )
s_diag_d(i) = DBLE( s_d(i,i) )
END DO
CALL zhegvdx_gpu(n, h_d, ldh, s_d, ldh, v_d, ldh, 1, m, e_d, work_d,&
lwork_d, rwork_d, lrwork_d, &
work, lwork, rwork, lrwork, &
iwork, liwork, v_h, SIZE(v_h, 1), e_h, info, .TRUE.)
!
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' zhegvdx_gpu failed ', ABS( info ) )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, n
h_d(i,i) = DCMPLX( h_diag_d(i), 0.0_DP)
s_d(i,i) = DCMPLX( s_diag_d(i), 0.0_DP)
DO j = i + 1, n
h_d(i,j) = DCONJG( h_d(j,i) )
s_d(i,j) = DCONJG( s_d(j,i) )
END DO
DO j = n + 1, ldh
h_d(j,i) = ( 0.0_DP, 0.0_DP )
s_d(j,i) = ( 0.0_DP, 0.0_DP )
END DO
END DO
#if ! defined(__USE_GLOBAL_BUFFER)
DEALLOCATE(h_diag_d, s_diag_d)
!
DEALLOCATE(work, rwork, iwork)
DEALLOCATE(work_d, rwork_d)
DEALLOCATE(v_h, e_h)
#else
CALL dev%release_buffer( h_diag_d, info )
CALL dev%release_buffer( s_diag_d, info)
!
CALL pin%release_buffer(work, info)
CALL pin%release_buffer(rwork, info)
CALL pin%release_buffer(iwork, info)
CALL dev%release_buffer( work_d, info)
CALL dev%release_buffer( rwork_d, info)
CALL pin%release_buffer(v_h, info)
CALL pin%release_buffer(e_h, info)
#endif
! ^^^ not __USE_CUSOLVER
#else
CALL lax_error__( 'cdiaghg', 'Called GPU eigensolver without GPU support', 1 )
#endif

View File

@ -574,13 +574,7 @@ END SUBROUTINE laxlib_multi_init_desc_x
SUBROUTINE diagonalize_serial_gpu( m, rhos, rhod, s, info )
#if defined(__CUDA)
use cudafor
#if defined ( __USE_CUSOLVER )
USE cusolverDn
#else
use eigsolve_vars
use nvtx_inters
use dsyevd_gpu
#endif
IMPLICIT NONE
include 'laxlib_kinds.fh'
INTEGER, INTENT(IN) :: m
@ -592,28 +586,13 @@ END SUBROUTINE laxlib_multi_init_desc_x
INTEGER :: lwork_d
INTEGER :: i, j, lda
!
#if defined (__USE_CUSOLVER)
!
INTEGER, DEVICE :: devInfo
TYPE(cusolverDnHandle) :: cuSolverHandle
REAL(DP), ALLOCATABLE, DEVICE :: work_d(:)
!
#else
!
REAL(DP), ALLOCATABLE :: work_d(:), a(:,:)
ATTRIBUTES( DEVICE ) :: work_d, a
REAL(DP), ALLOCATABLE :: b(:,:)
REAL(DP), ALLOCATABLE :: work_h(:), w_h(:), z_h(:,:)
ATTRIBUTES( PINNED ) :: work_h, w_h, z_h
INTEGER, ALLOCATABLE :: iwork_h(:)
ATTRIBUTES( PINNED ) :: iwork_h
!
INTEGER :: lwork_h, liwork_h
!
#endif
! .... Subroutine Body
!
#if defined (__USE_CUSOLVER)
!
s = rhos
lda = SIZE( rhos, 1 )
@ -641,49 +620,6 @@ END SUBROUTINE laxlib_multi_init_desc_x
DEALLOCATE( work_d )
!
#else
!
info = 0
lwork_d = 2*64*64 + 66*SIZE(rhos,1)
lwork_h = 1 + 6*SIZE(rhos,1) + 2*SIZE(rhos,1)*SIZE(rhos,1)
liwork_h = 3 + 5*SIZE(rhos,1)
ALLOCATE(work_d(lwork_d),STAT = info)
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate work_d ', ABS( info ) )
ALLOCATE(a(SIZE(rhos,1),SIZE(rhos,2)),STAT = info)
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate a ', ABS( info ) )
ALLOCATE(work_h(lwork_h),STAT = info)
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate work_h ', ABS( info ) )
ALLOCATE(iwork_h(liwork_h),STAT = info)
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate iwork_h ', ABS( info ) )
!
ALLOCATE(w_h(SIZE(rhod)),STAT = info)
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate w_h ', ABS( info ) )
ALLOCATE(z_h(SIZE(s,1),SIZE(s,2)),STAT = info)
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate z_h ', ABS( info ) )
if(initialized == 0) call init_eigsolve_gpu
info = cudaMemcpy(a, rhos, SIZE(rhos,1)*SIZE(rhos,2), cudaMemcpyDeviceToDevice)
lda = SIZE(rhos,1)
!$cuf kernel do(2) <<<*,*>>>
do j = 1,m
do i = 1,m
if (i > j) then
s(i,j) = a(i,j)
endif
end do
end do
call dsyevd_gpu('V', 'U', 1, m, m, a, lda, s, lda, rhod, work_d, lwork_d, &
work_h, lwork_h, iwork_h, liwork_h, z_h, lda, w_h, info)
DEALLOCATE(z_h)
DEALLOCATE(w_h)
DEALLOCATE(iwork_h)
DEALLOCATE(work_h)
DEALLOCATE(a)
DEALLOCATE(work_d)
#endif
#else
IMPLICIT NONE
include 'laxlib_kinds.fh'

View File

@ -203,11 +203,7 @@ SUBROUTINE laxlib_rdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
USE laxlib_parallel_include
#if defined(__CUDA)
USE cudafor
#if defined(__USE_CUSOLVER)
USE cusolverdn
#else
USE dsygvdx_gpu
#endif
#endif
!
! NB: the flag below can be used to decouple LAXlib from devXlib.
@ -276,14 +272,12 @@ SUBROUTINE laxlib_rdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
ATTRIBUTES( DEVICE ) :: h_diag_d, s_diag_d
#endif
!
#if defined(__USE_CUSOLVER)
INTEGER :: devInfo_d, h_meig
ATTRIBUTES( DEVICE ) :: devInfo_d
TYPE(cusolverDnHandle), SAVE :: cuSolverHandle
LOGICAL, SAVE :: cuSolverInitialized = .FALSE.
REAL(DP), VARTYPE :: h_bkp_d(:,:), s_bkp_d(:,:)
ATTRIBUTES( DEVICE ) :: h_bkp_d, s_bkp_d
#endif
#undef VARTYPE
!
CALL start_clock_gpu( 'rdiaghg' )
@ -292,61 +286,8 @@ SUBROUTINE laxlib_rdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
!
IF ( me_bgrp == root_bgrp ) THEN
!
#if (!defined(__USE_CUSOLVER)) && defined(__CUDA)
ALLOCATE(e_h(n), v_h(ldh,n))
#if defined(__CUDA)
!
ALLOCATE(h_diag_d(n), s_diag_d(n))
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, n
h_diag_d(i) = DBLE( h_d(i,i) )
s_diag_d(i) = DBLE( s_d(i,i) )
END DO
!
lwork = 1 + 6*n + 2*n*n
liwork = 3 + 5*n
ALLOCATE(work(lwork), iwork(liwork))
!
lwork_d = 2*64*64 + 66*n
#if ! defined(__USE_GLOBAL_BUFFER)
ALLOCATE(work_d(1*lwork_d), STAT = info)
#else
CALL dev%lock_buffer( work_d, lwork_d, info )
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' cannot allocate work_d ', ABS( info ) )
#endif
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' allocate work_d ', ABS( info ) )
!
CALL dsygvdx_gpu(n, h_d, ldh, s_d, ldh, v_d, ldh, 1, m, e_d, work_d, &
lwork_d, work, lwork, iwork, liwork, v_h, size(v_h, 1), &
e_h, info, .TRUE.)
!
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' dsygvdx_gpu failed ', ABS( info ) )
!
!$cuf kernel do(1) <<<*,*>>>
DO i = 1, n
h_d(i,i) = h_diag_d(i)
s_d(i,i) = s_diag_d(i)
DO j = i + 1, n
h_d(i,j) = h_d(j,i)
s_d(i,j) = s_d(j,i)
END DO
! This could be avoided, need to check dsygvdx_gpu implementation
DO j = n + 1, ldh
h_d(j,i) = 0.0_DP
s_d(j,i) = 0.0_DP
END DO
END DO
DEALLOCATE(h_diag_d,s_diag_d)
!
DEALLOCATE(work, iwork)
#if ! defined(__USE_GLOBAL_BUFFER)
DEALLOCATE(work_d)
#else
CALL dev%release_buffer( work_d, info )
#endif
!
DEALLOCATE(v_h, e_h)
#elif defined(__USE_CUSOLVER) && defined(__CUDA)
! vvv __USE_CUSOLVER
#if ! defined(__USE_GLOBAL_BUFFER)
ALLOCATE(h_bkp_d(n,n), s_bkp_d(n,n), STAT = info)
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' cannot allocate h_bkp_d or s_bkp_d ', ABS( info ) )

View File

@ -1,65 +0,0 @@
include ../../make.inc
# Stripped version of F90FLAGS to remove GPU details added explicitly below.
F90FLAGSS := $(filter-out $(CUDA_F90FLAGS),$(F90FLAGS))
FLAGS = -O3 -pgf90libs -Mcuda=cc$(GPU_ARCH),cuda$(CUDA_RUNTIME),ptxinfo $(F90FLAGSS)
FLAGS2 = -O3 -pgf90libs -Mcuda=cc$(GPU_ARCH),cuda$(CUDA_RUNTIME),ptxinfo,maxregcount:64 $(F90FLAGSS)
# For performance reasons, cc of FLAGS3 must be <= 60
define MIN
$(firstword $(sort ${1} ${2}))
endef
FLAGS3 = -O3 -pgf90libs -Mcuda=cc$(call MIN,${GPU_ARCH},60),cuda$(CUDA_RUNTIME),ptxinfo,nordc,maxregcount:255 $(F90FLAGSS)
# Uncomment to enable NVTX markers
#OPTFLAGS = -DUSE_NVTX
all: lib_eigsolve.a
OBJS = cusolverDn_m.o eigsolve_vars.o toolbox.o zhegst_gpu.o zhemv_gpu.o zhetd2_gpu.o zhetrd_gpu.o zheevd_gpu.o zhegvdx_gpu.o \
dsygst_gpu.o dsymv_gpu.o dsytd2_gpu.o dsytrd_gpu.o dsyevd_gpu.o dsygvdx_gpu.o
zhetd2_gpu.o : zhetd2_gpu.F90
pgf90 -c ${FLAGS2} ${OPTFLAGS} $*.F90 -o $*.o
zhemv_gpu.o : zhemv_gpu.F90
pgf90 -c ${FLAGS3} ${OPTFLAGS} $*.F90 -o $*.o
dsytd2_gpu.o : dsytd2_gpu.F90
pgf90 -c ${FLAGS2} ${OPTFLAGS} $*.F90 -o $*.o
dsymv_gpu.o : dsymv_gpu.F90
pgf90 -c ${FLAGS3} ${OPTFLAGS} $*.F90 -o $*.o
%.o: %.cuf
pgf90 -c ${FLAGS} ${OPTFLAGS} $*.cuf -o $*.o
%.o: %.F90
pgf90 -c ${FLAGS} ${OPTFLAGS} $*.F90 -o $*.o
lib_eigsolve.a: $(OBJS)
ar -cr lib_eigsolve.a $(OBJS)
clean:
rm -f lib_eigsolve.a *.mod *.o
# Dependencies
dsyevd_gpu.o : dsytrd_gpu.o
dsyevd_gpu.o : eigsolve_vars.o
dsyevd_gpu.o : toolbox.o
dsygst_gpu.o : eigsolve_vars.o
dsygvdx_gpu.o : dsyevd_gpu.o
dsygvdx_gpu.o : dsygst_gpu.o
dsygvdx_gpu.o : eigsolve_vars.o
dsygvdx_gpu.o : toolbox.o
dsytrd_gpu.o : dsymv_gpu.o
dsytrd_gpu.o : dsytd2_gpu.o
dsytrd_gpu.o : eigsolve_vars.o
eigsolve_vars.o : cusolverDn_m.o
zheevd_gpu.o : eigsolve_vars.o
zheevd_gpu.o : toolbox.o
zheevd_gpu.o : zhetrd_gpu.o
zhegst_gpu.o : eigsolve_vars.o
zhegvdx_gpu.o : eigsolve_vars.o
zhegvdx_gpu.o : toolbox.o
zhegvdx_gpu.o : zheevd_gpu.o
zhegvdx_gpu.o : zhegst_gpu.o
zhetrd_gpu.o : eigsolve_vars.o
zhetrd_gpu.o : zhemv_gpu.o
zhetrd_gpu.o : zhetd2_gpu.o

View File

@ -11,12 +11,6 @@
include ../make.inc
include install_utils
# For NVIDIA Eigensolver
EIGENSOLVER_VERSION=0.3.1
EIGENSOLVER_GPU=v${EIGENSOLVER_VERSION}.tar.gz
EIGENSOLVER_GPU_NAME=Eigensolver_gpu-${EIGENSOLVER_VERSION}
EIGENSOLVER_GPU_URL=https://github.com/NVIDIA/Eigensolver_gpu/archive/${EIGENSOLVER_GPU}
# MAIN target
all: libcuda
@ -72,18 +66,6 @@ fox_clean:
libcuda : $(addprefix libcuda_,$(CUDA_EXTLIBS))
libcuda_eigensolver :
$(call download_and_unpack,$(EIGENSOLVER_GPU_NAME),$(EIGENSOLVER_GPU_URL),EIGENSOLVER_GPU,EIGENSOLVER_GPU)
if test ! -e ../EIGENSOLVER_GPU/lib_eigsolve/lib_eigsolve.a ; then \
(cp Makefile.lib_eigsolve ../EIGENSOLVER_GPU/lib_eigsolve/Makefile; \
cd ../EIGENSOLVER_GPU/lib_eigsolve/; $(MAKE) ); else \
(echo "No configuration file found for GPU custom eigensolver"; exit); fi
libcuda_eigensolver_clean:
if test -d ../EIGENSOLVER_GPU; then (cd ../EIGENSOLVER_GPU/lib_eigsolve ; $(MAKE) clean); fi
libcuda_eigensolver_veryclean:
if test -d ../EIGENSOLVER_GPU; then (rm -R -f ../EIGENSOLVER_GPU ../${EIGENSOLVER_GPU_NAME}); fi
CUDA_PATH := $(if $(GPU_ARCH),$(CUDA_PATH),no)
libcuda_devxlib :
$(call update_submodule,external,devxlib)
@ -136,6 +118,6 @@ libmbd_distclean:
# cleaning
###################################
clean: lapack_clean fox_clean libcuda_eigensolver_clean libcuda_devxlib_clean libmbd_clean
clean: lapack_clean fox_clean libcuda_devxlib_clean libmbd_clean
veryclean: fox_clean libcuda_eigensolver_veryclean libcuda_devxlib_veryclean libmbd_distclean
veryclean: fox_clean libcuda_devxlib_veryclean libmbd_distclean