mirror of https://gitlab.com/QEF/q-e.git
Support for CUDA EIGENSOLVER removed
This commit is contained in:
parent
e821a717dc
commit
d262aca3cb
|
@ -223,11 +223,7 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
#if defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
USE cudafor
|
USE cudafor
|
||||||
!
|
!
|
||||||
#if defined(__USE_CUSOLVER)
|
|
||||||
USE cusolverdn
|
USE cusolverdn
|
||||||
#else
|
|
||||||
USE zhegvdx_gpu
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
!
|
!
|
||||||
USE laxlib_parallel_include
|
USE laxlib_parallel_include
|
||||||
|
@ -296,7 +292,6 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
ATTRIBUTES( DEVICE ) :: work_d, rwork_d, h_diag_d, s_diag_d
|
ATTRIBUTES( DEVICE ) :: work_d, rwork_d, h_diag_d, s_diag_d
|
||||||
#endif
|
#endif
|
||||||
INTEGER :: i, j
|
INTEGER :: i, j
|
||||||
#if defined( __USE_CUSOLVER )
|
|
||||||
INTEGER :: devInfo_d, h_meig
|
INTEGER :: devInfo_d, h_meig
|
||||||
ATTRIBUTES( DEVICE ) :: devInfo_d
|
ATTRIBUTES( DEVICE ) :: devInfo_d
|
||||||
TYPE(cusolverDnHandle), SAVE :: cuSolverHandle
|
TYPE(cusolverDnHandle), SAVE :: cuSolverHandle
|
||||||
|
@ -304,7 +299,6 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
!
|
!
|
||||||
COMPLEX(DP), VARTYPE :: h_bkp_d(:,:), s_bkp_d(:,:)
|
COMPLEX(DP), VARTYPE :: h_bkp_d(:,:), s_bkp_d(:,:)
|
||||||
ATTRIBUTES( DEVICE ) :: h_bkp_d, s_bkp_d
|
ATTRIBUTES( DEVICE ) :: h_bkp_d, s_bkp_d
|
||||||
#endif
|
|
||||||
#undef VARTYPE
|
#undef VARTYPE
|
||||||
!
|
!
|
||||||
!
|
!
|
||||||
|
@ -318,9 +312,7 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
!
|
!
|
||||||
! Keeping compatibility for both CUSolver and CustomEigensolver, CUSolver below
|
! Keeping compatibility for both CUSolver and CustomEigensolver, CUSolver below
|
||||||
!
|
!
|
||||||
#if defined(__USE_CUSOLVER) && defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
!
|
|
||||||
! vvv __USE_CUSOLVER
|
|
||||||
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
#if ! defined(__USE_GLOBAL_BUFFER)
|
||||||
ALLOCATE(h_bkp_d(n,n), s_bkp_d(n,n), STAT = info)
|
ALLOCATE(h_bkp_d(n,n), s_bkp_d(n,n), STAT = info)
|
||||||
|
@ -387,96 +379,9 @@ SUBROUTINE laxlib_cdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
CALL dev%release_buffer( h_bkp_d, info )
|
CALL dev%release_buffer( h_bkp_d, info )
|
||||||
CALL dev%release_buffer( s_bkp_d, info )
|
CALL dev%release_buffer( s_bkp_d, info )
|
||||||
#endif
|
#endif
|
||||||
! ^^^ __USE_CUSOLVER
|
|
||||||
!
|
!
|
||||||
! Keeping compatibility for both CUSolver and CustomEigensolver, CustomEigensolver below
|
! Keeping compatibility for both CUSolver and CustomEigensolver, CustomEigensolver below
|
||||||
!
|
!
|
||||||
#elif defined(__CUDA)
|
|
||||||
! vvv not __USE_CUSOLVER
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
|
||||||
! NB: dimension is different!
|
|
||||||
ALLOCATE(v_h(ldh,n), e_h(n))
|
|
||||||
ALLOCATE(h_diag_d(n) , s_diag_d(n))
|
|
||||||
#else
|
|
||||||
CALL pin%lock_buffer( v_h, (/ldh,n/), info )
|
|
||||||
CALL pin%lock_buffer( e_h, n, info )
|
|
||||||
!
|
|
||||||
CALL dev%lock_buffer( h_diag_d, n, info )
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate h_bkp_d ', ABS( info ) )
|
|
||||||
CALL dev%lock_buffer( s_diag_d, n, info )
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate s_bkp_d ', ABS( info ) )
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
lwork = n
|
|
||||||
lrwork = 1+5*n+2*n*n
|
|
||||||
liwork = 3+5*n
|
|
||||||
!
|
|
||||||
lwork_d = 2*64*64 + 65 * n
|
|
||||||
lrwork_d = n
|
|
||||||
!
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
|
||||||
ALLOCATE(work(lwork), rwork(lrwork), iwork(liwork))
|
|
||||||
!
|
|
||||||
ALLOCATE(work_d(1*lwork_d), STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' allocate work_d ', ABS( info ) )
|
|
||||||
!
|
|
||||||
ALLOCATE(rwork_d(1*lrwork_d), STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' allocate rwork_d ', ABS( info ) )
|
|
||||||
#else
|
|
||||||
CALL pin%lock_buffer(work, lwork, info)
|
|
||||||
CALL pin%lock_buffer(rwork, lrwork, info)
|
|
||||||
CALL pin%lock_buffer(iwork, liwork, info)
|
|
||||||
CALL dev%lock_buffer( work_d, lwork_d, info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate work_d ', ABS( info ) )
|
|
||||||
CALL dev%lock_buffer( rwork_d, lrwork_d, info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' cannot allocate rwork_d ', ABS( info ) )
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
!$cuf kernel do(1) <<<*,*>>>
|
|
||||||
DO i = 1, n
|
|
||||||
h_diag_d(i) = DBLE( h_d(i,i) )
|
|
||||||
s_diag_d(i) = DBLE( s_d(i,i) )
|
|
||||||
END DO
|
|
||||||
CALL zhegvdx_gpu(n, h_d, ldh, s_d, ldh, v_d, ldh, 1, m, e_d, work_d,&
|
|
||||||
lwork_d, rwork_d, lrwork_d, &
|
|
||||||
work, lwork, rwork, lrwork, &
|
|
||||||
iwork, liwork, v_h, SIZE(v_h, 1), e_h, info, .TRUE.)
|
|
||||||
!
|
|
||||||
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' cdiaghg_gpu ', ' zhegvdx_gpu failed ', ABS( info ) )
|
|
||||||
!
|
|
||||||
!$cuf kernel do(1) <<<*,*>>>
|
|
||||||
DO i = 1, n
|
|
||||||
h_d(i,i) = DCMPLX( h_diag_d(i), 0.0_DP)
|
|
||||||
s_d(i,i) = DCMPLX( s_diag_d(i), 0.0_DP)
|
|
||||||
DO j = i + 1, n
|
|
||||||
h_d(i,j) = DCONJG( h_d(j,i) )
|
|
||||||
s_d(i,j) = DCONJG( s_d(j,i) )
|
|
||||||
END DO
|
|
||||||
DO j = n + 1, ldh
|
|
||||||
h_d(j,i) = ( 0.0_DP, 0.0_DP )
|
|
||||||
s_d(j,i) = ( 0.0_DP, 0.0_DP )
|
|
||||||
END DO
|
|
||||||
END DO
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
|
||||||
DEALLOCATE(h_diag_d, s_diag_d)
|
|
||||||
!
|
|
||||||
DEALLOCATE(work, rwork, iwork)
|
|
||||||
DEALLOCATE(work_d, rwork_d)
|
|
||||||
DEALLOCATE(v_h, e_h)
|
|
||||||
#else
|
|
||||||
CALL dev%release_buffer( h_diag_d, info )
|
|
||||||
CALL dev%release_buffer( s_diag_d, info)
|
|
||||||
!
|
|
||||||
CALL pin%release_buffer(work, info)
|
|
||||||
CALL pin%release_buffer(rwork, info)
|
|
||||||
CALL pin%release_buffer(iwork, info)
|
|
||||||
CALL dev%release_buffer( work_d, info)
|
|
||||||
CALL dev%release_buffer( rwork_d, info)
|
|
||||||
CALL pin%release_buffer(v_h, info)
|
|
||||||
CALL pin%release_buffer(e_h, info)
|
|
||||||
#endif
|
|
||||||
! ^^^ not __USE_CUSOLVER
|
|
||||||
#else
|
#else
|
||||||
CALL lax_error__( 'cdiaghg', 'Called GPU eigensolver without GPU support', 1 )
|
CALL lax_error__( 'cdiaghg', 'Called GPU eigensolver without GPU support', 1 )
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -574,13 +574,7 @@ END SUBROUTINE laxlib_multi_init_desc_x
|
||||||
SUBROUTINE diagonalize_serial_gpu( m, rhos, rhod, s, info )
|
SUBROUTINE diagonalize_serial_gpu( m, rhos, rhod, s, info )
|
||||||
#if defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
use cudafor
|
use cudafor
|
||||||
#if defined ( __USE_CUSOLVER )
|
|
||||||
USE cusolverDn
|
USE cusolverDn
|
||||||
#else
|
|
||||||
use eigsolve_vars
|
|
||||||
use nvtx_inters
|
|
||||||
use dsyevd_gpu
|
|
||||||
#endif
|
|
||||||
IMPLICIT NONE
|
IMPLICIT NONE
|
||||||
include 'laxlib_kinds.fh'
|
include 'laxlib_kinds.fh'
|
||||||
INTEGER, INTENT(IN) :: m
|
INTEGER, INTENT(IN) :: m
|
||||||
|
@ -592,28 +586,13 @@ END SUBROUTINE laxlib_multi_init_desc_x
|
||||||
INTEGER :: lwork_d
|
INTEGER :: lwork_d
|
||||||
INTEGER :: i, j, lda
|
INTEGER :: i, j, lda
|
||||||
!
|
!
|
||||||
#if defined (__USE_CUSOLVER)
|
|
||||||
!
|
!
|
||||||
INTEGER, DEVICE :: devInfo
|
INTEGER, DEVICE :: devInfo
|
||||||
TYPE(cusolverDnHandle) :: cuSolverHandle
|
TYPE(cusolverDnHandle) :: cuSolverHandle
|
||||||
REAL(DP), ALLOCATABLE, DEVICE :: work_d(:)
|
REAL(DP), ALLOCATABLE, DEVICE :: work_d(:)
|
||||||
!
|
!
|
||||||
#else
|
|
||||||
!
|
|
||||||
REAL(DP), ALLOCATABLE :: work_d(:), a(:,:)
|
|
||||||
ATTRIBUTES( DEVICE ) :: work_d, a
|
|
||||||
REAL(DP), ALLOCATABLE :: b(:,:)
|
|
||||||
REAL(DP), ALLOCATABLE :: work_h(:), w_h(:), z_h(:,:)
|
|
||||||
ATTRIBUTES( PINNED ) :: work_h, w_h, z_h
|
|
||||||
INTEGER, ALLOCATABLE :: iwork_h(:)
|
|
||||||
ATTRIBUTES( PINNED ) :: iwork_h
|
|
||||||
!
|
|
||||||
INTEGER :: lwork_h, liwork_h
|
|
||||||
!
|
|
||||||
#endif
|
|
||||||
! .... Subroutine Body
|
! .... Subroutine Body
|
||||||
!
|
!
|
||||||
#if defined (__USE_CUSOLVER)
|
|
||||||
!
|
!
|
||||||
s = rhos
|
s = rhos
|
||||||
lda = SIZE( rhos, 1 )
|
lda = SIZE( rhos, 1 )
|
||||||
|
@ -641,49 +620,6 @@ END SUBROUTINE laxlib_multi_init_desc_x
|
||||||
DEALLOCATE( work_d )
|
DEALLOCATE( work_d )
|
||||||
|
|
||||||
!
|
!
|
||||||
#else
|
|
||||||
!
|
|
||||||
info = 0
|
|
||||||
lwork_d = 2*64*64 + 66*SIZE(rhos,1)
|
|
||||||
lwork_h = 1 + 6*SIZE(rhos,1) + 2*SIZE(rhos,1)*SIZE(rhos,1)
|
|
||||||
liwork_h = 3 + 5*SIZE(rhos,1)
|
|
||||||
ALLOCATE(work_d(lwork_d),STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate work_d ', ABS( info ) )
|
|
||||||
ALLOCATE(a(SIZE(rhos,1),SIZE(rhos,2)),STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate a ', ABS( info ) )
|
|
||||||
ALLOCATE(work_h(lwork_h),STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate work_h ', ABS( info ) )
|
|
||||||
ALLOCATE(iwork_h(liwork_h),STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate iwork_h ', ABS( info ) )
|
|
||||||
!
|
|
||||||
ALLOCATE(w_h(SIZE(rhod)),STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate w_h ', ABS( info ) )
|
|
||||||
ALLOCATE(z_h(SIZE(s,1),SIZE(s,2)),STAT = info)
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' laxlib diagonalize_serial_gpu ', ' allocate z_h ', ABS( info ) )
|
|
||||||
|
|
||||||
if(initialized == 0) call init_eigsolve_gpu
|
|
||||||
|
|
||||||
info = cudaMemcpy(a, rhos, SIZE(rhos,1)*SIZE(rhos,2), cudaMemcpyDeviceToDevice)
|
|
||||||
lda = SIZE(rhos,1)
|
|
||||||
!$cuf kernel do(2) <<<*,*>>>
|
|
||||||
do j = 1,m
|
|
||||||
do i = 1,m
|
|
||||||
if (i > j) then
|
|
||||||
s(i,j) = a(i,j)
|
|
||||||
endif
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
|
|
||||||
call dsyevd_gpu('V', 'U', 1, m, m, a, lda, s, lda, rhod, work_d, lwork_d, &
|
|
||||||
work_h, lwork_h, iwork_h, liwork_h, z_h, lda, w_h, info)
|
|
||||||
|
|
||||||
DEALLOCATE(z_h)
|
|
||||||
DEALLOCATE(w_h)
|
|
||||||
DEALLOCATE(iwork_h)
|
|
||||||
DEALLOCATE(work_h)
|
|
||||||
DEALLOCATE(a)
|
|
||||||
DEALLOCATE(work_d)
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
IMPLICIT NONE
|
IMPLICIT NONE
|
||||||
include 'laxlib_kinds.fh'
|
include 'laxlib_kinds.fh'
|
||||||
|
|
|
@ -203,11 +203,7 @@ SUBROUTINE laxlib_rdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
USE laxlib_parallel_include
|
USE laxlib_parallel_include
|
||||||
#if defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
USE cudafor
|
USE cudafor
|
||||||
#if defined(__USE_CUSOLVER)
|
|
||||||
USE cusolverdn
|
USE cusolverdn
|
||||||
#else
|
|
||||||
USE dsygvdx_gpu
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
!
|
!
|
||||||
! NB: the flag below can be used to decouple LAXlib from devXlib.
|
! NB: the flag below can be used to decouple LAXlib from devXlib.
|
||||||
|
@ -276,14 +272,12 @@ SUBROUTINE laxlib_rdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
ATTRIBUTES( DEVICE ) :: h_diag_d, s_diag_d
|
ATTRIBUTES( DEVICE ) :: h_diag_d, s_diag_d
|
||||||
#endif
|
#endif
|
||||||
!
|
!
|
||||||
#if defined(__USE_CUSOLVER)
|
|
||||||
INTEGER :: devInfo_d, h_meig
|
INTEGER :: devInfo_d, h_meig
|
||||||
ATTRIBUTES( DEVICE ) :: devInfo_d
|
ATTRIBUTES( DEVICE ) :: devInfo_d
|
||||||
TYPE(cusolverDnHandle), SAVE :: cuSolverHandle
|
TYPE(cusolverDnHandle), SAVE :: cuSolverHandle
|
||||||
LOGICAL, SAVE :: cuSolverInitialized = .FALSE.
|
LOGICAL, SAVE :: cuSolverInitialized = .FALSE.
|
||||||
REAL(DP), VARTYPE :: h_bkp_d(:,:), s_bkp_d(:,:)
|
REAL(DP), VARTYPE :: h_bkp_d(:,:), s_bkp_d(:,:)
|
||||||
ATTRIBUTES( DEVICE ) :: h_bkp_d, s_bkp_d
|
ATTRIBUTES( DEVICE ) :: h_bkp_d, s_bkp_d
|
||||||
#endif
|
|
||||||
#undef VARTYPE
|
#undef VARTYPE
|
||||||
!
|
!
|
||||||
CALL start_clock_gpu( 'rdiaghg' )
|
CALL start_clock_gpu( 'rdiaghg' )
|
||||||
|
@ -292,61 +286,8 @@ SUBROUTINE laxlib_rdiaghg_gpu( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp
|
||||||
!
|
!
|
||||||
IF ( me_bgrp == root_bgrp ) THEN
|
IF ( me_bgrp == root_bgrp ) THEN
|
||||||
!
|
!
|
||||||
#if (!defined(__USE_CUSOLVER)) && defined(__CUDA)
|
#if defined(__CUDA)
|
||||||
ALLOCATE(e_h(n), v_h(ldh,n))
|
|
||||||
!
|
!
|
||||||
ALLOCATE(h_diag_d(n), s_diag_d(n))
|
|
||||||
!$cuf kernel do(1) <<<*,*>>>
|
|
||||||
DO i = 1, n
|
|
||||||
h_diag_d(i) = DBLE( h_d(i,i) )
|
|
||||||
s_diag_d(i) = DBLE( s_d(i,i) )
|
|
||||||
END DO
|
|
||||||
!
|
|
||||||
lwork = 1 + 6*n + 2*n*n
|
|
||||||
liwork = 3 + 5*n
|
|
||||||
ALLOCATE(work(lwork), iwork(liwork))
|
|
||||||
!
|
|
||||||
lwork_d = 2*64*64 + 66*n
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
|
||||||
ALLOCATE(work_d(1*lwork_d), STAT = info)
|
|
||||||
#else
|
|
||||||
CALL dev%lock_buffer( work_d, lwork_d, info )
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' cannot allocate work_d ', ABS( info ) )
|
|
||||||
#endif
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' allocate work_d ', ABS( info ) )
|
|
||||||
!
|
|
||||||
CALL dsygvdx_gpu(n, h_d, ldh, s_d, ldh, v_d, ldh, 1, m, e_d, work_d, &
|
|
||||||
lwork_d, work, lwork, iwork, liwork, v_h, size(v_h, 1), &
|
|
||||||
e_h, info, .TRUE.)
|
|
||||||
!
|
|
||||||
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' dsygvdx_gpu failed ', ABS( info ) )
|
|
||||||
!
|
|
||||||
!$cuf kernel do(1) <<<*,*>>>
|
|
||||||
DO i = 1, n
|
|
||||||
h_d(i,i) = h_diag_d(i)
|
|
||||||
s_d(i,i) = s_diag_d(i)
|
|
||||||
DO j = i + 1, n
|
|
||||||
h_d(i,j) = h_d(j,i)
|
|
||||||
s_d(i,j) = s_d(j,i)
|
|
||||||
END DO
|
|
||||||
! This could be avoided, need to check dsygvdx_gpu implementation
|
|
||||||
DO j = n + 1, ldh
|
|
||||||
h_d(j,i) = 0.0_DP
|
|
||||||
s_d(j,i) = 0.0_DP
|
|
||||||
END DO
|
|
||||||
END DO
|
|
||||||
DEALLOCATE(h_diag_d,s_diag_d)
|
|
||||||
!
|
|
||||||
DEALLOCATE(work, iwork)
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
|
||||||
DEALLOCATE(work_d)
|
|
||||||
#else
|
|
||||||
CALL dev%release_buffer( work_d, info )
|
|
||||||
#endif
|
|
||||||
!
|
|
||||||
DEALLOCATE(v_h, e_h)
|
|
||||||
#elif defined(__USE_CUSOLVER) && defined(__CUDA)
|
|
||||||
! vvv __USE_CUSOLVER
|
|
||||||
#if ! defined(__USE_GLOBAL_BUFFER)
|
#if ! defined(__USE_GLOBAL_BUFFER)
|
||||||
ALLOCATE(h_bkp_d(n,n), s_bkp_d(n,n), STAT = info)
|
ALLOCATE(h_bkp_d(n,n), s_bkp_d(n,n), STAT = info)
|
||||||
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' cannot allocate h_bkp_d or s_bkp_d ', ABS( info ) )
|
IF( info /= 0 ) CALL lax_error__( ' rdiaghg_gpu ', ' cannot allocate h_bkp_d or s_bkp_d ', ABS( info ) )
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
include ../../make.inc
|
|
||||||
|
|
||||||
# Stripped version of F90FLAGS to remove GPU details added explicitly below.
|
|
||||||
F90FLAGSS := $(filter-out $(CUDA_F90FLAGS),$(F90FLAGS))
|
|
||||||
|
|
||||||
FLAGS = -O3 -pgf90libs -Mcuda=cc$(GPU_ARCH),cuda$(CUDA_RUNTIME),ptxinfo $(F90FLAGSS)
|
|
||||||
FLAGS2 = -O3 -pgf90libs -Mcuda=cc$(GPU_ARCH),cuda$(CUDA_RUNTIME),ptxinfo,maxregcount:64 $(F90FLAGSS)
|
|
||||||
|
|
||||||
# For performance reasons, cc of FLAGS3 must be <= 60
|
|
||||||
define MIN
|
|
||||||
$(firstword $(sort ${1} ${2}))
|
|
||||||
endef
|
|
||||||
FLAGS3 = -O3 -pgf90libs -Mcuda=cc$(call MIN,${GPU_ARCH},60),cuda$(CUDA_RUNTIME),ptxinfo,nordc,maxregcount:255 $(F90FLAGSS)
|
|
||||||
|
|
||||||
# Uncomment to enable NVTX markers
|
|
||||||
#OPTFLAGS = -DUSE_NVTX
|
|
||||||
|
|
||||||
all: lib_eigsolve.a
|
|
||||||
|
|
||||||
OBJS = cusolverDn_m.o eigsolve_vars.o toolbox.o zhegst_gpu.o zhemv_gpu.o zhetd2_gpu.o zhetrd_gpu.o zheevd_gpu.o zhegvdx_gpu.o \
|
|
||||||
dsygst_gpu.o dsymv_gpu.o dsytd2_gpu.o dsytrd_gpu.o dsyevd_gpu.o dsygvdx_gpu.o
|
|
||||||
|
|
||||||
zhetd2_gpu.o : zhetd2_gpu.F90
|
|
||||||
pgf90 -c ${FLAGS2} ${OPTFLAGS} $*.F90 -o $*.o
|
|
||||||
zhemv_gpu.o : zhemv_gpu.F90
|
|
||||||
pgf90 -c ${FLAGS3} ${OPTFLAGS} $*.F90 -o $*.o
|
|
||||||
dsytd2_gpu.o : dsytd2_gpu.F90
|
|
||||||
pgf90 -c ${FLAGS2} ${OPTFLAGS} $*.F90 -o $*.o
|
|
||||||
dsymv_gpu.o : dsymv_gpu.F90
|
|
||||||
pgf90 -c ${FLAGS3} ${OPTFLAGS} $*.F90 -o $*.o
|
|
||||||
%.o: %.cuf
|
|
||||||
pgf90 -c ${FLAGS} ${OPTFLAGS} $*.cuf -o $*.o
|
|
||||||
%.o: %.F90
|
|
||||||
pgf90 -c ${FLAGS} ${OPTFLAGS} $*.F90 -o $*.o
|
|
||||||
|
|
||||||
lib_eigsolve.a: $(OBJS)
|
|
||||||
ar -cr lib_eigsolve.a $(OBJS)
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f lib_eigsolve.a *.mod *.o
|
|
||||||
|
|
||||||
# Dependencies
|
|
||||||
dsyevd_gpu.o : dsytrd_gpu.o
|
|
||||||
dsyevd_gpu.o : eigsolve_vars.o
|
|
||||||
dsyevd_gpu.o : toolbox.o
|
|
||||||
dsygst_gpu.o : eigsolve_vars.o
|
|
||||||
dsygvdx_gpu.o : dsyevd_gpu.o
|
|
||||||
dsygvdx_gpu.o : dsygst_gpu.o
|
|
||||||
dsygvdx_gpu.o : eigsolve_vars.o
|
|
||||||
dsygvdx_gpu.o : toolbox.o
|
|
||||||
dsytrd_gpu.o : dsymv_gpu.o
|
|
||||||
dsytrd_gpu.o : dsytd2_gpu.o
|
|
||||||
dsytrd_gpu.o : eigsolve_vars.o
|
|
||||||
eigsolve_vars.o : cusolverDn_m.o
|
|
||||||
zheevd_gpu.o : eigsolve_vars.o
|
|
||||||
zheevd_gpu.o : toolbox.o
|
|
||||||
zheevd_gpu.o : zhetrd_gpu.o
|
|
||||||
zhegst_gpu.o : eigsolve_vars.o
|
|
||||||
zhegvdx_gpu.o : eigsolve_vars.o
|
|
||||||
zhegvdx_gpu.o : toolbox.o
|
|
||||||
zhegvdx_gpu.o : zheevd_gpu.o
|
|
||||||
zhegvdx_gpu.o : zhegst_gpu.o
|
|
||||||
zhetrd_gpu.o : eigsolve_vars.o
|
|
||||||
zhetrd_gpu.o : zhemv_gpu.o
|
|
||||||
zhetrd_gpu.o : zhetd2_gpu.o
|
|
|
@ -11,12 +11,6 @@
|
||||||
include ../make.inc
|
include ../make.inc
|
||||||
include install_utils
|
include install_utils
|
||||||
|
|
||||||
# For NVIDIA Eigensolver
|
|
||||||
EIGENSOLVER_VERSION=0.3.1
|
|
||||||
EIGENSOLVER_GPU=v${EIGENSOLVER_VERSION}.tar.gz
|
|
||||||
EIGENSOLVER_GPU_NAME=Eigensolver_gpu-${EIGENSOLVER_VERSION}
|
|
||||||
EIGENSOLVER_GPU_URL=https://github.com/NVIDIA/Eigensolver_gpu/archive/${EIGENSOLVER_GPU}
|
|
||||||
|
|
||||||
# MAIN target
|
# MAIN target
|
||||||
|
|
||||||
all: libcuda
|
all: libcuda
|
||||||
|
@ -72,18 +66,6 @@ fox_clean:
|
||||||
|
|
||||||
libcuda : $(addprefix libcuda_,$(CUDA_EXTLIBS))
|
libcuda : $(addprefix libcuda_,$(CUDA_EXTLIBS))
|
||||||
|
|
||||||
libcuda_eigensolver :
|
|
||||||
$(call download_and_unpack,$(EIGENSOLVER_GPU_NAME),$(EIGENSOLVER_GPU_URL),EIGENSOLVER_GPU,EIGENSOLVER_GPU)
|
|
||||||
if test ! -e ../EIGENSOLVER_GPU/lib_eigsolve/lib_eigsolve.a ; then \
|
|
||||||
(cp Makefile.lib_eigsolve ../EIGENSOLVER_GPU/lib_eigsolve/Makefile; \
|
|
||||||
cd ../EIGENSOLVER_GPU/lib_eigsolve/; $(MAKE) ); else \
|
|
||||||
(echo "No configuration file found for GPU custom eigensolver"; exit); fi
|
|
||||||
|
|
||||||
libcuda_eigensolver_clean:
|
|
||||||
if test -d ../EIGENSOLVER_GPU; then (cd ../EIGENSOLVER_GPU/lib_eigsolve ; $(MAKE) clean); fi
|
|
||||||
libcuda_eigensolver_veryclean:
|
|
||||||
if test -d ../EIGENSOLVER_GPU; then (rm -R -f ../EIGENSOLVER_GPU ../${EIGENSOLVER_GPU_NAME}); fi
|
|
||||||
|
|
||||||
CUDA_PATH := $(if $(GPU_ARCH),$(CUDA_PATH),no)
|
CUDA_PATH := $(if $(GPU_ARCH),$(CUDA_PATH),no)
|
||||||
libcuda_devxlib :
|
libcuda_devxlib :
|
||||||
$(call update_submodule,external,devxlib)
|
$(call update_submodule,external,devxlib)
|
||||||
|
@ -136,6 +118,6 @@ libmbd_distclean:
|
||||||
# cleaning
|
# cleaning
|
||||||
###################################
|
###################################
|
||||||
|
|
||||||
clean: lapack_clean fox_clean libcuda_eigensolver_clean libcuda_devxlib_clean libmbd_clean
|
clean: lapack_clean fox_clean libcuda_devxlib_clean libmbd_clean
|
||||||
|
|
||||||
veryclean: fox_clean libcuda_eigensolver_veryclean libcuda_devxlib_veryclean libmbd_distclean
|
veryclean: fox_clean libcuda_devxlib_veryclean libmbd_distclean
|
||||||
|
|
Loading…
Reference in New Issue