quantum-espresso/LAXlib/tests/test_diaghg_gpu_3.f90

253 lines
7.1 KiB
Fortran

#ifdef __CUDA
program test_diaghg_gpu_3
USE laxlib_parallel_include
USE mp, ONLY : mp_bcast
USE mp_world, ONLY : mp_world_start, mp_world_end, mpime, &
root, nproc, world_comm
USE mp_bands_util, ONLY : me_bgrp, root_bgrp, intra_bgrp_comm
USE tester
IMPLICIT NONE
include 'laxlib_kinds.fh'
!
TYPE(tester_t) :: test
INTEGER :: world_group = 0
!
CALL test%init()
test%tolerance64=1.d-8
!
#if defined(__MPI)
world_group = MPI_COMM_WORLD
#endif
CALL mp_world_start(world_group)
!
me_bgrp = mpime; root_bgrp=root; intra_bgrp_comm=world_comm
!
CALL real_1(test)
!
CALL complex_1(test)
!
CALL collect_results(test)
!
CALL mp_world_end()
!
IF (mpime .eq. 0) CALL test%print()
!
CONTAINS
!
SUBROUTINE real_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
real(DP), allocatable :: h(:,:)
real(DP), allocatable :: h_save(:,:)
real(DP), allocatable :: s(:,:)
real(DP), allocatable :: s_save(:,:)
real(DP), allocatable :: e(:)
real(DP), allocatable :: v(:,:)
real(DP), allocatable :: e_save(:)
real(DP), allocatable :: v_save(:,:)
real(DP), allocatable :: e_dc(:) ! stores results from divide and conquer
real(DP), allocatable :: v_dc(:,:) !
! device copies
real(DP), allocatable, device :: h_d(:,:)
real(DP), allocatable, device :: s_d(:,:)
real(DP), allocatable, device :: e_d(:)
real(DP), allocatable, device :: v_d(:,:)
!
!
character(len=20) :: inputs(2)
integer :: i, j, info
!
inputs = ["ZnOG1.bin", "ZnOG2.bin"]
!
DO i=1, SIZE(inputs)
!
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
!
h_save = h
s_save = s
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, .false. )
!
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, .true. )
!
ALLOCATE(v_dc, SOURCE=h_save)
ALLOCATE(e_dc(n))
s = s_save
CALL solve_with_dsygvd(n, v_dc, s, ldh, e_dc)
s = s_save
!
CALL test%assert_close( e(1:m), e_save(1:m))
!
! GPU data & subroutines
!
v = (0.d0, 0.d0)
e = 0.d0
s = s_save
h = h_save
ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
!
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, .false. )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v_d = (0.d0, 0.d0)
e_d = 0.d0
s_d = s_save
h_d = h_save
!
! Start from data on the GPU and diagonalize on the CPU
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm, .true. )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
!
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h_d, s_d, e_d, v_d)
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save, v_dc, e_dc)
END DO
!
END SUBROUTINE real_1
!
SUBROUTINE complex_1(test)
USE mp_world, ONLY : mpime
USE LAXlib
USE test_io
implicit none
!
TYPE(tester_t) :: test
!
integer :: ldh, n, m
complex(DP), allocatable :: h(:,:)
complex(DP), allocatable :: h_save(:,:)
complex(DP), allocatable :: s(:,:)
complex(DP), allocatable :: s_save(:,:)
real(DP), allocatable :: e(:)
complex(DP), allocatable :: v(:,:)
real(DP), allocatable :: e_save(:)
complex(DP), allocatable :: v_save(:,:)
real(DP), allocatable :: e_dc(:) ! stores results from divide and conquer
complex(DP), allocatable :: v_dc(:,:) !
! device copies
complex(DP), allocatable, device :: h_d(:,:)
complex(DP), allocatable, device :: s_d(:,:)
real(DP), allocatable, device :: e_d(:)
complex(DP), allocatable, device :: v_d(:,:)
!
!
character(len=20) :: inputs(4)
integer :: i, j, info
!
inputs = ["ZnOK1.bin ", &
"ZnOK2.bin ", &
"SiGeK1.bin", &
"SiGeK2.bin"]
!
DO i=1, SIZE(inputs)
!
CALL read_problem(inputs(i), ldh, n, m, h, s, e, v, info)
!
IF (info /= 0) THEN
IF (mpime == 0) print *, "Test with ", inputs(i), " skipped. Input not found."
CYCLE
END IF
!
ALLOCATE(h_save, SOURCE=h)
ALLOCATE(s_save, SOURCE=s)
ALLOCATE(e_save, SOURCE=e)
ALLOCATE(v_save, SOURCE=v)
ALLOCATE(e_dc(n))
ALLOCATE(v_dc, SOURCE=h)
!
h_save = h
s_save = s
!
! == Check CPU interface without and with offloading ==
!
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, .false. )
!
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
h = h_save; s = s_save;
v = (0.d0, 0.d0)
e = 0.d0
CALL diaghg( n, m, h, s, ldh, e, v, me_bgrp, root_bgrp, intra_bgrp_comm, .true. )
!
! N.B.: GPU eigensolver uses a different algorithm: zhegvd
s = s_save; e_dc = 0.d0
CALL solve_with_zhegvd(n, v_dc, s, ldh, e_dc)
!
CALL test%assert_close( e(1:m), e_save(1:m))
!
! GPU data & subroutines
!
v = (0.d0, 0.d0)
e = 0.d0
s = s_save
h = h_save
ALLOCATE(e_d, SOURCE=e); ALLOCATE(v_d, SOURCE=v)
ALLOCATE(h_d, SOURCE=h); ALLOCATE(s_d, SOURCE=s)
!
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
CALL test%assert_close( e(1:m), e_save(1:m) )
!
!
v_d = (0.d0, 0.d0)
e_d = 0.d0
s_d = s_save
h_d = h_save
CALL diaghg( n, m, h_d, s_d, ldh, e_d, v_d, me_bgrp, root_bgrp, intra_bgrp_comm )
!
v(1:n, 1:m) = v_d(1:n, 1:m)
e(1:m) = e_d(1:m)
!
CALL test%assert_close( e(1:m), e_save(1:m))
!
DEALLOCATE(h_d, s_d, e_d, v_d)
DEALLOCATE(h,s,e,v,h_save,s_save,e_save,v_save)
DEALLOCATE(e_dc, v_dc)
END DO
!
END SUBROUTINE complex_1
!
end program test_diaghg_gpu_3
#else
program test_diaghg_gpu_3
end program test_diaghg_gpu_3
#endif