PROGRAM lax_test use laxlib_descriptor USE laxlib_parallel_include use dspev_module IMPLICIT NONE include 'laxlib_kinds.fh' include 'laxlib_param.fh' include 'laxlib_hi.fh' include 'laxlib_low.fh' #if defined(__MPI) INTEGER STATUS(MPI_STATUS_SIZE) #else INTEGER :: MPI_MAX_PROCESSOR_NAME=64 #endif INTEGER :: mype, npes, comm, ntgs, root LOGICAL :: iope INTEGER :: ierr INTEGER :: stdout ! INTEGER :: np_ortho(2) = 1 ! size of the processor grid used in ortho INTEGER :: me_ortho(2) = 0 ! coordinates of the processors INTEGER :: me_ortho1 = 0 ! task id for the ortho group INTEGER :: nproc_ortho = 1 ! size of the ortho group: ! of two neighbour processors in ortho_comm INTEGER :: ortho_comm = 0 ! communicator for the ortho group INTEGER :: ortho_row_comm = 0 ! communicator for the ortho row group INTEGER :: ortho_col_comm = 0 ! communicator for the ortho col group INTEGER :: ortho_comm_id= 0 ! id of the ortho_comm INTEGER :: ortho_parent_comm = 0 ! parent communicator from which ortho group has been created ! #if defined __SCALAPACK INTEGER :: me_blacs = 0 ! BLACS processor index starting from 0 INTEGER :: np_blacs = 1 ! BLACS number of processor #endif ! INTEGER :: world_cntx = -1 ! BLACS context of all processor INTEGER :: ortho_cntx = -1 ! BLACS context for ortho_comm LOGICAL :: do_distr_diag_inside_bgrp = .FALSE. LOGICAL :: la_proc INTEGER, ALLOCATABLE :: rank_ip( :, : ) INTEGER, ALLOCATABLE :: irc_ip( : ) INTEGER, ALLOCATABLE :: nrc_ip( : ) INTEGER, ALLOCATABLE :: idesc_ip(:,:,:) ! #if defined(__INTEL_COMPILER) #if __INTEL_COMPILER >= 1300 ! the following is a workaround for Intel 12.1 bug #if __INTEL_COMPILER < 9999 !dir$ attributes align: 4096 :: a, s, c, d #endif #endif #endif REAL(DP), ALLOCATABLE :: a(:,:) REAL(DP), ALLOCATABLE :: s(:,:) REAL(DP), ALLOCATABLE :: c(:,:) REAL(DP), ALLOCATABLE :: d(:) ! REAL(DP) :: time1, time2 REAL*8 :: tempo(100) REAL*8, allocatable :: tempo_tutti(:) REAL*8, allocatable :: perf_matrix(:,:) REAL*8, allocatable :: latency_matrix(:,:) integer, allocatable :: perf_count(:,:) REAL*8 :: tempo_mio(100) REAL*8 :: tempo_min(100) REAL*8 :: tempo_max(100) REAL*8 :: tempo_avg(100) TYPE(la_descriptor) :: desc INTEGER :: idesc(LAX_DESC_SIZE) INTEGER :: i, ir, ic, nx, n, nr, nc ! size of the matrix INTEGER :: n_in, nlen, dest, sour, tag, ii INTEGER :: nnodes ! integer :: nargs CHARACTER(LEN=80) :: arg CHARACTER(LEN=MPI_MAX_PROCESSOR_NAME), allocatable :: proc_name(:) CHARACTER(LEN=MPI_MAX_PROCESSOR_NAME), allocatable :: node_name(:) INTEGER, allocatable :: proc2node(:) #if defined(_OPENMP) INTEGER, EXTERNAL :: omp_get_max_threads #endif ! #if defined(_OPENMP) INTEGER :: PROVIDED #endif ! ! ........ ! ! default parameter ! n_in = 1024 ! nargs = command_argument_count() do i = 1, nargs - 1 CALL get_command_argument(i, arg) IF( TRIM( arg ) == '-n' ) THEN CALL get_command_argument(i+1, arg) READ( arg, * ) n_in END IF end do #if defined(__MPI) #if defined(_OPENMP) CALL MPI_Init_thread(MPI_THREAD_FUNNELED, PROVIDED, ierr) #else CALL MPI_Init(ierr) #endif CALL mpi_comm_rank(MPI_COMM_WORLD,mype,ierr) CALL mpi_comm_size(MPI_COMM_WORLD,npes,ierr) comm = MPI_COMM_WORLD ntgs = 1 root = 0 IF(mype==root) THEN iope = .true. ELSE iope = .false. ENDIF #else mype = 0 npes = 1 comm = 0 ntgs = 1 root = 0 iope = .true. #endif OPEN ( unit = 6, file = TRIM('test.out'), status='unknown' ) ! !write(6,*) 'mype = ', mype, ' npes = ', npes ! ! ! Broadcast input parameter first ! #if defined(__MPI) CALL MPI_BCAST(n_in, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr ) #endif n = n_in if( mype == 0 ) then write(6,*) '+-----------------------------------+' write(6,*) '| QE Linear Algebra |' write(6,*) '| testing & timing |' write(6,*) '| by Carlo Cavazzoni |' write(6,*) '+-----------------------------------+' write(6,*) write(6,*) 'matrix size = ', n, ' x ', n write(6,*) 'num. procs = ', npes #if defined(_OPENMP) write(6,*) 'thr x proc = ', omp_get_max_threads() #endif write(6,*) endif allocate( proc_name( npes ) ) allocate( node_name( npes ) ) allocate( proc2node( npes ) ) nnodes = 0 do i = 1, npes #if defined(__MPI) if( mype == i-1 ) then call MPI_Get_processor_name( proc_name(i), nlen, ierr ) end if CALL MPI_BCAST( nlen, 1, MPI_INT, i-1, MPI_COMM_WORLD, ierr ) CALL MPI_BCAST( proc_name(i), MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER, i-1, MPI_COMM_WORLD, ierr ) #else proc_name(i) = 'localhost' #endif if( mype == 0 ) then ! write(6,310) i, proc_name(i) end if 310 FORMAT('pe = ',I5,' name = ', A20) do ii = 1, nnodes if( proc_name(i) == node_name(ii) ) then exit end if end do if( ii > nnodes ) then nnodes = nnodes + 1 node_name( nnodes ) = proc_name( i ) end if proc2node( i ) = ii end do ! if( mype == 0 ) then write(6,*) '+-----------------------------------+' write(6,*) '| node list |' write(6,*) '+-----------------------------------+' write(6,*) do ii = 1, nnodes write(6,310) ii, node_name(ii) end do end if allocate( perf_matrix( nnodes, nnodes ) ) allocate( latency_matrix( nnodes, nnodes ) ) allocate( perf_count( nnodes, nnodes ) ) perf_matrix = 0.0d0 latency_matrix = 0.0d0 perf_count = 0 ! Check core speed ! #if defined(__MPI) CALL MPI_BARRIER( MPI_COMM_WORLD, ierr) #endif nx = 1024 ALLOCATE( s( nx, nx ) ) ALLOCATE( a( nx, nx ) ) ALLOCATE( c( nx, nx ) ) ALLOCATE( tempo_tutti( npes ) ) tempo_tutti = 0.0d0 a = 1.0d0 s = 1.0d0 c = 1.0d0 CALL dgemm('n', 'n', nx, nx, nx, 1.0d0, A, nx, s, nx, 1.0d0, C, nx) #if defined(__MPI) tempo(1) = MPI_WTIME() #endif CALL dgemm('n', 'n', nx, nx, nx, 1.0d0, A, nx, s, nx, 1.0d0, C, nx) CALL dgemm('n', 'n', nx, nx, nx, 1.0d0, A, nx, s, nx, 1.0d0, C, nx) CALL dgemm('n', 'n', nx, nx, nx, 1.0d0, A, nx, s, nx, 1.0d0, C, nx) CALL dgemm('n', 'n', nx, nx, nx, 1.0d0, A, nx, s, nx, 1.0d0, C, nx) CALL dgemm('n', 'n', nx, nx, nx, 1.0d0, A, nx, s, nx, 1.0d0, C, nx) #if defined(__MPI) tempo(2) = MPI_WTIME() #endif DEALLOCATE( s ) DEALLOCATE( a ) DEALLOCATE( c ) tempo_tutti(mype+1) = tempo(2)-tempo(1) #if defined(__MPI) CALL MPI_ALLREDUCE( MPI_IN_PLACE, tempo_tutti, npes, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr ) #endif if( mype == 0 ) then write(6,*) write(6,*) write(6,*) write(6,*) '+-----------------------------------+' write(6,*) '| measured task performances |' write(6,*) '+-----------------------------------+' do i = 1, npes write(6,300) i, 5.0d0*DBLE(nx*nx*nx)*2.0d0/tempo_tutti(i)/1.0D+9, proc_name(i) end do end if 300 FORMAT('pe = ',I5,',', F8.3, ' GFlops', ', node: ', A20) ! ! Check network speed ! nx = 2048 ALLOCATE( s( nx, nx ) ) tempo_tutti = 0.0d0 do ii = 0, npes-1 do i = 0, ii-1 sour = ii dest = i tag = i + ii * npes #if defined(__MPI) CALL MPI_BARRIER( MPI_COMM_WORLD, ierr) if( ( mype == sour ) .or. ( mype == dest ) ) THEN tempo(1) = MPI_WTIME() if( mype == dest ) then CALL MPI_SEND(s, nx*nx, MPI_DOUBLE_PRECISION, sour, TAG, MPI_COMM_WORLD, ierr) CALL MPI_RECV(s, nx*nx, MPI_DOUBLE_PRECISION, sour, TAG+NPES*NPES, MPI_COMM_WORLD, status, ierr) else if( mype == sour ) then CALL MPI_RECV(s, nx*nx, MPI_DOUBLE_PRECISION, dest, TAG, MPI_COMM_WORLD, status, ierr) CALL MPI_SEND(s, nx*nx, MPI_DOUBLE_PRECISION, dest, TAG+NPES*NPES, MPI_COMM_WORLD, ierr) endif tempo(2) = MPI_WTIME() perf_matrix( proc2node( ii+1 ), proc2node( i+1 ) ) = perf_matrix( proc2node( ii+1 ), proc2node( i+1 ) ) + & 2.0d0*DBLE(nx*nx)*8.0d0/(tempo(2)-tempo(1))/1.0D+9 perf_count( proc2node( ii+1 ), proc2node( i+1 ) ) = perf_count( proc2node( ii+1 ), proc2node( i+1 ) ) + 1 END IF CALL MPI_BARRIER( MPI_COMM_WORLD, ierr) if( ( mype == sour ) .or. ( mype == dest ) ) THEN tempo(1) = MPI_WTIME() if( mype == dest ) then CALL MPI_SEND(ii, 1, MPI_BYTE, sour, TAG, MPI_COMM_WORLD, ierr) CALL MPI_RECV(ii, 1, MPI_BYTE, sour, TAG+NPES, MPI_COMM_WORLD, status, ierr) else if( mype == sour ) then CALL MPI_RECV(ii, 1, MPI_BYTE, dest, TAG, MPI_COMM_WORLD, status, ierr) CALL MPI_SEND(ii, 1, MPI_BYTE, dest, TAG+NPES, MPI_COMM_WORLD, ierr) endif tempo(2) = MPI_WTIME() latency_matrix( proc2node( ii+1 ), proc2node( i+1 ) ) = latency_matrix( proc2node( ii+1 ), proc2node( i+1 ) ) + & (tempo(2)-tempo(1)) END IF #endif end do end do #if defined(__MPI) CALL MPI_ALLREDUCE( MPI_IN_PLACE, perf_matrix, SIZE(perf_matrix), MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr ) CALL MPI_ALLREDUCE( MPI_IN_PLACE, latency_matrix, SIZE(latency_matrix), MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr ) CALL MPI_ALLREDUCE( MPI_IN_PLACE, perf_count, SIZE(perf_count), MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierr ) #endif if( mype == 0 ) then write(6,*) write(6,*) write(6,*) write(6,*) '+-----------------------------------+' write(6,*) '| ping-pong network bandwidth |' write(6,*) '+-----------------------------------+' write(6,*) do ii = 1, nnodes do i = 1, nnodes if( perf_count(i,ii) > 0 ) then perf_matrix(i,ii) = perf_matrix(i,ii) / perf_count(i,ii) write( 6, 314 ) node_name(i), node_name(ii), perf_count(i,ii), perf_matrix(i,ii) end if end do end do 314 FORMAT( A20, A20, I5, ':', F8.3, 'GBytes') write(6,*) write(6,*) '+-----------------------------------+' write(6,*) '| ping-pong network latency |' write(6,*) '+-----------------------------------+' write(6,*) do ii = 1, nnodes do i = 1, nnodes if( perf_count(i,ii) > 0 ) then latency_matrix(i,ii) = latency_matrix(i,ii) / perf_count(i,ii) write( 6, 315 ) node_name(i), node_name(ii), perf_count(i,ii), latency_matrix(i,ii)*1000000.0d0 end if end do end do 315 FORMAT( A20, A20, I5, ':', F10.3, 'usec') end if DEALLOCATE( s ) DEALLOCATE( tempo_tutti ) ! ! CALL laxlib_start(n, mpi_comm_world, mpi_comm_world, do_distr_diag_inside_bgrp) CALL laxlib_getval( np_ortho = np_ortho, ortho_comm = ortho_comm, & do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp ) ! ALLOCATE(rank_ip( np_ortho(1), np_ortho(2) )) ALLOCATE( irc_ip( np_ortho(1) ), nrc_ip (np_ortho(1) ) ) CALL desc_init( n, nx, la_proc, idesc, rank_ip, irc_ip, nrc_ip ) ! CALL laxlib_intarray_to_desc( desc, idesc ) ! nx = 1 IF( desc%active_node > 0 ) nx = desc%nrcx ! ALLOCATE( d( n ) ) ALLOCATE( s( nx, nx ) ) ALLOCATE( a( nx, nx ) ) ALLOCATE( c( nx, nx ) ) nr = desc%nr nc = desc%nc ir = desc%ir ic = desc%ic ! ! do not take the time of the first execution, it may be biased from MPI ! initialization stuff ! CALL set_a() ! CALL diagonalize_parallel_x( n, a, d, s, idesc ) ! tempo = 0.0d0 tempo_mio = 0.0d0 tempo_min = 0.0d0 tempo_max = 0.0d0 tempo_avg = 0.0d0 CALL set_a() ! #if defined(__MPI) CALL MPI_BARRIER( MPI_COMM_WORLD, ierr) tempo(1) = MPI_WTIME() #endif ! CALL diagonalize_parallel_x( n, a, d, s, idesc ) ! #if defined(__MPI) CALL MPI_BARRIER( MPI_COMM_WORLD, ierr) tempo(2) = MPI_WTIME() #endif ! CALL sqr_mm_cannon( 'N', 'N', n, 1.0d0, a, nx, s, nx, 0.0d0, c, nr, idesc) ! #if defined(__MPI) CALL MPI_BARRIER( MPI_COMM_WORLD, ierr) tempo(3) = MPI_WTIME() ! do i = 2, 10 tempo_mio(i) = tempo(i)-tempo(i-1) end do CALL MPI_ALLREDUCE( tempo_mio, tempo_min, 100, MPI_DOUBLE_PRECISION, MPI_MIN, MPI_COMM_WORLD, ierr ) CALL MPI_ALLREDUCE( tempo_mio, tempo_max, 100, MPI_DOUBLE_PRECISION, MPI_MAX, MPI_COMM_WORLD, ierr ) CALL MPI_ALLREDUCE( tempo_mio, tempo_avg, 100, MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD, ierr ) #else tempo_min = tempo tempo_max = tempo #endif tempo_avg = tempo_avg / npes ! IF( mype == 0 ) THEN write(6,*) write(6,*) ' Matrix eigenvalues ' write(6,*) IF ( n <= 16 ) THEN DO i = 1, n write(6,*) ' D(',i,')=',d(i) END DO ELSE DO i = 1, 8 write(6,*) ' D(',i,')=',d(i) END DO write(6,*) ' ... ' DO i = n-8, n write(6,*) ' D(',i,')=',d(i) END DO END IF write(6,*) ENDIF if( mype == 0 ) then write(6,*) '**** LA Timing ****' write(6,*) write(6,200) 2.0d0*n*n*n / 1.D9 / tempo_avg(3) write(6,*) write(6,100) write(6,1) write(6,100) write(6,2) tempo_min(2), tempo_max(2), tempo_avg(2) write(6,100) write(6,3) tempo_min(3), tempo_max(3), tempo_avg(3) write(6,100) 200 FORMAT(' GFlops = ', F14.2 ) 100 FORMAT(' +--------------------+----------------+-----------------+----------------+' ) 1 FORMAT(' |LAX subroutine | sec. min | sec. max | sec. avg |' ) 2 FORMAT(' |diagonalize_parallel| ', D14.3, ' | ', D14.3, ' | ', D14.3, ' |' ) 3 FORMAT(' |sqr_mm_cannon | ', D14.3, ' | ', D14.3, ' | ', D14.3, ' |' ) end if deallocate( proc_name ) deallocate( node_name ) deallocate( proc2node ) deallocate( perf_matrix ) deallocate( perf_count ) #if defined(__MPI) CALL mpi_finalize(ierr) #endif contains SUBROUTINE set_a() INTEGER :: i, j, ii, jj IF( desc%active_node < 0 ) RETURN DO j = 1, nc DO i = 1, nr ii = i + ir - 1 jj = j + ic - 1 IF( ii == jj ) THEN a(i,j) = ( DBLE( n-ii+1 ) ) / DBLE( n ) + 1.0d0 / ( DBLE( ii+jj ) - 1.0d0 ) ELSE a(i,j) = 1.0d0 / ( DBLE( ii+jj ) - 1.0d0 ) END IF END DO END DO RETURN END SUBROUTINE set_a #if !defined(__MPI) real*8 function MPI_WTIME() mpi_wtime = 0 endfunction #endif end program lax_test