quantum-espresso/Modules/ptoolkit.f90

3089 lines
81 KiB
Fortran
Raw Normal View History

!
! Copyright (C) 2001-2005 Quantum-ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#include "f_defs.h"
!
!==----------------------------------------------==!
MODULE parallel_toolkit
!==----------------------------------------------==!
USE kinds, ONLY : DP
USE io_global, ONLY : stdout
USE parallel_include
IMPLICIT NONE
SAVE
PRIVATE
PUBLIC :: pdspev_drv, dspev_drv, &
diagonalize, pzhpev_drv, zhpev_drv, cdiagonalize
PUBLIC :: rep_matmul_drv
!==----------------------------------------------==!
CONTAINS
!==----------------------------------------------==!
SUBROUTINE ptredv( a, lda, d, e, v, ldv, nrl, n, nproc, me, comm )
!
! Parallel version of the famous HOUSEHOLDER tridiagonalization
! Algorithm for simmetric matrix.
!
! AUTHOR : Carlo Cavazzoni - SISSA 1997
! comments and suggestions to : carlo.cavazzoni@cineca.it
!
! REFERENCES :
!
! NUMERICAL RECIPES, THE ART OF SCIENTIFIC COMPUTING.
! W.H. PRESS, B.P. FLANNERY, S.A. TEUKOLSKY, AND W.T. VETTERLING,
! CAMBRIDGE UNIVERSITY PRESS, CAMBRIDGE.
!
! PARALLEL NUMERICAL ALGORITHMS,
! T.L. FREEMAN AND C.PHILLIPS,
! PRENTICE HALL INTERNATIONAL (1992).
!
!
!
! INPUTS :
!
! A(NRL,N) Local part of the global matrix A(N,N) to be reduced,
! only the upper triangle is needed.
! The rows of the matrix are distributed amoung processors
! with blocking factor 1.
! Example for NPROC = 4 :
! ROW | PE
! 1 | 0
! 2 | 1
! 3 | 2
! 4 | 3
! 5 | 0
! 6 | 1
! .. | ..
!
! LDA LEADING DIMENSION OF MATRIX A.
!
! LDV LEADING DIMENSION OF MATRIX V.
!
! NRL NUMBER OF ROWS BELONGING TO THE LOCAL PROCESSOR.
!
! N DIMENSION OF THE GLOBAL MATRIX.
!
! NPROC NUMBER OF PROCESSORS.
!
! ME INDEX OF THE LOCAL PROCESSOR (Starting from 0).
!
!
! OUTPUTS :
!
! V(NRL,N) Ortogonal transormation that tridiagonalize A,
! this matrix is distributed omong processor
! in the same way as A.
!
! D(N) Diagonal elements of the tridiagonal matrix
! this vector is equal on all processors.
!
! E(N) Subdiagonal elements of the tridiagonal matrix
! this vector is equal on all processors.
!
!
USE kinds
IMPLICIT NONE
INTEGER, PARAMETER :: PTRED_WORK_SIZE = 2333
#if defined __PARA
# if defined __SHMEM
include "mpp/shmem.fh"
INTEGER PSYNC_BC(SHMEM_BCAST_SYNC_SIZE)
INTEGER PSYNC_B(SHMEM_BARRIER_SYNC_SIZE)
INTEGER PSYNC_STA(SHMEM_REDUCE_SYNC_SIZE)
REAL(DP) pWrk(MAX(PTRED_WORK_SIZE,SHMEM_REDUCE_MIN_WRKDATA_SIZE))
DATA PSYNC_BC /SHMEM_BCAST_SYNC_SIZE*SHMEM_SYNC_VALUE/
DATA PSYNC_B /SHMEM_BARRIER_SYNC_SIZE*SHMEM_SYNC_VALUE/
DATA PSYNC_STA /SHMEM_REDUCE_SYNC_SIZE*SHMEM_SYNC_VALUE/
SAVE PSYNC_B, PSYNC_STA, PSYNC_BC,pWrk
# endif
#endif
INTEGER :: N, NRL, LDA, LDV
INTEGER :: NPROC, ME, comm
REAL(DP) :: A(LDA,N), D(N), E(N), V(LDV,N)
!
REAL(DP) :: DDOT
!
REAL(DP) :: g, scale, sigma, kappa, f, h, tmp
REAL(DP) :: u(PTRED_WORK_SIZE)
REAL(DP) :: p(PTRED_WORK_SIZE)
REAL(DP) :: vtmp(PTRED_WORK_SIZE)
REAL(DP) :: tu, tp, one_over_h
REAL(DP) :: one_over_scale
REAL(DP) :: ul(PTRED_WORK_SIZE)
REAL(DP) :: pl(PTRED_WORK_SIZE)
#if (defined __SHMEM && defined __ALTIX) || (defined __SHMEM && defined __ORIGIN)
REAL(DP), SAVE :: d_tmp(PTRED_WORK_SIZE)
#endif
integer :: l, i, j, k, t, tl, ierr
integer :: kl, jl, ks, lloc
integer :: is(PTRED_WORK_SIZE)
integer :: ri(PTRED_WORK_SIZE)
save :: g,h,scale,sigma,kappa,f,u,p,tmp,vtmp
save :: tu,tp,one_over_h,one_over_scale,ul,pl
save :: l,i,j,k,t,tl,ierr,kl,jl,is,ri,ks,lloc
! .......... FOR I=N STEP -1 UNTIL 1 DO -- ..........
! ttot = 0
! mem1 = 0
IF( N == 0 ) THEN
RETURN
ELSE IF( (N > PTRED_WORK_SIZE) .OR. (N < 0) ) THEN
WRITE( stdout,*) ' *** ERROR IN PTREDV'
WRITE( stdout,*) ' N OUT OF RANGE : ',N
STOP
END IF
DO I = N, 1, -1
IS(I) = (I-1)/NPROC
RI(I) = MOD((I-1),NPROC)
IF(ME .le. RI(I) ) then
IS(I) = IS(I) + 1
END IF
! WRITE( stdout,100) 'ME,I,RI,IS',ME,I,RI(I),IS(I)
100 FORMAT(A,2X,5I4)
END DO
DO I = N, 2, -1
L = I - 1 ! primo elemeto
H = 0.0d0
IF ( L .GT. 1 ) THEN
SCALE = 0.0D0
DO K = 1, is(l)
SCALE = SCALE + DABS( A(K,I) )
END DO
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(SCALE, SCALE, 1, 0, 0, nproc, pWrk, pSync_sta)
# elif defined __MPI
CALL MPI_ALLREDUCE(SCALE, TMP, 1, MPI_DOUBLE_PRECISION, MPI_SUM, comm, IERR)
SCALE = TMP
# endif
#endif
IF ( SCALE .EQ. 0.0D0 ) THEN
IF (RI(L).EQ.ME) THEN
E(I) = A(is(L),I)
END IF
ELSE
! ...... CALCOLO DI SIGMA E DI H
ONE_OVER_SCALE = 1.0d0/SCALE
SIGMA = 0.0D0
DO k = 1,is(L)
A(k,I) = A(k,I) * ONE_OVER_SCALE
SIGMA = SIGMA + A(k,I)**2
END DO
IF(RI(L) .eq. ME ) THEN
F = A(is(L),I)
END IF
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(SIGMA, SIGMA, 1, 0, 0, nproc, pWrk, pSync_sta)
call shmem_barrier_all
# if defined __ALTIX || defined __ORIGIN
CALL SHMEM_BROADCAST8(F,F,1,RI(L),0,0,nproc,pSync_bc)
# else
CALL SHMEM_BROADCAST(F,F,1,RI(L),0,0,nproc,pSync_bc)
# endif
# elif defined __MPI
CALL MPI_ALLREDUCE(SIGMA, TMP, 1, MPI_DOUBLE_PRECISION, MPI_SUM, comm, IERR)
SIGMA = TMP
CALL MPI_BCAST(F, 1, MPI_DOUBLE_PRECISION, RI(L), comm, IERR)
# endif
#endif
G = -SIGN(SQRT(SIGMA),F)
H = SIGMA - F*G
ONE_OVER_H = 1.0d0/H
E(I) = SCALE*G
! ...... COSTRUZIONE DEL VETTORE U
#if defined __T3E
!DIR$ UNROLL 8
#endif
DO k = 1,L
vtmp(k) = 0.0d0
END DO
k = ME + 1
#if defined __T3E
!DIR$ UNROLL 4
#endif
DO kl = 1,is(l)
vtmp(k) = A(kl,I)
UL(kl) = A(kl,I)
k = k + NPROC
END DO
IF(RI(L) .eq. ME ) THEN
vtmp(L) = F - G
UL(is(l)) = F - G
A(is(l),I) = F - G
END IF
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(VTMP, U, L, 0, 0, nproc, pWrk, pSync_sta)
# elif defined __MPI
CALL MPI_ALLREDUCE(VTMP,U,L,MPI_DOUBLE_PRECISION,MPI_SUM,comm,IERR)
# endif
#else
u(1:l) = vtmp(1:l)
#endif
! ...... COSTRUZIONE DEL VETTORE P
KAPPA = 0.0d0
DO J = 1,L
vtmp(j) = 0.0d0
#if defined __T3E
!DIR$ UNROLL 8
#endif
DO KL = 1, IS(J)
vtmp(J) = vtmp(J) + A(KL,J) * UL(KL)
END DO
IF(L.GT.J .AND. ME.EQ.RI(J)) then
#if defined __T3E
!DIR$ UNROLL 8
#endif
DO K = J+1,L
vtmp(J) = vtmp(J) + A(IS(J),K) * U(K)
END DO
END IF
vtmp(J) = vtmp(J) * ONE_OVER_H
KAPPA = KAPPA + vtmp(J) * U(J) * 0.5d0 * ONE_OVER_H
END DO
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(KAPPA, KAPPA, 1, 0, 0, nproc, pWrk, pSync_sta)
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(vtmp, P, L, 0, 0, nproc, pWrk, pSync_sta)
call shmem_barrier_all
# elif defined __MPI
CALL MPI_ALLREDUCE(KAPPA,TMP,1,MPI_DOUBLE_PRECISION,MPI_SUM, comm,IERR)
KAPPA = TMP
CALL MPI_ALLREDUCE(vtmp,p,L,MPI_DOUBLE_PRECISION,MPI_SUM,comm,IERR)
# endif
#else
p(1:l) = vtmp(1:l)
#endif
CALL DAXPY(l, -kappa, u, 1, p, 1)
k = me + 1
#if defined __T3E
!DIR$ UNROLL 8
#endif
DO kl = 1,is(l)
PL(kl) = P(k)
k = k + NPROC
END DO
DO J = 1,L
tu= U(J)
tp= P(J)
#if defined __T3E
!DIR$ UNROLL 12
#endif
DO KL = 1,is(l)
A(KL,j) = A(KL,j) - UL(KL) * tp
A(KL,j) = A(KL,j) - PL(KL) * tu
END DO
END DO
END IF
ELSE
IF(RI(L).EQ.ME) THEN
G = A(is(l),I)
END IF
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
# if defined __ALTIX || defined __ORIGIN
CALL SHMEM_BROADCAST8(G,G,1,RI(L),0,0,nproc,pSync_bc)
# else
CALL SHMEM_BROADCAST(G,G,1,RI(L),0,0,nproc,pSync_bc)
# endif
# elif defined __MPI
CALL MPI_BCAST(G,1,MPI_DOUBLE_PRECISION,RI(L),comm,IERR)
# endif
#endif
E(I) = G
END IF
D(I) = H
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
# endif
#endif
END DO
E(1) = 0.0d0
D(1) = 0.0d0
! t1 = mclock()
DO J = 1,N
#if defined __T3E
!DIR$ UNROLL 12
#endif
DO I = 1,NRL
V(I,J) = 0.0d0
END DO
IF(RI(J).EQ.ME) THEN
V(IS(J),J) = 1.0d0
END IF
END DO
DO I = 2,N
L = I - 1
LLOC = IS(L)
IF(D(I).NE.0.0d0) THEN
ONE_OVER_H = 1.0d0/D(I)
DO J = 1, L
P(J) = DDOT(LLOC,V(1,J),1,A(1,I),1)
END DO
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(P, VTMP, L, 0, 0, nproc, pWrk, pSync_sta)
# elif defined __MPI
CALL MPI_ALLREDUCE(P,VTMP,L,MPI_DOUBLE_PRECISION,MPI_SUM,comm,IERR)
# endif
#else
vtmp(1:l) = p(1:l)
#endif
DO J = 1, L
call DAXPY(LLOC,-VTMP(J)*ONE_OVER_H,A(1,I),1,V(1,J),1)
END DO
END IF
#if defined __PARA
# if defined __SHMEM
call shmem_barrier_all
# endif
#endif
END DO
DO I = 1,N
U(I) = 0.0d0
IF(RI(I).eq.ME) then
U(I) = A(IS(I),I)
END IF
END DO
#if defined __PARA
# if defined __SHMEM
#if defined __ALTIX || defined __ORIGIN
d_tmp(1:n) = d(1:n)
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(U, d_tmp, N, 0, 0, nproc, pWrk, &
& pSync_sta)
#else
call shmem_barrier_all
CALL SHMEM_REAL8_SUM_TO_ALL(U, D, N, 0, 0, nproc, pWrk, pSync_sta)
#endif
call shmem_barrier_all
# elif defined __MPI
CALL MPI_ALLREDUCE(U,D,N,MPI_DOUBLE_PRECISION,MPI_SUM,comm,IERR)
# endif
#else
D(1:N) = U(1:N)
#endif
RETURN
END SUBROUTINE ptredv
!==----------------------------------------------==!
SUBROUTINE ptqliv( d, e, n, z, ldz, nrl )
!
! Modified QL algorithm for CRAY T3E PARALLEL MACHINE
! calculate the eigenvectors and eigenvalues of a matrix reduced to
! tridiagonal form by PTREDV.
!
! AUTHOR : Carlo Cavazzoni - SISSA 1997
! comments and suggestions to : cava@sissa.it
!
! REFERENCES :
!
! NUMERICAL RECIPES, THE ART OF SCIENTIFIC COMPUTING.
! W.H. PRESS, B.P. FLANNERY, S.A. TEUKOLSKY, AND W.T. VETTERLING,
! CAMBRIDGE UNIVERSITY PRESS, CAMBRIDGE.
!
! PARALLEL NUMERICAL ALGORITHMS,
! T.L. FREEMAN AND C.PHILLIPS,
! PRENTICE HALL INTERNATIONAL (1992).
!
! NOTE : the algorithm tha finds the eigenvalues is not parallelized
! ( it scales as O(N^2) ), I prefere to parallelize only the
! updating of the eigenvectors because its the most costly
! part of the algorithm ( it scales as O(N^3) ).
! For large matrix in practice all the time is spent in the updating
! that in this routine scales linearly with the number of processors,
! in fact there is no communication at all.
!
!
! INPUTS :
!
! D(N) Diagonal elements of the tridiagonal matrix
! this vector is equal on all processors.
!
! E(N) Subdiagonal elements of the tridiagonal matrix
! this vector is equal on all processors.
!
! N DIMENSION OF THE GLOBAL MATRIX.
!
! NRL NUMBER OF ROWS OF Z BELONGING TO THE LOCAL PROCESSOR.
!
! LDZ LEADING DIMENSION OF MATRIX Z.
!
! Z(LDZ,N) Ortogonal transormation that tridiagonalize the original
! matrix A.
! The rows of the matrix are distributed amoung processors
! with blocking factor 1.
! Example for NPROC = 4 :
! ROW | PE
! 1 | 0
! 2 | 1
! 3 | 2
! 4 | 3
! 5 | 0
! 6 | 1
! .. | ..
!
!
!
! OUTPUTS :
!
! Z(LDZ,N) EIGENVECTORS OF THE ORIGINAL MATRIX.
! THE Jth COLUMN of Z contains the eigenvectors associated
! with the jth eigenvalue.
! The eigenvectors are scattered among processors (4PE examp. )
! eigenvector | PE
! elements |
! V(1) | 0
! V(2) | 1
! V(3) | 2
! V(4) | 3
! V(5) | 0
! V(6) | 1
! .... ..
!
! D(N) Eigenvalues of the original matrix,
! this vector is equal on all processors.
!
!
!
!
USE kinds
IMPLICIT NONE
INTEGER n,nrl,ldz
REAL(DP) d(n),e(n)
REAL(DP) z(ldz,n)
INTEGER FV_WORK_SIZE
INTEGER CSV_WORK_SIZE
PARAMETER ( FV_WORK_SIZE = 2000 )
PARAMETER ( CSV_WORK_SIZE = 2000 )
INTEGER i,iter,mk,k,l,m
REAL(DP) b,dd,f,g,p,r,c,s
REAL(DP) cv(CSV_WORK_SIZE)
REAL(DP) sv(CSV_WORK_SIZE)
REAL(DP) fv1(FV_WORK_SIZE)
REAL(DP) fv2(FV_WORK_SIZE)
save cv,sv,fv1,fv2,b,dd,f,g,p,r,c,s
save i,iter,mk,k,l,m
if(n.gt.CSV_WORK_SIZE) then
print *,' ptqli CSV_WORK_SIZE too small '
stop
end if
if(nrl.gt.FV_WORK_SIZE) then
print *,' ptqli FV_WORK_SIZE too small '
stop
end if
do l = 2,n
e(l-1) = e(l)
end do
do 15 l=1,n
iter=0
1 do m=l,n-1
dd = abs(d(m))+abs(d(m+1))
if ( abs(e(m))+dd .eq. dd ) goto 2
end do
m=n
2 if(m.ne.l)then
if(iter.eq.200) then
call errore(' tqli ',' too many iterations ', iter)
end if
iter=iter+1
g=(d(l+1)-d(l))/(2.0d0*e(l))
r=pythag(g,1.0d0)
g=d(m)-d(l)+e(l)/(g+sign(r,g))
s=1.0d0
c=1.0d0
p=0.0d0
do i=m-1,l,-1
f=s*e(i)
b=c*e(i)
r=pythag(f,g)
e(i+1)=r
if(r.eq.0.0d0)then
d(i+1)=d(i+1)-p
e(m)=0.0d0
goto 1
endif
c=g/r
g=d(i+1)-p
s=f/r
r=(d(i)-g)*s+2.0d0*c*b
p=s*r
d(i+1)=g+p
g=c*r-b
! do k=1,nrl
! f=z(k,i+1)
! z(k,i+1)=s*z(k,i)+c*f
! z(k,i)=c*z(k,i)-s*f
! end do
cv(i) = c
sv(i) = s
end do
do i=m-1,l,-1
#if defined __T3E
!DIR$ UNROLL 12
#endif
do k=1,nrl
fv2(k) =z(k,i+1)
end do
#if defined __T3E
!DIR$ UNROLL 12
#endif
do k=1,nrl
fv1(k) =z(k,i)
end do
#if defined __T3E
!DIR$ UNROLL 12
#endif
do k=1,nrl
z(k,i+1) =sv(i)*fv1(k) + cv(i)*fv2(k)
z(k,i) =cv(i)*fv1(k) - sv(i)*fv2(k)
end do
end do
d(l)=d(l)-p
e(l)=g
e(m)=0.0d0
goto 1
endif
15 continue
return
END SUBROUTINE ptqliv
!==----------------------------------------------==!
SUBROUTINE peigsrtv(d,v,ldv,n,nrl)
!
! This routine sort eigenvalues and eigenvectors
! generated by PTREDV and PTQLIV.
!
! AUTHOR : Carlo Cavazzoni - SISSA 1997
! comments and suggestions to : cava@sissa.it
!
USE kinds
IMPLICIT NONE
INTEGER n,ldv,nrl
REAL(DP) d(n),v(ldv,n)
INTEGER i,j,k
REAL(DP) p
save i,j,k
save p
! REAL(DP) fv(nrl)
do 13 i=1,n-1
k=i
p=d(i)
do j=i+1,n
if(d(j).le.p)then
k=j
p=d(j)
endif
end do
if(k.ne.i)then
d(k)=d(i)
d(i)=p
!
! Exchange local elements of eigenvectors.
!
#if defined __T3E
!DIR$ UNROLL 12
#endif
do j=1,nrl
p=v(j,i)
v(j,i)=v(j,k)
v(j,k)=p
END DO
endif
13 continue
return
END SUBROUTINE peigsrtv
FUNCTION pythag(a,b)
USE kinds
IMPLICIT NONE
REAL(DP) a,b,pythag
REAL(DP) absa,absb
absa=abs(a)
absb=abs(b)
if(absa.gt.absb)then
pythag=absa*sqrt(1.0d0+(absb/absa)**2)
else
if(absb.eq.0.0d0)then
pythag=0.0d0
else
pythag=absb*sqrt(1.0d0+(absa/absb)**2)
endif
endif
return
END FUNCTION pythag
!
!-------------------------------------------------------------------------
SUBROUTINE diagonalize( iopt, a, d, ev, n, nproc, mpime, comm_in )
!-------------------------------------------------------------------------
!
! This subroutine calculate eigenvalues and optionally eigenvectors
! of a real symmetric matrix "a" distributed or replicated across
! processors
!
! iopt input, choose the distributions of "a"
! iopt = 0 matrix "a" and "ev" are distributed across procs
! iopt = 1 matrix "a" and "ev" are replicated across procs
! a(:,:) input, matrix to be diagonalized, overwritten on output
! d(:) output, array of the eigenvalues
! ev(:,:) output, matrix of the eigenvectors
! n input, dimension of matrix a
! nproc input, number of processors
! mpime input, index of this processors (starting from 0!)
!
! When iopt = 0 the matrix "a" shoud have the following layout:
!
! A(NRL,N) Local part of the global matrix A(N,N) to be reduced,
! The rows of the matrix are cyclically distributed among
! processors with blocking factor 1.
! Example for NPROC = 4 :
! ROW | PE
! 1 | 0
! 2 | 1
! 3 | 2
! 4 | 3
! 5 | 0
! 6 | 1
! .. | ..
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: iopt, n, nproc, mpime
REAL(DP), INTENT(OUT) :: d(n)
REAL(DP), INTENT(INOUT) :: a(n,n)
REAL(DP), INTENT(OUT) :: ev(n,n)
INTEGER, OPTIONAL, INTENT(IN) :: comm_in
!
REAL(DP), ALLOCATABLE :: aloc(:,:)
REAL(DP), ALLOCATABLE :: evloc(:,:)
REAL(DP), ALLOCATABLE :: e(:)
INTEGER :: nrl, i, j, jl, ierr, comm
!
!
IF ( PRESENT( comm_in ) ) THEN
!
comm = comm_in
!
#ifdef __MPI
ELSE
!
comm = MPI_COMM_WORLD
!
#endif
END IF
!
nrl = n / nproc
!
IF ( mpime < MOD( n, nproc ) ) nrl = nrl + 1
!
ALLOCATE( evloc( nrl, n ) )
ALLOCATE( e( n ) )
!
IF ( iopt == 1 ) THEN
!
ALLOCATE( aloc( nrl, n ) )
!
DO i = 1, n
DO jl = 1, nrl
aloc( jl, i ) = a( (jl-1)*nproc + mpime + 1, i )
END DO
END DO
!
CALL ptredv( aloc, nrl, d, e, evloc, nrl, nrl, n, nproc, mpime, comm )
!
DEALLOCATE( aloc )
!
ELSE
!
CALL ptredv( a, nrl, d, e, evloc, nrl, nrl, n, nproc, mpime, comm )
!
END IF
!
CALL ptqliv( d, e, n, evloc, nrl, nrl )
CALL peigsrtv( d, evloc, nrl, n, nrl )
!
IF ( iopt == 1 ) THEN
!
DO i = 1,n
DO j = 1,n
a(j,i) = 0.D0
END DO
DO jl = 1,nrl
a((jl-1)*nproc + mpime + 1,i) = evloc(jl,i)
END DO
END DO
!
#if defined (__PARA)
# if defined (__MPI)
CALL MPI_ALLREDUCE( a, ev, n*n, &
MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr )
# endif
#else
ev = a
#endif
!
ELSE
!
DO i = 1, n
DO jl = 1, nrl
ev(jl,i) = evloc(jl,i)
END DO
END DO
!
END IF
!
DEALLOCATE( evloc )
DEALLOCATE( e )
!
RETURN
!
END SUBROUTINE diagonalize
!==----------------------------------------------==!
SUBROUTINE pdspev_drv( jobz, ap, lda, w, z, ldz, &
nrl, n, nproc, mpime, comm_in )
USE kinds
IMPLICIT NONE
CHARACTER :: JOBZ
INTEGER :: lda, ldz, nrl, n, nproc, mpime
INTEGER, OPTIONAL, INTENT(IN) :: comm_in
REAL(DP) :: ap( lda, * ), w( * ), z( ldz, * )
REAL(DP) :: sd( n )
INTEGER :: comm
!
IF ( PRESENT( comm_in ) ) THEN
comm = comm_in
#ifdef __MPI
ELSE
comm = MPI_COMM_WORLD
#endif
END IF
CALL ptredv(ap, lda, w, sd, z, ldz, nrl, n, nproc, mpime, comm)
CALL ptqliv(w, sd, n, z, ldz, nrl)
CALL peigsrtv(w, z, ldz, n, nrl)
RETURN
END SUBROUTINE pdspev_drv
!==----------------------------------------------==!
SUBROUTINE dspev_drv( JOBZ, UPLO, N, AP, W, Z, LDZ )
USE kinds
IMPLICIT NONE
CHARACTER :: JOBZ, UPLO
INTEGER :: IOPT, INFO, LDZ, N
REAL(DP) :: AP( * ), W( * ), Z( LDZ, * )
REAL(DP), ALLOCATABLE :: WORK(:)
ALLOCATE( work( 3*n ) )
#if defined __AIX
IOPT = 0
IF((JOBZ .EQ. 'V') .OR. (JOBZ .EQ. 'v') ) iopt = iopt + 1
IF((UPLO .EQ. 'U') .OR. (UPLO .EQ. 'u') ) iopt = iopt + 20
CALL DSPEV(IOPT, ap, w, z, ldz, n, work, 3*n)
#else
CALL DSPEV(jobz, uplo, n, ap(1), w(1), z(1,1), ldz, work, INFO)
IF( info .NE. 0 ) THEN
CALL errore( ' dspev_drv ', ' diagonalization failed ',info )
END IF
#endif
DEALLOCATE( work )
RETURN
END SUBROUTINE dspev_drv
!-------------------------------------------------------------------------
SUBROUTINE cdiagonalize( iopt, a, d, ev, n, nproc, mpime, comm_in )
!-------------------------------------------------------------------------
!
! ... this routine calls the appropriate Lapack routine for diagonalizing
! ... a complex Hermitian matrix
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: iopt, n, nproc, mpime
REAL(DP), INTENT(OUT) :: d(n)
COMPLEX(DP), INTENT(INOUT) :: a(n,n)
COMPLEX(DP), INTENT(OUT) :: ev(n,n)
INTEGER, OPTIONAL, INTENT(IN) :: comm_in
!
INTEGER :: i, j, k, nrl, comm, ierr
INTEGER :: info = 0
!
COMPLEX(DP), ALLOCATABLE :: aloc(:)
COMPLEX(DP), ALLOCATABLE :: ap(:,:)
COMPLEX(DP), ALLOCATABLE :: vp(:,:)
REAL(DP), ALLOCATABLE :: rwork(:)
COMPLEX(DP), ALLOCATABLE :: cwork(:)
!
!
IF ( PRESENT( comm_in ) ) THEN
!
comm = comm_in
!
#ifdef __MPI
ELSE
!
comm = MPI_COMM_WORLD
!
#endif
END IF
!
IF ( ( nproc == 1 ) .OR. ( n < nproc ) ) THEN
!
! ... here use the standard scalar lapack drivers
!
ALLOCATE( aloc( n*(n+1)/2 ) )
ALLOCATE( rwork( 4*n ) )
ALLOCATE( cwork( 4*n ) )
!
! ... copy the lower-diagonal part of the matrix according to the
! ... Lapack packed storage scheme for Hermitian matrices
!
k = 0
DO j = 1, n
DO i = j, n
k = k + 1
aloc(k) = a(i,j)
END DO
END DO
!
! ... call the Lapack routine
!
#if defined (__AIX)
!
iopt = 1
!
CALL ZHPEV( iopt, aloc, d, ev, n, n, cwork, 4*n )
!
#else
CALL ZHPEV( 'V', 'L', n, aloc, d, ev, n, cwork, rwork, info )
#endif
!
DEALLOCATE( cwork )
DEALLOCATE( rwork )
DEALLOCATE( aloc )
!
IF ( info /= 0 ) &
CALL errore( 'cdiagonalize', 'info', ABS( info ) )
!
ELSE
!
nrl = n / nproc
!
IF ( mpime < MOD( n, nproc ) ) nrl = nrl + 1
!
ALLOCATE( vp( nrl, n ) )
ALLOCATE( rwork( n ) )
ALLOCATE( cwork( n ) )
!
IF ( iopt == 1 ) THEN
!
ALLOCATE( ap( nrl, n ) )
!
DO j = 1, n
DO i = 1, nrl
ap(i,j) = a(mpime+(i-1)*nproc+1,j)
END DO
END DO
!
CALL pzhptrd( n, nrl, ap, nrl, d, rwork, cwork, nproc, mpime, comm )
CALL pzupgtr( n, nrl, ap, nrl, cwork, vp, nrl, nproc, mpime, comm)
!
DEALLOCATE( ap )
!
ELSE
!
CALL pzhptrd( n, nrl, a, nrl, d, rwork, cwork, nproc, mpime, comm )
CALL pzupgtr( n, nrl, a, nrl, cwork, vp, nrl, nproc, mpime, comm )
!
END IF
!
CALL pzsteqr( 'V', n, nrl, d, rwork, vp, nrl, nproc, mpime, comm )
!
IF ( iopt == 1 ) THEN
!
DO j = 1,n
DO i = 1,n
a(i,j) = 0.D0
END DO
DO i = 1, nrl
a((i-1)*nproc+mpime+1,j) = vp(i,j)
END DO
END DO
!
#if defined (__PARA)
# if defined (__MPI)
CALL MPI_ALLREDUCE( a, ev, 2*n*n, &
MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr )
# endif
#else
ev(:,:) = a(:,:)
#endif
ELSE
!
DO j = 1,n
DO i = 1, nrl
ev(i,j) = vp(i,j)
END DO
END DO
!
END IF
!
DEALLOCATE( vp )
DEALLOCATE( rwork )
DEALLOCATE( cwork )
!
END IF
!
RETURN
!
END SUBROUTINE cdiagonalize
!
!-------------------------------------------------------------------------
SUBROUTINE pzhptrd( n, nrl, ap, lda, d, e, tau, nproc, me, comm )
!-------------------------------------------------------------------------
!
! Parallel MPI version of the LAPACK routine ZHPTRD
!
! Carlo Cavazzoni (carlo.cavazzoni@cineca.it) -- CINECA
! Dicember 12, 1999
!
! REFERENCES :
!
! NUMERICAL RECIPES, THE ART OF SCIENTIFIC COMPUTING.
! W.H. PRESS, B.P. FLANNERY, S.A. TEUKOLSKY, AND W.T. VETTERLING,
! CAMBRIDGE UNIVERSITY PRESS, CAMBRIDGE.
!
! PARALLEL NUMERICAL ALGORITHMS,
! T.L. FREEMAN AND C.PHILLIPS,
! PRENTICE HALL INTERNATIONAL (1992).
!
! LAPACK routine (version 2.0) --
! Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
! Courant Institute, Argonne National Lab, and Rice University
!
IMPLICIT NONE
! .. __SCALAR Arguments ..
INTEGER LDA, N, NRL, NPROC, ME, comm
! ..
! .. Array Arguments ..
REAL(DP) D( * ), E( * )
COMPLEX(DP) AP(LDA, * ), TAU( * )
! ..
!
! Purpose
! =======
!
! PZHPTRD reduces a complex Hermitian distributed matrix AP to
! real symmetric tridiagonal form T by a unitary similarity
! transformation: Q**H * A * Q = T.
!
! Arguments
! =========
!
! N (input) INTEGER
! The order of the mglobal atrix AP. N >= 0.
!
! NRL (input) INTEGER
! The number of local rows of the matrix AP. NRL >= 0.
!
! AP (input/output) COMPLEX(DP) array, dimension (LDA,N)
! On entry, the Hermitian matrix AP.
! The rows of the matrix are distributed among processors
! with blocking factor 1.
! Example for NPROC = 4 :
! ROW | PE
! 1 | 0
! 2 | 1
! 3 | 2
! 4 | 3
! 5 | 0
! 6 | 1
! .. | ..
! On exit, the diagonal and first subdiagonal
! of A are overwritten by the corresponding elements of the
! tridiagonal matrix T, and the elements below the first
! subdiagonal, with the array TAU, represent the unitary
! matrix Q as a product of elementary reflectors;
!
! LDA (input) INTEGER
! Leading dimension of the local matrix AP, LDA > NRL
!
! D (output) DOUBLE PRECISION array, dimension (N)
! The diagonal elements of the tridiagonal matrix T:
! D(i) = AP(i,i).
!
! E (output) DOUBLE PRECISION array, dimension (N-1)
! The off-diagonal elements of the tridiagonal matrix T:
! E(i) = A(i+1,i)
!
! TAU (output) COMPLEX(DP) array, dimension (N-1)
! The __SCALAR factors of the elementary reflectors (see Further
! Details).
!
! NPROC (input) INTEGER
! Number of processors
!
! ME (input) INTEGER
! Index of the local processor ( 0, 1, 2, ..., NPROC-1 )
!
! Further Details
! ===============
!
! the matrix Q is represented as a product of elementary
! reflectors
!
! Q = H(1) H(2) . . . H(n-1).
!
! Each H(i) has the form
!
! H(i) = I - tau * v * v'
!
! where tau is a complex __SCALAR, and v is a complex vector with
! v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in AP,
! overwriting A(i+2:n,i), and tau is stored in TAU(i).
!
! =====================================================================
!
! .. Parameters ..
COMPLEX(DP) ONE, ZERO, HALF
PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ),ZERO = ( 0.0D+0, 0.0D+0 ), &
& HALF = ( 0.5D+0, 0.0D+0 ) )
REAL(DP) RONE, RZERO
PARAMETER ( RONE = 1.0D+0, RZERO = 0.0D+0 )
INTEGER QI
INTEGER IL(N+1)
INTEGER OW(N+1)
COMPLEX(DP) CTMP
COMPLEX(DP) CTMPV(N+1)
COMPLEX(DP) TAUL(N+1)
COMPLEX(DP) APKI(N+1)
REAL(DP) TMP
REAL(DP) TMPV(N+1)
! ..
! .. Local __SCALARs ..
INTEGER J, I, I1, K, I2, NI1, JL
INTEGER KL, J1
COMPLEX(DP) ALPHA, TAUI
INTEGER KNT, IERR
REAL(DP) ALPHI, ALPHR, BETA, RSAFMN, SAFMIN, XNORM
! ..
! .. External Subroutines ..
EXTERNAL ZAXPY
EXTERNAL ZDSCAL, ZSCAL
! ..
! .. External Functions ..
COMPLEX(DP) ZDOTC
EXTERNAL ZDOTC
REAL(DP) DLAMCH, DLAPY3, DZNRM2
COMPLEX(DP) ZLADIV
EXTERNAL DLAMCH, DLAPY3, DZNRM2, ZLADIV
! ..
! .. Intrinsic Functions ..
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
INTRINSIC DABS, DBLE, AIMAG, SIGN
! cmplx removed because preprocessed
!
! .. Executable Statements ..
!
! Quick return if possible
!
IF(N.LE.0) THEN
RETURN
END IF
DO I = 1,N+1
QI = (I-1)/NPROC
OW(I) = MOD((I-1),NPROC)
IF(ME .le. OW(I) ) then
IL(I) = QI + 1
ELSE
IL(I) = QI
END IF
END DO
!
! Reduce the lower triangle of A.
!
IF (OW(1).EQ.ME) THEN
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
AP( IL(1), 1 ) = DBLE( AP( IL(1), 1 ) )
END IF
DO I = 1, N - 1
!
! Generate elementary reflector H(i) = I - tau * v * v'
! to annihilate A(i+2:n,i)
!
IF (OW(I+1).EQ.ME) THEN
ALPHA = AP( IL(I+1), I )
END IF
#if defined __PARA
# if defined __MPI
CALL MPI_BCAST(ALPHA,1,MPI_DOUBLE_COMPLEX,OW(I+1),comm,IERR)
# endif
#endif
IF( (N-I).LE.0 ) THEN
TAUI = RZERO
ELSE
IF(OW(I+2).EQ.ME) THEN
I2 = IL(I+2)
ELSE
I2 = IL(I+2) + 1 ! I+2
ENDIF
NI1 = NRL - I2 + 1 ! N-I-1
IF((N-I-1).GT.0) THEN
XNORM = DZNRM2( NI1, AP( I2, I ), 1 )
#if defined __PARA
XNORM = XNORM ** 2
# if defined __MPI
CALL MPI_ALLREDUCE(XNORM,TMP,1,MPI_DOUBLE_PRECISION, &
& MPI_SUM, comm,IERR)
# endif
XNORM = SQRT(TMP)
#endif
ELSE
XNORM = 0.0D0
ENDIF
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
ALPHR = DBLE( ALPHA )
ALPHI = AIMAG( ALPHA )
IF( XNORM.EQ.RZERO .AND. ALPHI.EQ.RZERO ) THEN
TAUI = RZERO
ELSE
BETA = -SIGN( DLAPY3( ALPHR, ALPHI, XNORM ), ALPHR )
SAFMIN = DLAMCH( 'S' ) / DLAMCH( 'E' )
RSAFMN = RONE / SAFMIN
IF( DABS( BETA ).LT.SAFMIN ) THEN
KNT = 0
10 CONTINUE
KNT = KNT + 1
IF(NI1.GT.0) THEN
CALL ZDSCAL( NI1, RSAFMN, AP( I2, I ), 1 )
ENDIF
BETA = BETA*RSAFMN
ALPHI = ALPHI*RSAFMN
ALPHR = ALPHR*RSAFMN
IF( DABS( BETA ).LT.SAFMIN ) GO TO 10
IF((N-I-1).GT.0) THEN
XNORM = DZNRM2( NI1, AP( I2, I ), 1 )
#if defined __PARA
XNORM = XNORM ** 2
# if defined __MPI
CALL MPI_ALLREDUCE(XNORM,TMP,1,MPI_DOUBLE_PRECISION, &
& MPI_SUM, comm,IERR)
# endif
XNORM = SQRT(TMP)
#endif
ELSE
XNORM = 0.0D0
ENDIF
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
ALPHA = CMPLX( ALPHR, ALPHI )
BETA = -SIGN( DLAPY3( ALPHR, ALPHI, XNORM ), ALPHR )
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
TAUI = CMPLX( (BETA-ALPHR)/BETA, -ALPHI/BETA )
ALPHA = ZLADIV( ONE, ALPHA-BETA )
IF(NI1.GT.0) THEN
CALL ZSCAL( NI1, ALPHA, AP( I2, I ), 1 )
ENDIF
ALPHA = BETA
DO J = 1, KNT
ALPHA = ALPHA*SAFMIN
END DO
ELSE
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
TAUI = CMPLX( (BETA-ALPHR)/BETA, -ALPHI/BETA )
ALPHA = ZLADIV( ONE, ALPHA-BETA )
IF(NI1.GT.0) THEN
CALL ZSCAL( NI1, ALPHA, AP( I2, I ), 1 )
ENDIF
ALPHA = BETA
END IF
END IF
ENDIF
!
E( I ) = ALPHA
!
IF( TAUI.NE.ZERO ) THEN
!
! Apply H(i) from both sides to A(i+1:n,i+1:n)
!
! ... AP( I+1, I ) = ONE
IF (OW(I+1).EQ.ME) THEN
AP( IL(I+1), I ) = ONE
END IF
!
! Compute y := tau * A * v storing y in TAU(i:n-1)
!
! ... broadcast A(K,I)
IF(OW(I+1).EQ.ME) THEN
I1 = IL(I+1)
ELSE
I1 = IL(I+1) + 1 ! I+2
ENDIF
#if defined __PARA
DO J = I+1, N
CTMPV(J) = ZERO
END DO
DO JL = I1, NRL
J = ME + (JL-1)*NPROC + 1
CTMPV(J) = AP(JL,I)
END DO
# if defined __MPI
CALL MPI_ALLREDUCE(CTMPV(I+1),APKI(I+1),N-I, &
& MPI_DOUBLE_COMPLEX, MPI_SUM, comm,IERR)
# endif
#else
DO J = I+1,N
APKI(J) = AP(J,I)
ENDDO
#endif
DO J = I+1, N+1
TAU(J-1) = ZERO
END DO
DO JL = I1, NRL
J = ME + (JL-1)*NPROC + 1
TAU(J-1) = ZERO
DO K = I+1, J
TAU(J-1) = TAU(J-1) + TAUI * AP(JL,K) * APKI(K)
END DO
END DO
DO J = I+1, N
IF(OW(J+1).EQ.ME) THEN
J1 = IL(J+1)
ELSE
J1 = IL(J+1) + 1 ! I+2
ENDIF
DO KL = J1, NRL
K = ME + (KL-1)*NPROC + 1
TAU(J-1) = TAU(J-1) + TAUI * CONJG(AP(KL,J)) * APKI(K)
END DO
END DO
#if defined __PARA
! ... parallel sum TAU
# if defined __MPI
CALL MPI_ALLREDUCE(TAU(I),CTMPV(I),N-I+1, &
& MPI_DOUBLE_COMPLEX,MPI_SUM,comm,IERR)
# endif
DO J = I, N
TAU(J) = CTMPV(J)
END DO
#endif
!
! Compute w := y - 1/2 * tau * (y'*v) * v
!
! ... ALPHA = -HALF*TAUI*ZDOTC(N-I,TAU(I),1,AP(I+1,I),1)
JL = 1
DO J = I, N
IF(OW(J+1).EQ.ME) THEN
TAUL(JL) = TAU(J)
JL = JL + 1
END IF
END DO
IF(OW(I+1).EQ.ME) THEN
I1 = IL(I+1)
ELSE
I1 = IL(I+1) + 1 ! I+1
ENDIF
NI1 = NRL - I1 + 1 ! N-I
ALPHA = -HALF*TAUI*ZDOTC(NI1,TAUL(1),1,AP(I1,I),1)
#if defined __PARA
# if defined __MPI
CALL MPI_ALLREDUCE(ALPHA,CTMP,1, &
& MPI_DOUBLE_COMPLEX, MPI_SUM, comm,IERR)
# endif
ALPHA = CTMP
#endif
#if defined __PARA
CALL ZAXPY(NI1,ALPHA,AP(I1,I),1,TAUL(1),1)
JL = 1
DO J = I, N
CTMPV(J) = ZERO
IF(OW(J+1).EQ.ME) THEN
CTMPV(J) = TAUL(JL)
JL = JL + 1
END IF
END DO
# if defined __MPI
CALL MPI_ALLREDUCE(CTMPV(I),TAU(I),N-I+1, &
& MPI_DOUBLE_COMPLEX, MPI_SUM, comm,IERR)
# endif
#else
CALL ZAXPY(N-I,ALPHA,AP(I+1,I),1,TAU(I),1)
#endif
!
! Apply the transformation as a rank-2 update:
! A := A - v * w' - w * v'
!
! ... broadcast A(K,I)
IF(OW(I+1).EQ.ME) THEN
I1 = IL(I+1)
ELSE
I1 = IL(I+1) + 1 ! I+2
ENDIF
#if defined __PARA
DO J = I+1, N
CTMPV(J) = ZERO
END DO
DO JL = I1, NRL
J = ME + (JL-1)*NPROC + 1
CTMPV(J) = AP(JL,I)
END DO
# if defined __MPI
CALL MPI_ALLREDUCE(CTMPV(I+1),APKI(I+1),N-I, &
& MPI_DOUBLE_COMPLEX, MPI_SUM, comm,IERR)
# endif
#else
DO J = I+1, N
APKI(J) = AP(J,I)
END DO
#endif
DO K = I+1,N
DO JL = I1,NRL
J = ME + (JL-1)*NPROC + 1
AP(JL,K) = AP(JL,K) - ONE * AP(JL,I) * CONJG(TAU(K-1)) - &
& CONJG(ONE) * TAU(J-1) * CONJG(APKI(K))
END DO
END DO
!
END IF
IF(OW(I+1).EQ.ME) THEN
AP(IL(I+1),I) = E( I )
END IF
IF(OW(I).EQ.ME) THEN
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
D( I ) = DBLE(AP( IL(I),I ))
END IF
#if defined __PARA
# if defined __MPI
CALL MPI_BCAST(D(I),1,MPI_DOUBLE_PRECISION,OW(I), comm,IERR)
# endif
#endif
TAU( I ) = TAUI
END DO
IF(OW(I).EQ.ME) THEN
General cleanup of intrinsic functions: conversion to real => DBLE (including real part of a complex number) conversion to complex => CMPLX complex conjugate => CONJG imaginary part => AIMAG All functions are uppercase. CMPLX is preprocessed by f_defs.h and performs an explicit cast: #define CMPLX(a,b) cmplx(a,b,kind=DP) This implies that 1) f_defs.h must be included whenever a CMPLX is present, 2) CMPLX should stay in a single line, 3) DP must be defined. All occurrences of real, float, dreal, dfloat, dconjg, dimag, dcmplx removed - please do not reintroduce any of them. Tested only with ifc7 and g95 - beware unintended side effects Maybe not the best solution (explicit casts everywhere would be better) but it can be easily changed with a script if the need arises. The following code might be used to test for possible trouble: program test_intrinsic implicit none integer, parameter :: dp = selected_real_kind(14,200) real (kind=dp) :: a = 0.123456789012345_dp real (kind=dp) :: b = 0.987654321098765_dp complex (kind=dp) :: c = ( 0.123456789012345_dp, 0.987654321098765_dp) print *, ' A = ', a print *, ' DBLE(A)= ', DBLE(a) print *, ' C = ', c print *, 'CONJG(C)= ', CONJG(c) print *, 'DBLE(c),AIMAG(C) = ', DBLE(c), AIMAG(c) print *, 'CMPLX(A,B,kind=dp)= ', CMPLX( a, b, kind=dp) end program test_intrinsic Note that CMPLX and REAL without a cast yield single precision numbers on ifc7 and g95 !!! git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@2133 c92efa57-630b-4861-b058-cf58834340f0
2005-08-27 01:44:42 +08:00
D( N ) = DBLE(AP( IL(I),I ))
END IF
#if defined __PARA
# if defined __MPI
CALL MPI_BCAST(D(N),1,MPI_DOUBLE_PRECISION,OW(I), comm,IERR)
# endif
#endif
!
RETURN
!
! End of ZHPTRD
!
END SUBROUTINE pzhptrd
!==----------------------------------------------==!
SUBROUTINE pzupgtr( n, nrl, ap, lda, tau, q, ldq, nproc, me, comm)
!
! Parallel MPI version of the LAPACK routine ZUPGTR
!
! Carlo Cavazzoni (carlo.cavazzoni@cineca.it) -- CINECA
! Dicember 12, 1999
!
! REFERENCES :
!
! NUMERICAL RECIPES, THE ART OF SCIENTIFIC COMPUTING.
! W.H. PRESS, B.P. FLANNERY, S.A. TEUKOLSKY, AND W.T. VETTERLING,
! CAMBRIDGE UNIVERSITY PRESS, CAMBRIDGE.
!
! PARALLEL NUMERICAL ALGORITHMS,
! T.L. FREEMAN AND C.PHILLIPS,
! PRENTICE HALL INTERNATIONAL (1992).
!
! LAPACK routine (version 2.0) --
! Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
! Courant Institute, Argonne National Lab, and Rice University
USE kinds
IMPLICIT NONE
!
! .. __SCALAR Arguments ..
INTEGER INFO, LDQ, N, LDA, NRL, NPROC, ME, comm
! ..
! .. Array Arguments ..
COMPLEX(DP) AP(LDA, * ), Q( LDQ, * ), TAU( * )
! ..
!
! Purpose
! =======
!
! PZUPGTR generates a complex unitary matrix Q which is defined as the
! product of n-1 elementary reflectors H(i) of order n, as returned by
! PZHPTRD :
!
! Q = H(1) H(2) . . . H(n-1).
!
! Arguments
! =========
!
! N (input) INTEGER
! The order of the mglobal atrix AP. N >= 0.
!
! NRL (input) INTEGER
! The number of local rows of the matrix AP. NRL >= 0.
!
! AP (input) COMPLEX(DP) array, dimension (LDA,N)
! The vectors which define the elementary reflectors, as
! returned by PZHPTRD.
! The rows of the matrix are distributed among processors
! with blocking factor 1.
! Example for NPROC = 4 :
! ROW | PE
! 1 | 0
! 2 | 1
! 3 | 2
! 4 | 3
! 5 | 0
! 6 | 1
! .. | ..
!
! LDA (input) INTEGER
! Leading dimension of the local matrix AP, LDA > NRL
!
! TAU (input) COMPLEX(DP) array, dimension (N-1)
! TAU(i) must contain the __SCALAR factor of the elementary
! reflector H(i), as returned by PZHPTRD.
!
! Q (output) COMPLEX(DP) array, dimension (LDQ,N)
! The N-by-N unitary matrix Q.
! The rows of the matrix are distributed among processors
! in the same way of the matrix AP
!
! LDQ (input) INTEGER
! The leading dimension of the array Q. LDQ >= max(1,NRL).
!
! NPROC (input) INTEGER
! Number of processors
!
! ME (input) INTEGER
! Index of the local processor ( 0, 1, 2, ..., NPROC-1 )
!
! =====================================================================
!
! .. Parameters ..
COMPLEX(DP) ONE, ZERO
PARAMETER ( ONE = (1.0D+0,0.0D+0), ZERO = (0.0D+0,0.0D+0) )
INTEGER QI
INTEGER IL(N+1)
INTEGER OW(N+1)
COMPLEX(DP) CTMP
COMPLEX(DP) CTMPV(N+1)
COMPLEX(DP) WORK(N+1)
! ..
! .. Local __SCALARs ..
INTEGER I, IINFO, J, K, JL, KL, J1, I1, I2, NI1, L, IERR
! ..
! .. Executable Statements ..
!
! Test the input arguments
!
! Quick return if possible
!
IF( N.EQ.0 ) THEN
RETURN
END IF
!
DO I = 1,N+1
QI = (I-1)/NPROC
OW(I) = MOD((I-1),NPROC)
IF(ME .le. OW(I) ) then
IL(I) = QI + 1
ELSE
IL(I) = QI
END IF
END DO
!
! Unpack the vectors which define the elementary reflectors and
! set the first row and column of Q equal to those of the unit
! matrix
!
IF(OW(1).EQ.ME) THEN
Q( IL(1), 1 ) = ONE
DO KL = 2, NRL
Q( KL, 1 ) = ZERO
END DO
DO J = 2, N
Q( IL(1), J ) = ZERO
END DO
ELSE
DO KL = 1, NRL
Q( KL, 1 ) = ZERO
END DO
ENDIF
DO J = 2, N
IF(OW(J+1).EQ.ME) THEN
J1 = IL(J+1)
ELSE
J1 = IL(J+1) + 1
ENDIF
DO KL = J1, NRL
Q( KL, J ) = AP( KL, J-1 )
END DO
END DO
IF( N.GT.1 ) THEN
!
! Generate Q(2:n,2:n)
!
DO I = N-1, 1, -1
!
! Apply H(i) to A(i:m,i:n) from the left
!
IF( I.LT.(N-1) ) THEN
IF(OW(I+1).EQ.ME) THEN
Q( IL(I+1), I+1 ) = ONE
END IF
!
! Form H * C
!
IF( TAU(I).NE.ZERO ) THEN
!
! w := C' * v
!
IF(OW(I+1).EQ.ME) THEN
I1 = IL(I+1)
ELSE
I1 = IL(I+1) + 1
ENDIF
DO J = 1, N-1-I
CTMP = ZERO
DO KL = I1, NRL
CTMP = CTMP + CONJG( Q( KL, J+I+1 ) ) * Q( KL,I+1 )
END DO
WORK(J) = CTMP
END DO
#if defined __PARA
# if defined __MPI
CALL MPI_ALLREDUCE(WORK,CTMPV,N-1-I,MPI_DOUBLE_COMPLEX, &
& MPI_SUM, comm,IERR)
# endif
DO J = 1,N-1-I
WORK(J) = CTMPV(J)
END DO
#endif
!
! C := C - v * w'
!
DO J = 1, N-1-I
CTMP = -TAU(I) * CONJG( WORK( J ) )
DO KL = I1, NRL
Q( KL, J+I+1 ) = Q( KL, J+I+1 ) + CTMP * Q(KL, I+1)
END DO
END DO
END IF
END IF
IF( I.LT.(N-1) ) THEN
IF(OW(I+2).EQ.ME) THEN
I2 = IL(I+2) ! I+2
ELSE
I2 = IL(I+2) + 1 ! local ind. of the first element > I+2
ENDIF
NI1 = NRL - I2 + 1 ! N-I-1
CALL ZSCAL( NI1, -TAU( I ), Q( I2, I+1 ), 1 )
END IF
IF(OW(I+1).EQ.ME) THEN
Q( IL(I+1), I+1 ) = ONE - TAU( I )
END IF
!
! Set A(1:i-1,i) to zero
!
DO L = 1, I - 1
IF(OW(L+1).EQ.ME) THEN
Q( IL(L+1), I+1 ) = ZERO
END IF
END DO
END DO
END IF
RETURN
!
! End of ZUPGTR
!
END SUBROUTINE pzupgtr
!==----------------------------------------------==!
SUBROUTINE pzsteqr( compz, n, nrl, d, e, z, ldz, nproc, me, comm )
!
! Parallel MPI version of the LAPACK routine ZHPTRD
!
! Carlo Cavazzoni (carlo.cavazzoni@cineca.it) -- CINECA
! Dicember 12, 1999
!
! REFERENCES :
!
! NUMERICAL RECIPES, THE ART OF SCIENTIFIC COMPUTING.
! W.H. PRESS, B.P. FLANNERY, S.A. TEUKOLSKY, AND W.T. VETTERLING,
! CAMBRIDGE UNIVERSITY PRESS, CAMBRIDGE.
!
! PARALLEL NUMERICAL ALGORITHMS,
! T.L. FREEMAN AND C.PHILLIPS,
! PRENTICE HALL INTERNATIONAL (1992).
!
! LAPACK routine (version 2.0) --
! Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
! Courant Institute, Argonne National Lab, and Rice University
!
USE kinds
IMPLICIT NONE
! .. __SCALAR Arguments ..
CHARACTER COMPZ
INTEGER LDZ, N, NRL, NPROC, ME, comm
! ..
! .. Array Arguments ..
REAL(DP) D( * ), E( * )
COMPLEX(DP) Z( LDZ, * )
! ..
!
! Purpose
! =======
!
! PZSTEQR computes all eigenvalues and, optionally, eigenvectors of a
! symmetric tridiagonal matrix using the implicit QL or QR method.
! The eigenvectors of a full or band complex Hermitian matrix can also
! be found if PZHPTRD has been used to reduce this
! matrix to tridiagonal form.
!
! Arguments
! =========
!
! COMPZ (input) CHARACTER*1
! = 'N': Compute eigenvalues only.
! = 'V': Compute eigenvalues and eigenvectors of the original
! Hermitian matrix. On entry, Z must contain the
! unitary matrix used to reduce the original matrix
! to tridiagonal form.
! = 'I': Compute eigenvalues and eigenvectors of the
! tridiagonal matrix. Z is initialized to the identity
! matrix.
!
! N (input) INTEGER
! The order of the mglobal atrix AP. N >= 0.
!
! NRL (input) INTEGER
! The number of local rows of the matrix AP. NRL >= 0.
!
! D (input/output) DOUBLE PRECISION array, dimension (N)
! On entry, the diagonal elements of the tridiagonal matrix.
! On exit, if INFO = 0, the eigenvalues in ascending order.
!
! E (input/output) DOUBLE PRECISION array, dimension (N-1)
! On entry, the (n-1) subdiagonal elements of the tridiagonal
! matrix.
! On exit, E has been destroyed.
!
! Z (input/output) COMPLEX(DP) array, dimension (LDZ, N)
! On entry, if COMPZ = 'V', then Z contains the unitary
! matrix used in the reduction to tridiagonal form.
! On exit if COMPZ = 'V', Z contains the
! orthonormal eigenvectors of the original Hermitian matrix,
! and if COMPZ = 'I', Z contains the orthonormal eigenvectors
! of the symmetric tridiagonal matrix.
! If COMPZ = 'N', then Z is not referenced.
! The rows of the matrix are distributed among processors
! with blocking factor 1, i.e. for NPROC = 4 :
! ROW Index | Processor index owning the row
! 1 | 0
! 2 | 1
! 3 | 2
! 4 | 3
! 5 | 0
! 6 | 1
! .. | ..
!
! LDZ (input) INTEGER
! The leading dimension of the array Z. LDZ >= 1, and if
! eigenvectors are desired, then LDZ >= max(1,NRL).
!
! NPROC (input) INTEGER
! Number of processors
!
! ME (input) INTEGER
! Index of the local processor ( 0, 1, 2, ..., NPROC-1 )
!
! =====================================================================
!
! .. Parameters ..
REAL(DP) ZERO, ONE, TWO, THREE, CTEMP, STEMP
PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0, &
& THREE = 3.0D0 )
COMPLEX(DP) CZERO, CONE,ZTEMP
PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), CONE = ( 1.0D0, 0.0D0 ) )
INTEGER MAXIT
PARAMETER ( MAXIT = 30 )
! ..
INTEGER QI, KL, INFO
INTEGER IL(N+1)
INTEGER OW(N+1)
REAL(DP) WORK(2*N)
! .. Local __SCALARs ..
INTEGER I, ICOMPZ, II, ISCALE, J, JTOT, K, L, L1, LEND, &
& LENDM1, LENDP1, LENDSV, LM1, LSV, M, MM, MM1, &
& NM1, NMAXIT
REAL(DP) ANORM, B, C, EPS, EPS2, F, G, P, R, RT1, RT2, &
& S, SAFMAX, SAFMIN, SSFMAX, SSFMIN, TST
! ..
! .. External Functions ..
LOGICAL LSAME
REAL(DP) DLAMCH, DLANST, DLAPY2
EXTERNAL LSAME, DLAMCH, DLANST, DLAPY2
! ..
! .. External Subroutines ..
EXTERNAL DLAE2, DLAEV2, DLARTG, DLASCL, DLASRT, XERBLA, &
& ZLASET, ZLASR, ZSWAP
! ..
! .. Intrinsic Functions ..
INTRINSIC DABS, MAX, SIGN, SQRT
! ..
! .. Executable Statements ..
!
! Test the input parameters.
!
INFO = 0
!
IF( LSAME( COMPZ, 'N' ) ) THEN
ICOMPZ = 0
ELSE IF( LSAME( COMPZ, 'V' ) ) THEN
ICOMPZ = 1
ELSE IF( LSAME( COMPZ, 'I' ) ) THEN
ICOMPZ = 2
ELSE
ICOMPZ = -1
END IF
IF( ICOMPZ.LT.0 ) THEN
INFO = -1
ELSE IF( N.LT.0 ) THEN
INFO = -2
ELSE IF( (LDZ.LT.1) .OR. ( ICOMPZ.GT.0 .AND. LDZ.LT.MAX(1,NRL) ) ) THEN
INFO = -6
END IF
IF( INFO.NE.0 ) THEN
CALL XERBLA( 'ZSTEQR', -INFO )
RETURN
END IF
!
! Quick return if possible
!
IF(N.LE.0) THEN
RETURN
END IF
!
DO I = 1,N+1
QI = (I-1)/NPROC
OW(I) = MOD((I-1),NPROC)
IF(ME .le. OW(I) ) then
IL(I) = QI + 1
ELSE
IL(I) = QI
END IF
END DO
IF( N.EQ.1 ) THEN
IF( ICOMPZ.EQ.2 .AND. OW(1).EQ.ME ) Z( IL(1), 1 ) = CONE
RETURN
END IF
!
! Determine the unit roundoff and over/underflow thresholds.
!
EPS = DLAMCH( 'E' )
EPS2 = EPS**2
SAFMIN = DLAMCH( 'S' )
SAFMAX = ONE / SAFMIN
SSFMAX = SQRT( SAFMAX ) / THREE
SSFMIN = SQRT( SAFMIN ) / EPS2
!
! Compute the eigenvalues and eigenvectors of the tridiagonal
! matrix.
!
IF( ICOMPZ.EQ.2 ) THEN
CALL ZLASET( 'Full', NRL, N, CZERO, CZERO, Z, LDZ )
DO J = 1, N
IF(OW(J).EQ.ME) THEN
Z( IL(J), J ) = CONE
END IF
END DO
END IF
!
NMAXIT = N*MAXIT
JTOT = 0
!
! Determine where the matrix splits and choose QL or QR iteration
! for each block, according to whether top or bottom diagonal
! element is smaller.
!
L1 = 1
NM1 = N - 1
!
10 CONTINUE
IF( L1.GT.N ) GO TO 160
IF( L1.GT.1 ) E( L1-1 ) = ZERO
IF( L1.LE.NM1 ) THEN
DO 20 M = L1, NM1
TST = DABS( E( M ) )
IF( TST.EQ.ZERO ) GO TO 30
IF( TST.LE.( SQRT(DABS(D(M)))*SQRT(DABS(D(M+1))))*EPS ) THEN
E( M ) = ZERO
GO TO 30
END IF
20 CONTINUE
END IF
M = N
!
30 CONTINUE
L = L1
LSV = L
LEND = M
LENDSV = LEND
L1 = M + 1
IF( LEND.EQ.L ) GO TO 10
!
! Scale submatrix in rows and columns L to LEND
!
ANORM = DLANST( 'I', LEND-L+1, D( L ), E( L ) )
ISCALE = 0
IF( ANORM.EQ.ZERO ) GO TO 10
IF( ANORM.GT.SSFMAX ) THEN
ISCALE = 1
CALL DLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L+1, 1, D( L ), N, INFO )
CALL DLASCL( 'G', 0, 0, ANORM, SSFMAX, LEND-L, 1, E( L ), N, INFO )
ELSE IF( ANORM.LT.SSFMIN ) THEN
ISCALE = 2
CALL DLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L+1, 1, D( L ), N, INFO )
CALL DLASCL( 'G', 0, 0, ANORM, SSFMIN, LEND-L, 1, E( L ), N, INFO )
END IF
!
! Choose between QL and QR iteration
!
IF( DABS( D( LEND ) ).LT.DABS( D( L ) ) ) THEN
LEND = LSV
L = LENDSV
END IF
!
IF( LEND.GT.L ) THEN
!
! QL Iteration
!
! Look for small subdiagonal element.
!
40 CONTINUE
IF( L.NE.LEND ) THEN
LENDM1 = LEND - 1
DO 50 M = L, LENDM1
TST = DABS( E( M ) )**2
IF( TST.LE.( EPS2*DABS(D(M)) )*DABS(D(M+1))+ SAFMIN )GO TO 60
50 CONTINUE
END IF
!
M = LEND
!
60 CONTINUE
IF( M.LT.LEND ) E( M ) = ZERO
P = D( L )
IF( M.EQ.L ) GO TO 80
!
! If remaining matrix is 2-by-2, use DLAE2 or SLAEV2
! to compute its eigensystem.
!
IF( M.EQ.L+1 ) THEN
IF( ICOMPZ.GT.0 ) THEN
CALL DLAEV2( D( L ), E( L ), D( L+1 ), RT1, RT2, C, S )
WORK( L ) = C
WORK( N-1+L ) = S
CTEMP = WORK( L )
STEMP = WORK( N-1+L )
IF( ( CTEMP.NE.ONE ) .OR. ( STEMP.NE.ZERO ) ) THEN
DO KL = 1, NRL
ZTEMP = Z( KL, 1+L )
Z( KL, 1+L ) = CTEMP*ZTEMP - STEMP*Z( KL, L )
Z( KL, L ) = STEMP*ZTEMP + CTEMP*Z( KL, L )
END DO
END IF
ELSE
CALL DLAE2( D( L ), E( L ), D( L+1 ), RT1, RT2 )
END IF
D( L ) = RT1
D( L+1 ) = RT2
E( L ) = ZERO
L = L + 2
IF( L.LE.LEND ) GO TO 40
GO TO 140
END IF
!
IF( JTOT.EQ.NMAXIT ) GO TO 140
JTOT = JTOT + 1
!
! Form shift.
!
G = ( D( L+1 )-P ) / ( TWO*E( L ) )
R = DLAPY2( G, ONE )
G = D( M ) - P + ( E( L ) / ( G+SIGN( R, G ) ) )
!
S = ONE
C = ONE
P = ZERO
!
! Inner loop
!
MM1 = M - 1
DO 70 I = MM1, L, -1
F = S*E( I )
B = C*E( I )
CALL DLARTG( G, F, C, S, R )
IF( I.NE.M-1 ) E( I+1 ) = R
G = D( I+1 ) - P
R = ( D( I )-G )*S + TWO*C*B
P = S*R
D( I+1 ) = G + P
G = C*R - B
!
! If eigenvectors are desired, then save rotations.
!
IF( ICOMPZ.GT.0 ) THEN
WORK( I ) = C
WORK( N-1+I ) = -S
END IF
!
70 CONTINUE
!
! If eigenvectors are desired, then apply saved rotations.
!
IF( ICOMPZ.GT.0 ) THEN
DO J = M - L + 1 - 1, 1, -1
CTEMP = WORK( L + J -1)
STEMP = WORK( N-1+L + J-1)
IF( ( CTEMP.NE.ONE ) .OR. ( STEMP.NE.ZERO ) ) THEN
DO KL = 1, NRL
ZTEMP = Z( KL, J+1+L-1 )
Z( KL, J+1+L-1 ) = CTEMP*ZTEMP - STEMP*Z( KL, J+L-1 )
Z( KL, J+L-1 ) = STEMP*ZTEMP + CTEMP*Z( KL, J+L-1 )
END DO
END IF
END DO
END IF
!
D( L ) = D( L ) - P
E( L ) = G
GO TO 40
!
! Eigenvalue found.
!
80 CONTINUE
D( L ) = P
!
L = L + 1
IF( L.LE.LEND ) GO TO 40
GO TO 140
!
ELSE
!
! QR Iteration
!
! Look for small superdiagonal element.
!
90 CONTINUE
IF( L.NE.LEND ) THEN
LENDP1 = LEND + 1
DO 100 M = L, LENDP1, -1
TST = DABS( E( M-1 ) )**2
IF( TST.LE.(EPS2*DABS(D(M)))*DABS(D(M-1))+ SAFMIN )GO TO 110
100 CONTINUE
END IF
!
M = LEND
!
110 CONTINUE
IF( M.GT.LEND ) E( M-1 ) = ZERO
P = D( L )
IF( M.EQ.L ) GO TO 130
!
! If remaining matrix is 2-by-2, use DLAE2 or SLAEV2
! to compute its eigensystem.
!
IF( M.EQ.L-1 ) THEN
IF( ICOMPZ.GT.0 ) THEN
CALL DLAEV2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2, C, S )
WORK( M ) = C
WORK( N-1+M ) = S
CTEMP = WORK( M )
STEMP = WORK( N-1+M )
IF( ( CTEMP.NE.ONE ) .OR. ( STEMP.NE.ZERO ) ) THEN
DO KL = 1, NRL
ZTEMP = Z( KL, L)
Z( KL, L ) = CTEMP*ZTEMP - STEMP*Z( KL, L-1 )
Z( KL, L-1 ) = STEMP*ZTEMP + CTEMP*Z( KL, L-1 )
END DO
END IF
ELSE
CALL DLAE2( D( L-1 ), E( L-1 ), D( L ), RT1, RT2 )
END IF
D( L-1 ) = RT1
D( L ) = RT2
E( L-1 ) = ZERO
L = L - 2
IF( L.GE.LEND ) GO TO 90
GO TO 140
END IF
!
IF( JTOT.EQ.NMAXIT ) GO TO 140
JTOT = JTOT + 1
!
! Form shift.
!
G = ( D( L-1 )-P ) / ( TWO*E( L-1 ) )
R = DLAPY2( G, ONE )
G = D( M ) - P + ( E( L-1 ) / ( G+SIGN( R, G ) ) )
!
S = ONE
C = ONE
P = ZERO
!
! Inner loop
!
LM1 = L - 1
DO 120 I = M, LM1
F = S*E( I )
B = C*E( I )
CALL DLARTG( G, F, C, S, R )
IF( I.NE.M ) E( I-1 ) = R
G = D( I ) - P
R = ( D( I+1 )-G )*S + TWO*C*B
P = S*R
D( I ) = G + P
G = C*R - B
!
! If eigenvectors are desired, then save rotations.
!
IF( ICOMPZ.GT.0 ) THEN
WORK( I ) = C
WORK( N-1+I ) = S
END IF
!
120 CONTINUE
!
! If eigenvectors are desired, then apply saved rotations.
!
IF( ICOMPZ.GT.0 ) THEN
DO J = 1, L - M
CTEMP = WORK( M+J-1 )
STEMP = WORK( N-1+M+J-1 )
IF( ( CTEMP.NE.ONE ) .OR. ( STEMP.NE.ZERO ) ) THEN
DO KL = 1, NRL
ZTEMP = Z( KL, J+M )
Z( KL, J+M ) = CTEMP*ZTEMP - STEMP*Z(KL, J+M-1)
Z( KL, J+M-1 ) = STEMP*ZTEMP + CTEMP*Z(KL, J+M-1)
END DO
END IF
END DO
END IF
!
D( L ) = D( L ) - P
E( LM1 ) = G
GO TO 90
!
! Eigenvalue found.
!
130 CONTINUE
D( L ) = P
!
L = L - 1
IF( L.GE.LEND ) GO TO 90
GO TO 140
!
END IF
!
! Undo scaling if necessary
!
140 CONTINUE
IF( ISCALE.EQ.1 ) THEN
CALL DLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV+1, 1, &
& D( LSV ), N, INFO )
CALL DLASCL( 'G', 0, 0, SSFMAX, ANORM, LENDSV-LSV, 1, E( LSV ), &
& N, INFO )
ELSE IF( ISCALE.EQ.2 ) THEN
CALL DLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV+1, 1, &
& D( LSV ), N, INFO )
CALL DLASCL( 'G', 0, 0, SSFMIN, ANORM, LENDSV-LSV, 1, E( LSV ), &
& N, INFO )
END IF
!
! Check for no convergence to an eigenvalue after a total
! of N*MAXIT iterations.
!
IF( JTOT.EQ.NMAXIT ) THEN
DO 150 I = 1, N - 1
IF( E( I ).NE.ZERO ) INFO = INFO + 1
150 CONTINUE
RETURN
END IF
GO TO 10
!
! Order eigenvalues and eigenvectors.
!
160 CONTINUE
IF( ICOMPZ.EQ.0 ) THEN
!
! Use Quick Sort
!
CALL DLASRT( 'I', N, D, INFO )
!
ELSE
!
! Use Selection Sort to minimize swaps of eigenvectors
!
DO 180 II = 2, N
I = II - 1
K = I
P = D( I )
DO 170 J = II, N
IF( D( J ).LT.P ) THEN
K = J
P = D( J )
END IF
170 CONTINUE
IF( K.NE.I ) THEN
D( K ) = D( I )
D( I ) = P
CALL ZSWAP( NRL, Z( 1, I ), 1, Z( 1, K ), 1 )
END IF
180 CONTINUE
END IF
RETURN
!
! End of ZSTEQR
!
END SUBROUTINE pzsteqr
!==----------------------------------------------==!
SUBROUTINE zhpev_drv( JOBZ, UPLO, N, AP, W, Z, LDZ )
USE kinds
IMPLICIT NONE
CHARACTER :: JOBZ, UPLO
INTEGER :: IOPT, INFO, LDZ, N
COMPLEX(DP) :: AP( * ), Z( LDZ, * )
REAL(DP) :: W( * )
REAL(DP), ALLOCATABLE :: RWORK(:)
COMPLEX(DP), ALLOCATABLE :: ZWORK(:)
#if defined __AIX
IOPT = 0
IF((JOBZ .EQ. 'V') .OR. (JOBZ .EQ. 'v') ) iopt = iopt + 1
IF((UPLO .EQ. 'U') .OR. (UPLO .EQ. 'u') ) iopt = iopt + 20
ALLOCATE( rwork( 4*n ) )
CALL ZHPEV(IOPT, ap, w, z, ldz, n, rwork, 4*n)
DEALLOCATE( rwork )
#else
ALLOCATE( rwork( MAX(1, 3*n-2) ), zwork( MAX(1, 2*n-1)) )
CALL ZHPEV(jobz, uplo, n, ap, w, z, ldz, zwork, rwork, INFO)
DEALLOCATE( rwork, zwork )
IF( info .NE. 0 ) THEN
CALL errore( ' dspev_drv ', ' diagonalization failed ',info )
END IF
#endif
RETURN
END SUBROUTINE zhpev_drv
!==----------------------------------------------==!
SUBROUTINE pzhpev_drv( jobz, ap, lda, w, z, ldz, &
nrl, n, nproc, mpime, comm_in )
USE kinds
IMPLICIT NONE
CHARACTER :: JOBZ
INTEGER :: lda, ldz, nrl, n, nproc, mpime
INTEGER, OPTIONAL, INTENT(IN) :: comm_in
COMPLEX(DP) :: ap( lda, * ), z( ldz, * )
REAL(DP) :: w( * )
REAL(DP) :: rwork( n )
COMPLEX(DP) :: cwork( n )
INTEGER :: comm
!
IF ( PRESENT( comm_in ) ) THEN
comm = comm_in
#ifdef __MPI
ELSE
comm = MPI_COMM_WORLD
#endif
END IF
CALL pzhptrd( n, nrl, ap, lda, w, rwork, cwork, nproc, mpime, comm)
IF( jobz == 'V' .OR. jobz == 'v' ) THEN
CALL pzupgtr( n, nrl, ap, lda, cwork, z, ldz, nproc, mpime, comm)
END IF
CALL pzsteqr( jobz, n, nrl, w, rwork, z, ldz, nproc, mpime, comm)
RETURN
END SUBROUTINE pzhpev_drv
!==----------------------------------------------==!
!
! My parallel blas
!
!==----------------------------------------------==!
SUBROUTINE mattr_drv( m, k, a, lda, b, ldb, nb, dims, coor, comm )
!
! Compute B as the transpose of matrix A
! A and B are distributed on a 2D cartesian processor
! grid in a block cyclic way (as in scalapack),
! using a block size of NB
!
! B := A'
!
! A is a K by M matrix
! B is an M by K matrix
!
implicit none
!
INTEGER, INTENT(IN) :: m, k
INTEGER, INTENT(IN) :: lda, ldb
REAL*8 :: a(lda,*), b(ldb,*)
INTEGER, INTENT(IN) :: nb, dims(2), coor(2), comm
!
#if defined __MPI
include 'mpif.h'
!
integer ierr
integer ndims, rowid, colid
integer coosrc(2), coodst(2), ipsrc, ipdst, mpime
integer ihsnd, ihrcv
logical periods(2)
!
integer :: iu
integer :: i, j, nk, nm
integer :: ii, jj
integer :: isrc, jsrc
integer :: idst, jdst
integer :: itag
integer :: nmb, nkb
integer :: istatus( MPI_STATUS_SIZE )
real*8, allocatable :: abuf(:,:)
!
integer :: numroc
integer :: indxg2l
external :: numroc, indxg2l
!
CALL GRID2D_RANK( dims(1), dims(2), coor(1), coor(2), mpime )
!
iu = 200 + mpime
!
! Compute the global number of blocks for matrix dimension
!
nmb = ( m + nb - 1 ) / nb
nkb = ( k + nb - 1 ) / nb
!
! WRITE(iu,*) 'TR',coor(1),coor(2),' m,k,nmb,nkb,nb = ',m,k,nmb,nkb,nb
! WRITE(iu,*) 'TR',coor(1),coor(2),' ---------------- '
!
ALLOCATE( abuf( nb, nb ) )
!
DO i = 1, nmb
DO j = 1, nkb
!
itag = j + nkb * (i-1)
!
coosrc(1) = MOD( (j-1), dims(1) )
coosrc(2) = MOD( (i-1), dims(2) )
!
coodst(1) = MOD( (i-1), dims(1) )
coodst(2) = MOD( (j-1), dims(2) )
!
CALL GRID2D_RANK( dims(1), dims(2), coosrc(1), coosrc(2), ipsrc )
CALL GRID2D_RANK( dims(1), dims(2), coodst(1), coodst(2), ipdst )
!
jsrc = INDXG2L( 1 + (j-1)*nb, nb, coor(1), 0, dims(1) )
isrc = INDXG2L( 1 + (i-1)*nb, nb, coor(2), 0, dims(2) )
!
jdst = INDXG2L( 1 + (j-1)*nb, nb, coor(2), 0, dims(2) )
idst = INDXG2L( 1 + (i-1)*nb, nb, coor(1), 0, dims(1) )
!
nk = MIN( nb, k - (j-1)*nb ) ! number of element in the block
nm = MIN( nb, m - (i-1)*nb ) ! number of element in the block
!
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' cccsrc,ipsrc = ', coosrc(1), coosrc(2), ipsrc
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' cccdst,ipdst = ', coodst(1), coodst(2), ipdst
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' i,j,nm,nk = ', i,j,nm,nk
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' isrc,jsrc = ', isrc,jsrc
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' idst,jdst = ', idst,jdst
!
IF( ipsrc == ipdst ) THEN
IF( ipsrc == mpime ) THEN
DO ii = 1, nm
DO jj = 1, nk
b( idst + ii - 1, jdst + jj - 1 ) = a( jsrc + jj - 1, isrc + ii - 1 )
END DO
END DO
END IF
ELSE
IF( ipsrc == mpime ) THEN
DO ii = 1, nm
DO jj = 1, nk
abuf( ii, jj ) = a( jsrc + jj - 1, isrc + ii - 1 )
!
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' ii,jj,abuf_snd = ', ii,jj,abuf( ii, jj )
!
END DO
END DO
CALL MPI_ISEND( abuf, nb*nb, MPI_DOUBLE_PRECISION, ipdst, itag, comm, ihsnd, ierr )
CALL mpi_wait(ihsnd, istatus, ierr)
ELSE IF( ipdst == mpime ) THEN
CALL MPI_IRECV( abuf, nb*nb, MPI_DOUBLE_PRECISION, ipsrc, itag, comm, ihrcv, ierr )
CALL mpi_wait(ihrcv, istatus, ierr)
DO jj = 1, nk
DO ii = 1, nm
!
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' ii,jj,abuf_rcv = ', ii,jj,abuf( ii, jj )
!
b( idst + ii - 1, jdst + jj - 1 ) = abuf( ii, jj )
END DO
END DO
END IF
END IF
!
! WRITE(iu,*) 'TR',coor(1),coor(2),itag,' ---------------- '
!
END DO
END DO
#else
INTEGER :: i, j
DO j = 1, k
DO i = 1, m
B( i, j ) = A( j, i )
END DO
END DO
#endif
RETURN
END SUBROUTINE mattr_drv
! ---------------------------------------------------------------------------------
SUBROUTINE matsplit_drv( m, k, ar, ldar, a, lda, nb, dims, coor, comm )
!
implicit none
!
INTEGER, INTENT(IN) :: m, k
INTEGER, INTENT(IN) :: ldar
REAL*8 :: ar(ldar,*) ! matrix to be splitted, replicated on all proc
INTEGER, INTENT(IN) :: lda
REAL*8 :: a(lda,*)
INTEGER, INTENT(IN) :: nb, coor(2), dims(2), comm
!
INTEGER :: i, j, nra, nca, ii, jj
!
INTEGER :: numroc, INDXL2G
EXTERNAL :: numroc, INDXL2G
nra = NUMROC( m, nb, coor(1), 0, dims(1) ) ! total number of local row for matrix A, C
nca = NUMROC( k, nb, coor(2), 0, dims(2) ) ! total number of local columns of A
do j = 1, nca
jj = INDXL2G( j, NB, coor(2), 0, dims(2) )
do i = 1, nra
ii = INDXL2G( i, NB, coor(1), 0, dims(1) )
a( i, j ) = ar( ii, jj )
end do
end do
RETURN
END SUBROUTINE matsplit_drv
! ---------------------------------------------------------------------------------
SUBROUTINE matmerge_drv( m, k, a, lda, ar, ldar, nb, dims, coor, comm )
!
implicit none
!
INTEGER, INTENT(IN) :: m, k
INTEGER, INTENT(IN) :: ldar
REAL*8 :: ar(ldar,*) ! matrix to be merged, replicated on all proc
INTEGER, INTENT(IN) :: lda
REAL*8 :: a(lda,*)
INTEGER, INTENT(IN) :: nb, coor(2), dims(2), comm
!
INTEGER :: i, j, ii, jj, ierr
#if defined __MPI
include 'mpif.h'
!
INTEGER :: jsrc, isrc, ipsrc, coosrc(2)
INTEGER :: nmb, nkb, nk, nm, mpime
REAL*8, ALLOCATABLE :: buf(:,:)
!
INTEGER :: INDXG2L
EXTERNAL :: INDXG2L
!
CALL GRID2D_RANK( dims(1), dims(2), coor(1), coor(2), mpime )
nmb = ( m + nb - 1 ) / nb
nkb = ( k + nb - 1 ) / nb
ALLOCATE( buf( nb, nb ) )
DO j = 1, nkb
DO i = 1, nmb
!
coosrc(1) = MOD( (i-1), dims(1) )
coosrc(2) = MOD( (j-1), dims(2) )
!
CALL GRID2D_RANK( dims(1), dims(2), coosrc(1), coosrc(2), ipsrc )
!
isrc = INDXG2L( 1 + (i-1)*nb, nb, coor(1), 0, dims(1) )
jsrc = INDXG2L( 1 + (j-1)*nb, nb, coor(2), 0, dims(2) )
!
nm = MIN( nb, m - (i-1)*nb ) ! number of element in the block
nk = MIN( nb, k - (j-1)*nb ) ! number of element in the block
IF( ipsrc == mpime ) THEN
DO jj = 1, nk
DO ii = 1, nm
buf( ii, jj ) = a( isrc + ii - 1, jsrc + jj - 1 )
END DO
END DO
ENDIF
!
CALL MPI_BCAST( buf, nb*nb, MPI_DOUBLE_PRECISION, ipsrc, comm, ierr )
!
do jj = 1, nk
do ii = 1, nm
ar( ii + (i-1)*nb, jj + (j-1)*nb ) = buf( ii, jj )
end do
end do
!
END DO
END DO
!
DEALLOCATE( buf )
#else
DO j = 1, k
DO i = 1, m
ar( i, j ) = a( i, j )
END DO
END DO
#endif
RETURN
END SUBROUTINE matmerge_drv
! ---------------------------------------------------------------------------------
SUBROUTINE matscal_drv( m, n, beta, c, ldc, nb, dims, coor, comm )
!
implicit none
!
INTEGER, INTENT(IN) :: m, n
REAL*8, INTENT(IN) :: beta
INTEGER, INTENT(IN) :: ldc
REAL*8 :: c(ldc,*)
INTEGER, INTENT(IN) :: nb, coor(2), dims(2), comm
!
INTEGER :: i, j, nr, nc, ierr
!
INTEGER :: numroc
EXTERNAL :: numroc
nr = NUMROC( m, nb, coor(1), 0, dims(1) ) ! local row of C
nc = NUMROC( n, nb, coor(2), 0, dims(2) ) ! local colum of C
IF( beta == 0.0d0 ) THEN
do j = 1, nc
do i = 1, nr
c(i,j) = 0.0d0
end do
end do
ELSE
do j = 1, nc
do i = 1, nr
c(i,j) = beta * c(i,j)
end do
end do
END IF
RETURN
END SUBROUTINE
! ---------------------------------------------------------------------------------
SUBROUTINE matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, nb, dims, coor, comm )
!
implicit none
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: m, n, k
REAL*8, INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
REAL*8 :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: nb, dims(2), coor(2), comm
!
! DGEMM PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
!
! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!
! WHERE OP( X ) IS ONE OF
!
! OP( X ) = X OR OP( X ) = X',
!
! ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
! AN M BY K MATRIX, OP( B ) A K BY N MATRIX AND C AN M BY N MATRIX.
!
!
!
#if defined __MPI
include 'mpif.h'
!
integer ierr
integer ndims, rowid, colid
integer comm_row, comm_col
!
integer :: ib, jb, kb, ibl, kbl, jbl
integer :: i, j, kk, ni, nj, nk, nm, il, jl
integer :: nnb, nmb, nkb
integer :: nr, nra, nca, nc, nrb, ncb, ii, jj
integer :: nrt, ncat, nct, nrbt
real*8, allocatable :: abuf(:,:), bbuf(:,:)
real*8, allocatable :: at(:,:)
real*8, allocatable :: bt(:,:)
!
integer :: numroc
integer :: indxg2l
external :: numroc, indxg2l
!
IF( dims(1) * dims(2) == 1 ) THEN
! if there is only one proc no need of using parallel alg.
call dgemm( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
RETURN
END IF
!
CALL MPI_COMM_SPLIT( COMM, coor(2), coor(1), COMM_COL, IERR )
CALL MPI_COMM_RANK( COMM_COL, rowid, IERR )
!
CALL MPI_COMM_SPLIT( COMM, coor(1), coor(2), COMM_ROW, IERR )
CALL MPI_COMM_RANK( COMM_ROW, colid, IERR )
!
! Compute the global number of blocks for matrix dimension
!
nmb = ( m + nb - 1 ) / nb
!
nnb = ( n + nb - 1 ) / nb
!
nkb = ( k + nb - 1 ) / nb
!
! Compute the total number of local row for matrix A, C
!
nr = NUMROC( m, nb, coor(1), 0, dims(1) ) ! local row of C
!
nra = NUMROC( m, nb, coor(1), 0, dims(1) ) ! local row of OP( A )
nca = NUMROC( k, nb, coor(2), 0, dims(2) ) ! local columns of OP( A )
!
nrb = NUMROC( k, nb, coor(1), 0, dims(1) ) ! local row of OP( B )
ncb = NUMROC( n, nb, coor(2), 0, dims(2) ) ! local colum of OP( B )
!
nc = NUMROC( n, nb, coor(2), 0, dims(2) ) ! local colum of C
!
IF( transa == 'T' .OR. transa == 't' ) THEN
!
ALLOCATE( at( nra, nca ) )
!
CALL mattr_drv( m, k, a, lda, at, nra, nb, dims, coor, comm )
!
END IF
!
IF( transb == 'T' .OR. transb == 't' ) THEN
!
ALLOCATE( bt( nrb, ncb ) )
!
CALL mattr_drv( k, n, b, ldb, bt, nrb, nb, dims, coor, comm )
!
END IF
!
! Scale matrix C
!
CALL matscal_drv( m, n, beta, c, ldc, nb, dims, coor, comm )
!
! loop over the rows/columns blocks of matrix OP(A)/OP(B)
!
do kb = 1, nkb
!
kk = ( kb - 1 ) * nb + 1 ! first element of the block (global index)
nk = MIN( nb, k - kk + 1 ) ! number of element in the block
colid = MOD( (kb-1), dims(2) ) ! processor owning the block
rowid = MOD( (kb-1), dims(1) )
allocate( abuf( nr, nk ) )
if( colid == coor(2) ) then
nrt = 0
ibl = 0
kbl = INDXG2L( 1 + (kb-1)*nb, nb, coor(2), 0, dims(2) )
do ib = 1 + coor(1), nmb, dims(1)
i = ( ib - 1 ) * nb + 1
ni = MIN( nb, m - i + 1 )
IF( transa == 'T' .OR. transa == 't' ) THEN
do jj = 1, nk
do ii = 1, ni
abuf( ii + nrt, jj ) = at( ii + ibl*nb, jj + kbl - 1 )
end do
end do
ELSE
do jj = 1, nk
do ii = 1, ni
abuf( ii + nrt, jj ) = a( ii + ibl*nb, jj + kbl - 1 )
end do
end do
END IF
nrt = nrt + ni
ibl = ibl + 1
end do
end if
CALL MPI_BCAST( abuf(1,1), nr*nk, MPI_DOUBLE_PRECISION, colid, COMM_ROW, IERR )
allocate( bbuf( nk, nc ) )
if( rowid == coor(1) ) then
nct = 0
jbl = 0
kbl = INDXG2L( 1 + (kb-1)*nb, nb, coor(1), 0, dims(1) )
do jb = 1 + coor(2), nnb, dims(2)
j = ( jb - 1 ) * nb + 1
nj = MIN( nb, n - j + 1 )
IF( transb == 'T' .OR. transb == 't' ) THEN
do jj = 1, nj
do ii = 1, nk
bbuf( ii, jj + nct ) = bt( ii + kbl - 1, jj + jbl*nb )
end do
end do
ELSE
do jj = 1, nj
do ii = 1, nk
bbuf( ii, jj + nct ) = b( ii + kbl - 1, jj + jbl*nb )
end do
end do
END IF
nct = nct + nj
jbl = jbl + 1
end do
end if
CALL MPI_BCAST( bbuf(1,1), nk*nc, MPI_DOUBLE_PRECISION, rowid, COMM_COL, IERR )
ii = 1
do ib = 1 + coor(1), nmb, dims(1)
i = ( ib - 1 ) * nb + 1
il = INDXG2L( i, nb, coor(1), 0, dims(1) )
ni = MIN( nb, m - i + 1 )
jj = 1
do jb = 1 + coor(2), nnb, dims(2)
j = ( jb - 1 ) * nb + 1
jl = INDXG2L( j, nb, coor(2), 0, dims(2) )
nj = MIN( nb, n - j + 1 )
call dgemm( 'n', 'n', ni, nj, nk, alpha, abuf( ii, 1 ), nra, bbuf( 1, jj ), nk, 1.0d0, c( il, jl ), ldc )
jj = jj + nj
end do
ii = ii + ni
end do
deallocate( abuf )
deallocate( bbuf )
end do
IF( ALLOCATED( at ) ) DEALLOCATE( at )
IF( ALLOCATED( bt ) ) DEALLOCATE( bt )
#else
! if we are not compiling with __MPI this is equivalent to a blas call
call dgemm( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC )
#endif
RETURN
END SUBROUTINE
!==----------------------------------------------==!
!
! Copyright (C) 2005 Carlo Cavazzoni
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
SUBROUTINE rep_matmul_drv( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC, comm )
!
! Parallel matrix multiplication with replicated matrix
!
implicit none
!
CHARACTER(LEN=1), INTENT(IN) :: transa, transb
INTEGER, INTENT(IN) :: m, n, k
REAL*8, INTENT(IN) :: alpha, beta
INTEGER, INTENT(IN) :: lda, ldb, ldc
REAL*8 :: a(lda,*), b(ldb,*), c(ldc,*)
INTEGER, INTENT(IN) :: comm
!
! DGEMM PERFORMS ONE OF THE MATRIX-MATRIX OPERATIONS
!
! C := ALPHA*OP( A )*OP( B ) + BETA*C,
!
! WHERE OP( X ) IS ONE OF
!
! OP( X ) = X OR OP( X ) = X',
!
! ALPHA AND BETA ARE SCALARS, AND A, B AND C ARE MATRICES, WITH OP( A )
! AN M BY K MATRIX, OP( B ) A K BY N MATRIX AND C AN M BY N MATRIX.
!
!
!
#if defined __MPI
include 'mpif.h'
!
INTEGER :: ME, I, II, J, JJ, IP, SOUR, DEST, INFO, IERR, ioff, ldx
INTEGER :: NB, IB_S, NB_SOUR, IB_SOUR, IBUF
INTEGER :: nproc, mpime, q, r
REAL*8, ALLOCATABLE :: auxa( : )
REAL*8, ALLOCATABLE :: auxc( : )
!
! ... BODY
!
CALL MPI_COMM_SIZE(comm, NPROC, IERR)
CALL MPI_COMM_RANK(comm, MPIME, IERR)
IF ( NPROC == 1 ) THEN
! if there is only one proc no need of using parallel alg.
CALL DGEMM(TRANSA, TRANSB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc)
RETURN
END IF
ME = MPIME + 1
Q = INT( m / NPROC )
R = MOD( m , NPROC )
! ... Find out the number of elements in the local block
! along "M" first dimension os matrix A
NB = Q
IF( ME <= R ) NB = NB + 1
! ... Find out the global index of the local first row
IF( ME <= R ) THEN
ib_s = (Q+1)*(ME-1) + 1
ELSE
ib_s = Q*(ME-1) + R + 1
END IF
ldx = m / nproc + 1
ALLOCATE( auxa( MAX( n, m ) * ldx ) )
ALLOCATE( auxc( MAX( n, m ) * ldx ) )
IF( TRANSA == 'N' .OR. TRANSA == 'n' ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
ELSE
ibuf = 0
ioff = ib_s - 1
DO J = 1, k
DO I = 1, NB
auxa( ibuf + I ) = A( J, I + ioff )
END DO
ibuf = ibuf + ldx
END DO
!ioff = ib_s - 1
!call mytranspose( A( 1, ioff + 1 ), lda, auxa(1), ldx, m, nb)
END IF
IF( beta /= 0.0d0 ) THEN
ibuf = 0
ioff = ib_s - 1
DO J = 1, n
DO I = 1, NB
auxc( ibuf + I ) = C( I + ioff, J )
END DO
ibuf = ibuf + ldx
END DO
END IF
CALL DGEMM( 'N', transb, nb, n, k, alpha, auxa(1), ldx, B, ldb, beta, auxc(1), ldx )
! ... Here processors exchange blocks
DO IP = 0, NPROC-1
! ... Find out the number of elements in the block of processor SOUR
NB_SOUR = q
IF( (IP+1) .LE. r ) NB_SOUR = NB_SOUR+1
! ... Find out the global index of the first row owned by SOUR
IF( (IP+1) .LE. r ) THEN
ib_sour = (Q+1)*IP + 1
ELSE
ib_sour = Q*IP + R + 1
END IF
IF( mpime == ip ) auxa(1:n*ldx) = auxc(1:n*ldx)
CALL MPI_BCAST( auxa(1), ldx*n, mpi_double_precision, ip, comm, IERR)
IBUF = 0
ioff = IB_SOUR - 1
DO J = 1, N
DO I = 1, NB_SOUR
C( I + ioff, J ) = AUXA( IBUF + I )
END DO
IBUF = IBUF + ldx
END DO
END DO
DEALLOCATE( auxa, auxc )
#else
! if we are not compiling with __MPI this is equivalent to a blas call
CALL DGEMM(TRANSA, TRANSB, m, N, k, alpha, A, lda, B, ldb, beta, C, ldc)
#endif
RETURN
END SUBROUTINE rep_matmul_drv
!==----------------------------------------------==!
END MODULE parallel_toolkit
!==----------------------------------------------==!