quantum-espresso/PW/s_gemm.f90

152 lines
3.7 KiB
Fortran

!
! Copyright (C) 2001 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#ifdef __BENCHLIB
SUBROUTINE S_gemm (FORMA, FORMB, L, N, M, alpha, A, LDA, B, LDB, &
beta, C, LDC)
! driver for SGEMM routine
! it transposes matrixes before calling the SGEMM subroutine in the T/N
! or T T mode in order to exploit cache and stream buffer mechanics
! written by C.Cavazzoni and S.Cozzini 18/12/97
! the calling parameter are the same as in the actual sgemm routines
! to use it, just replace SGEMM with S_gemm in the original code...
IMPLICIT NONE
CHARACTER (LEN=1) :: FORMA, FORMB
real (8) :: alpha, beta
REAL (8) :: A, B, C
real (8) , allocatable::auxa (:, :)
real (8) , allocatable::auxb (:, :)
real (8) , allocatable::auxc (:, :)
integer :: i, j
INTEGER :: LDA, LDB, LDC, L, M, N
DIMENSION A (LDA, * ), B (LDB, * ), C (LDC, * )
INTEGER :: IRTC
INTEGER :: MM, NN, KK
INTEGER :: S2, S3, itype
INTEGER :: INFO (2)
MM = L
NN = N
KK = M
! defines the algorithm to transpose matrix
itype = 0
if (forma.eq.'t'.or.forma.eq.'T') then
if (formb.eq.'t'.or.formb.eq.'T') then
allocate (auxc (MM, NN) )
S2 = IRTC ()
CALL SGEMM ('N', 'N', MM, NN, KK, alpha, B, LDB, A, LDA, &
beta, AUXC, MM)
S3 = IRTC ()
call r_transpose (AUXC, MM, C, LDC, NN, MM, itype, INFO (2) )
deallocate (auxc)
else
allocate (auxa (MM, KK) )
call r_transpose (A, LDA, AUXA, MM, MM, KK, itype, INFO (2) )
S2 = IRTC ()
CALL SGEMM ('N', 'N', MM, NN, KK, alpha, auxa, MM, B, LDB, &
beta, C, LDC)
S3 = IRTC ()
deallocate (auxa)
endif
elseif (formb.eq.'t'.or.formb.eq.'T') then
allocate (auxb (KK, NN) )
call r_transpose (B, LDB, AUXB, KK, KK, NN, itype, INFO (2) )
S2 = IRTC ()
CALL SGEMM ('N', 'N', MM, NN, KK, alpha, A, LDA, auxb, KK, &
beta, C, LDC)
S3 = IRTC ()
deallocate (auxb)
else
S2 = IRTC ()
CALL SGEMM (forma, formb, MM, NN, KK, alpha, A, LDA, B, LDB, &
beta, C, LDC)
S3 = IRTC ()
INFO (2) = 0
endif
RETURN
END SUBROUTINE S_gemm
subroutine r_transpose (A, LDA, B, LDB, N, M, ITYPE, INFO)
IMPLICIT NONE
INCLUDE "mpp/shmem.fh"
integer :: lda, ldb, n, m, itype, info
real (8) :: A (LDA, * ), B (LDB, * )
! DRIVER FOR MATRIX TRASPOSITIN
!
! A (INPUT ) MxN MATRIX TO BE TRANSPOSED
! B (OUTPUT) NxM TRANSPOSED MATRIX
!
! ITYPE (INPUT ) TYPE OF TRANSPOSITION METHOD
! INFO (OUTPUT) ROUTINE EXECUTION TIME ( MICROSECONDS )
integer :: irtc, my_pe, lputp
integer :: i, j, jn, mype
integer :: s2, s3
if (itype.eq.0) then
S2 = IRTC ()
do j = 1, N, 480
jn = min (n - j + 1, 480)
do i = 1, M
call lgetv (A (I, J), LDA, JN)
call lputv (B (J, I), 1, JN)
enddo
enddo
10 IF (LPUTP () .NE.0) GoTo 10
S3 = IRTC ()
elseif (itype.eq.1) then
! traspose with shmem
MYPE = MY_PE ()
S2 = IRTC ()
DO J = 1, M
CALL SHMEM_IGET (B (1, J), A (J, 1), 1, LDA, N, MYPE)
ENDDO
S3 = IRTC ()
elseif (itype.eq.2) then
S2 = irtc ()
DO J = 1, M
!DIR$ CACHE_BYPASS A,B
DO I = 1, N
B (I, J) = A (J, I)
ENDDO
ENDDO
S3 = IRTC ()
else
WRITE( stdout, * ) ' *** r_traspose : PARAMETER ITYPE ', 'OUT OF RANGE'
stop
endif
INFO = INT (DBLE (S3 - S2) * 3.333d-3)
return
end subroutine r_transpose
#else
subroutine sgemmdummy
return
end subroutine sgemmdummy
#endif