! ! Copyright (C) 2002 FPMD group ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! #include "f_defs.h" !---------------------------------------------------------------------- ! FFT base Module. ! Written by Carlo Cavazzoni !---------------------------------------------------------------------- ! #if defined __HPM # include "/cineca/prod/hpm/include/f_hpm.h" #endif #undef __FFT_BASE_TS1 !#define __FFT_BASE_TS1 !=----------------------------------------------------------------------=! MODULE fft_base !=----------------------------------------------------------------------=! USE kinds, ONLY: DP USE mp, ONLY: mp_max, mp_sum, mp_barrier USE parallel_include USE fft_types, ONLY: fft_dlay_descriptor IMPLICIT NONE TYPE ( fft_dlay_descriptor ) :: dfftp ! fft descriptor for potentials TYPE ( fft_dlay_descriptor ) :: dffts ! fft descriptor for potentials SAVE PRIVATE PUBLIC :: fft_transpose, fft_scatter, fft_timing PUBLIC :: dfftp, dffts, fft_dlay_descriptor INTEGER, ALLOCATABLE :: stmask(:) REAL(DP) :: fft_timing( 4, 2 ) = 0.0d0 !=----------------------------------------------------------------------=! CONTAINS !=----------------------------------------------------------------------=! !=----------------------------------------------------------------------=! ! ... FFT inizialization ! SUBROUTINE transpose_setup( dfft, me, nproc ) IMPLICIT NONE TYPE (fft_dlay_descriptor), INTENT(IN) :: dfft INTEGER, INTENT(IN) :: me ! processor index starting from 1 INTEGER, INTENT(IN) :: nproc ! number of processor INTEGER :: ierr ! ! ... Subroutine Body ! ierr = 0 IF( ALLOCATED(stmask) ) DEALLOCATE(stmask, STAT=ierr) IF( ierr /= 0 ) & CALL errore(' fft_base_setup ' , ' deallocation of stmask failed ', ierr) ALLOCATE( stmask ( SIZE( dfft%ismap ) ), STAT=ierr) IF( ierr /= 0 ) & CALL errore(' fft_base_setup ' , ' allocation of stmask failed ', ierr) ! ... the stick mask is copied locally to increase the efficiency stmask( : ) = dfft%ismap( : ) RETURN END SUBROUTINE transpose_setup ! ! !======================================================================= ! #if defined __PARA # if defined __FFT_BASE_TS1 SUBROUTINE fft_transpose( zstick, ldz, r, ldx, ldy, dfft, me, nproc, iopt) USE mp_buffers, ONLY: mp_allocate_buffers, & mp_snd_buffer, mp_rcv_buffer, mp_sendrecv_buffers, & mp_barrier_buffers, mp_deallocate_buffers, mp_alltoall_buffers, & mp_p_snd_buffer, mp_p_rcv_buffer, mp_bufsize_msgmax IMPLICIT NONE COMPLEX (DP) :: zstick( * ) COMPLEX (DP) :: r( * ) TYPE (fft_dlay_descriptor), INTENT(IN) :: dfft INTEGER, INTENT(IN) :: me ! processor index starting from 1 INTEGER, INTENT(IN) :: nproc INTEGER, INTENT(IN) :: iopt, ldz, ldx, ldy INTEGER :: i, j, k, ipz, offset, k_start, k_end, is INTEGER :: npz, nz_l, ns_l, ns_lp, nz_lx INTEGER :: nsx_l, msgsiz INTEGER :: mc1, mc2, mc3, mc4, is_offset, ns1 COMPLEX (DP) :: zero COMPLEX (DP) :: bswp( ldz * 4 ) INTEGER, SAVE :: dfft_id = -1 #if defined __HPM CALL f_hpmstart( 11, 'transpose' ) #endif ! ! ... SUBROUTINE BODY ! IF( iopt == 0 ) THEN CALL transpose_setup( dfft, me, nproc ) dfft_id = dfft%id RETURN END IF IF( dfft_id /= dfft%id ) THEN CALL transpose_setup( dfft, me, nproc ) dfft_id = dfft%id END IF npz = nproc nz_lx = MAXVAL( dfft%npp( 1 : nproc ) ) IF( ABS( iopt ) == 2 ) THEN ns_l = dfft%nsw( me ) nsx_l = MAXVAL( dfft%nsw( : ) ) ELSE ns_l = dfft%nsp( me ) nsx_l = MAXVAL( dfft%nsp( : ) ) END IF zero = 0.0d0 msgsiz = nsx_l * nz_lx CALL mp_allocate_buffers( msgsiz * npz ) IF ( iopt < 1 ) THEN r( 1 : ldx * ldy * dfft%npp( me ) ) = 0.0d0 k_start = 1 DO ipz = 1, npz nz_l = dfft%npp( ipz ) k_end = k_start + nz_l - 1 offset = ( ipz - 1 ) * msgsiz - k_start + 1 DO is = 1, ns_l is_offset = ( is - 1 ) * ldz DO k = k_start , k_end mp_snd_buffer(k + offset) = zstick( k + is_offset ) END DO offset = offset + nz_l END DO k_start = k_start + nz_l END DO CALL mp_alltoall_buffers(mp_snd_buffer, mp_rcv_buffer) DO ipz = 1, npz nz_l = dfft%npp( me ) offset = ( ipz - 1 ) * msgsiz is_offset = dfft%iss( ipz ) IF( ABS( iopt ) == 1 ) THEN ns_lp = dfft%nsp(ipz) ELSE ns_lp = dfft%nsw(ipz) END IF ns1 = MOD( ns_lp, 4 ) IF( ns1 /= 0 ) THEN DO is = 1, ns1 mc1 = stmask( is + is_offset ) DO k = 1 , nz_l r( mc1 + (k-1)*ldx*ldy ) = mp_rcv_buffer( k + offset ) END DO offset = offset + nz_l END DO END IF IF( ns_lp >= 4 ) THEN ns1 = ns1 + 1 DO is = ns1, ns_lp, 4 mc1 = stmask( is + is_offset ) mc2 = stmask( is+1 + is_offset ) mc3 = stmask( is+2 + is_offset ) mc4 = stmask( is+3 + is_offset ) DO k = 1 , nz_l bswp( k ) = mp_rcv_buffer( k + offset ) END DO offset = offset + nz_l DO k = 1 , nz_l bswp( k + nz_l ) = mp_rcv_buffer( k + offset ) END DO offset = offset + nz_l DO k = 1 , nz_l bswp( k + 2*nz_l ) = mp_rcv_buffer( k + offset ) END DO offset = offset + nz_l DO k = 1 , nz_l bswp( k + 3*nz_l ) = mp_rcv_buffer( k + offset ) END DO offset = offset + nz_l DO k = 1 , nz_l r( mc1 + (k-1)*ldx*ldy ) = bswp( k ) r( mc2 + (k-1)*ldx*ldy ) = bswp( k + nz_l ) r( mc3 + (k-1)*ldx*ldy ) = bswp( k + 2*nz_l ) r( mc4 + (k-1)*ldx*ldy ) = bswp( k + 3*nz_l ) END DO END DO END IF END DO ELSE IF ( iopt > 0 ) THEN DO ipz = 1, npz nz_l = dfft%npp( me ) offset = ( ipz - 1 ) * msgsiz is_offset = dfft%iss( ipz ) IF( ABS( iopt ) == 1 ) THEN ns_lp = dfft%nsp(ipz) ELSE ns_lp = dfft%nsw(ipz) END IF ns1 = MOD( ns_lp, 4 ) IF( ns1 /= 0 ) THEN DO is = 1, ns1 mc1 = stmask( is + is_offset ) DO k = 1 , nz_l mp_snd_buffer( k + offset ) = r( mc1 + (k-1)*ldx*ldy ) END DO offset = offset + nz_l END DO END IF IF( ns_lp >= 4 ) THEN ns1 = ns1 + 1 DO is = ns1, ns_lp, 4 mc1 = stmask( is + is_offset ) mc2 = stmask( is+1 + is_offset ) mc3 = stmask( is+2 + is_offset ) mc4 = stmask( is+3 + is_offset ) DO k = 1, nz_l bswp( k ) = r( mc1 + (k-1)*ldx*ldy ) bswp( k + nz_l ) = r( mc2 + (k-1)*ldx*ldy ) bswp( k + 2*nz_l ) = r( mc3 + (k-1)*ldx*ldy ) bswp( k + 3*nz_l ) = r( mc4 + (k-1)*ldx*ldy ) END DO DO k = 1 , nz_l mp_snd_buffer( k + offset ) = bswp( k ) END DO offset = offset + nz_l DO k = 1 , nz_l mp_snd_buffer( k + offset ) = bswp( k + nz_l ) END DO offset = offset + nz_l DO k = 1 , nz_l mp_snd_buffer( k + offset ) = bswp( k + 2*nz_l ) END DO offset = offset + nz_l DO k = 1 , nz_l mp_snd_buffer( k + offset ) = bswp( k + 3*nz_l ) END DO offset = offset + nz_l END DO END IF END DO call mp_alltoall_buffers(mp_snd_buffer, mp_rcv_buffer) k_start = 1 DO IPZ = 1, NPZ nz_l = dfft%npp( ipz ) k_end = k_start + nz_l - 1 offset = (ipz-1) * msgsiz - k_start + 1 DO is = 1, ns_l is_offset = ( is - 1 ) * ldz DO k = k_start , k_end zstick( k + is_offset ) = mp_rcv_buffer( k + offset ) END DO offset = offset + nz_l END DO k_start = k_start + nz_l END DO END IF CALL mp_deallocate_buffers() #if defined __HPM CALL f_hpmstop( 11 ) #endif RETURN END SUBROUTINE fft_transpose # else SUBROUTINE fft_transpose( zstick, ldz, r, ldx, ldy, dfft, me, nproc, iopt) IMPLICIT NONE COMPLEX (DP) :: zstick( * ) COMPLEX (DP) :: r( * ) TYPE (fft_dlay_descriptor), INTENT(IN) :: dfft INTEGER, INTENT(IN) :: me ! processor index starting from 1 INTEGER, INTENT(IN) :: nproc INTEGER, INTENT(IN) :: iopt INTEGER, INTENT(IN) :: ldz, ldx, ldy INTEGER :: i, j, k, ipz, offset, k_start, k_end, is, is_start, is_end INTEGER :: npz, nz_l, ns_l, ns_lp, nbuf, ierr, itag, mype, nsx_l, nz_lx INTEGER, ALLOCATABLE :: ishand( : ) INTEGER, ALLOCATABLE :: irhand( : ) INTEGER, ALLOCATABLE :: istatus( :, : ) LOGICAL, ALLOCATABLE :: rtest( : ) LOGICAL, ALLOCATABLE :: rdone( : ) COMPLEX(DP), ALLOCATABLE :: sndbuf(:,:) COMPLEX(DP), ALLOCATABLE :: rcvbuf(:,:) INTEGER :: i1, i2, j1, j2, mc1, mc2, is_offset INTEGER, SAVE :: dfft_id = -1 ! ! ... SUBROUTINE BODY ! #if defined __HPM CALL f_hpmstart( 12, 'transpose' ) #endif IF( iopt == 0 ) THEN CALL transpose_setup( dfft, me, nproc ) dfft_id = dfft%id RETURN END IF IF( dfft_id /= dfft%id ) THEN CALL transpose_setup( dfft, me, nproc ) dfft_id = dfft%id END IF npz = nproc mype = me - 1 nz_l = dfft%npp( me ) nz_lx = MAXVAL( dfft%npp( 1 : nproc ) ) IF( ABS( iopt ) == 2 ) THEN ns_l = dfft%nsw( me ) nsx_l = MAXVAL( dfft%nsw( : ) ) ELSE ns_l = dfft%nsp( me ) nsx_l = MAXVAL( dfft%nsp( : ) ) END IF ALLOCATE( sndbuf( nsx_l * nz_lx, npz ), ishand( npz ) ) ALLOCATE( rcvbuf( nsx_l * nz_lx, npz ), irhand( npz ) ) ALLOCATE( rtest( npz ), rdone( npz ) ) ALLOCATE( istatus( MPI_STATUS_SIZE, npz ) ) nbuf = nsx_l * nz_lx IF ( iopt < 0 ) THEN DO ipz = 1, npz itag = mype + 1 + npz * ( ipz - 1 ) call mpi_irecv(rcvbuf(1,ipz), nbuf, MPI_DOUBLE_COMPLEX, ipz-1, itag, & MPI_COMM_WORLD, irhand(ipz), ierr ) END DO k_start = 1 DO ipz = 1, npz nz_l = dfft%npp( ipz ) k_end = k_start + nz_l - 1 offset = - k_start + 1 DO is = 1, ns_l DO k = k_start , k_end sndbuf(k + offset, ipz) = zstick( k + (is-1)*ldz ) END DO offset = offset + nz_l END DO itag = ipz + npz * mype CALL mpi_isend( sndbuf(1,ipz), nbuf, MPI_DOUBLE_COMPLEX, ipz-1, itag, & MPI_COMM_WORLD, ishand(ipz), ierr ) k_start = k_start + nz_l END DO r( 1 : ldx*ldy*dfft%npp( me ) ) = 0.0d0 rdone = .FALSE. 111 CONTINUE DO IPZ = 1, NPZ nz_l = dfft%npp( me ) call mpi_test(irhand(ipz), rtest(ipz), istatus(1,ipz), ierr) IF( rtest(ipz) .AND. .NOT. rdone(ipz) ) THEN offset = 0 is_offset = dfft%iss( ipz ) IF( ABS( iopt ) == 2 ) THEN ns_lp = dfft%nsw( ipz ) ELSE ns_lp = dfft%nsp( ipz ) END IF DO is = 1, ns_lp - 1, 2 !mc1 = dfft%ismap( is + is_offset ) !mc2 = dfft%ismap( is+1 + is_offset ) mc1 = stmask( is + is_offset ) mc2 = stmask( is+1 + is_offset ) DO k = 1 , nz_l r( mc1 + (k-1)*ldx*ldy ) = rcvbuf(k + offset, ipz) END DO offset = offset + nz_l DO k = 1 , nz_l r( mc2 + (k-1)*ldx*ldy ) = rcvbuf(k + offset, ipz) END DO offset = offset + nz_l END DO IF( MOD( ns_lp, 2 ) /= 0 ) THEN is = ns_lp ! mc1 = dfft%ismap( is + is_offset ) mc1 = stmask( is + is_offset ) DO k = 1 , nz_l r( mc1 + (k-1)*ldx*ldy ) = rcvbuf(k + offset, ipz) END DO offset = offset + nz_l END IF rdone( ipz ) = .TRUE. END IF END DO IF( .NOT. ALL( rtest ) ) GO TO 111 ELSE IF ( iopt > 0 ) THEN DO IPZ = 1, NPZ itag = mype + 1 + npz * ( ipz - 1 ) call mpi_irecv(rcvbuf(1,ipz), nbuf, MPI_DOUBLE_COMPLEX, ipz-1, itag, & MPI_COMM_WORLD, irhand(ipz), ierr ) END DO DO ipz = 1, npz nz_l = dfft%npp( me ) offset = 0 is_offset = dfft%iss( ipz ) IF( ABS( iopt ) == 2 ) THEN ns_lp = dfft%nsw( ipz ) ELSE ns_lp = dfft%nsp( ipz ) END IF DO is = 1, ns_lp ! mc1 = dfft%ismap( is + is_offset ) mc1 = stmask( is + is_offset ) DO k = 1 , nz_l sndbuf( k + offset, ipz ) = r( mc1 + (k-1)*ldx*ldy ) END DO offset = offset + nz_l END DO itag = ipz + npz * mype call mpi_isend(sndbuf(1,ipz), nbuf, MPI_DOUBLE_COMPLEX, ipz-1, itag, & MPI_COMM_WORLD, ishand(ipz), ierr ) END DO rdone = .FALSE. 112 CONTINUE k_start = 1 DO IPZ = 1, NPZ nz_l = dfft%npp( ipz ) call mpi_test(irhand(ipz), rtest(ipz), istatus(1,ipz), ierr) IF( rtest(ipz) .AND. .NOT. rdone(ipz) ) THEN k_end = k_start + nz_l - 1 offset = - k_start + 1 DO is = 1, ns_l DO k = k_start , k_end zstick( k + (is-1)*ldz ) = rcvbuf( k + offset, ipz ) END DO offset = offset + nz_l END DO rdone( ipz ) = .TRUE. END IF k_start = k_start + nz_l END DO IF( .NOT. ALL( rtest ) ) GO TO 112 END IF DO ipz = 1, npz call mpi_wait(ishand(ipz), istatus(1,ipz), ierr) END DO DEALLOCATE(sndbuf, rcvbuf, ishand, irhand, rtest, rdone, istatus) #if defined __HPM CALL f_hpmstop( 12 ) #endif RETURN END SUBROUTINE fft_transpose # endif #else ! Scalar code SUBROUTINE fft_transpose( zstick, ldz, r, ldx, ldy, dfft, me, nproc, iopt) IMPLICIT NONE COMPLEX (DP) :: zstick( * ) COMPLEX (DP) :: r( * ) TYPE (fft_dlay_descriptor), INTENT(IN) :: dfft INTEGER, INTENT(IN) :: me ! processor index starting from 1 INTEGER, INTENT(IN) :: nproc INTEGER, INTENT(IN) :: iopt, ldx, ldy, ldz INTEGER :: i, j, k, is, nz, ns, mc1 INTEGER, SAVE :: dfft_id = -1 ! ! ... SUBROUTINE BODY ! IF( iopt == 0 ) THEN CALL transpose_setup( dfft, me, nproc ) dfft_id = dfft%id RETURN END IF IF( dfft_id /= dfft%id ) THEN CALL transpose_setup( dfft, me, nproc ) dfft_id = dfft%id END IF nz = dfft%npp( me ) IF ( iopt < 0 ) THEN r( 1 : ldx*ldy*nz ) = CMPLX(0.0d0,0.0d0) IF( iopt == 2 ) THEN ns = dfft%nsw( me ) ELSE ns = dfft%nsp( me ) END IF DO is = 1, ns ! mc1 = dfft%ismap( is ) mc1 = stmask( is ) DO k = 1 , nz r( mc1 + (k-1)*ldx*ldy ) = zstick( k + (is-1)*ldz ) END DO END DO ELSE IF ( iopt > 0 ) THEN IF( iopt == -2 ) THEN ns = dfft%nsw( me ) ELSE ns = dfft%nsp( me ) END IF DO is = 1, ns ! mc1 = dfft%ismap( is ) mc1 = stmask( is ) DO k = 1 , nz zstick( k + (is-1)*ldz ) = r( mc1 + (k-1)*ldx*ldy ) END DO END DO END IF RETURN END SUBROUTINE fft_transpose #endif ! ! Copyright (C) 2001 PWSCF group ! This file is distributed under the terms of the ! GNU General Public License. See the file `License' ! in the root directory of the present distribution, ! or http://www.gnu.org/copyleft/gpl.txt . ! ! !----------------------------------------------------------------------- subroutine fft_scatter (f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign) !----------------------------------------------------------------------- ! ! transpose the fft grid across nodes ! a) From columns to planes (sign > 0) ! ! "columns" (or "pencil") representation: ! processor "me" has ncp_(me) contiguous columns along z ! Each column has nrx3 elements for a fft of order nr3 ! nrx3 can be =nr3+1 in order to reduce memory conflicts. ! ! The transpose take places in two steps: ! 1) on each processor the columns are divided into slices along z ! that are stored contiguously. On processor "me", slices for ! processor "proc" are npp_(proc)*ncp_(me) big ! 2) all processors communicate to exchange slices ! (all columns with z in the slice belonging to "me" ! must be received, all the others must be sent to "proc") ! Finally one gets the "planes" representation: ! processor "me" has npp_(me) complete xy planes ! ! b) From planes to columns (sign < 0) ! ! Quite the same in the opposite direction ! ! The output is overwritten on f_in ; f_aux is used as work space ! #ifdef __PARA USE parallel_include #endif use mp_global, ONLY: nproc_pool, me_pool, intra_pool_comm, nproc, my_image_id USE kinds, only : DP implicit none integer, intent(in) :: nrx3, nxx_, sign, ncp_ (:), npp_ (:) complex (DP) :: f_in (nxx_), f_aux (nxx_) #ifdef __PARA integer :: dest, from, k, offset1 (nproc), sendcount (nproc), & sdispls (nproc), recvcount (nproc), rdispls (nproc), & proc, ierr, me, nprocp ! #if defined __HPM ! CALL f_hpmstart( 10, 'scatter' ) #endif me = me_pool + 1 nprocp = nproc_pool ! if (nprocp.eq.1) return ! call start_clock ('fft_scatter') ! ! sendcount(proc): amount of data processor "me" must send to processor ! recvcount(proc): amount of data processor "me" must receive from ! do proc = 1, nprocp sendcount (proc) = npp_ (proc) * ncp_ (me) recvcount (proc) = npp_ (me) * ncp_ (proc) enddo ! ! offset1(proc) is used to locate the slices to be sent to proc ! sdispls(proc)+1 is the beginning of data that must be sent to proc ! rdispls(proc)+1 is the beginning of data that must be received from pr ! offset1 (1) = 1 sdispls (1) = 0 rdispls (1) = 0 do proc = 2, nprocp offset1 (proc) = offset1 (proc - 1) + npp_ (proc - 1) sdispls (proc) = sdispls (proc - 1) + sendcount (proc - 1) rdispls (proc) = rdispls (proc - 1) + recvcount (proc - 1) enddo ! ierr = 0 if (sign.gt.0) then ! ! "forward" scatter from columns to planes ! ! step one: store contiguously the slices ! do proc = 1, nprocp from = offset1 (proc) dest = 1 + sdispls (proc) do k = 1, ncp_ (me) call DCOPY (2 * npp_ (proc), f_in (from + (k - 1) * nrx3), & 1, f_aux (dest + (k - 1) * npp_ (proc) ), 1) enddo enddo ! ! maybe useless; ensures that no garbage is present in the output ! f_in = 0.0d0 ! ! step two: communication ! call mpi_barrier (intra_pool_comm, ierr) call mpi_alltoallv (f_aux(1), sendcount, sdispls, MPI_DOUBLE_COMPLEX, f_in(1), & recvcount, rdispls, MPI_DOUBLE_COMPLEX, intra_pool_comm, ierr) if( ABS(ierr) /= 0 ) call errore ('fft_scatter', 'info<>0', ABS(ierr) ) ! else ! ! "backward" scatter from planes to columns ! ! step two: communication ! call mpi_barrier (intra_pool_comm, ierr) call mpi_alltoallv (f_in(1), recvcount, rdispls, MPI_DOUBLE_COMPLEX, f_aux(1), & sendcount, sdispls, MPI_DOUBLE_COMPLEX, intra_pool_comm, ierr) if( ABS(ierr) /= 0 ) call errore ('fft_scatter', 'info<>0', ABS(ierr) ) ! ! step one: store contiguously the columns ! f_in = 0.0d0 ! do proc = 1, nprocp from = 1 + sdispls (proc) dest = offset1 (proc) do k = 1, ncp_ (me) call DCOPY (2 * npp_ (proc), f_aux (from + (k - 1) * npp_ ( & proc) ), 1, f_in (dest + (k - 1) * nrx3), 1) enddo enddo endif call stop_clock ('fft_scatter') #endif #if defined __HPM ! CALL f_hpmstop( 10 ) #endif return end subroutine fft_scatter !=----------------------------------------------------------------------=! END MODULE fft_base !=----------------------------------------------------------------------=!