Merge branch 'develop' of gitlab.com:giannozz/q-e into bader

This commit is contained in:
Paolo Giannozzi 2021-10-17 09:37:51 +02:00
commit cc6bb68bac
546 changed files with 37880 additions and 20931 deletions

1
.gitignore vendored
View File

@ -48,3 +48,4 @@ tempdir
tags
EPW/src/tmp
LAXlib/*.fh
KS_Solvers/*.fh

View File

@ -49,6 +49,7 @@ build:cmake-gnu:
- mkdir build
- cd build
- cmake -DBUILD_SHARED_LIBS=ON -DCMAKE_Fortran_COMPILER=gfortran -DCMAKE_C_COMPILER=gcc .. && make
- make pw ph hp pwcond neb pp pwall cp tddfpt gwl ld1 upf xspectra couple epw all_currents
build:pgi:
tags: [docker]
@ -71,7 +72,7 @@ build:cmake-nvhpc:
- cmake --version
- mkdir build
- cd build
- cmake -DBUILD_SHARED_LIBS=ON -DCMAKE_Fortran_COMPILER=mpif90 -DCMAKE_C_COMPILER=mpicc
- cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_Fortran_COMPILER=mpif90 -DCMAKE_C_COMPILER=mpicc
-DQE_ENABLE_CUDA=ON -DQE_ENABLE_OPENACC=ON .. && make
#build:centos:

View File

@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
set(CMAKE_POLICY_DEFAULT_CMP0048 NEW)
project(qe
VERSION 6.7.1
VERSION 6.8
DESCRIPTION "ESPRESSO: opEn-Source Package for Research in Electronic Structure, Simulation, and Optimization"
LANGUAGES Fortran C)
@ -32,16 +32,15 @@ endif()
# Define the paths for static libraries and executables
##########################################################
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${qe_BINARY_DIR}/lib
CACHE
CACHE
PATH "Single output directory for building all libraries.")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${qe_BINARY_DIR}/bin
CACHE
CACHE
PATH "Single output directory for building all executables.")
###########################################################
# Build helpers
###########################################################
set(PROJECT_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
include(cmake/qeHelpers.cmake)
@ -111,7 +110,7 @@ option(QE_ENABLE_STATIC_BUILD
"enable fully static build of executables" OFF)
option(QE_ENABLE_DOC
"enable documentation building" OFF)
set(QE_FFTW_VENDOR "AUTO" CACHE
set(QE_FFTW_VENDOR "AUTO" CACHE
STRING "select a specific FFTW library [Intel_DFTI, Intel_FFTW3, ArmPL, IBMESSL, FFTW3, Internal]")
set(QE_ENABLE_SANITIZER "none" CACHE STRING "none,asan,ubsan,tsan,msan")
@ -215,14 +214,35 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES "PGI" OR CMAKE_Fortran_COMPILER_ID MATCHES
endif()
############################################################
## Compiler vendor specific options
# C preprocessor
# Note: reply on the compiler preprocessor whenever possible
############################################################
if(DEFINED ENV{CPP})
set(QE_CPP_DEFAULT $ENV{CPP})
else()
set(QE_CPP_DEFAULT cpp)
endif()
# QE_CPP_DEFAULT is only effective when cached QE_CPP doesn't exist.
set(QE_CPP ${QE_CPP_DEFAULT} CACHE
STRING "C preprocessor for qe_preprocess_source in qeHelpers.cmake")
find_program(QE_CPP_FULL_PATH NAMES ${QE_CPP} DOC "C preprocessor full path")
if(QE_CPP_FULL_PATH)
message(STATUS "C preprocessor used by qe_preprocess_source in qeHelpers.cmake: ${QE_CPP_FULL_PATH}")
else()
set(QE_CPP_SAVED ${QE_CPP})
unset(QE_CPP CACHE)
message(FATAL_ERROR "C preprocessor ${QE_CPP_SAVED} not found. Pass a working one to CMake via QE_CPP!")
endif()
############################################################
# Compiler vendor specific options
############################################################
if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
include(${PROJECT_CMAKE}/GNUFortranCompiler.cmake)
include(GNUFortranCompiler)
elseif(CMAKE_Fortran_COMPILER_ID MATCHES "PGI" OR CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC")
include(${PROJECT_CMAKE}/NVFortranCompiler.cmake)
include(NVFortranCompiler)
elseif(CMAKE_Fortran_COMPILER_ID MATCHES "XL")
include(${PROJECT_CMAKE}/IBMFortranCompiler.cmake)
include(IBMFortranCompiler)
endif()
if(QE_ENABLE_STATIC_BUILD)
@ -248,6 +268,7 @@ if(QE_ENABLE_CUDA OR QE_ENABLE_PROFILE_NVTX)
set_target_properties(CUDA::nvToolsExt PROPERTIES INTERFACE_LINK_LIBRARIES "-cuda;libnvToolsExt.so")
set(CMAKE_REQUIRED_LIBRARIES "-cuda;libnvToolsExt.so")
check_function_exists(nvtxRangePushEx NVTX_FOUND)
unset(CMAKE_REQUIRED_LIBRARIES)
if(NOT NVTX_FOUND)
message(FATAL_ERROR "Check nvtxRangePushEx in libnvToolsExt.so failed")
endif()
@ -348,12 +369,14 @@ if(NOT QE_LAPACK_INTERNAL)
set(CMAKE_REQUIRED_LINK_OPTIONS ${OpenMP_Fortran_FLAGS})
endif()
find_package(LAPACK)
unset(CMAKE_REQUIRED_LINK_OPTIONS)
endif()
else()
if(QE_ENABLE_OPENMP)
set(CMAKE_REQUIRED_LINK_OPTIONS ${OpenMP_Fortran_FLAGS})
endif()
find_package(LAPACK)
unset(CMAKE_REQUIRED_LINK_OPTIONS)
endif()
if(LAPACK_FOUND)
list(APPEND _lapack_libs
@ -369,6 +392,7 @@ if(NOT QE_LAPACK_INTERNAL)
target_link_libraries(qe_lapack INTERFACE ${_lapack_libs})
set(CMAKE_REQUIRED_LIBRARIES ${_lapack_libs})
check_fortran_function_exists(zhpev ZHPEV_FOUND)
unset(CMAKE_REQUIRED_LIBRARIES)
if(NOT ZHPEV_FOUND)
unset(ZHPEV_FOUND CACHE)
message(FATAL_ERROR "Incomplete LAPACK! function zhpev not found!")
@ -552,7 +576,6 @@ endif(QE_ENABLE_PROFILE_NVTX)
# Components
###########################################################
add_subdirectory(external)
add_subdirectory(clib)
add_subdirectory(FFTXlib)
add_subdirectory(UtilXlib)
add_subdirectory(Modules)
@ -660,9 +683,6 @@ add_custom_target(ph
qe_phonon_fdef_exe
qe_phonon_fdifc_exe
qe_phonon_postahc_exe
qe_plotphon_kforbands_exe
qe_plotphon_bandstognuplot_exe
qe_plotphon_eminmax_exe
COMMENT
"phonon code, Gamma-only and third-order derivatives")

View File

@ -9,11 +9,7 @@ MPILIBS=-lgfortran -lmpi_mpifh -lmpi
# location of required libraries
PWOBJS = ../src/libqecouple.a ../../PW/src/libpw.a
CPOBJS = ../src/libqecouple.a ../../CPV/src/libcp.a
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/Davidson/libdavid.a \
../../KS_Solvers/CG/libcg.a ../../FFTXlib/libqefft.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
../../dft-d3/libdftd3qe.a
LIBOBJS = ../../clib/clib.a ../../iotk/src/libiotk.a
QEMODS=../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a $(BASEMODS)
TLDEPS=couple
@ -21,16 +17,16 @@ TLDEPS=couple
all : tldeps c2pw.x f2pw.x c2cp.x f2cp.x
f2pw.x : f2pw.o $(PWOBJS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
c2pw.x : c2pw.o $(PWOBJS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
f2cp.x : f2cp.o $(CPOBJS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
c2cp.x : c2cp.o $(CPOBJS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
%.o: %.cpp
$(MPICXX) -I../include -c $(MPICXXFLAGS) $< -o $@

View File

@ -9,11 +9,7 @@ MPILIBS=-Wl,-Bstatic,-lifport,-lifcore,-limf,-Bdynamic -lmpi_mpifh -lmpi
# location of required libraries
PWOBJS = ../src/libqecouple.a ../../PW/src/libpw.a
CPOBJS = ../src/libqecouple.a ../../CPV/src/libcp.a
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/Davidson/libdavid.a \
../../KS_Solvers/CG/libcg.a ../../FFTXlib/libqefft.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
../../dft-d3/libdftd3qe.a
LIBOBJS = ../../clib/clib.a ../../iotk/src/libiotk.a
QEMODS=../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a $(BASEMODS)
TLDEPS=couple
@ -21,16 +17,16 @@ TLDEPS=couple
all : tldeps c2pw.x f2pw.x c2cp.x f2cp.x
f2pw.x : f2pw.o $(PWOBJS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
c2pw.x : c2pw.o $(PWOBJS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
f2cp.x : f2cp.o $(CPOBJS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
c2cp.x : c2cp.o $(CPOBJS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
%.o: %.cpp
$(MPICXX) -I../include -c $(MPICXXFLAGS) $< -o $@

View File

@ -911,7 +911,7 @@ input_description -distribution {Quantum Espresso} -package CP -program cp.x {
var ortho_max -type INTEGER {
default { 20 }
default {300 }
info {
maximum number of iterations for orthonormalization
meaningful only if orthogonalization = 'ortho'

View File

@ -3,42 +3,36 @@
Introduction
============
This guide covers the usage of the `CP` package, version 6.6, a core
This guide covers the usage of the `CP` package, version 6.8, a core
component of the Quantum ESPRESSO distribution. Further documentation,
beyond what is provided in this guide, can be found in the directory
`CPV/Doc/`, containing a copy of this guide.
*Important notice: due to the lack of time and of manpower, this manual
is only partially updated and may contain outdated information.*
This guide assumes that you know the physics that `CP` describes and the
This guide assumes that you know the physics that `CP` describes and the
methods it implements. It also assumes that you have already installed,
or know how to install, Quantum ESPRESSO. If not, please read the
general User's Guide for Quantum ESPRESSO, found in directory `Doc/` two
levels above the one containing this guide; or consult the web site:\
levels above the one containing this guide; or consult the web site:
`http://www.quantum-espresso.org`.
People who want to modify or contribute to `CP` should read the
Developer Manual:\
`Doc/developer_man.pdf`.
People who want to modify or contribute to `CP` should read the
Developer Manual: `https://gitlab.com/QEF/q-e/-/wikis/home`.
`CP` can perform Car-Parrinello molecular dynamics, including
variable-cell dynamics, and free-energy surface calculation at fixed
cell through meta-dynamics, if patched with PLUMED.
The `CP` package is based on the original code written by Roberto Car
and Michele Parrinello. `CP` was developed by Alfredo Pasquarello (EPF
`CP` can perform Car-Parrinello molecular dynamics, including
variable-cell dynamics. The `CP` package is based on the original code
written by Roberto Car
and Michele Parrinello. `CP` was developed by Alfredo Pasquarello (EPF
Lausanne), Kari Laasonen (Oulu), Andrea Trave, Roberto Car (Princeton),
Nicola Marzari (EPF Lausanne), Paolo Giannozzi, and others. FPMD, later
merged with `CP`, was developed by Carlo Cavazzoni, Gerardo Ballabio
(CINECA), Sandro Scandolo (ICTP), Guido Chiarotti, Paolo Focher, and
others. We quote in particular:
merged with `CP`, was developed by Carlo Cavazzoni (Leonardo), Gerardo
Ballabio (CINECA), Sandro Scandolo (ICTP), Guido Chiarotti, Paolo Focher,
and others. We quote in particular:
- Federico Grasselli and Riccardo Bertossa (SISSA) for bug fixes,
extensions to Autopilot;
- Biswajit Santra, Hsin-Yu Ko, Marcus Calegari Andrade (Princeton) for
SCAN functional;
various contribution, notably the SCAN functional;
- Robert DiStasio (Cornell)), Biswajit Santra, and Hsin-Yu Ko for
hybrid functionals with MLWF; (maximally localized Wannier
@ -50,21 +44,21 @@ others. We quote in particular:
- Paolo Umari (Univ. Padua) for finite electric fields and conjugate
gradients;
- Paolo Umari and Ismaila Dabo for ensemble-DFT;
- Paolo Umari and Ismaila Dabo (Penn State) for ensemble-DFT;
- Xiaofei Wang (Princeton) for META-GGA;
- The Autopilot feature was implemented by Targacept, Inc.
This guide has been mostly writen by Gerardo Ballabio and Carlo
Cavazzoni.
The original version of this guide was mostly written by Gerardo Ballabio
and Carlo Cavazzoni.
`CP` is free software, released under the GNU General Public License.\
`CP` is free software, released under the GNU General Public License.\
See `http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt`, or the file
License in the distribution).
`License` in the distribution.
We shall greatly appreciate if scientific work done using the Quantum
ESPRESSO distribution will contain an acknowledgment to the following
ESPRESSO distribution will contain an acknowledgment to the following
references:
> P. Giannozzi, S. Baroni, N. Bonini, M. Calandra, R. Car, C. Cavazzoni,
@ -95,22 +89,22 @@ Users of the GPU-enabled version should also cite the following paper:
> Ferretti, N. Marzari, I. Timrov, A. Urru, S. Baroni, J. Chem. Phys.
> 152, 154105 (2020)
Note the form Quantum ESPRESSO for textual citations of the code. Please
also see package-specific documentation for further recommended
citations. Pseudopotentials should be cited as (for instance)
Note the form `Quantum ESPRESSO` (in small caps) for textual citations
of the code. Please also see other package-specific documentation for
further recommended citations. Pseudopotentials should be cited as
(for instance)
> \[ \] We used the pseudopotentials C.pbe-rrjkus.UPF and O.pbe-vbc.UPF
> from\
> `http://www.quantum-espresso.org`.
> from `http://www.quantum-espresso.org`.
Compilation
===========
`CP` is included in the core Quantum ESPRESSO distribution. Instruction
`CP` is included in the core Quantum ESPRESSO distribution. Instruction
on how to install it can be found in the general documentation (User's
Guide) for Quantum ESPRESSO.
Typing `make cp` from the main Quantum ESPRESSO directory or `make` from
Typing `make cp` from the main Quantum ESPRESSO directory or `make` from
the `CPV/` subdirectory produces the following codes in `CPV/src`:
- `cp.x`: Car-Parrinello Molecular Dynamics code
@ -125,52 +119,44 @@ Symlinks to executable programs will be placed in the `bin/`
subdirectory.
As a final check that compilation was successful, you may want to run
some or all of the tests and examples. Automated tests for `cp.x` are in
some or all of the tests and examples. Automated tests for `cp.x` are in
directory `test-suite/` and can be run via the `Makefile` found there.
Please see the general User's Guide for their setup.
You may take the tests and examples distributed with `CP` as templates
You may take the tests and examples distributed with `CP` as templates
for writing your own input files. Input files for tests are contained in
subdirectories `test-suite/cp_` with file type `*.in1`, `*.in2`, \... .
subdirectories `test-suite/cp_*` with file type `*.in1`, `*.in2`, \... .
Input files for examples are produced, if you run the examples, in the
`results/` subdirectories, with names ending with `.in`.
For general information on parallelism and how to run in parallel
execution, please see the general User's Guide. `CP` currently can take
advantage of both MPI and OpenMP parallelization. The "plane-wave",
"linear-algebra" and "task-group" parallelization levels are
implemented.
execution, please see the general User's Guide. `CP` currently can take
advantage of both MPI and OpenMP parallelization and on GPU acceleration.
The "plane-wave", "linear-algebra" and "task-group" parallelization levels
are implemented.
Input data
==========
Input data for `cp.x` is organized into several namelists, followed by
other fields ("cards") introduced by keywords. The namelists are
other fields ("cards") introduced by keywords. The namelists are:
------------------- ----------------------------------------------------------
&CONTROL: general variables controlling the run
&SYSTEM: structural information on the system under investigation
&ELECTRONS: electronic variables, electron dynamics
&IONS : ionic variables, ionic dynamics
&CELL (optional): variable-cell dynamics
------------------- ----------------------------------------------------------
> &CONTROL: general variables controlling the run\
> &SYSTEM: structural information on the system under investigation\
> &ELECTRONS: electronic variables, electron dynamics\
> &IONS : ionic variables, ionic dynamics\
> &CELL (optional): variable-cell dynamics\
\
The `&CELL` namelist may be omitted for fixed-cell calculations. This
depends on the value of variable `calculation` in namelist &CONTROL.
Most variables in namelists have default values. Only the following
Most variables in namelists have default values. Only he following
variables in &SYSTEM must always be specified:
----------- --------------------- -----------------------------------------------
`ibrav` (integer) Bravais-lattice index
`celldm` (real, dimension 6) crystallographic constants
`nat` (integer) number of atoms in the unit cell
`ntyp` (integer) number of types of atoms in the unit cell
`ecutwfc` (real) kinetic energy cutoff (Ry) for wavefunctions.
----------- --------------------- -----------------------------------------------
\
).
> `ibrav` (integer) Bravais-lattice index\
> `celldm` (real, dimension 6) crystallographic constants\
> `nat` (integer) number of atoms in the unit cell\
> `ntyp` (integer) number of types of atoms in the unit cell\
> `ecutwfc` (real) kinetic energy cutoff (Ry) for wavefunctions
Explanations for the meaning of variables `ibrav` and `celldm`, as well
as on alternative ways to input structural data, are contained in files
@ -178,34 +164,31 @@ as on alternative ways to input structural data, are contained in files
describe a large number of other variables as well. Almost all variables
have default values, which may or may not fit your needs.
Comment lines in namelists can be introduced by a \"!\", exactly as in
fortran code.
After the namelists, you have several fields ("cards") introduced by
keywords with self-explanatory names:
> ATOMIC\_SPECIES\
> ATOMIC\_POSITIONS\
> CELL\_PARAMETERS (optional)\
> OCCUPATIONS (optional)\
> OCCUPATIONS (optional)
The keywords may be followed on the same line by an option. Unknown
fields are ignored. See the files mentioned above for details on the
available "cards".
Comments lines in "cards" can be introduced by either a "!" or a "\#"
character in the first position of a line.
Comment lines in namelists can be introduced by a \"!\", exactly as in
fortran code. Comments lines in "cards" can be introduced by either a "!"
or a "\#" character in the first position of a line.
Data files
----------
The output data files are written in the directory specified by variable
`outdir`, with names specified by variable `prefix` (a string that is
prepended to all file names, whose default value is: `prefix=pwscf`).
The `iotk` toolkit is used to write the file in a XML format, whose
definition can be found in the Developer Manual. In order to use the
data directory on a different machine, you need to convert the binary
files to formatted and back, using the `bin/iotk` script.
prepended to all file names, whose default value is `prefix=cp_$ndw`,
where `ndw` is an integer specified in input).
In order to use the data on a different machine, you may need to
compile `CP` with HDF5 enabled.
The execution stops if you create a file `prefix.EXIT` either in the
working directory (i.e. where the program is executed), or in the
@ -215,58 +198,13 @@ this procedure is that all files are properly closed, whereas just
killing the process may leave data and output files in an unusable
state.
Format of arrays containing charge density, potential, etc.
-----------------------------------------------------------
The index of arrays used to store functions defined on 3D meshes is
actually a shorthand for three indices, following the FORTRAN convention
(\"leftmost index runs faster\"). An example will explain this better.
Suppose you have a 3D array `psi(nr1x,nr2x,nr3x)`. FORTRAN compilers
store this array sequentially in the computer RAM in the following way:
psi( 1, 1, 1)
psi( 2, 1, 1)
...
psi(nr1x, 1, 1)
psi( 1, 2, 1)
psi( 2, 2, 1)
...
psi(nr1x, 2, 1)
...
...
psi(nr1x,nr2x, 1)
...
psi(nr1x,nr2x,nr3x)
etc
Let `ind` be the position of the `(i,j,k)` element in the above list:
the following relation
ind = i + (j - 1) * nr1x + (k - 1) * nr2x * nr1x
holds. This should clarify the relation between 1D and 3D indexing. In
real space, the `(i,j,k)` point of the FFT grid with dimensions `nr1`
( $`\le`$ `nr1x`), `nr2` ( $`\le`$ `nr2x`), , `nr3` ( $`\le`$ `nr3x`), is
```math
r_{ijk}=\frac{i-1}{nr1} \tau_1 + \frac{j-1}{nr2} \tau_2 + \frac{k-1}{nr3} \tau_3
```
where the $`\tau_i`$ are the basis vectors of the
Bravais lattice. The latter are stored row-wise in the `at` array:
$`\tau_1 =`$ `at(:, 1)`, $`\tau_2 =`$ `at(:, 2)`, $`\tau_3 =`$ `at(:, 3)`.
The distinction between the dimensions of the FFT grid, `(nr1,nr2,nr3)`
and the physical dimensions of the array, `(nr1x,nr2x,nr3x)` is done
only because it is computationally convenient in some cases that the two
sets are not the same. In particular, it is often convenient to have
`nrx1`=`nr1`+1 to reduce memory conflicts.
The format of arrays containing charge density, potential, etc.
is described in the developer manual.
Output files
==========
The `cp.x` code produces many output file, that together build up the trajectory.
The `cp.x` code produces many output files, that together build up the trajectory.
You have a file for the positions, called `prefix.pos`, where `prefix` is defined in
the input file, that is formatted like:
@ -280,35 +218,40 @@ the input file, that is formatted like:
0.42395189282719E+01 0.55766875434652E+01 0.31291744042209E+01
0.45445534106843E+01 0.36049553522533E+01 0.55864387532281E+01
where in the first line there is an header with, in order, the number of the step and
the time in ps of this step. Later you found the positions of all the atoms, in the
same order of the input file (note that this behaviour emerged in v6.6 -- previously
atoms were sorted by type). In this example we have 3 atoms.
The type must be deduced from the input file. After the first 4 lines
you find the same structure for the second step. The units of the position are Bohr's
radius. Note that the atoms coordinates are unwrapped, so it is possible that they go
outside the simulation cell.
where the first line contains the step number and elapsed time, in ps, at this
step; the following lines contain the positions, in Bohr radii, of all the
atoms (3 in this examples), in the same order as in the input file (since v6.6
-- previously, atoms were sorted by type; the type must be deduced from the
input file). The same structure is repeated for the second step and so on.
The printout is made every `iprint` steps (10 in this case, so at step 10, 20,
etc.). Note that the atomic coordinates are not wrapped into the simulation
cell, so it is possible that they lie outside it.
The velocities are written in a similar file named `prefix.vel`, where `prefix` is defined in
the input file, that is formatted like the `.pos` file. The units are the usual Hartree
atomic units (note again that the velocity in the pw code differs by a factor of 2).
The velocities are written in a similar file named `prefix.vel`, where `prefix`
is defined in the input file, that is formatted like the `.pos` file. The units
are the usual Hartree atomic units (note that the velocities in the `pw.x` code
are in _Rydberg_ a.u. and differ by a factor 2).
The `prefix.for` file is formatted like the previous two. Contains the computed forces
and has Hartree atomic units too.
It is written only if `tprnfor = .true.` is set in the input file.
The `prefix.for` file, formatted like the previous two, contains the computed
forces, in Hartree atomic units as well. It is written only if a molecular
dynamics calculation is performed, or if `tprnfor = .true.` is set in input.
The file `prefix.evp` has one line per printed step and contains some thermodynamic data.
The file `prefix.evp` has one line per printed step and contains some
thermodynamical data.
The first line of the file names the columns:
```
# nfi time(ps) ekinc T\_cell(K) Tion(K) etot enthal econs econt Volume Pressure(GPa
# nfi time(ps) ekinc Tcell(K) Tion(K) etot enthal econs econt Volume Pressure(GPa)
```
where:
- `ekinc` $`K_{ELECTRONS}`$, the electron's fake kinetic energy
- `enthal` $`E_{DFT}+PV`$
- `etot` $`E_{DFT}`$ potential energy of the system, the DFT energy
- `econs` $`E_{DFT} + K_{NUCLEI}`$ this is something that is a constant of motion in the limit where the electronic fictitious mass is zero. It has a physical meaning.
- `econt` $`E_{DFT} + K_{IONS} + K_{ELECTRONS}`$ this is a constant of motion of the lagrangian. If the dt is small enough this will be up to a very good precision a constant. It is not a physical quantity, since $`K_{ELECTRONS}`$ has _nothing_ to do with the quantum kinetic energy of the electrons.
- `ekinc` is the electrons fictitious kinetic energy, $`K_{ELECTRONS}`$
- `enthal` is the enthalpy, $`E_{DFT}+PV`$
- `etot` is the DFT (potential) energy of the system, $`E_{DFT}`$
- `econs` is a physically meaningful constant of motion, $`E_{DFT} + K_{NUCLEI}`$,
in the limit of zero electronic fictitious mass
- `econt` is the constant of motion of the lagrangian$`E_{DFT} + K_{IONS} + K_{ELECTRONS}`$ t.
If the time step `dt` is small enough this will be up to a very good precision a constant.
It is not a physical quantity, since $`K_{ELECTRONS}`$ has _nothing_ to do with the quantum
kinetic energy of the electrons.
Using `CP`
@ -317,7 +260,7 @@ Using `CP`
It is important to understand that a CP simulation is a sequence of
different runs, some of them used to \"prepare\" the initial state of
the system, and other performed to collect statistics, or to modify the
state of the system itself, i.e. modify the temperature or the pressure.
state of the system itself, i.e. to modify the temperature or the pressure.
To prepare and run a CP simulation you should first of all define the
system:
@ -393,8 +336,7 @@ An example of input file (Benzene Molecule):
H -2.2 2.2 0.0
H 2.2 2.2 0.0
You can find the description of the input variables in file
`Doc/INPUT_CP.*`.
You can find the description of the input variables in file `Doc/INPUT_CP.*`.
Reaching the electronic ground state
------------------------------------
@ -403,7 +345,7 @@ The first run, when starting from scratch, is always an electronic
minimization, with fixed ions and cell, to bring the electronic system
on the ground state (GS) relative to the starting atomic configuration.
This step is conceptually very similar to self-consistency in a
`pw.x` run.
`pw.x` run.
Sometimes a single run is not enough to reach the GS. In this case, you
need to re-run the electronic minimization stage. Use the input of the
@ -428,14 +370,12 @@ $`< 10^{-5}`$. You could check the value of the fictitious kinetic energy
on the standard output (column EKINC).
Different strategies are available to minimize electrons, but the most
used ones are:
- steepest descent: `electron_dynamics = sd`
- damped dynamics: `electron_dynamics = damp`, `electron_damping` =
a number typically ranging from 0.1 and 0.5
frequently used is _damped dynamics_: `electron_dynamics = damp` and
`electron_damping` = a number typically ranging from 0.1 and 0.5.
See the input description to compute the optimal damping factor.
Steepest descent: `electron_dynamics = sd`, is also available but it
is typicallyslower than damped dynamics and should be used only to
start the minimization.
Relax the system
----------------
@ -860,14 +800,6 @@ ranges between 4 and 7.
All the other parameters have the same meaning in the usual `CP` input,
and they are discussed above.
### Free-energy surface calculations
Once `CP` is patched with `PLUMED` plug-in, it becomes possible to
turn-on most of the PLUMED functionalities running `CP` as:
`./cp.x -plumed` plus the other usual `CP` arguments. The PLUMED input
file has to be located in the specified `outdir` with the fixed name
`plumed.dat`.
### Treatment of USPPs
The cutoff `ecutrho` defines the resolution on the real space FFT mesh
@ -1030,99 +962,62 @@ An example input is listed as following:
O 16.0D0 O_HSCV_PBE-1.0.UPF
H 2.0D0 H_HSCV_PBE-1.0.UPF
Performances
============
Parallel Performances
=====================
`cp.x` can run in principle on any number of processors. The
effectiveness of parallelization is ultimately judged by the "scaling",
i.e. how the time needed to perform a job scales with the number of
processors, and depends upon:
processors. Ideally one would like to have linear scaling, i.e.
$`T \sim T_0/N_p`$ for $`N_p`$ processors, where $`T_0`$ is the estimated
time for serial execution. In addition, one would like to have linear
scaling of the RAM per processor: $`O_N \sim O_0/N_p`$, so that large-memory
systems fit into the RAM of each processor.
- the size and type of the system under study;
We refer to the "Parallelization" section of the general User's Guide for
a description of MPI and OpenMP parallelization paradigms, of the various
MPI parallelization levels, and on how to activate them.
- the judicious choice of the various levels of parallelization
(detailed in
Sec.[\[SubSec:para\]](#SubSec:para){reference-type="ref"
reference="SubSec:para"});
A judicious choice of the various levels of parallelization, together
with the availability of suitable hardware (e.g. fast communications)
is fundamental to reach good performances._VERY IMPORTANT_: For each
system there is an optimal range of number of processors on which to
run the job. A too large number of processors or a bad parallelization
style will yield performance degradation.
- the availability of fast interprocess communications (or lack of
it).
For `CP` with hybrid functionals, see the related section above this one.
For all other cases, the relevant MPI parallelization levels are:
Ideally one would like to have linear scaling, i.e. $`T \sim T_0/N_p`$ for
$`N_p`$ processors, where $`T_0`$ is the estimated time for serial
execution. In addition, one would like to have linear scaling of the RAM
per processor: $`O_N \sim O_0/N_p`$, so that large-memory systems fit into
the RAM of each processor.
- "plane waves" (PW);
- "tasks" (activated by command-line option `-nt N`);
- "linear algebra" (`-nd N`);
- "bands" parallelization (`-nb N`), to be used only in
special cases;
- "images" parallelization (`-ni N`), used only in code `manycp.x`
(see the header of `CPV/src/manycp.f90` for documentation).
As a general rule, image parallelization:
As a rule of thumb:
- start with PW parallelization only (e.g. `mpirun -np N cp.x ...` with
no other parallelization options); the code will scale well unless `N`
exceeds the third FFT dimensions `nr3` and/or `nr3s`.
- To further increase the number of processors, use "task groups",
typically 4 to 8 (e.g. `mpirun -np N cp.x -nt 8 ...`).
- Alternatively, or in addition, you may compile with OpenMP:
`./configure --enable-openmp ...`, then `export OMP_NUM_THREADS=n`
and run on `n` threads (4 to 8 typically).
_Beware conflicts between MPI and OpenMP threads_!
don't do this unless you know what you are doing.
- Finally, the optimal number of processors for \"linear-algebra\"
parallelization can be found by observing the performances of `ortho`
in the final time report for different numbers of processors in the
linear-algebra group (must be a square integer, not larger than the
number of processoris for plane-wave parallelization). Linear-algebra
parallelization distributes `M\times M`$ matrices, with `M` number of
bands, so it may be useful if memory-constrained.
- may give good scaling, but the slowest image will determine the
overall performances ("load balancing" may be a problem);
Note: optimal serial performances are achieved when the data are as much
as possible kept into the cache. As a side effect, PW parallelization may
yield superlinear (better than linear) scaling, thanks to the increase in
serial speed coming from the reduction of data size (making it easier for
the machine to keep data in the cache).
- requires very little communications (suitable for ethernet
communications);
- does not reduce the required memory per processor (unsuitable for
large-memory jobs).
Parallelization on k-points:
- guarantees (almost) linear scaling if the number of k-points is a
multiple of the number of pools;
- requires little communications (suitable for ethernet
communications);
- does not reduce the required memory per processor (unsuitable for
large-memory jobs).
Parallelization on PWs:
- yields good to very good scaling, especially if the number of
processors in a pool is a divisor of $`N_3`$ and $`N_{r3}`$ (the
dimensions along the z-axis of the FFT grids, `nr3` and `nr3s`,
which coincide for NCPPs);
- requires heavy communications (suitable for Gigabit ethernet up to
4, 8 CPUs at most, specialized communication hardware needed for 8
or more processors );
- yields almost linear reduction of memory per processor with the
number of processors in the pool.
A note on scaling: optimal serial performances are achieved when the
data are as much as possible kept into the cache. As a side effect, PW
parallelization may yield superlinear (better than linear) scaling,
thanks to the increase in serial speed coming from the reduction of data
size (making it easier for the machine to keep data in the cache).
VERY IMPORTANT: For each system there is an optimal range of number of
processors on which to run the job. A too large number of processors
will yield performance degradation. If the size of pools is especially
delicate: $`N_p`$ should not exceed $`N_3`$ and $`N_{r3}`$, and should ideally
be no larger than $`1/2\div1/4 N_3`$ and/or $`N_{r3}`$. In order to increase
scalability, it is often convenient to further subdivide a pool of
processors into "task groups". When the number of processors exceeds the
number of FFT planes, data can be redistributed to \"task groups\" so
that each group can process several wavefunctions at the same time.
The optimal number of processors for \"linear-algebra\" parallelization,
taking care of multiplication and diagonalization of $`M\times M`$
matrices, should be determined by observing the performances of
`cdiagh/rdiagh` (`pw.x`) or `ortho` (`cp.x`) for different numbers of
processors in the linear-algebra group (must be a square integer).
Actual parallel performances will also depend on the available software
(MPI libraries) and on the available communication hardware. For PC
clusters, OpenMPI (`http://www.openmpi.org/`) seems to yield better
performances than other implementations (info by Kostantin Kudin). Note
however that you need a decent communication hardware (at least Gigabit
ethernet) in order to have acceptable performances with PW
parallelization. Do not expect good scaling with cheap hardware: PW
calculations are by no means an \"embarrassing parallel\" problem.
Also note that multiprocessor motherboards for Intel Pentium CPUs
typically have just one memory bus for all processors. This dramatically
slows down any code doing massive access to memory (as most codes in the
Quantum ESPRESSO distribution do) that runs on processors of the same
motherboard.

View File

@ -114,33 +114,33 @@ makov_payne.o
LOBJS = \
entropy.o
QEMODS=../../Modules/libqemod.a ../../upflib/libupf.a ../../FFTXlib/libqefft.a ../../LAXlib/libqela.a ../../UtilXlib/libutil.a ../../XClib/xc_lib.a
QEMODS=$(BASEMODS)
TLDEPS= bindir libs mods
all : tldeps libcp.a manycp.x cp.x wfdd.x cppp.x
manycp.x : manycp.o libcp.a $(QEMODS) $(LIBOBJS)
manycp.x : manycp.o libcp.a $(QEMODS)
$(LD) $(LDFLAGS) -o manycp.x manycp.o \
libcp.a $(QEMODS) $(LIBOBJS) $(QELIBS)
libcp.a $(QEMODS) $(QELIBS)
- ( cd ../../bin ; ln -fs ../CPV/src/manycp.x . )
cp.x : cprstart.o libcp.a $(QEMODS) $(LIBOBJS)
cp.x : cprstart.o libcp.a $(QEMODS)
$(LD) $(LDFLAGS) -o cp.x cprstart.o \
libcp.a $(QEMODS) $(LIBOBJS) $(QELIBS)
libcp.a $(QEMODS) $(QELIBS)
- ( cd ../../bin ; ln -fs ../CPV/src/cp.x . )
libcp.a : $(FOBJS) $(LOBJS)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
cppp.x : cppp.o $(QEMODS) $(LIBOBJS)
cppp.x : cppp.o $(QEMODS)
$(LD) $(LDFLAGS) -o cppp.x cppp.o \
$(QEMODS) $(LIBOBJS) $(QELIBS)
$(QEMODS) $(QELIBS)
- (cd ../../bin ; ln -fs ../CPV/src/cppp.x . )
wfdd.x : wfdd.o
$(LD) $(LDFLAGS) -o $@ wfdd.o $(QEMODS) $(LIBOBJS) $(QELIBS)
$(LD) $(LDFLAGS) -o $@ wfdd.o $(QEMODS) $(QELIBS)
- ( cd ../../bin ; ln -fs ../CPV/src/$@ . )
tldeps :

View File

@ -12,7 +12,8 @@
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac, fion, ema0bg, becdr, &
lambdap, lambda, nlam, vpot, c0, cm, phi, dbec,l_cprestart )
!! please see https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.64.1045
!! please see https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.79.1337 (ensemble DFT)
!! and https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.64.1045 (conjugate gradient)
use kinds, only: dp
use control_flags, only: tpre, iverbosity, tfor, tprnfor
@ -52,11 +53,11 @@
use cp_electronic_mass, ONLY : emass_cutoff
use orthogonalize_base, ONLY : calphi_bgrp
use cp_interfaces, ONLY : rhoofr, dforce, compute_stress, vofrho, nlfl_bgrp, prefor
use cp_interfaces, ONLY : nlsm2_bgrp, calbec, caldbec_bgrp, nlfq_bgrp
use cp_interfaces, ONLY : nlsm2_bgrp, calbec, caldbec_bgrp, nlfq_bgrp, runcp_uspp
USE cp_main_variables, ONLY : idesc, drhor, drhog
USE mp_global, ONLY: me_image, my_image_id, nbgrp
USE fft_base, ONLY: dffts, dfftp
use wave_gauge, only: project_parallel_gauge_2
!
implicit none
@ -100,6 +101,11 @@
complex(dp), allocatable :: c3(:)
real(dp) :: gamma, entmp, sta
complex(dp),allocatable :: hpsi(:,:), hpsi0(:,:), gi(:,:), hi(:,:)
#if defined(__CUDA)
complex(dp), allocatable, DEVICE :: hpsi_dummy(:,:), c0_dummy(:,:), gi_dummy(:,:)
#else
complex(dp), allocatable :: hpsi_dummy(:,:), c0_dummy(:,:), gi_dummy(:,:)
#endif
real(DP), allocatable:: s_minus1(:,:)!factors for inverting US S matrix
real(DP), allocatable:: k_minus1(:,:)!factors for inverting US preconditioning matrix
real(DP), allocatable :: lambda_repl(:,:) ! replicated copy of lambda
@ -308,34 +314,10 @@
call newd(vpot,rhovan,fion,.true.)
call prefor(eigr,betae)!ATTENZIONE
do i=1,nbsp,2
call dforce( i, bec, betae, c0,c2,c3,rhos, dffts%nnr, ispin,f,nbsp,nspin)
if(tefield .and. (evalue.ne.0.d0)) then
call dforceb(c0, i, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
c2(1:ngw)=c2(1:ngw)+evalue*df(1:ngw)
call dforceb(c0, i+1, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
c3(1:ngw)=c3(1:ngw)+evalue*df(1:ngw)
endif
if(tefield2 .and. (evalue2.ne.0.d0)) then
call dforceb(c0, i, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
c2(1:ngw)=c2(1:ngw)+evalue2*df(1:ngw)
call dforceb(c0, i+1, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
c3(1:ngw)=c3(1:ngw)+evalue2*df(1:ngw)
endif
hpsi(1:ngw, i)=c2(1:ngw)
if(i+1 <= nbsp) then
hpsi(1:ngw,i+1)=c3(1:ngw)
endif
if (gstart==2) then
hpsi(1, i)=CMPLX(DBLE(hpsi(1, i)), 0.d0,kind=DP)
if(i+1 <= nbsp) then
hpsi(1,i+1)=CMPLX(DBLE(hpsi(1,i+1)), 0.d0,kind=DP)
endif
end if
enddo
call prefor(eigr,betae)
! this puts the gradient inside the array hpsi
call runcp_uspp(0,0.d0,0.d0,ema0bg, 0.d0, rhos, bec, &
c0, c0_dummy, hpsi, hpsi_dummy, .false., .false., .true.)
if(pre_state) call ave_kin(c0,SIZE(c0,1),nbsp,ave_ene)
@ -864,47 +846,9 @@
#endif
call prefor(eigr,betae)
do i=1,nbsp,2
call dforce(i,bec,betae,c0,c2,c3,rhos,dffts%nnr,ispin,f,nbsp,nspin)
if(tefield.and.(evalue .ne. 0.d0)) then
call dforceb &
(c0, i, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
do ig=1,ngw
c2(ig)=c2(ig)+evalue*df(ig)
enddo
call dforceb &
(c0, i+1, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
do ig=1,ngw
c3(ig)=c3(ig)+evalue*df(ig)
enddo
endif
if(tefield2.and.(evalue2 .ne. 0.d0)) then
call dforceb &
(c0, i, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
do ig=1,ngw
c2(ig)=c2(ig)+evalue2*df(ig)
enddo
call dforceb &
(c0, i+1, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
do ig=1,ngw
c3(ig)=c3(ig)+evalue2*df(ig)
enddo
endif
do ig=1,ngw
gi(ig, i)=c2(ig)
if(i+1 <= nbsp) then
gi(ig,i+1)=c3(ig)
endif
end do
if (gstart==2) then
gi(1, i)=CMPLX(DBLE(gi(1, i)),0.d0,kind=DP)
if(i+1 <= nbsp) then
gi(1,i+1)=CMPLX(DBLE(gi(1,i+1)),0.d0,kind=DP)
endif
end if
enddo
! this puts the gradient inside the array gi
call runcp_uspp(0,0.d0,0.d0,ema0bg, 0.d0, rhos, bec, &
c0, c0_dummy, gi, gi_dummy, .false., .false., .true.)
ALLOCATE( lambda_repl( nudx, nudx ) )
!
do is = 1, nspin
@ -940,55 +884,19 @@
!if required project c0 on previous manifold of occupied states
!NOT IMPLEMENTED YET FOR ENSEMBLE DFT AND NSPIN==2
!NOT IMPLEMENTED FOR US PSEUDOPOTENTIALS
lambda_repl=0.d0
do i = 1, nss
do j = 1, nss
ii = i + istart - 1
jj = j + istart - 1
do ig = 1, ngw
lambda_repl( i, j ) = lambda_repl( i, j ) + &
2.d0 * DBLE( CONJG( c0old( ig, ii ) ) * c0( ig, jj) )
enddo
if( gstart == 2 ) then
lambda_repl( i, j ) = lambda_repl( i, j ) - &
DBLE( CONJG( c0old( 1, ii ) ) * c0( 1, jj ) )
endif
enddo
enddo
CALL mp_sum( lambda_repl, intra_bgrp_comm )
cm(:,:)=c0(:,:)
c0=(0.d0,0.d0)
do i=1,nss
do j=1,nss
c0(1:ngw,i)=c0(1:ngw,i)+lambda_repl(i,j)*cm(1:ngw,j)
enddo
enddo
call project_parallel_gauge_2(c0old, cm, c0, &
nss, ngw, ngw,gstart)
call calbec (nbsp,betae,c0,bec)
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
call calbec(nbsp, betae,c0,bec)
call runcp_uspp(0,0.d0,0.d0,ema0bg, 0.d0, rhos, bec, &
c0, c0_dummy, gi, gi_dummy, .false., .false., .true.)
do i=1,nbsp,2
call dforce(i,bec,betae,c0,c2,c3,rhos,dffts%nnr,ispin,f,nbsp,nspin)
do ig=1,ngw
gi(ig, i)=c2(ig)
if(i+1 <= nbsp) then
gi(ig,i+1)=c3(ig)
endif
end do
if (gstart==2) then
gi(1, i)=CMPLX(DBLE(gi(1, i)),0.d0,kind=DP)
if(i+1 <= nbsp) then
gi(1,i+1)=CMPLX(DBLE(gi(1,i+1)),0.d0,kind=DP)
endif
end if
enddo
lambda_repl = 0.d0
do i = 1, nss

View File

@ -1099,7 +1099,7 @@ SUBROUTINE rhov(rhovan,rhog,rhor)
CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,isup), rhog(:,isdw) )
!
IF( iverbosity > 1 ) THEN
WRITE( stdout,'(a,2f12.8,/,a,2f12.8)') &
WRITE( stdout,'(a,f12.8,/,a,f12.8)') &
& ' rhov: n_v(g=0) up = ',omega*DBLE (rhog(1,isup)), &
& ' rhov: n_v(g=0) down = ',omega*DBLE(rhog(1,isdw))
END IF

View File

@ -336,10 +336,10 @@
END SUBROUTINE writefile_x
END INTERFACE
!
INTERFACE runcp_uspp
SUBROUTINE runcp_uspp_x &
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart, compute_only_gradient )
USE kinds, ONLY: DP
IMPLICIT NONE
integer, intent(in) :: nfi
@ -350,7 +350,7 @@
complex(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
complex(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
logical, optional, intent(in) :: fromscra
logical, optional, intent(in) :: restart
logical, optional, intent(in) :: restart, compute_only_gradient
END SUBROUTINE
END INTERFACE

View File

@ -29,6 +29,7 @@ cg_sub.o : ../../Modules/ions_base.o
cg_sub.o : ../../Modules/kind.o
cg_sub.o : ../../Modules/mp_global.o
cg_sub.o : ../../Modules/recvec.o
cg_sub.o : ../../Modules/wave_gauge.o
cg_sub.o : ../../UtilXlib/mp.o
cg_sub.o : ../../upflib/uspp.o
cg_sub.o : cg.o

View File

@ -37,7 +37,7 @@
IMPLICIT NONE
! input
REAL(DP), INTENT(IN) :: rhovan(nhm*(nhm+1)/2,nat,nspin)
REAL(DP) vr(dfftp%nnr,nspin)
REAL(DP), INTENT(IN) :: vr(dfftp%nnr,nspin)
LOGICAL, INTENT(IN) :: tprint
! output
REAL(DP) fion(3,nat)

View File

@ -18,7 +18,7 @@
SUBROUTINE runcp_uspp_x &
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart, compute_only_gradient )
!
! This subroutine performs a Car-Parrinello or Steepest-Descent step
! on the electronic variables, computing forces on electrons
@ -30,6 +30,8 @@
! on output:
! cm_bgrp wave functions at time t + dt, not yet othogonalized
!
! if compute_only_gradient is true, this routine only puts the gradient
! in the array cm_*
USE parallel_include
USE kinds, ONLY : DP
USE mp_global, ONLY : me_bgrp, &
@ -40,7 +42,7 @@
use control_flags, only : lwf, tsde, many_fft
use uspp, only : deeq, vkb, vkb_d
use gvect, only : gstart
use electrons_base, only : nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, nupdwn_bgrp, iupdwn_bgrp
use electrons_base, only : nbsp_bgrp, ispin_bgrp, f_bgrp , nspin, nupdwn_bgrp, iupdwn_bgrp
use wannier_subroutines, only : ef_potential
use efield_module, only : dforce_efield, tefield, dforce_efield2, tefield2
use gvecw, only : ngw, ngwx
@ -59,6 +61,7 @@
COMPLEX(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
LOGICAL, OPTIONAL, INTENT(IN) :: fromscra
LOGICAL, OPTIONAL, INTENT(IN) :: restart
LOGICAL, OPTIONAL, INTENT(IN) :: compute_only_gradient
!
!
real(DP) :: verl1, verl2, verl3
@ -78,7 +81,7 @@
integer :: i, nsiz, incr, idx, idx_in, ierr
integer :: iwfc, nwfc, is, ii, tg_rhos_siz, c2_siz
integer :: iflag
logical :: ttsde
logical :: ttsde, only_gradient
INTEGER :: omp_get_num_threads
#if defined (__CUDA)
@ -96,6 +99,12 @@
IF( PRESENT( restart ) ) THEN
IF( restart ) iflag = 2
END IF
IF(PRESENT( compute_only_gradient) ) then
only_gradient = compute_only_gradient
ELSE
only_gradient = .false.
END IF
IF( dffts%has_task_groups ) THEN
tg_rhos_siz = dffts%nnr_tg
@ -115,12 +124,14 @@
verl2 = 1.0d0 - verl1
verl3 = 1.0d0 * fccc
ALLOCATE( emadt2( ngw ) )
ALLOCATE( emaver( ngw ) )
ccc = fccc * dt2bye
emadt2 = dt2bye * ema0bg
emaver = emadt2 * verl3
IF( .not. only_gradient) then
ALLOCATE( emadt2( ngw ) )
ALLOCATE( emaver( ngw ) )
ccc = fccc * dt2bye
emadt2 = dt2bye * ema0bg
emaver = emadt2 * verl3
END IF
IF( iflag == 0 ) THEN
ttsde = tsde
@ -279,7 +290,7 @@
CALL dforce_efield2 ( bec_bgrp, i, c0_bgrp, c2, c3, rhos)
END IF
IF( iflag == 2 ) THEN
IF( iflag == 2 .and. .not. only_gradient ) THEN
DO idx = 1, incr, 2
IF( i + idx - 1 <= nbsp_bgrp ) THEN
cm_bgrp( :, i+idx-1) = c0_bgrp(:,i+idx-1)
@ -293,16 +304,25 @@
DO idx = 1, incr, 2
idx_in = idx/2+1
IF( i + idx - 1 <= nbsp_bgrp ) THEN
IF (tsde) THEN
CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
CALL wave_steepest( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), emaver, c3(:), ngw, idx_in )
IF( .not. only_gradient) then
IF (tsde) THEN
CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
CALL wave_steepest( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), emaver, c3(:), ngw, idx_in )
ELSE
CALL wave_verlet( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), verl1, verl2, emaver, c2(:), ngw, idx_in )
CALL wave_verlet( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), verl1, verl2, emaver, c3(:), ngw, idx_in )
ENDIF
IF ( gstart == 2 ) THEN
cm_bgrp(1,i+idx-1) = CMPLX(real(cm_bgrp(1,i+idx-1)),0.0d0,kind=dp)
cm_bgrp(1,i+idx ) = CMPLX(real(cm_bgrp(1,i+idx )),0.0d0,kind=dp)
END IF
ELSE
CALL wave_verlet( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), verl1, verl2, emaver, c2(:), ngw, idx_in )
CALL wave_verlet( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), verl1, verl2, emaver, c3(:), ngw, idx_in )
ENDIF
IF ( gstart == 2 ) THEN
cm_bgrp(1,i+idx-1) = CMPLX(real(cm_bgrp(1,i+idx-1)),0.0d0,kind=dp)
cm_bgrp(1,i+idx ) = CMPLX(real(cm_bgrp(1,i+idx )),0.0d0,kind=dp)
cm_bgrp(:, i+idx-1) = c2(:)
cm_bgrp(:, i+idx) = c3(:)
IF ( gstart == 2 ) THEN
cm_bgrp(1, i+idx-1) = CMPLX(dble(cm_bgrp(1, i+idx-1)), 0.0d0, kind=dp)
cm_bgrp(1, i+idx) = CMPLX(dble(cm_bgrp(1, i+idx)), 0.0d0, kind=dp)
END IF
END IF
END IF
END DO
@ -319,9 +339,10 @@
END IF
END IF
DEALLOCATE( emadt2 )
DEALLOCATE( emaver )
IF (.not. only_gradient) then
DEALLOCATE( emadt2 )
DEALLOCATE( emaver )
END IF
#if defined (__CUDA)
DEALLOCATE( rhos_d )
#endif

View File

@ -1,3 +1,28 @@
New in development version:
* RMM-DIIS for CPU (S. Nisihara) and GPU (E. de Paoli, P. Delugas)
* DFT-D3: MPI parallelization and GPU acceleration with OPenACC
* projwfc.x can be used to compute the PDOS in a local basis (I. Timrov)
Fixed in development version:
* Possible out-of-bound error (gfortran only) could crash DFT+U
* incorrect exx factor multiplication in the gga term of polarized cx0
functional (v.6.8 only)
* Some build problems occurring under special circumstances
* Some PP files were not correctly read since v.6.7
* DFT-D3 with dftd3_version=4 or 6 could produce NaN's in parallel runs
due to missing zero initialization of some work arrays
* Ensemble-DFT in CP ("cg") wasn't working any longer for norm-conserving PPs
* In DFT+U (lda_plus_u_kind = 0 and 1) the pw.x code was printing the squared
eigenvectors instead of simply eigenvectors. Now it prints the
eigenvectors (consistent with lda_plus_u_kind = 2).
* plotband.x wasn't correctly plotting the bands, under some not-so-special
circumstances
* CP with DFT+U could crash when writing the xml file (since v.6.6)
Incompatible changes in develoment version:
* Changes to Makefiles and to file "make.inc"
* clib/ deleted, files clib/*.c moved to UtilXlib/ or to Modules/
Known problems in 6.8 version:
* electron-phonon calculation in the non-colinear/spinorbit case is broken
* some obscure problem still affects variable-cell with hybrid functionals

View File

@ -524,7 +524,7 @@ Some environment variables that are relevant to \configure\ are:
\texttt{LIBDIRS}& extra directories where to search for libraries\\
\end{tabular}\\
(note that \texttt{F90} is an ``historical'' name -- we actually use
Fortran 2003 -- and that it should be used only together with option
Fortran 2008 -- and that it should be used only together with option
\texttt{--disable-parallel}. In fact, the value of F90 must be
consistent with the parallel Fortran compiler which is determined by
\configure\ and stored in the \texttt{MPIF90} variable).
@ -865,13 +865,8 @@ A few \libxc\ functional routines provides the energy and some others the potent
%
\subsubsection{XC test}
\label{SubSec:XCtest}
A testing program, \texttt{xclib\_test.x}, for the \texttt{XClib} library of \qe\ is available. Three options:
\begin{itemize}
\item \texttt{dft-info}: infos on the input dft are provided. If the functionals are from \libxc\ the external parameters, when present, are listed with a brief description and their default value.
\item \texttt{exe-benchmark} (\texttt{gen-benchmark}): the program gets a data file generated previously as input (by running the same program with the \texttt{gen-benchmark} option) and compares the output data, namely energy and potential on a selected number of grid points. It also compares total energy and potential summed up on a large grid in order to better span the input domain. This option shoud be used to test modifications in the \texttt{XClib} library or to check the output matching among different parallelization schemes.
\item \texttt{dft-comparison}: comparison between two different dfts on a large grid of points. Max, min and average percentage difference between the two dft outputs (energy and potential) are provided and the points of the grid where the two output differ are shown. This option can be used, for example, to find differences between internal \qe\ functionals and the \libxc\ ones.
\end{itemize}
The testing program is available for LDA, GGA and MGGA functionals. It also tests the potential derivatives for LDA (\texttt{dmxc}) and GGA (\texttt{dgcxc}).
A testing program, \texttt{xclib\_test.x}, for the \texttt{XClib} library of \qe\ is available. The program is available for LDA, GGA and MGGA functionals (both QE and Libxc). It also tests the potential derivatives for LDA (\texttt{dmxc}) and GGA (\texttt{dgcxc}).\\
See XClib/README.TEST file for further details.
\subsection{Compilation}
\label{SubSec:Compilation}

View File

@ -10,42 +10,40 @@ MODFLAGS= $(BASEMOD_FLAGS) \
PHAUXOBJS = ../../../PHonon/PH/libphaux.a
PHOBJS = ../../../PHonon/PH/libph.a
PWOBJS = ../../../PW/src/libpw.a
PWOBJS = ../../../PW/src/libpw.a ../../../KS_Solvers/libks_solvers.a ../../../dft-d3/libdftd3qe.a
LRMODS = ../../../LR_Modules/liblrmod.a
QEMODS = ../../../Modules/libqemod.a ../../../KS_Solvers/libks_solvers.a ../../../upflib/libupf.a ../../../XClib/xc_lib.a \
../../../FFTXlib/libqefft.a ../../../LAXlib/libqela.a ../../../UtilXlib/libutil.a ../../../dft-d3/libdftd3qe.a
LIBOBJS = ../../../clib/clib.a
QEMODS = $(BASEMODS)
all : ZG.x disca.x pp_disca.x pp_spctrlfn.x bands_unfold.x epsilon_Gaus.x
ZG.x : ZG.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS) $(LIBOBJS)
ZG.x : ZG.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
ZG.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
ZG.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
disca.x : disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS) $(LIBOBJS)
disca.x : disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
disca.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
disca.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
pp_disca.x : pp_disca.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
pp_disca.x : pp_disca.o $(PWOBJS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
pp_disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
pp_disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
bands_unfold.x : bands_unfold.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
bands_unfold.x : bands_unfold.o $(PWOBJS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
bands_unfold.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
bands_unfold.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
pp_spctrlfn.x : pp_spctrlfn.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
pp_spctrlfn.x : pp_spctrlfn.o $(PWOBJS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
pp_spctrlfn.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
pp_spctrlfn.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
epsilon_Gaus.x : epsilon_Gaus.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
epsilon_Gaus.x : epsilon_Gaus.o $(PWOBJS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
epsilon_Gaus.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
epsilon_Gaus.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
clean :

View File

@ -78,13 +78,10 @@ EPWOBJS += ephblochkq.o wfc_elec.o test_tools.o
PHOBJS = ../../PHonon/PH/libph.a
PWOBJS = ../../PW/src/libpw.a
W90LIB = ../../wannier90-3.1.0/libwannier.a
LRMODS = ../../LR_Modules/liblrmod.a
PWOBJS = ../../PW/src/libpw.a
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
../../upflib/libupf.a ../../FFTXlib/libqefft.a ../../dft-d3/libdftd3qe.a
LIBOBJS =../../LAXlib/libqela.a ../../UtilXlib/libutil.a ../../clib/clib.a
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
QEMODS = $(BASEMODS)
TLDEPS= pw ph pp
@ -94,9 +91,9 @@ libepw.a : $(EPWOBJS)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
epw.x : epw.o libepw.a $(PHOBJS) $(LRMODS) $(PWOBJS) $(W90LIB) $(QEMODS) $(LIBOBJS)
epw.x : epw.o libepw.a $(PHOBJS) $(LRMODS) $(PWOBJS) $(W90LIB) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
epw.o libepw.a $(PHOBJS) $(LRMODS) $(W90LIB) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
epw.o libepw.a $(PHOBJS) $(LRMODS) $(W90LIB) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../bin ; ln -fs ../src/epw.x . )
pw :

View File

@ -130,8 +130,8 @@
SUBROUTINE close_final
!------------------------------------------------------------------
!
USE units_lr, ONLY : iuwfc
USE units_ph, ONLY : iudwf, iudrho
USE units_lr, ONLY : iuwfc, iudwf
USE units_ph, ONLY : iudrho
USE phcom, ONLY : fildrho
USE mp_global, ONLY : me_pool,root_pool
USE io_var, ONLY : iunepmatwe

View File

@ -606,7 +606,6 @@
!
USE kinds, ONLY : DP
USE ions_base, ONLY : nat, ityp, ntyp => nsp
USE spin_orb, ONLY : lspinorb
USE cell_base, ONLY : tpiba2, omega, tpiba
USE gvect, ONLY : ngm, gg, g, eigts1, eigts2, eigts3, mill
USE scf, ONLY : v, vltot

View File

@ -198,4 +198,6 @@ if(QE_ENABLE_TEST)
add_unit_test(test_qe_fftx-r1-t3 1 3 $<TARGET_FILE:qe_fftx_test>)
add_unit_test(test_qe_fftx-r3-t1 3 1 $<TARGET_FILE:qe_fftx_test>)
add_unit_test(test_qe_fftx-r3-t2 3 2 $<TARGET_FILE:qe_fftx_test>)
add_subdirectory(tests)
endif()

View File

@ -160,16 +160,22 @@
SUBROUTINE init_plan()
implicit none
!
COMPLEX(DP), ALLOCATABLE :: c_test(:)
!
ALLOCATE(c_test, mold=c)
!
IF( C_ASSOCIATED(fw_planz( icurrent)) ) CALL fftw_destroy_plan( fw_planz( icurrent) )
IF( C_ASSOCIATED(bw_planz( icurrent)) ) CALL fftw_destroy_plan( bw_planz( icurrent) )
idir = -1
fw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c, &
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_ESTIMATE)
fw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c_test, &
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_MEASURE)
idir = 1
bw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c, &
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_ESTIMATE)
bw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c_test, &
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_MEASURE)
!
DEALLOCATE(c_test)
!
zdims(1,icurrent) = nz; zdims(2,icurrent) = nsl; zdims(3,icurrent) = ldz;
ip = icurrent
icurrent = MOD( icurrent, ndims ) + 1
@ -309,42 +315,47 @@
SUBROUTINE init_plan()
implicit none
COMPLEX(DP), ALLOCATABLE :: f_test(:)
!
ALLOCATE(f_test,mold=r)
!
IF ( ldx /= nx .OR. ldy /= ny ) THEN
IF( C_ASSOCIATED(fw_plan(2,icurrent)) ) CALL fftw_destroy_plan( fw_plan(2,icurrent) )
IF( C_ASSOCIATED(bw_plan(2,icurrent)) ) CALL fftw_destroy_plan( bw_plan(2,icurrent) )
idir = -1
fw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, r(1:), &
(/ldx*ldy/), ldx, 1, r(1:), (/ldx*ldy/), ldx, 1, idir, &
FFTW_ESTIMATE)
fw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, f_test(1:), &
(/ldx*ldy/), ldx, 1, f_test(1:), (/ldx*ldy/), ldx, 1, idir, &
FFTW_MEASURE)
idir = 1
bw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, r(1:), &
(/ldx*ldy/), ldx, 1, r(1:), (/ldx*ldy/), ldx, 1, idir, &
FFTW_ESTIMATE)
bw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, f_test(1:), &
(/ldx*ldy/), ldx, 1, f_test(1:), (/ldx*ldy/), ldx, 1, idir, &
FFTW_MEASURE)
IF( C_ASSOCIATED(fw_plan(1,icurrent)) ) CALL fftw_destroy_plan( fw_plan(1,icurrent) )
IF( C_ASSOCIATED(bw_plan(1,icurrent)) ) CALL fftw_destroy_plan( bw_plan(1,icurrent) )
idir = -1
fw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, r(1:), &
(/ldx*ldy/), 1, ldx, r(1:), (/ldx*ldy/), 1, ldx, idir, &
FFTW_ESTIMATE)
fw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, f_test(1:), &
(/ldx*ldy/), 1, ldx, f_test(1:), (/ldx*ldy/), 1, ldx, idir, &
FFTW_MEASURE)
idir = 1
bw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, r(1:), &
(/ldx*ldy/), 1, ldx, r(1:), (/ldx*ldy/), 1, ldx, idir, &
FFTW_ESTIMATE)
bw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, f_test(1:), &
(/ldx*ldy/), 1, ldx, f_test(1:), (/ldx*ldy/), 1, ldx, idir, &
FFTW_MEASURE)
ELSE
IF( C_ASSOCIATED(fw_plan( 1, icurrent)) ) CALL fftw_destroy_plan( fw_plan( 1, icurrent) )
IF( C_ASSOCIATED(bw_plan( 1, icurrent)) ) CALL fftw_destroy_plan( bw_plan( 1, icurrent) )
idir = -1
fw_plan(1, icurrent) = fftw_plan_many_dft(2, (/ny, nx/), nzl,&
r(1:), (/ny, nx/), 1, nx*ny, r(1:), (/ny, nx/), 1, nx*ny, idir,&
FFTW_ESTIMATE)
f_test(1:), (/ldy, ldx/), 1, ldx*ldy, f_test(1:), (/ldy, ldx/), 1, ldx*ldy, idir,&
FFTW_MEASURE)
idir = 1
bw_plan(1, icurrent) = fftw_plan_many_dft(2, (/ny, nx/), nzl,&
r(1:), (/ny, nx/), 1, nx*ny, r(1:), (/ny, nx/), 1, nx*ny, idir,&
FFTW_ESTIMATE)
f_test(1:), (/ldy, ldx/), 1, ldx*ldy, f_test(1:), (/ldy, ldx/), 1, ldx*ldy, idir,&
FFTW_MEASURE)
END IF
!
DEALLOCATE(f_test)
!
dims(1,icurrent) = ny; dims(2,icurrent) = ldx;
dims(3,icurrent) = nx; dims(4,icurrent) = nzl;
ip = icurrent
@ -450,15 +461,21 @@
SUBROUTINE init_plan()
implicit none
COMPLEX(DP), ALLOCATABLE :: f_test(:)
IF ( nx /= ldx .or. ny /= ldy .or. nz /= ldz ) &
call fftx_error__('cfft3','not implemented',3)
IF( C_ASSOCIATED(fw_plan(icurrent)) ) CALL fftw_destroy_plan( fw_plan(icurrent) )
IF( C_ASSOCIATED(bw_plan(icurrent)) ) CALL fftw_destroy_plan( bw_plan(icurrent) )
!
ALLOCATE(f_test,mold=f)
!
idir = -1
fw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f(1:), f(1:), idir, FFTW_ESTIMATE)
fw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f_test(1:), f_test(1:), idir, FFTW_MEASURE)
idir = 1
bw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f(1:), f(1:), idir, FFTW_ESTIMATE)
bw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f_test(1:), f_test(1:), idir, FFTW_MEASURE)
!
DEALLOCATE(f_test)
!
dims(1,icurrent) = nx; dims(2,icurrent) = ny; dims(3,icurrent) = nz
ip = icurrent
icurrent = MOD( icurrent, ndims ) + 1
@ -630,7 +647,9 @@ SUBROUTINE cfft3ds (f, nx, ny, nz, ldx, ldy, ldz, howmany, isign, &
SUBROUTINE init_plan()
implicit none
!
COMPLEX(DP), ALLOCATABLE :: f_test(:)
!
IF( C_ASSOCIATED(fw_plan( 1, icurrent)) ) &
CALL fftw_destroy_plan( fw_plan( 1, icurrent) )
IF( C_ASSOCIATED(bw_plan( 1, icurrent)) ) &
@ -643,25 +662,30 @@ SUBROUTINE cfft3ds (f, nx, ny, nz, ldx, ldy, ldz, howmany, isign, &
CALL fftw_destroy_plan( fw_plan( 3, icurrent) )
IF( C_ASSOCIATED(bw_plan( 3, icurrent)) ) &
CALL fftw_destroy_plan( bw_plan( 3, icurrent) )
!
ALLOCATE(f_test, mold=f)
!
idir = -1
fw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f(1:), (/ldz, ldy, ldx/), 1, ldx, &
f(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_ESTIMATE)
fw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f_test(1:), (/ldz, ldy, ldx/), 1, ldx, &
f_test(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_MEASURE)
idir = 1
bw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f(1:), (/ldz, ldy, ldx/), 1, ldx, &
f(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_ESTIMATE)
bw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f_test(1:), (/ldz, ldy, ldx/), 1, ldx, &
f_test(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_MEASURE)
idir = -1
fw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_ESTIMATE)
fw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_MEASURE)
idir = 1
bw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_ESTIMATE)
bw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_MEASURE)
idir = -1
fw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_ESTIMATE)
fw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_MEASURE)
idir = 1
bw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_ESTIMATE)
bw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_MEASURE)
!
DEALLOCATE(f_test)
!
dims(1,icurrent) = nx; dims(2,icurrent) = ny; dims(3,icurrent) = nz
ip = icurrent
icurrent = MOD( icurrent, ndims ) + 1

View File

@ -67,7 +67,9 @@ program test
!!
!!-ntg Number of task groups
!!
!!-gamma Enables gamma point trick. Should be about 2 times faster.
!!-gamma Enables gamma point trick. Should be about 2 times faster
!!
!!-pd If .true. uses pencil decomposition, otherwise uses slab decomposition
!!
!!-av1 x y z First lattice vector, in atomic units. N.B.: when using -av1, -alat is ignored!
!!
@ -90,8 +92,19 @@ program test
USE fft_helper_subroutines
USE fft_interfaces, ONLY:fwfft, invfft
USE timers
!
IMPLICIT NONE
!
ENUM, BIND(C)
ENUMERATOR :: STAMP_BEGIN = 1
ENUMERATOR :: STAMP_PSI = 2
ENUMERATOR :: STAMP_INVFFT = 3
ENUMERATOR :: STAMP_VLOC = 4
ENUMERATOR :: STAMP_FWFFT = 5
ENUMERATOR :: STAMP_HPSI = 6
END ENUM
INTEGER, PARAMETER :: NUM_STAMPS = 6
!
TYPE(fft_type_descriptor) :: dfftp, dffts, dfft3d
!
TYPE(sticks_map) :: smap
@ -121,17 +134,21 @@ program test
!! cut-off for the wave-function
REAL*8 :: tpiba, alat, alat_in
!! lattice parameters
REAL*8 :: time(100)
REAL*8 :: my_time(100)
REAL*8 :: time_min(100)
REAL*8 :: time_max(100)
REAL*8 :: time_avg(100)
REAL*8 :: time(NUM_STAMPS)
REAL*8 :: my_time(NUM_STAMPS)
REAL*8 :: time_min(NUM_STAMPS)
REAL*8 :: time_max(NUM_STAMPS)
REAL*8 :: time_avg(NUM_STAMPS)
REAL*8 :: wall
REAL*8 :: wall_avg
!
LOGICAL :: gamma_only = .false.
LOGICAL :: use_tg
!! if calculations require only gamma point
LOGICAL :: use_tg
!! if calculations use task group
LOGICAL :: use_pd = .false.
!! if calculations use pencil decomposition
LOGICAL :: lpara
REAL*8 :: at(3, 3), bg(3, 3)
REAL(DP), PARAMETER :: pi = 4.0_DP * atan(1.0_DP)
!
@ -247,6 +264,10 @@ program test
CALL get_command_argument(i + 1, arg)
READ (arg, *) gamma_only
END IF
IF (TRIM(arg) == '-pd') THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) use_pd
END IF
IF ((TRIM(arg) == '-howmany').or.(TRIM(arg) == '-nh')) THEN
CALL get_command_argument(i + 1, arg)
READ (arg, *) many_fft
@ -357,6 +378,7 @@ program test
write (*, *) 'Num Task Group = ', ntgs
write (*, *) 'Num Many FFTs = ', many_fft
write (*, *) 'Gamma trick = ', gamma_only
write (*, *) 'Pencil decomp = ', use_pd
end if
!
nx = 2*int(sqrt(gcutm)*sqrt(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)) + 1
@ -371,12 +393,16 @@ program test
IF (gamma_only) incr = 2
dffts%has_task_groups = (ntgs > 1)
use_tg = dffts%has_task_groups
lpara = (npes > 1)
!
dffts%rho_clock_label='ffts' ; dffts%wave_clock_label='fftw'
CALL fft_type_init(dffts, smap, "wave", gamma_only, .true., comm, at, bg, gkcut, gcutms/gkcut, nyfft=ntgs, nmany=many_fft)
dffts%rho_clock_label='ffts'
dffts%wave_clock_label='fftw'
!
CALL fft_type_init(dffts, smap, "wave", gamma_only, lpara, comm, at, bg, gkcut, gcutms/gkcut, &
nyfft=ntgs, nmany=many_fft, use_pd=use_pd)
dfftp%rho_clock_label='fft'
CALL fft_type_init(dfftp, smap, "rho", gamma_only, .true., comm, at, bg, gcutm, 4.d0, nyfft=ntgs, nmany=many_fft)
CALL fft_type_init(dfftp, smap, "rho", gamma_only, lpara, comm, at, bg, gcutm, 4.d0, nyfft=ntgs, &
nmany=many_fft, use_pd=use_pd)
!
CALL fft_base_info(mype == 0, dffts, dfftp)
if (mype == 0) then
@ -431,7 +457,7 @@ program test
g, gg, mill, ig_l2g, gstart, .TRUE. )
ELSE
CALL ggen( dfftp, gamma_only, at, bg, gcutm, ngm_g, ngm, &
g, gg, mill, ig_l2g, gstart, .FALSE. )
g, gg, mill, ig_l2g, gstart, .FALSE. )
END IF
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
!
@ -500,13 +526,13 @@ program test
IF (use_tg) THEN
DO ib = 1, nbnd, incr
!
time(1) = mpi_wall_time()
time(STAMP_BEGIN) = mpi_wall_time()
!
call prepare_psi_tg(ib, nbnd, ngms, psi, tg_psic, dffts, gamma_only)
time(2) = mpi_wall_time()
time(STAMP_PSI) = mpi_wall_time()
!
CALL invfft('tgWave', tg_psic, dffts);
time(3) = mpi_wall_time()
CALL invfft('tgWave', tg_psic, dffts)
time(STAMP_INVFFT) = mpi_wall_time()
!
CALL tg_get_group_nr3(dffts, right_nr3)
!
@ -514,13 +540,13 @@ program test
tg_psic(j) = tg_psic(j)*tg_v(j)
ENDDO
!
time(4) = mpi_wall_time()
time(STAMP_VLOC) = mpi_wall_time()
!
CALL fwfft('tgWave', tg_psic, dffts);
time(5) = mpi_wall_time()
CALL fwfft('tgWave', tg_psic, dffts)
time(STAMP_FWFFT) = mpi_wall_time()
!
CALL accumulate_hpsi_tg(ib, nbnd, ngms, hpsi, tg_psic, dffts, gamma_only)
time(6) = mpi_wall_time()
time(STAMP_HPSI) = mpi_wall_time()
!
DO i = 2, 6
my_time(i) = my_time(i) + (time(i) - time(i - 1))
@ -531,6 +557,8 @@ program test
ENDDO
ELSEIF (many_fft > 1) THEN
DO ib = 1, nbnd, many_fft
!
time(STAMP_BEGIN) = mpi_wall_time()
!
group_size = MIN(many_fft, nbnd - (ib -1))
!
@ -538,24 +566,24 @@ program test
!call prepare_psi(ib, nbnd, ngms, psi, psic(1+(k-1)*dffts%nnr:), dffts, gamma_only)
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
ENDDO
time(2) = mpi_wall_time()
time(STAMP_PSI) = mpi_wall_time()
!
CALL invfft('Wave', psic, dffts, howmany=group_size)
time(3) = mpi_wall_time()
time(STAMP_INVFFT) = mpi_wall_time()
!
DO j = 1, dffts%nnr
psic(j) = psic(j)*v(j)
ENDDO
time(4) = mpi_wall_time()
time(STAMP_VLOC) = mpi_wall_time()
!
CALL fwfft('Wave', psic, dffts, howmany=group_size)
time(5) = mpi_wall_time()
time(STAMP_FWFFT) = mpi_wall_time()
!
DO k=0, group_size - 1
!CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic(1+(k-1)*dffts%nnr:), dffts, gamma_only)
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, dffts, gamma_only)
ENDDO
time(6) = mpi_wall_time()
time(STAMP_HPSI) = mpi_wall_time()
!
DO i = 2, 6
my_time(i) = my_time(i) + (time(i) - time(i - 1))
@ -567,21 +595,24 @@ program test
ELSE
DO ib = 1, nbnd, incr
!
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
time(2) = mpi_wall_time()
time(STAMP_BEGIN) = mpi_wall_time()
!
CALL invfft('Wave', psic, dffts); time(3) = mpi_wall_time()
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
time(STAMP_PSI) = mpi_wall_time()
!
CALL invfft('Wave', psic, dffts)
time(STAMP_INVFFT) = mpi_wall_time()
!
DO j = 1, dffts%nnr
psic(j) = psic(j)*v(j)
ENDDO
time(4) = mpi_wall_time()
time(STAMP_VLOC) = mpi_wall_time()
!
CALL fwfft('Wave', psic, dffts);
time(5) = mpi_wall_time()
CALL fwfft('Wave', psic, dffts)
time(STAMP_FWFFT) = mpi_wall_time()
!
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, dffts, gamma_only)
time(6) = mpi_wall_time()
time(STAMP_HPSI) = mpi_wall_time()
!
DO i = 2, 6
my_time(i) = my_time(i) + (time(i) - time(i - 1))
@ -639,21 +670,21 @@ program test
write(*,100)
write(*,1)
write(*,100)
write(*,2) time_min(2), time_max(2), time_avg(2)
write(*,3) time_min(3), time_max(3), time_avg(3)
write(*,4) time_min(4), time_max(4), time_avg(4)
write(*,5) time_min(5), time_max(5), time_avg(5)
write(*,6) time_min(6), time_max(6), time_avg(6)
write(*,2) time_min(STAMP_PSI), time_max(STAMP_PSI), time_avg(STAMP_PSI)
write(*,3) time_min(STAMP_INVFFT), time_max(STAMP_INVFFT), time_avg(STAMP_INVFFT)
write(*,4) time_min(STAMP_VLOC), time_max(STAMP_VLOC), time_avg(STAMP_VLOC)
write(*,5) time_min(STAMP_FWFFT), time_max(STAMP_FWFFT), time_avg(STAMP_FWFFT)
write(*,6) time_min(STAMP_HPSI), time_max(STAMP_HPSI), time_avg(STAMP_HPSI)
write(*,7) wall
write(*,100)
100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
1 FORMAT(' |FFT TEST subroutine | sec. min | sec. max | sec. avg |' )
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
4 FORMAT(' |workload | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
4 FORMAT(' |workload | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
7 FORMAT(' |wall time | ', D14.5, ' |')
end if
@ -1241,27 +1272,27 @@ subroutine print_clock(mype, npes, ncount)
end if
10100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
101 FORMAT(' |FFT subroutine | sec. min | sec. max | sec. avg |' )
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
108 FORMAT(' |fft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1021 FORMAT(' |fft_scatt_many_yz | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
1022 FORMAT(' |fft_scatt_many_xy | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
108 FORMAT(' |fft | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1021 FORMAT(' |fft_scatt_many_yz | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
1022 FORMAT(' |fft_scatt_many_xy | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
end subroutine

View File

@ -0,0 +1,26 @@
set(test_common_src tester.f90 utils.f90)
qe_add_library(qe_fftx_test_common ${test_common_src})
target_link_libraries(qe_fftx_test_common PRIVATE qe_mpi_fortran)
set(source_names fft_scalar_gpu fft_scatter_mod_gpu fwinv_gpu)
foreach(NAME ${source_names})
set(TEST_SOURCE_FILE test_${NAME}.f90)
qe_enable_cuda_fortran("${TEST_SOURCE_FILE}")
qe_add_executable(test_qe_fftx_${NAME} ${TEST_SOURCE_FILE})
set_target_properties(test_qe_fftx_${NAME}
PROPERTIES
OUTPUT_NAME test_qe_fftx_${NAME}.x
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../bin)
target_link_libraries(test_qe_fftx_${NAME}
PRIVATE
qe_openmp_fortran
qe_mpi_fortran
qe_fftx_test_common
qe_fftx)
add_unit_test(test_qe_fftx_${NAME}-r1-t1 1 1 $<TARGET_FILE:test_qe_fftx_${NAME}>)
add_unit_test(test_qe_fftx_${NAME}-r1-t3 1 3 $<TARGET_FILE:test_qe_fftx_${NAME}>)
add_unit_test(test_qe_fftx_${NAME}-r3-t1 3 1 $<TARGET_FILE:test_qe_fftx_${NAME}>)
add_unit_test(test_qe_fftx_${NAME}-r3-t2 3 2 $<TARGET_FILE:test_qe_fftx_${NAME}>)
endforeach()

View File

@ -2,11 +2,11 @@
include ../../make.inc
MODFLAGS= $(MOD_FLAG).. $(MOD_FLAG).
MODFLAGS = $(MOD_FLAG).. $(MOD_FLAG).
SRCS = test_fft_scalar_gpu.f90 \
test_fft_scatter_mod_gpu.f90 \
test_fwinv_gpu.f90
test_fft_scatter_mod_gpu.f90 \
test_fwinv_gpu.f90
EXECS = $(SRCS:.f90=.x)
@ -15,16 +15,10 @@ all: common $(EXECS)
common: tester.o utils.o
%.x: %.o
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(QELIBS)
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(LD_LIBS)
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
# .PHONY forces execution of a rule irrespective of the presence of an
# updated file with the same name of the rule. In this way, the script
# that generates version.f90 always runs, updating the version if you
# execute "svn update". The update_version script takes care of not
# changing the file if the svn version did not change
clean:
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x rnd_seed*
.PHONY: all clean

View File

@ -271,3 +271,20 @@ end program test_fft_scalar_gpu
program test_fft_scalar_gpu
end program test_fft_scalar_gpu
#endif
!
! Dummy
SUBROUTINE stop_clock(label)
CHARACTER(*) :: label
END SUBROUTINE stop_clock
!
SUBROUTINE start_clock(label)
CHARACTER(*) :: label
END SUBROUTINE start_clock
!
SUBROUTINE stop_clock_gpu(label)
CHARACTER(*) :: label
END SUBROUTINE stop_clock_gpu
!
SUBROUTINE start_clock_gpu(label)
CHARACTER(*) :: label
END SUBROUTINE start_clock_gpu

View File

@ -1,10 +1,13 @@
#if defined(__CUDA)
program test_fft_scatter_mod_gpu
#if defined(__MPI)
#if defined(__MPI) && defined(__MPI_MODULE)
USE mpi
#endif
USE tester
IMPLICIT NONE
#if defined(__MPI) && ! defined(__MPI_MODULE)
INCLUDE 'mpif.h'
#endif
! MPI type
type mpi_t
integer :: me, n, root, comm
@ -87,7 +90,8 @@ program test_fft_scatter_mod_gpu
bg = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(bg))
bg = 2.d0*pi
!
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, nyfft=nyfft)
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, &
& nyfft=nyfft, nmany=1)
!
END SUBROUTINE fft_desc_init
@ -124,8 +128,8 @@ program test_fft_scatter_mod_gpu
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE scatter_mod, ONLY : fft_scatter_xy
USE scatter_mod_gpu, ONLY : fft_scatter_xy_gpu
USE fft_scatter, ONLY : fft_scatter_xy
USE fft_scatter_gpu, ONLY : fft_scatter_xy_gpu
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
@ -196,8 +200,8 @@ program test_fft_scatter_mod_gpu
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE scatter_mod, ONLY : fft_scatter_yz
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu
USE fft_scatter, ONLY : fft_scatter_yz
USE fft_scatter_gpu, ONLY : fft_scatter_yz_gpu
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
@ -265,8 +269,8 @@ program test_fft_scatter_mod_gpu
USE fft_param, ONLY : DP
USE fft_types, ONLY : fft_type_descriptor
USE stick_base, ONLY : sticks_map
USE scatter_mod, ONLY : fft_scatter_yz
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
USE fft_scatter, ONLY : fft_scatter_yz
USE fft_scatter_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
implicit none
TYPE(mpi_t) :: mp
TYPE(tester_t) :: test
@ -392,15 +396,24 @@ program test_fft_scatter_mod_gpu
END SUBROUTINE test_fft_scatter_many_yz_gpu_1
end program test_fft_scatter_mod_gpu
! dummy subroutines
subroutine stop_clock( label )
character(len=*) :: label
end subroutine stop_clock
subroutine start_clock( label )
character(len=*) :: label
end subroutine start_clock
!
#else
program test_fft_scatter_mod_gpu
end program test_fft_scatter_mod_gpu
#endif
!
! Dummy
SUBROUTINE stop_clock(label)
CHARACTER(*) :: label
END SUBROUTINE stop_clock
!
SUBROUTINE start_clock(label)
CHARACTER(*) :: label
END SUBROUTINE start_clock
!
SUBROUTINE stop_clock_gpu(label)
CHARACTER(*) :: label
END SUBROUTINE stop_clock_gpu
!
SUBROUTINE start_clock_gpu(label)
CHARACTER(*) :: label
END SUBROUTINE start_clock_gpu

View File

@ -1,10 +1,13 @@
#if defined(__CUDA)
program test_fwinv_gpu
#if defined(__MPI)
#if defined(__MPI) && defined(__MPI_MODULE)
USE mpi
#endif
USE tester
IMPLICIT NONE
#if defined(__MPI) && ! defined(__MPI_MODULE)
INCLUDE 'mpif.h'
#endif
! MPI type
type mpi_t
integer :: me, n, root, comm
@ -40,6 +43,7 @@ program test_fwinv_gpu
CALL test_invfft_gpu_1(mp, test, .false., i)
END IF
END DO
!
CALL test_fwfft_many_gpu_1(mp, test, .true., 1)
CALL test_fwfft_many_gpu_1(mp, test, .false., 1)
!
@ -108,7 +112,8 @@ program test_fwinv_gpu
at = RESHAPE((/10.d0, 0.d0, 0.d0, 0.d0, 10.d0, 0.d0, 0.d0, 0.d0, 10.d0/), shape(at))
CALL calc_bg(at, bg)
!
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, nyfft=nyfft)
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, &
& nyfft=nyfft, nmany=1)
!
END SUBROUTINE fft_desc_init
@ -410,15 +415,24 @@ program test_fwinv_gpu
END SUBROUTINE test_invfft_many_gpu_1
end program test_fwinv_gpu
! dummy subroutines
subroutine stop_clock( label )
character(len=*) :: label
end subroutine stop_clock
subroutine start_clock( label )
character(len=*) :: label
end subroutine start_clock
!
#else
program test_fwinv_gpu
end program test_fwinv_gpu
#endif
!
! Dummy
SUBROUTINE stop_clock(label)
CHARACTER(*) :: label
END SUBROUTINE stop_clock
!
SUBROUTINE start_clock(label)
CHARACTER(*) :: label
END SUBROUTINE start_clock
!
SUBROUTINE stop_clock_gpu(label)
CHARACTER(*) :: label
END SUBROUTINE stop_clock_gpu
!
SUBROUTINE start_clock_gpu(label)
CHARACTER(*) :: label
END SUBROUTINE start_clock_gpu

View File

@ -1,9 +1,12 @@
SUBROUTINE collect_results(test)
#if defined(__MPI)
#if defined(__MPI) && defined(__MPI_MODULE)
USE mpi
#endif
USE tester
IMPLICIT NONE
#if defined(__MPI) && ! defined(__MPI_MODULE)
INCLUDE 'mpif.h'
#endif
!
TYPE(tester_t) :: test
INTEGER :: itottests, itoterr, ierr, me
@ -75,11 +78,14 @@ END SUBROUTINE save_random_seed
SUBROUTINE no_test
#if defined(__MPI)
#if defined(__MPI) && defined(__MPI_MODULE)
USE mpi
#endif
USE tester
IMPLICIT NONE
#if defined(__MPI) && ! defined(__MPI_MODULE)
INCLUDE 'mpif.h'
#endif
!TYPE(tester_t) :: test
INTEGER :: ierr
!

View File

@ -71,7 +71,10 @@ set(src_pw4gww
pw4gww/hpsi_pw4gww.f90
pw4gww/cgsolve_all_gamma.f90
pw4gww/realus.f90
pw4gww/operator_1_vp.f90)
pw4gww/operator_1_vp.f90
pw4gww/lanczos_chains.f90
pw4gww/convergence.f90
pw4gww/easy_gw.f90)
qe_enable_cuda_fortran("${src_pw4gww}")
set(src_bse
@ -243,11 +246,11 @@ target_link_libraries(qe_gww_simpleip
###########################################################
set(src_graph_x util/graph.f90)
qe_add_executable(qe_gww_util_grap_exe ${src_graph_x})
set_target_properties(qe_gww_util_grap_exe
PROPERTIES
set_target_properties(qe_gww_util_grap_exe
PROPERTIES
OUTPUT_NAME graph.x)
set_target_properties(qe_gww_util_grap_exe
PROPERTIES
set_target_properties(qe_gww_util_grap_exe
PROPERTIES
OUTPUT_NAME graph.x)
###########################################################
@ -255,11 +258,11 @@ set_target_properties(qe_gww_util_grap_exe
###########################################################
set(src_abcoeff_to_eps_x util/abcoeff_to_eps.f90)
qe_add_executable(qe_gww_util_abcoefftoeps_exe ${src_abcoeff_to_eps_x})
set_target_properties(qe_gww_util_abcoefftoeps_exe
PROPERTIES
set_target_properties(qe_gww_util_abcoefftoeps_exe
PROPERTIES
OUTPUT_NAME abcoeff_to_eps.x)
set_target_properties(qe_gww_util_abcoefftoeps_exe
PROPERTIES
set_target_properties(qe_gww_util_abcoefftoeps_exe
PROPERTIES
OUTPUT_NAME abcoeff_to_eps.x)
###########################################################
@ -267,11 +270,11 @@ set_target_properties(qe_gww_util_abcoefftoeps_exe
###########################################################
set(src_memory_pw4gww_x util/memory_pw4gww.f90)
qe_add_executable(qe_gww_util_memorypw4gww_exe ${src_memory_pw4gww_x})
set_target_properties(qe_gww_util_memorypw4gww_exe
PROPERTIES
set_target_properties(qe_gww_util_memorypw4gww_exe
PROPERTIES
OUTPUT_NAME memory_pw4gww.x)
set_target_properties(qe_gww_util_memorypw4gww_exe
PROPERTIES
set_target_properties(qe_gww_util_memorypw4gww_exe
PROPERTIES
OUTPUT_NAME memory_pw4gww.x)
#############################################################
@ -279,8 +282,8 @@ set_target_properties(qe_gww_util_memorypw4gww_exe
############################################################
set(src_bse_main_x bse/bse_main.f90)
qe_add_executable(qe_gww_bse_bse_main_exe ${src_bse_main_x})
set_target_properties(qe_gww_bse_bse_main_exe
PROPERTIES
set_target_properties(qe_gww_bse_bse_main_exe
PROPERTIES
OUTPUT_NAME bse_main.x)
target_link_libraries(qe_gww_bse_bse_main_exe
PRIVATE
@ -298,8 +301,8 @@ target_link_libraries(qe_gww_bse_bse_main_exe
################################################################
set(src_gww_x gww/gww.f90)
qe_add_executable(qe_gww_gww_exe ${src_gww_x})
set_target_properties(qe_gww_gww_exe
PROPERTIES
set_target_properties(qe_gww_gww_exe
PROPERTIES
OUTPUT_NAME gww.x)
target_link_libraries(qe_gww_gww_exe
PRIVATE
@ -309,42 +312,59 @@ target_link_libraries(qe_gww_gww_exe
qe_modules)
################################################################
# gww_fit.x
# gww_fit.x
################################################################
set(src_gww_fit_x gww/gww_fit.f90)
qe_add_executable(qe_gww_gww_fit_exe ${src_gww_fit_x})
set_target_properties(qe_gww_gww_fit_exe
PROPERTIES
OUTPUT_NAME gww.x)
set_target_properties(qe_gww_gww_fit_exe
PROPERTIES
OUTPUT_NAME gww_fit.x)
target_link_libraries(qe_gww_gww_fit_exe
PRIVATE
PRIVATE
qe_gww
qe_gww_minpack
qe_modules)
################################################################
# head.x
# head.x
################################################################
set(src_head_x head/head.f90)
qe_add_executable(qe_gww_head_exe ${src_head_x})
set_target_properties(qe_gww_head_exe
PROPERTIES
set_target_properties(qe_gww_head_exe
PROPERTIES
OUTPUT_NAME head.x)
target_link_libraries(qe_gww_head_exe
PRIVATE
PRIVATE
qe_gww_head
qe_gww_minpack
qe_phonon_ph
qe_modules
qe_pw)
###########################################################
# simple.x
###########################################################
set(src_simple_x simple/simple.f90)
qe_add_executable(qe_gww_simple_exe ${src_simple_x})
set_target_properties(qe_gww_simple_exe
PROPERTIES
OUTPUT_NAME simple.x)
target_link_libraries(qe_gww_simple_exe
PRIVATE
qe_gww_simple
qe_fftx
qe_xclib
qe_pw
qe_upflib
qe_modules)
###########################################################
# simple_bse.x
###########################################################
set(src_simple_bse_x simple_bse/simple_bse.f90)
qe_add_executable(qe_gww_simple_bse_exe ${src_simple_bse_x})
set_target_properties(qe_gww_simple_bse_exe
PROPERTIES
set_target_properties(qe_gww_simple_bse_exe
PROPERTIES
OUTPUT_NAME simple_bse.x)
target_link_libraries(qe_gww_simple_bse_exe
PRIVATE
@ -356,15 +376,32 @@ target_link_libraries(qe_gww_simple_bse_exe
# simple_ip.x
################################################################
set(src_simple_ip_x simple_ip/simple_ip.f90)
qe_add_executable(qe_gww_simple_ip_exe ${src_simple_ip_x})
set_target_properties(qe_gww_simple_ip_exe
PROPERTIES
qe_add_executable(qe_gww_simple_ip_exe ${src_simple_ip_x})
set_target_properties(qe_gww_simple_ip_exe
PROPERTIES
OUTPUT_NAME simple_ip.x)
target_link_libraries(qe_gww_simple_ip_exe
PRIVATE
qe_gww_simpleip
qe_modules)
################################################################
# pw4gww.x
################################################################
set(src_pw4gww_pw4gww_x pw4gww/pw4gww.f90)
qe_add_executable(qe_gww_pw4gww_exe ${src_pw4gww_pw4gww_x})
set_target_properties(qe_gww_pw4gww_exe
PROPERTIES
OUTPUT_NAME pw4gww.x)
target_link_libraries(qe_gww_pw4gww_exe
PRIVATE
qe_gww_pw4gww
qe_phonon_gamma
qe_fftx
qe_xclib
qe_pw
qe_modules)
###########################################################
qe_install_targets(
@ -378,6 +415,7 @@ qe_install_targets(
qe_gww_simplebse
qe_gww_simpleip
# Executables
qe_gww_pw4gww_exe
qe_gww_util_grap_exe
qe_gww_util_abcoefftoeps_exe
qe_gww_util_memorypw4gww_exe
@ -385,5 +423,6 @@ qe_install_targets(
qe_gww_gww_exe
qe_gww_gww_fit_exe
qe_gww_head_exe
qe_gww_simple_exe
qe_gww_simple_bse_exe
qe_gww_simple_ip_exe)

View File

@ -45,10 +45,8 @@ qpcorrections.o
#zvscal_test.o\
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
../../upflib/libupf.a ../../KS_Solvers/libks_solvers.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a ../../XClib/xc_lib.a
PWOBJS = ../../PW/src/libpw.a
QEMODS = $(BASEMODS)
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a
PW4GWWOBJ = ../pw4gww/fft_custom.o ../pw4gww/stop_pp.o ../pw4gww/mp_wave_parallel.o
GWWOBJ = ../gww/libgww.a ../minpack/minpacklib.a
@ -58,7 +56,7 @@ all : tldeps bse_main.x
bse_main.x : bse_main.o libbse.a $(BSEOBJS) $(PWOBJS) $(QEMODS) $(GWWOBJ)
$(LD) $(LDFLAGS) -o $@ \
bse_main.o libbse.a $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS) $(PW4GWWOBJ) $(GWWOBJ)
bse_main.o libbse.a $(PWOBJS) $(QEMODS) $(QELIBS) $(PW4GWWOBJ) $(GWWOBJ)
- ( cd ../../bin ; ln -fs ../GWW/bse/$@ . )
tldeps:

View File

@ -43,9 +43,7 @@ times_gw.o \
vcprim.o
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a
QEMODS = ../../KS_Solvers/libks_solvers.a $(BASEMODS)
LIBMIN= ../minpack/minpacklib.a
@ -53,14 +51,14 @@ TLDEPS=phlibs
all : tldeps gww.x gww_fit.x libgww.a
gww.x : gww.o $(GWWOBJS) $(LIBOBJS) $(QEMODS) $(LIBMIN)
gww.x : gww.o $(GWWOBJS) $(QEMODS) $(LIBMIN)
$(MPIF90) $(LDFLAGS) -o $@ \
gww.o $(GWWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
gww.o $(GWWOBJS) $(QEMODS) $(LIBMIN) $(QELIBS)
- ( cd ../../bin; ln -fs ../GWW/gww/$@ . )
gww_fit.x : gww_fit.o $(GWWOBJS) $(LIBOBJS) $(QEMODS) $(LIBMIN)
gww_fit.x : gww_fit.o $(GWWOBJS) $(QEMODS) $(LIBMIN)
$(MPIF90) $(LDFLAGS) -o $@ \
gww_fit.o $(GWWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
gww_fit.o $(GWWOBJS) $(QEMODS) $(LIBMIN) $(QELIBS)
- ( cd ../../bin; ln -fs ../GWW/gww/$@ . )
libgww.a : $(GWWOBJS)

View File

@ -376,8 +376,8 @@ subroutine do_polarization_lanczos(tf,options,ispin)
call read_compact_q_lanczos(cql, 1)!just for obtaining numpw,numt...poor man solution
allocate(cql_save(1,1,1))
endif
allocate(vtl_save(1,1,1))
allocate(ttl_save(1,1,1))
else
!put all matrices vtl and ttl in memory, distributed according to valence state
@ -581,9 +581,9 @@ subroutine do_polarization_lanczos(tf,options,ispin)
if(options%l_t_wannier) then
deallocate(cql_save)
call free_memory_compact_q_lanczos(cql)
else
deallocate(vtl_save,ttl_save)
endif
deallocate(vtl_save,ttl_save)
! deallocate(e_mat)
deallocate(af,occ)

View File

@ -19,11 +19,10 @@ openfilq.o \
phq_readin.o \
solve_head.o
QEMODS = ../../Modules/libqemod.a ../../upflib/libupf.a \
../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
../../FFTXlib/libqefft.a ../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
../../dft-d3/libdftd3qe.a
LIBPWPH = ../../PHonon/PH/libph.a ../..//LR_Modules/liblrmod.a ../../PW/src/libpw.a
QEMODS = $(BASEMODS)
LIBPWPH = ../../PHonon/PH/libph.a ../../LR_Modules/liblrmod.a \
../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
LIBMIN= ../minpack/minpacklib.a
@ -31,9 +30,9 @@ TLDEPS= phlibs
all : tldeps head.x
head.x : head.o $(LIBOBJS) $(HEADOBJS) $(LIBPWPH) $(LIBMIN)
head.x : head.o $(HEADOBJS) $(LIBPWPH) $(LIBMIN)
$(LD) $(LDFLAGS) -o head.x head.o \
$(HEADOBJS) $(LIBPWPH) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
$(HEADOBJS) $(LIBPWPH) $(QEMODS) $(LIBMIN) $(QELIBS)
- ( cd ../../bin ; ln -fs ../GWW/head/head.x . )
tldeps :

View File

@ -36,7 +36,8 @@ subroutine bcast_ph_input ( )
USE io_global, ONLY : ionode_id
USE run_info, ONLY : title
USE wannier_gw, ONLY : l_head, omega_gauss, n_gauss, grid_type, nsteps_lanczos,&
&second_grid_n,second_grid_i,l_scissor,scissor,len_head_block_freq,len_head_block_wfc
&second_grid_n,second_grid_i,l_scissor,scissor,len_head_block_freq,&
l_easy,len_head_block_wfc
USE control_lr, ONLY : lgamma, lrpa
@ -106,6 +107,7 @@ subroutine bcast_ph_input ( )
call mp_bcast(scissor, ionode_id, world_comm)
call mp_bcast(len_head_block_freq, ionode_id, world_comm)
call mp_bcast(len_head_block_wfc, ionode_id,world_comm)
call mp_bcast(l_easy , ionode_id,world_comm)
#endif
return

View File

@ -18,9 +18,9 @@ SUBROUTINE close_phq( flag )
!
USE io_global, ONLY : ionode, stdout
USE uspp, ONLY : okvan
USE units_ph, ONLY : iudwf, iubar, iudrhous, iuebar, iudrho, &
USE units_ph, ONLY : iubar, iudrhous, iuebar, iudrho, &
iudvscf, iucom, iudvkb3
USE units_lr, ONLY : iuwfc
USE units_lr, ONLY : iuwfc, iudwf
USE control_ph, ONLY : zue, epsil
USE recover_mod, ONLY : clean_recover
USE output, ONLY : fildrho, fildvscf

View File

@ -149,13 +149,19 @@ PROGRAM head
!
! ... cleanup of the variables for the next q point
!
write(stdout,*) 'DEBUG 1'
CALL clean_pw_ph(iq)
write(stdout,*) 'DEBUG 2'
!
END DO
CALL ph_writefile('init',0,0,ierr)
write(stdout,*) 'DEBUG 3'
CALL ph_writefile('init',0,0,ierr)
write(stdout,*) 'DEBUG 4'
CALL collect_grid_files()
write(stdout,*) 'DEBUG 5'
CALL destroy_status_run()
write(stdout,*) 'DEBUG 6'
!
IF (bands_computed) CALL print_clock_pw()
!

View File

@ -28,6 +28,7 @@ subroutine lanczos_state_k(ik,nstates, nsteps,in_states,d,f,omat,dpsi_ipol, t_ou
USE uspp, ONLY : vkb, nkb, okvan
USE klist, ONLY : xk,igk_k, ngk
USE noncollin_module, ONLY : noncolin, npol
USE wvfct, ONLY : current_k
implicit none
@ -59,7 +60,9 @@ subroutine lanczos_state_k(ik,nstates, nsteps,in_states,d,f,omat,dpsi_ipol, t_ou
allocate(alpha(nstates),beta(nstates),gamma(nstates),n_1(nstates),delta(nstates))
allocate(c(nstates))
allocate(spsi(npwx,nstates))
current_k=ik
npw = ngk(ik)
t_out(:,:,:)=(0.d0,0.d0)

View File

@ -93,20 +93,28 @@ solve_head.o : ../../LR_Modules/lrcom.o
solve_head.o : ../../Modules/becmod.o
solve_head.o : ../../Modules/cell_base.o
solve_head.o : ../../Modules/constants.o
solve_head.o : ../../Modules/control_flags.o
solve_head.o : ../../Modules/fft_base.o
solve_head.o : ../../Modules/io_base.o
solve_head.o : ../../Modules/io_files.o
solve_head.o : ../../Modules/io_global.o
solve_head.o : ../../Modules/ions_base.o
solve_head.o : ../../Modules/kind.o
solve_head.o : ../../Modules/mp_bands.o
solve_head.o : ../../Modules/mp_images.o
solve_head.o : ../../Modules/mp_pools.o
solve_head.o : ../../Modules/mp_wave.o
solve_head.o : ../../Modules/mp_world.o
solve_head.o : ../../Modules/noncol.o
solve_head.o : ../../Modules/recvec.o
solve_head.o : ../../Modules/wannier_gw.o
solve_head.o : ../../Modules/wavefunctions.o
solve_head.o : ../../PHonon/PH/phcom.o
solve_head.o : ../../PW/src/buffers.o
solve_head.o : ../../PW/src/ldaU.o
solve_head.o : ../../PW/src/pwcom.o
solve_head.o : ../../PW/src/scf_mod.o
solve_head.o : ../../PW/src/symme.o
solve_head.o : ../../UtilXlib/mp.o
solve_head.o : ../../upflib/paw_variables.o
solve_head.o : ../../upflib/uspp.o

View File

@ -17,11 +17,11 @@ SUBROUTINE openfilq()
!
USE kinds, ONLY : DP
USE control_flags, ONLY : io_level, modenum
USE units_ph, ONLY : iudwf, iubar, iucom, iudvkb3, &
USE units_ph, ONLY : iubar, iucom, iudvkb3, &
iudrhous, iuebar, iudrho, iudyn, iudvscf, &
lrdwf, lrbar, lrcom, lrdvkb3, &
lrbar, lrcom, lrdvkb3, &
lrdrhous, lrebar, lrdrho
USE units_lr, ONLY : iuwfc, lrwfc
USE units_lr, ONLY : iuwfc, lrwfc, iudwf, lrdwf
! USE io_files, ONLY : tmp_dir
USE control_ph, ONLY : epsil, zue, ext_recover, trans, &
tmp_dir_ph, start_irr, last_irr

View File

@ -79,7 +79,7 @@ SUBROUTINE phq_readin()
USE ahc, ONLY : elph_ahc, ahc_dir, ahc_nbnd, ahc_nbndskip, &
skip_upperfan
USE wannier_gw, ONLY : l_head, omega_gauss, n_gauss, grid_type, nsteps_lanczos,second_grid_n,second_grid_i,&
&l_scissor,scissor, len_head_block_freq, len_head_block_wfc
&l_scissor,scissor, len_head_block_freq, len_head_block_wfc, l_easy
!
IMPLICIT NONE
!
@ -128,7 +128,7 @@ SUBROUTINE phq_readin()
wpot_dir, ahc_dir, ahc_nbnd, ahc_nbndskip, &
skip_upperfan, &
l_head, omega_gauss, n_gauss, grid_type,nsteps_lanczos,l_scissor,scissor,&
second_grid_n,second_grid_i,len_head_block_wfc,len_head_block_freq
second_grid_n,second_grid_i,len_head_block_wfc,len_head_block_freq, l_easy
! tr2_ph : convergence threshold
! amass : atomic masses
@ -348,6 +348,8 @@ SUBROUTINE phq_readin()
d2ns_type = 'full'
len_head_block_freq=0
len_head_block_wfc=0
l_easy=.false.
!
! ... reading the namelist inputph
!

View File

@ -1,5 +1,5 @@
!
! Copyright (C) 2001-2013 Quantum ESPRESSO group
! Copyright (C) 2001-2021 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
@ -27,7 +27,7 @@ subroutine solve_head
USE wannier_gw, ONLY : n_gauss, omega_gauss, grid_type,&
nsteps_lanczos,second_grid_n,second_grid_i,&
l_scissor,scissor,len_head_block_freq, &
len_head_block_wfc
len_head_block_wfc, l_easy
USE control_ph, ONLY : tr2_ph
USE gvect, ONLY : ngm, ngm_g, ig_l2g, gstart, g
USE gvecs, ONLY : doublegrid
@ -45,6 +45,7 @@ subroutine solve_head
use qpoint, ONLY : npwq, nksq
use control_lr, ONLY : nbnd_occ, lgamma
use SCF
implicit none
@ -92,6 +93,7 @@ subroutine solve_head
COMPLEX(kind=DP), ALLOCATABLE :: z_dl(:),z_d(:),z_du(:),z_b(:)
COMPLEX(kind=DP) :: csca, csca1
COMPLEX(kind=DP), ALLOCATABLE :: t_out(:,:,:), psi_tmp(:)
TYPE(scf_type) :: wing
INTEGER :: n
INTEGER :: npwx_g
@ -144,6 +146,9 @@ subroutine solve_head
do i=1,n_gauss
freqs(1+i)=omega_gauss*dble(i)/dble(n_gauss)
enddo
if(l_easy) then
freqs(1)=omega_gauss/dble(n_gauss)/4.d0
endif
else if(grid_type==4) then!equally spaced grid shifted of 1/2
freqs(1) = 0.d0
do i=1,n_gauss
@ -471,18 +476,19 @@ subroutine solve_head
#else
call syme (pola_charge(:,:,:,i))
#endif
call create_scf_type ( wing, .true. )
do ipol=1,3
CALL fwfft ('Rho', pola_charge(1:dfftp%nnr,1,ipol,i), dfftp)
tmp_g(:)=(0.d0,0.d0)
tmp_g(gstart:ngm)=pola_charge(dfftp%nl(gstart:ngm),1,ipol,i)
wing%of_g(1:ngm,1)=-4.d0*tmp_g(1:ngm)
call write_wing ( wing, nspin,ipol,i)
!loop on frequency
do ig=gstart,ngm
e_head_pol(ig,i,ipol)=-4.d0*tmp_g(ig)
enddo
enddo
call destroy_scf_type (wing )
enddo
@ -568,9 +574,74 @@ subroutine solve_head
call mp_barrier( world_comm )
write(stdout,*) 'ATT2'
write(stdout,*) 'THIS IS THE END'
call stop_clock ('solve_head')
return
end subroutine solve_head
SUBROUTINE write_wing ( rho, nspin,ipol,iw)
USE kinds, ONLY : DP
USE io_files, ONLY : create_directory
USE io_base, ONLY : write_rhog, read_rhog
!
USE paw_variables, ONLY : okpaw
USE ldaU, ONLY : lda_plus_u
USE noncollin_module, ONLY : noncolin
USE spin_orb, ONLY : domag
USE scf, ONLY : scf_type
!
USE cell_base, ONLY : bg, tpiba
USE gvect, ONLY : ig_l2g, mill
USE control_flags, ONLY : gamma_only
USE io_files, ONLY : seqopn, tmp_dir, prefix, postfix
USE io_global, ONLY : ionode, ionode_id, stdout
USE mp_pools, ONLY : my_pool_id
USE mp_bands, ONLY : my_bgrp_id, root_bgrp_id, &
root_bgrp, intra_bgrp_comm
USE mp_images, ONLY : intra_image_comm
USE mp, ONLY : mp_bcast
!
IMPLICIT NONE
TYPE(scf_type), INTENT(IN) :: rho
INTEGER, INTENT(IN) :: nspin
INTEGER, INTENT(IN) :: ipol!direction
INTEGER, INTENT(IN) :: iw !frequency
!
CHARACTER (LEN=256) :: dirname
LOGICAL :: lexist
INTEGER :: nspin_, iunocc, iunpaw, ierr
INTEGER, EXTERNAL :: find_free_unit
CHARACTER(5) :: nfile
CHARACTER :: npol
write(nfile,'(5i1)') &
& iw/10000,mod(iw,10000)/1000,mod(iw,1000)/100,mod(iw,100)/10,mod(iw,10)
write(npol,'(1i1)') ipol
dirname = TRIM(tmp_dir) // TRIM(prefix) // postfix
CALL create_directory( dirname )
! in the following case do not read or write polarization
IF ( noncolin .AND. .NOT.domag ) THEN
nspin_ = 1
ELSE
nspin_ = nspin
ENDIF
! Write G-space density
IF ( my_pool_id == 0 .AND. my_bgrp_id == root_bgrp_id ) &
CALL write_rhog( TRIM(dirname) // "wing_" // npol // "_" //nfile, &
root_bgrp, intra_bgrp_comm, &
bg(:,1)*tpiba, bg(:,2)*tpiba, bg(:,3)*tpiba, &
gamma_only, mill, ig_l2g, rho%of_g(:,1:nspin_) )
RETURN
END SUBROUTINE write_wing

View File

@ -47,25 +47,22 @@ hpsi_pw4gww.o \
cgsolve_all_gamma.o \
realus.o \
operator_1_vp.o \
operator_debug.o
operator_debug.o \
lanczos_chains.o \
convergence.o \
easy_gw.o
QEMODS = ../../Modules/libqemod.a ../../upflib/libupf.a \
../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
../../FFTXlib/libqefft.a ../../LAXlib/libqela.a \
../../UtilXlib/libutil.a ../../dft-d3/libdftd3qe.a
# dft-d3 required by xlf for obscure reasons
PWOBJS = ../../PW/src/libpw.a
PHOBJS = ../../PHonon/Gamma/libphcg.a
LIBMIN=
QEMODS = $(BASEMODS)
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
PHOBJS = ../../PHonon/Gamma/libphcg.a ../../LR_Modules/liblrmod.a
TLDEPS= phlibs
all : tldeps pw4gww.x
pw4gww.x : pw4gww.o libpw4gww.a $(PW4GWWOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN)
pw4gww.x : pw4gww.o libpw4gww.a $(PW4GWWOBJS) $(PWOBJS) $(QEMODS)
$(LD) $(LDFLAGS) -o $@ \
pw4gww.o libpw4gww.a $(PWOBJS) $(PHOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
pw4gww.o libpw4gww.a $(PHOBJS) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../bin ; ln -fs ../GWW/pw4gww/$@ . )
tldeps :

2528
GWW/pw4gww/convergence.f90 Normal file

File diff suppressed because it is too large Load Diff

270
GWW/pw4gww/easy_gw.f90 Normal file
View File

@ -0,0 +1,270 @@
!this subroutine performes a GW calculation with EASY strategy
!for states from s_first_state to s_last_state
SUBROUTINE easy_gw
USE wannier_gw
USE wvfct, ONLY : npw
USE io_global, ONLY : ionode, stdout
USE mp, ONLY : mp_barrier, mp_bcast,mp_sum
USE mp_world, ONLY : world_comm, mpime
USE io_files, ONLY : prefix, tmp_dir, nwordwfc,iunwfc
USE fft_base, ONLY : dffts
USE convergence_gw
USE wvfct, ONLY : nbnd, et, npwx
USE wavefunctions, ONLY : evc
USE constants, ONLY : rytoev
USE lsda_mod, ONLY : lsda, nspin,current_spin,isk
USE io_files, ONLY : create_directory
IMPLICIT NONE
REAL(kind=DP), ALLOCATABLE :: v_states(:,:,:)!valence states in real space
INTEGER :: ii,iw,jj,kk
TYPE(convergence_tests) :: ct!all data for testing convergence of basis sets
TYPE(self_energy) :: se!for putting all the calculated data
COMPLEX(kind=DP), ALLOCATABLE :: freq(:)
REAL(kind=DP), ALLOCATABLE :: e_xc(:,:),e_h(:,:),e_x(:,:)
COMPLEX(kind=DP), ALLOCATABLE :: ks_wfcs(:,:,:)
INTEGER, EXTERNAL :: find_free_unit
INTEGER :: iun
INTEGER :: ix,iy,iz,ir,im
INTEGER :: passed
INTEGER :: nr_counter
INTEGER :: ix_start,iy_start,iz_start
REAL(kind=DP) :: disterr,mindisterr,disterrmin
INTEGER :: is
INTEGER :: i_restart,i_res
INTEGER :: n_total, i_first, i_last, n_local, i_stop
CHARACTER(4) :: nfilex,nfiley,nfilez,nfile_orb
CHARACTER(5) :: nfilel2
LOGICAL :: lex, l_old_restart
INTERFACE
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin, v_states )
USE kinds, ONLY : DP
USE fft_base, ONLY : dffts
USE lsda_mod, ONLY : nspin
INTEGER :: lda, n, m
COMPLEX(kind=DP) :: psi(lda,m)
REAL(kind=DP) :: e_xc(m), e_h(m)
INTEGER, INTENT(in) :: ispin !spin 1,2
REAL(kind=DP), OPTIONAL :: v_states(dffts%nnr,m, nspin)
END SUBROUTINE energies_xc
END INTERFACE
call start_clock('easy_gw')
allocate(v_states(dffts%nnr,num_nbnds,nspin))
if(.not.l_truncated_coulomb) call calculate_vg0()
if(nspin==2) then
CALL davcio(evc,2*nwordwfc,iunwfc,2,-1)
call evc_to_real(num_nbnds, v_states(1,1,2))
CALL davcio(evc,2*nwordwfc,iunwfc,1,-1)
endif
call evc_to_real(num_nbnds, v_states(1,1,1))
allocate(e_xc(nbnd,nspin),e_h(nbnd,nspin),e_x(nbnd,nspin))
allocate(ks_wfcs(npw,nbnd,nspin))
do is=1,nspin
IF (lsda) current_spin = isk(is)
if(nspin/=1) CALL davcio(evc,2*nwordwfc,iunwfc,is,-1)!read wfcs for
call energies_xc( npwx, npw, nbnd, evc, e_xc(:,is),e_h(:,is),is ,v_states)
ks_wfcs(1:npw,1:nbnd,is)=evc(1:npw,1:nbnd)
enddo
CALL dft_exchange(num_nbndv,nbnd,nset,e_x,ks_wfcs)
allocate(freq(n_gauss))
do iw=1,n_gauss
freq(iw)=(0.d0,1.d0)*(omega_gauss/dble(n_gauss)*(dble(iw)) -omega_gauss)
if(abs(aimag(freq(iw))) < 1d-10) freq(iw)=0.d0
enddo
call initialize_memory(se)
call set_se_energies(se, et,e_xc,e_x)
nr_counter=1
if(easy_grid_type==0) then
do is=s_first_spin,s_last_spin
do ii=s_first_state,s_last_state
call start_convergence(ct,ii,is,v_states,.true.,0,0,0,n_gauss,freq,ks_wfcs)
call calculate_convergence(ct,v_states,se,nr_counter)
call free_memory(ct)
enddo
enddo
elseif(easy_grid_type==1) then
!create directories
do is=1,nspin
do ii=s_first_state,s_last_state
write(nfile_orb,'(4i1)') &
& ii/1000,mod(ii,1000)/100,mod(ii,100)/10, mod(ii,10)
if(is==1) then
call create_directory(trim(prefix)//'-gwl_orbital_1_'//nfile_orb)
else
call create_directory(trim(prefix)//'-gwl_orbital_2_'//nfile_orb)
endif
enddo
enddo
!calculate total number of points
ix_start=1+easy_grid_param(1)
iy_start=1+easy_grid_param(2)
iz_start=1+easy_grid_param(3)
n_total=0
do iz=iz_start,dffts%nr3,easy_grid_param(4)
do iy=iy_start,dffts%nr2,easy_grid_param(4)
do ix=ix_start,dffts%nr1,easy_grid_param(4)
passed=0
if(ix<easy_grid_param(5) .and. iy<easy_grid_param(5) .and. iz<easy_grid_param(5)) then
if( iz>dffts%nr3p_offset(mpime+1) .and. iz <=( dffts%nr3p_offset(mpime+1)+dffts%my_nr3p)) then
ii=(iz-dffts%nr3p_offset(mpime+1)-1)*dffts%nr2*dffts%nr1+&
(iy-1)*dffts%nr1+ix
do is=s_first_spin,s_last_spin
do jj=s_first_state,s_last_state
if(abs(v_states(ii,jj,is))> easy_psi_thrs) passed=1
enddo
enddo
endif
endif
call mp_sum(passed,world_comm)
if(passed>0) then
n_total=n_total+1
endif
enddo
enddo
enddo
write(stdout,*) 'TOTAL NUMBER OF POINTS:', n_total
n_local=n_total/easy_split_calc_n
if(n_local*easy_split_calc_n< n_total) n_local=n_local+1
i_first=(easy_split_calc_i-1)*n_local+1
i_last=i_first+n_local-1
if(i_last>n_total) i_last=n_total
if(restart_gww>10000) then
l_old_restart=.false.
restart_gww=0
else
l_old_restart=.true.
endif
if(restart_gww>1) then
i_restart=restart_gww+i_first
write(stdout,*) 'RESTARTING FROM POINT:', i_restart
else
i_restart=i_first
endif
i_res=0
i_stop=i_last
write(stdout,*) 'DOING POINTS RANGE', i_restart,i_stop
ix_start=1+easy_grid_param(1)
iy_start=1+easy_grid_param(2)
iz_start=1+easy_grid_param(3)
do iz=iz_start,dffts%nr3,easy_grid_param(4)
do iy=iy_start,dffts%nr2,easy_grid_param(4)
do ix=ix_start,dffts%nr1,easy_grid_param(4)
write(stdout,*) 'COORDINATES:',ix,iy,iz
passed=0
!write(stdout,*) 'OFFSET', dffts%nr3p_offset(mpime+1),mpime
!NOTE OFFSET and MPIME start from 0
if(ix<easy_grid_param(5) .and. iy<easy_grid_param(5) .and. iz<easy_grid_param(5)) then
if( iz>dffts%nr3p_offset(mpime+1) .and. iz <=( dffts%nr3p_offset(mpime+1)+dffts%my_nr3p)) then
ii=(iz-dffts%nr3p_offset(mpime+1)-1)*dffts%nr2*dffts%nr1+&
(iy-1)*dffts%nr1+ix
do is=s_first_spin,s_last_spin
do jj=s_first_state,s_last_state
if(abs(v_states(ii,jj,is))> easy_psi_thrs) passed=1
enddo
enddo
end if
endif
call mp_sum(passed,world_comm)
write(stdout,*) 'PASSED', passed
if(passed>0) then
i_res=i_res+1
if(l_old_restart) then
if(i_res>=i_restart) then
call start_convergence(ct,0,0,v_states,.false.,ix,iy,iz,n_gauss,freq,ks_wfcs)
call calculate_convergence(ct,v_states,se,nr_counter)
call free_memory(ct)
endif
else
kk=s_first_state
write(nfilel2,'(5i1)') &
& kk/10000,mod(kk,10000)/1000,mod(kk,1000)/100,mod(kk,100)/10,mod(kk,10)
write(nfilex,'(4i1)') &
& ix/1000,mod(ix,1000)/100,mod(ix,100)/10, mod(ix,10)
write(nfiley,'(4i1)') &
& iy/1000,mod(iy,1000)/100,mod(iy,100)/10, mod(iy,10)
write(nfilez,'(4i1)') &
& iz/1000,mod(iz,1000)/100,mod(iz,100)/10, mod(iz,10)
!call create_directory(trim(prefix)//'-gwl_orbital_1_'//nfile_orb)
write(nfile_orb,'(4i1)') &
& kk/1000,mod(kk,1000)/100,mod(kk,100)/10, mod(kk,10)
inquire( file=trim(prefix)//'-gwl_orbital_1_'//nfile_orb//'/im_on_im_r_'//nfilex//'_'//nfiley&
&//'_'//nfilez//'__'//nfilel2,exist=lex)
! inquire( file=trim(prefix)//'-'//'im_on_im_r_'//nfilex//'_'//nfiley&
! &//'_'//nfilez//'__'//nfilel2,exist=lex)
if(.not.lex) then
call start_convergence(ct,0,0,v_states,.false.,ix,iy,iz,n_gauss,freq,ks_wfcs)
call calculate_convergence(ct,v_states,se,nr_counter)
call free_memory(ct)
endif
endif
if(i_res==i_stop) then
exit
end if
endif
enddo
if(i_res==i_stop) then
exit
end if
enddo
if(i_res==i_stop) then
exit
end if
enddo
end if
!if(easy_grid_type==1.or.easy_grid_type==2) call average_self_energy(se)
!if(easy_grid_type==0 .and. l_whole_s) call solve_off_diagonal(se)
if(ionode .and. (easy_grid_type==0 .or. i_restart==1)) then
iun = find_free_unit()
open( unit=iun, file=trim(prefix)//'-'//'bands.dat', status='unknown')
do is=1,nspin
write(iun,*) num_nbndv(is)
do ii=1,num_nbnds
write(iun,*) ii,et(ii,is)*rytoev,0.d0,0.d0,(et(ii,is)-e_xc(ii,is)+e_x(ii,is))*rytoev
enddo
enddo
close(iun)
endif
call stop_clock('easy_gw')
call print_clock('easy_gw')
call print_clock('gzero_complex')
call print_clock('vpv_complex')
call print_clock('vpv_lanczos')
call print_clock('fft')
call print_clock('ffts')
call print_clock('fftw')
call free_memory(se)
deallocate(e_xc,e_h,e_x)
deallocate(v_states,ks_wfcs)
return
END SUBROUTINE easy_gw

View File

@ -9,7 +9,7 @@
!
!----------------------------------------------------------------------------
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin, v_states )
!----------------------------------------------------------------------------
!
! computes the expectation values of the exchange and correlation potential
@ -45,6 +45,7 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
USE exx, ONLY : vexx !Suriano
USE xc_lib, ONLY : exx_is_active, xclib_dft_is
USE klist, ONLY : igk_k
!
IMPLICIT NONE
@ -56,6 +57,7 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
COMPLEX(DP) :: psi(lda,m)
REAL(kind=DP) :: e_xc(m), e_h(m)
INTEGER, INTENT(in) :: ispin !spin 1,2
REAL(kind=DP), OPTIONAL :: v_states(dffts%nnr,m, nspin)
REAL(kind=DP), ALLOCATABLE :: vr(:,:)
!
@ -347,8 +349,9 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
allocate(psi_r(dfftp%nnr),psi_rs(dfftp%nnr))
iunwfcreal=find_free_unit()
CALL diropn( iunwfcreal, 'real_whole', dffts%nnr, exst )
if(.not. present(v_states)) then
CALL diropn( iunwfcreal, 'real_whole', dffts%nnr, exst )
endif
!calculate xc potential on fine grid
@ -404,16 +407,19 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
do ibnd=1,m!loop on states
!read from disk wfc on coarse grid
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
if(doublegrid) then
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
else
psi_r(:)=psi_rs(:)
endif
do ir=1,dfftp%nnr
psi_r(ir)=psi_r(ir)**2.d0
enddo
if(.not. present(v_states)) then
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
if(doublegrid) then
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
else
psi_r(:)=psi_rs(:)
endif
else
psi_r(1:dffts%nnr)=v_states(1:dffts%nnr,ibnd,ispin)
endif
do ir=1,dfftp%nnr
psi_r(ir)=psi_r(ir)**2.d0
enddo
!if(okvan) call adduspos_gamma_r(ibnd,ibnd,psi_r,1,becp_gw(:,ibnd),becp_gw(:,ibnd))
@ -454,11 +460,15 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
do ibnd=1,m!loop on states
!read from disk wfc on coarse grid
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
if(doublegrid) then
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
if(.not. present(v_states)) then
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
if(doublegrid) then
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
else
psi_r(:)=psi_rs(:)
endif
else
psi_r(:)=psi_rs(:)
psi_r(1:dffts%nnr)=v_states(1:dffts%nnr,ibnd,ispin)
endif
do ir=1,dfftp%nnr
@ -487,7 +497,7 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
deallocate(psi_r,psi_rs)
deallocate(exact_x)
close(iunwfcreal)
if(.not. present(v_states) ) close(iunwfcreal)
deallocate(e_hub)
if(l_whole_s) then
!NOT_TO_BE_INCLUDED_START

View File

@ -75,10 +75,141 @@ subroutine cg_psi_pw4gww (lda, n, m, psi, h_diag)
!
do k = 1, m
do i = 1, n
psi (i, k) = psi (i, k) * 1.d0/(1.d0+h_diag (i, k))!-et(k,1))
!psi (i, k) = psi (i, k) * 1.d0/(h_diag (i, k)-100.d0)
psi (i, k) = psi (i, k) * 1.d0/(1.d0+h_diag (i, k))
enddo
enddo
return
end subroutine cg_psi_pw4gww
subroutine hpsi_pw4gww2( ndim,psi,ppsi,et,ik,numv)
USE kinds, ONLY : DP
USE wvfct, ONLY : npwx, npw, nbnd
USE mp, ONLY : mp_sum, mp_barrier, mp_bcast
USE mp_world, ONLY : mpime, nproc
implicit none
INTEGER, INTENT(in) :: ndim !leading dimension of psi and psip
INTEGER, INTENT(in) :: numv!number of bands
INTEGER, INTENT(in) ::ik!dumm integer
COMPLEX(kind=DP), INTENT(in) :: psi(ndim,numv)
COMPLEX(kind=DP), INTENT(out) :: ppsi(ndim,numv)
REAL(kind=DP) :: et(numv)
INTEGER :: iv
!apply h_psi
call h_psi( ndim, npw, numv, psi, ppsi )
do iv=1,numv
ppsi(1:npw,iv)=ppsi(1:npw,iv)-et(iv)*psi(1:npw,iv)
enddo
return
end subroutine hpsi_pw4gww2
subroutine hpsi_square( ndim,psi,ppsi,et,ik,numv)
USE kinds, ONLY : DP
USE wvfct, ONLY : npwx, npw, nbnd
USE mp, ONLY : mp_sum, mp_barrier, mp_bcast
USE mp_world, ONLY : mpime, nproc
implicit none
INTEGER, INTENT(in) :: ndim !leading dimension of psi and psip
INTEGER, INTENT(in) :: numv!number of bands
INTEGER, INTENT(in) ::ik!dumm integer
REAL(kind=DP),SAVE :: freq_real=0.d0,freq_im=0.d0
LOGICAL, SAVE :: lproj=.false. !if true project over empty manyfold
COMPLEX(kind=DP), INTENT(in) :: psi(ndim,numv)
COMPLEX(kind=DP), INTENT(out) :: ppsi(ndim,numv)
REAL(kind=DP) :: et(numv)
INTEGER :: iv
COMPLEX(kind=DP),ALLOCATABLE :: psi1(:,:)
!apply h_psi
if(ik==-1) then
freq_real=et(1)
freq_im=et(2)
elseif(ik==-2) then
lproj=.true.
elseif(ik==-3) then
lproj=.false.
else
if(lproj) then
do iv=1,numv
call pc_operator(psi(1,iv),1,.false.)
enddo
endif
allocate(psi1(ndim,numv))
call h_psi( ndim, npw, numv, psi, psi1 )
do iv=1,numv
psi1(1:npw,iv)=psi1(1:npw,iv)-(freq_real+et(iv))*psi(1:npw,iv)
enddo
call h_psi( ndim, npw, numv, psi1, ppsi )
do iv=1,numv
ppsi(1:npw,iv)=ppsi(1:npw,iv)-(freq_real+et(iv))*psi1(1:npw,iv)
enddo
do iv=1,numv
ppsi(1:npw,iv)=ppsi(1:npw,iv)+(freq_im**2.d0)*psi(1:npw,iv)
enddo
if(lproj)then
do iv=1,numv
call pc_operator(ppsi(1,iv),1,.false.)
enddo
endif
deallocate(psi1)
endif
return
end subroutine hpsi_square
!-----------------------------------------------------------------
subroutine cg_psi_pw4gww_square (lda, n, m, psi, h_diag)
!-----------------------------------------------------------------
!
! This routine gives a preconditioning to the linear system solver.
! The preconditioning is diagonal in reciprocal space
!
USE kinds, only : DP
USE wvfct, ONLY : et
implicit none
integer :: lda, n, m
! input: the leading dimension of the psi vector
! input: the real dimension of the vector
! input: the number of vectors
complex(DP) :: psi (lda, m)
! inp/out: the vector to be preconditioned
real(DP) :: h_diag (lda, m)
! input: the preconditioning vector
integer :: k, i
! counter on bands
! counter on the elements of the vector
do k = 1, m
do i = 1, n
psi (i, k) = psi (i, k)* 1.d0/(h_diag (i, k))
enddo
enddo
return
end subroutine cg_psi_pw4gww_square

File diff suppressed because it is too large Load Diff

View File

@ -28,6 +28,34 @@ contour_terms.o : ../../Modules/wannier_gw.o
contour_terms.o : ../../Modules/wavefunctions.o
contour_terms.o : ../../PW/src/pwcom.o
contour_terms.o : ../../UtilXlib/mp.o
convergence.o : ../../FFTXlib/fft_interfaces.o
convergence.o : ../../Modules/becmod.o
convergence.o : ../../Modules/cell_base.o
convergence.o : ../../Modules/constants.o
convergence.o : ../../Modules/fft_base.o
convergence.o : ../../Modules/gvecw.o
convergence.o : ../../Modules/io_base.o
convergence.o : ../../Modules/io_files.o
convergence.o : ../../Modules/io_global.o
convergence.o : ../../Modules/kind.o
convergence.o : ../../Modules/mp_bands.o
convergence.o : ../../Modules/mp_global.o
convergence.o : ../../Modules/mp_images.o
convergence.o : ../../Modules/mp_pools.o
convergence.o : ../../Modules/mp_wave.o
convergence.o : ../../Modules/mp_world.o
convergence.o : ../../Modules/noncol.o
convergence.o : ../../Modules/recvec.o
convergence.o : ../../Modules/wannier_gw.o
convergence.o : ../../Modules/wavefunctions.o
convergence.o : ../../PW/src/g_psi_mod.o
convergence.o : ../../PW/src/ldaU.o
convergence.o : ../../PW/src/pwcom.o
convergence.o : ../../PW/src/scf_mod.o
convergence.o : ../../UtilXlib/mp.o
convergence.o : ../../upflib/paw_variables.o
convergence.o : ../../upflib/uspp.o
convergence.o : lanczos_chains.o
dft_exchange.o : ../../FFTXlib/fft_interfaces.o
dft_exchange.o : ../../Modules/cell_base.o
dft_exchange.o : ../../Modules/constants.o
@ -61,6 +89,17 @@ diago_cg_g.o : ../../Modules/mp_bands.o
diago_cg_g.o : ../../Modules/mp_world.o
diago_cg_g.o : ../../Modules/random_numbers.o
diago_cg_g.o : ../../UtilXlib/mp.o
easy_gw.o : ../../Modules/constants.o
easy_gw.o : ../../Modules/fft_base.o
easy_gw.o : ../../Modules/io_files.o
easy_gw.o : ../../Modules/io_global.o
easy_gw.o : ../../Modules/kind.o
easy_gw.o : ../../Modules/mp_world.o
easy_gw.o : ../../Modules/wannier_gw.o
easy_gw.o : ../../Modules/wavefunctions.o
easy_gw.o : ../../PW/src/pwcom.o
easy_gw.o : ../../UtilXlib/mp.o
easy_gw.o : convergence.o
energies_xc.o : ../../FFTXlib/fft_interfaces.o
energies_xc.o : ../../Modules/becmod.o
energies_xc.o : ../../Modules/cell_base.o
@ -152,6 +191,16 @@ hpsi_pw4gww.o : ../../Modules/kind.o
hpsi_pw4gww.o : ../../Modules/mp_world.o
hpsi_pw4gww.o : ../../PW/src/pwcom.o
hpsi_pw4gww.o : ../../UtilXlib/mp.o
lanczos_chains.o : ../../FFTXlib/fft_interfaces.o
lanczos_chains.o : ../../Modules/fft_base.o
lanczos_chains.o : ../../Modules/io_global.o
lanczos_chains.o : ../../Modules/kind.o
lanczos_chains.o : ../../Modules/mp_world.o
lanczos_chains.o : ../../Modules/recvec.o
lanczos_chains.o : ../../Modules/wannier_gw.o
lanczos_chains.o : ../../Modules/wavefunctions.o
lanczos_chains.o : ../../PW/src/pwcom.o
lanczos_chains.o : ../../UtilXlib/mp.o
matrix_wannier_gamma.o : ../../Modules/cell_base.o
matrix_wannier_gamma.o : ../../Modules/constants.o
matrix_wannier_gamma.o : ../../Modules/fft_base.o
@ -295,6 +344,7 @@ produce_wannier_gamma.o : ../../Modules/fft_base.o
produce_wannier_gamma.o : ../../Modules/gvecw.o
produce_wannier_gamma.o : ../../Modules/io_files.o
produce_wannier_gamma.o : ../../Modules/io_global.o
produce_wannier_gamma.o : ../../Modules/kind.o
produce_wannier_gamma.o : ../../Modules/mp_world.o
produce_wannier_gamma.o : ../../Modules/recvec.o
produce_wannier_gamma.o : ../../Modules/wannier_gw.o

View File

@ -74,6 +74,19 @@
TYPE(exchange_cus) :: exx_cus
INTERFACE
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin, v_states )
USE kinds, ONLY : DP
USE fft_base, ONLY : dffts
USE lsda_mod, ONLY : nspin
INTEGER :: lda, n, m
COMPLEX(kind=DP) :: psi(lda,m)
REAL(kind=DP) :: e_xc(m), e_h(m)
INTEGER, INTENT(in) :: ispin !spin 1,2
REAL(kind=DP), OPTIONAL :: v_states(dffts%nnr,m, nspin)
END SUBROUTINE energies_xc
END INTERFACE
! interface
! subroutine fake_conduction_wannier(fcw_n,fcw_s,fcw_m,cut,s_cut)

View File

@ -154,7 +154,24 @@ program gwl_punch
l_no_GW_just_screening,& ! JDE
l_no_GW_bare_Coulomb,& ! JDE
no_GW_cg_maxit,& ! JDE
no_GW_cg_threshold ! JDE
no_GW_cg_threshold,& ! JDE
l_easy, &
l_easy_lanczos_g, &
easy_grid_type, &
easy_grid_param, &
easy_average_type, &
easy_psi_thrs,&
l_easy_update_basis_w, &
l_easy_dielectric_constant, &
easy_w_update_alpha, &
easy_w_update_lanczos, &
easy_w_thrs, &
s_first_spin,&
s_last_spin, &
easy_split_calc_n, &
easy_split_calc_i, &
l_easy_w_disk
USE exchange_custom, ONLY : exchange_fast_dual
@ -192,7 +209,13 @@ program gwl_punch
l_cond_pol_base,l_semicore,n_semicore,l_semicore_read, l_verbose, l_contour,&
l_real,exchange_fast_dual,l_bse,s_bse,dual_bse,l_big_system,extra_pw_cutoff,&
l_list,l_scissor,scissor,l_full,n_full,l_simple,&
l_no_GW_just_screening, l_no_GW_bare_Coulomb, no_GW_cg_maxit, no_GW_cg_threshold ! JDE
l_no_GW_just_screening, l_no_GW_bare_Coulomb, no_GW_cg_maxit, no_GW_cg_threshold,&
l_easy, easy_grid_type, easy_grid_param,easy_average_type, easy_psi_thrs,&
l_easy_lanczos_g, &
l_easy_update_basis_w, l_easy_dielectric_constant, easy_w_update_alpha, &
easy_w_update_lanczos, easy_w_thrs, s_first_spin, s_last_spin, easy_split_calc_n,&
easy_split_calc_i, l_easy_w_disk
!
@ -294,6 +317,28 @@ program gwl_punch
l_no_GW_bare_coulomb=.false. ! JDE
no_GW_cg_maxit=30 ! JDE
no_GW_cg_threshold=1.d-10 ! JDE
l_easy=.false.
l_easy_lanczos_g=.false.
easy_grid_type=0
easy_grid_param(1)=0
easy_grid_param(2)=0
easy_grid_param(3)=0
easy_grid_param(4)=5
easy_grid_param(5)=100000
easy_average_type=0
easy_psi_thrs=0.d0
l_easy_update_basis_w=.false.
l_easy_dielectric_constant=.false.
easy_w_update_alpha=0.1d0
easy_w_update_lanczos=0.5
easy_w_thrs=1d-20
s_first_spin=0
s_last_spin=0
easy_split_calc_n=1
easy_split_calc_i=1
l_easy_w_disk=.false.
!
! Reading input file
!
@ -418,7 +463,26 @@ program gwl_punch
CALL mp_bcast(l_no_GW_bare_coulomb, ionode_id, world_comm) ! JDE
CALL mp_bcast(no_GW_cg_maxit, ionode_id, world_comm) ! JDE
CALL mp_bcast(no_GW_cg_threshold, ionode_id, world_comm) ! JDE
CALL mp_bcast(l_easy, ionode_id, world_comm)
CALL mp_bcast(l_easy_lanczos_g, ionode_id, world_comm)
CALL mp_bcast(easy_grid_type, ionode_id, world_comm)
CALL mp_bcast(easy_grid_param, ionode_id, world_comm)
CALL mp_bcast(easy_average_type, ionode_id, world_comm)
CALL mp_bcast(easy_psi_thrs, ionode_id, world_comm)
CALL mp_bcast(l_easy_update_basis_w, ionode_id,world_comm)
CALL mp_bcast(l_easy_dielectric_constant, ionode_id,world_comm)
CALL mp_bcast(easy_w_update_alpha, ionode_id,world_comm)
CALL mp_bcast(easy_w_update_lanczos , ionode_id,world_comm)
CALL mp_bcast(easy_w_thrs, ionode_id,world_comm)
CALL mp_bcast(s_first_spin, ionode_id, world_comm)
CALL mp_bcast(s_last_spin, ionode_id, world_comm)
CALL mp_bcast(easy_split_calc_n, ionode_id, world_comm)
CALL mp_bcast(easy_split_calc_i, ionode_id, world_comm)
CALL mp_bcast( l_easy_w_disk, ionode_id, world_comm)
if(s_first_spin==0) s_first_spin=1
if(s_last_spin==0) s_first_spin=nspin
call read_file
@ -443,9 +507,11 @@ program gwl_punch
call summary()
!
! init some quantities ...
!
CALL hinit0()
if(lda_plus_u) then
CALL init_ns()
endif
@ -498,14 +564,12 @@ program gwl_punch
ENDIF
if(l_easy) then
CALL easy_gw()
else
CALL produce_wannier_gamma
endif
if(l_verbose) write(stdout,*) 'BEFORE produce_wannier_gamma'
FLUSH( stdout )
CALL produce_wannier_gamma
if(l_verbose) write(stdout,*) 'AFTER produce_wannier_gamma'
FLUSH( stdout )
! ENDIF
!
!
@ -749,6 +813,10 @@ subroutine read_export (pp_file,kunit,uspp_spsi, ascii, single_file, raw)
ALLOCATE( sevc(npwx,nbnd), STAT=ierr )
IF (ierr/=0) CALL errore( ' read_export ',' Unable to allocate SEVC ', ABS(ierr) )
CALL init_us_1
!CALL init_at_1
CALL allocate_bec_type (nkb,nbnd,becp)
do ik = 1, nkstot

View File

@ -22,13 +22,8 @@ SIMPLEOBJS = \
commutator.o
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
../../upflib/libupf.a ../../KS_Solvers/libks_solvers.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
../../dft-d3/libdftd3qe.a ../../XClib/xc_lib.a
# dft-d3 required by xlf for obscure reasons
PWOBJS = ../../PW/src/libpw.a
QEMODS = $(BASEMODS)
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
GWWOBJ = ../gww/libgww.a
TLDEPS= pwlibs gwwlib
@ -37,7 +32,7 @@ all : tldeps simple.x
simple.x : simple.o libsimple.a $(SIMPLEOBJS) $(PWOBJS) $(QEMODS) $(GWWOBJ)
$(LD) $(LDFLAGS) -o $@ \
simple.o libsimple.a $(PWOBJS) $(GWWOBJ) $(QEMODS) $(LIBOBJS) $(QELIBS) $(LIBMIN)
simple.o libsimple.a $(PWOBJS) $(GWWOBJ) $(QEMODS) $(QELIBS) $(LIBMIN)
- ( cd ../../bin ; ln -fs ../GWW/simple/$@ . )
tldeps :

View File

@ -14,7 +14,6 @@ subroutine product_basis
USE uspp_param, ONLY : upf, nh
USE noncollin_module, ONLY: npol, noncolin
USE mp_world, ONLY : world_comm
USE spin_orb, ONLY: lspinorb
USE ions_base, ONLY : nat, nsp, ityp
USE io_global, ONLY : stdout, ionode
USE input_simple
@ -290,7 +289,6 @@ SUBROUTINE optimal_gram_schmidt_nc(num_in,wfcs,thres,num_out)
USE noncollin_module, ONLY: npol, noncolin
USE input_simple, ONLY : npw_max,vkb_max
USE noncollin_module, ONLY: npol, noncolin
USE spin_orb, ONLY: lspinorb
USE ions_base, ONLY : nat, nsp, ityp

View File

@ -9,7 +9,6 @@ subroutine v_product
USE klist, ONLY : nks,ngk,xk
USE noncollin_module, ONLY: npol, noncolin
USE mp_world, ONLY : world_comm,mpime
USE spin_orb, ONLY: lspinorb
USE io_global, ONLY : stdout, ionode
USE input_simple
USE gvect, ONLY : ngm, gstart,gg, g

View File

@ -12,7 +12,6 @@ subroutine wfc_basis
USE uspp_param, ONLY : upf, nh
USE noncollin_module, ONLY: npol, noncolin
USE mp_world, ONLY : world_comm
USE spin_orb, ONLY: lspinorb
USE ions_base, ONLY : nat, nsp, ityp
USE io_global, ONLY : stdout, ionode
USE input_simple
@ -541,7 +540,6 @@ SUBROUTINE optimal_gram_schmidt_z(num_in,wfcs,ithres,thres,num_out)
USE uspp, ONLY : nkb, vkb, becsum, nhtol, nhtoj, indv, okvan
USE uspp_param, ONLY : upf, nh
USE noncollin_module, ONLY: npol, noncolin
USE spin_orb, ONLY: lspinorb
USE ions_base, ONLY : nat, nsp, ityp

View File

@ -19,17 +19,15 @@ spectrum.o \
lanczos.o \
build_eemat.o
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
../../KS_Solvers/libks_solvers.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a
QEMODS = $(BASEMODS)
TLDEPS=bindir libs mods gwwlib
all : tldeps simple_bse.x libsimple_exc.a
simple_bse.x : simple_bse.o $(EXCOBJS) $(LIBOBJS) $(QEMODS)
simple_bse.x : simple_bse.o $(EXCOBJS) $(QEMODS)
$(MPIF90) $(LDFLAGS) -o $@ \
simple_bse.o $(EXCOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
simple_bse.o $(EXCOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../bin; ln -fs ../GWW/simple_bse/$@ . )

View File

@ -16,10 +16,8 @@ diagonalization.o \
dielectric.o
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
../../KS_Solvers/libks_solvers.a \
../../LAXlib/libqela.a ../../UtilXlib/libutil.a
PWOBJS = ../../PW/src/libpw.a
QEMODS = $(BASEMODS)
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a
LIBMIN=
@ -27,9 +25,9 @@ TLDEPS=pwlibs
all : tldeps simple_ip.x libsimple_ip.a
simple_ip.x : simple_ip.o $(SIMPLEIPOBJS) $(LIBOBJS) $(PWOBJS) $(QEMODS) $(LIBMIN)
simple_ip.x : simple_ip.o $(SIMPLEIPOBJS) $(PWOBJS) $(QEMODS) $(LIBMIN)
$(MPIF90) $(LDFLAGS) -o $@ \
simple_ip.o $(SIMPLEIPOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
simple_ip.o $(SIMPLEIPOBJS) $(PWOBJS) $(QEMODS) $(LIBMIN) $(QELIBS)
- ( cd ../../bin; ln -fs ../GWW/simple_ip/$@ . )
libsimple_ip.a : $(SIMPLEIPOBJS)

View File

@ -13,7 +13,6 @@ set(src_hp
src/hp_dealloc_1.f90
src/hp_dealloc_2.f90
src/hp_dvpsi_pert.f90
src/hp_efermi_shift.f90
src/hp_find_inequiv_sites.f90
src/hp_generate_grids.f90
src/hp_init.f90

View File

@ -22,7 +22,6 @@ hp_dealloc_q.o \
hp_dealloc_1.o \
hp_dealloc_2.o \
hp_dvpsi_pert.o \
hp_efermi_shift.o \
hp_find_inequiv_sites.o \
hp_generate_grids.o \
hp_init.o \
@ -53,10 +52,8 @@ hp_write_chi_full.o \
hp_write_dnsq.o
LRMODS = ../../LR_Modules/liblrmod.a
PWOBJS = ../../PW/src/libpw.a
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/libks_solvers.a \
../../upflib/libupf.a ../../FFTXlib/libqefft.a ../../LAXlib/libqela.a\
../../UtilXlib/libutil.a ../../dft-d3/libdftd3qe.a ../../XClib/xc_lib.a
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
QEMODS = $(BASEMODS)
TLDEPS= hplibs
@ -64,9 +61,9 @@ all : tldeps libs-hp hp.x
libs-hp : libhp.a
hp.x : hp_main.o libhp.a $(PWOBJS) $(LRMODS) $(LIBOBJS)
hp.x : hp_main.o libhp.a $(PWOBJS) $(LRMODS)
$(LD) $(LDFLAGS) -o $@ hp_main.o libhp.a \
$(LRMODS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
$(LRMODS) $(PWOBJS) $(QEMODS) $(QELIBS)
- ( cd ../../bin ; ln -fs ../HP/src/hp.x . )
tldeps :

View File

@ -26,8 +26,7 @@ subroutine hp_allocate_q
USE eqv, ONLY : dpsi, evq, dmuxc, dvpsi
USE control_lr, ONLY : lgamma
USE ldaU, ONLY : Hubbard_lmax, nwfcU
USE ldaU_hp, ONLY : this_pert_is_on_file, &
swfcatomk, swfcatomkpq
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
!
IMPLICIT NONE
INTEGER :: ik
@ -44,9 +43,6 @@ subroutine hp_allocate_q
ALLOCATE (dpsi(npwx*npol,nbnd))
ALLOCATE (dmuxc(dfftp%nnr,nspin_mag,nspin_mag))
!
ALLOCATE (this_pert_is_on_file(nksq))
this_pert_is_on_file(:) = .FALSE.
!
IF (okvan) THEN
ALLOCATE (eigqts(nat))
ALLOCATE (becp1(nksq))

View File

@ -16,8 +16,8 @@ SUBROUTINE hp_close_q ( flag )
!
USE buffers, ONLY : close_buffer
USE io_files, ONLY : iunhub
USE units_lr, ONLY : iuwfc, iuatswfc
USE ldaU_hp, ONLY : iudwfc, iudvwfc
USE units_lr, ONLY : iuwfc, iuatswfc, iudwf
USE ldaU_hp, ONLY : iudvwfc
USE control_lr, ONLY : lgamma
!
IMPLICIT NONE
@ -28,10 +28,10 @@ SUBROUTINE hp_close_q ( flag )
CALL close_buffer(iuwfc,'delete')
!
IF (flag) THEN
CALL close_buffer(iudwfc,'delete')
CALL close_buffer(iudwf,'delete')
CALL close_buffer(iudvwfc,'delete')
ELSE
CALL close_buffer(iudwfc,'keep')
CALL close_buffer(iudwf,'keep')
CALL close_buffer(iudvwfc,'keep')
ENDIF
!

View File

@ -14,7 +14,8 @@ SUBROUTINE hp_dealloc_1()
!
USE lr_symm_base, ONLY : rtau
USE start_k, ONLY : xk_start, wk_start
USE ldaU_hp, ONLY : Rvect, dnsscf, dns0, dnsscf_tot, dns0_tot, &
USE ldaU_lr, ONLY : dnsscf
USE ldaU_hp, ONLY : Rvect, dns0, dnsscf_tot, dns0_tot, &
x_q, comp_iq
!
IMPLICIT NONE

View File

@ -22,8 +22,7 @@ SUBROUTINE hp_dealloc_q()
& dvxc_s, vsgga, segni
USE eqv, ONLY : dmuxc, dpsi, dvpsi, evq
USE control_lr, ONLY : lgamma, nbnd_occ
USE ldaU_hp, ONLY : this_pert_is_on_file, &
swfcatomk, swfcatomkpq
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
!
IMPLICIT NONE
INTEGER :: ik
@ -42,9 +41,6 @@ SUBROUTINE hp_dealloc_q()
if (allocated(ikqs)) deallocate (ikqs)
if (allocated(m_loc)) deallocate (m_loc)
!
if (allocated(this_pert_is_on_file)) &
& deallocate (this_pert_is_on_file)
!
IF (okvan) THEN
if (allocated(eigqts)) deallocate (eigqts)
if (allocated(becp1)) then

View File

@ -38,13 +38,13 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
USE control_flags, ONLY : iverbosity
USE qpoint, ONLY : nksq, ikks, ikqs
USE control_lr, ONLY : lgamma, nbnd_occ
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc, iudwf, lrdwf
USE lr_symm_base, ONLY : nsymq
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard, offsetU, nwfcU
USE ldaU_hp, ONLY : conv_thr_chi, trace_dns_tot_old, &
conv_thr_chi_best, iter_best, iudwfc, lrdwfc, &
swfcatomk, swfcatomkpq
USE hp_efermi_shift, ONLY : def
conv_thr_chi_best, iter_best
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
USE efermi_shift, ONLY : def
!
IMPLICIT NONE
!
@ -120,7 +120,7 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
!
! At each SCF iteration for each ik read dpsi from file
!
CALL get_buffer (dpsi, lrdwfc, iudwfc, ik)
CALL get_buffer (dpsi, lrdwf, iudwf, ik)
!
! Loop on Hubbard atoms
!
@ -149,10 +149,8 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
!
ENDDO
!
#if defined (__MPI)
CALL mp_sum(proj1, intra_pool_comm)
CALL mp_sum(proj2, intra_pool_comm)
#endif
!
DO na = 1, nat
!
@ -185,7 +183,7 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
w1 = weight * wdelta
!
dnsq(m1, m2, current_spin, na) = dnsq(m1, m2, current_spin, na) + &
w1 * def * CONJG(proj1(ibnd,ihubst1)) * proj1(ibnd,ihubst2)
w1 * def(1) * CONJG(proj1(ibnd,ihubst1)) * proj1(ibnd,ihubst2)
!
ENDIF
!

View File

@ -29,7 +29,8 @@ SUBROUTINE hp_dnstot_sum_q
USE control_flags, ONLY : iverbosity
USE lr_symm_base, ONLY : nsymq, invsymq, minus_q, rtau
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard
USE ldaU_hp, ONLY : nqsh, Rvect, dnsscf, dns0, dnsscf_tot, dns0_tot, &
USE ldaU_lr, ONLY : dnsscf
USE ldaU_hp, ONLY : nqsh, Rvect, dns0, dnsscf_tot, dns0_tot, &
skip_equivalence_q, nq1, nq2, nq3, x_q, nqs
!
IMPLICIT NONE

View File

@ -20,8 +20,6 @@ subroutine hp_dvpsi_pert (ik)
!
! dvpsi is for a given "k", "q" and "J"
!
! dvpsi is READ from file if this_pert_is_on_file(ik) = .TRUE.
! otherwise dvpsi is COMPUTED and WRITTEN on file
! (evc, swfcatomk, swfcatomkpq must be set)
!
USE kinds, ONLY : DP
@ -38,8 +36,8 @@ subroutine hp_dvpsi_pert (ik)
USE units_lr, ONLY : iuatswfc
USE control_lr, ONLY : lgamma
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, offsetU, nwfcU
USE ldaU_hp, ONLY : nqsh, perturbed_atom, this_pert_is_on_file, &
iudvwfc, lrdvwfc, swfcatomk, swfcatomkpq
USE ldaU_hp, ONLY : nqsh, perturbed_atom, iudvwfc, lrdvwfc
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
!
IMPLICIT NONE
!
@ -59,23 +57,11 @@ subroutine hp_dvpsi_pert (ik)
DO na = 1, nat
IF (perturbed_atom(na)) counter = counter + 1
ENDDO
IF (counter.NE.1) CALL errore( 'hp_dvpsi_pert', "One perturbed atom must be specified", 1)
IF (counter /= 1) CALL errore( 'hp_dvpsi_pert', "One perturbed atom must be specified", 1)
!
dvpsi(:,:) = (0.0d0, 0.0d0)
!
! If this is not the first iteration, hence dvpsi was already
! computed before. So read it from file and exit.
!
IF (this_pert_is_on_file(ik)) THEN
!
CALL get_buffer(dvpsi, lrdvwfc, iudvwfc, ik)
CALL stop_clock ('hp_dvpsi_pert')
RETURN
!
ENDIF
!
! If this is a first iteration, then dvpsi must be computed
! and written on file.
! Compute dvpsi for ik and write on buffer iudvwfc
!
ALLOCATE (proj(nbnd,nwfcU))
!
@ -126,7 +112,6 @@ subroutine hp_dvpsi_pert (ik)
! Write dvpsi on file.
!
CALL save_buffer(dvpsi, lrdvwfc, iudvwfc, ik)
this_pert_is_on_file(ik) = .true.
!
DEALLOCATE (proj)
!

View File

@ -1,103 +0,0 @@
!
! Copyright (C) 2001-2018 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
MODULE hp_efermi_shift
USE kinds, ONLY : DP
COMPLEX(DP), SAVE, PUBLIC :: def
CONTAINS
!-----------------------------------------------------------------------
SUBROUTINE hp_ef_shift (drhoscf, ldos, ldoss, dos_ef, dbecsum, becsum1)
!-----------------------------------------------------------------------
!
! This routine takes care of the effects of a shift of Ef, due to the
! perturbation, that can take place in a metal at q=0.
! The pertubation is neutral.
!
USE kinds, ONLY : DP
USE ions_base, ONLY : nat
USE io_global, ONLY : stdout
USE wavefunctions, ONLY : evc
USE cell_base, ONLY : omega
USE fft_base, ONLY : dfftp, dffts
USE fft_interfaces, ONLY : fwfft, invfft
USE gvect, ONLY : gg
USE buffers, ONLY : get_buffer, save_buffer
USE lsda_mod, ONLY : nspin
USE klist, ONLY : degauss, ngauss, ngk
USE noncollin_module, ONLY : nspin_mag, nspin_lsda
USE mp_bands, ONLY : intra_bgrp_comm
USE mp, ONLY : mp_sum
USE uspp_param, ONLY : nhm
IMPLICIT NONE
!
COMPLEX(DP) :: drhoscf(dfftp%nnr,nspin_mag), &
ldos(dfftp%nnr,nspin_mag), &
ldoss(dffts%nnr,nspin_mag)
! input/output: the change of the charge (with augmentation)
! input: local DOS at Ef (with augmentation)
! input: local DOS at Ef
REAL(DP) :: dos_ef
! input: density of states at Ef
!
! The PAW case
!
COMPLEX(DP), OPTIONAL :: dbecsum ((nhm*(nhm+1))/2, nat, nspin_mag, 1)
! input: dbecsum = 2 <psi|beta> <beta|dpsi>
! output: dbecsum = 2 <psi|beta> <beta|dpsi> + def * becsum1
REAL(DP), OPTIONAL :: becsum1 ((nhm*(nhm+1))/2, nat, nspin_mag)
! input: becsum1 = wdelta * <psi|beta> <beta|psi>
! (where wdelta is a Dirac-delta-like function)
!
COMPLEX(DP) :: delta_n ! the change in electron number
INTEGER :: is ! counter on spin polarizations
!
CALL start_clock ('hp_ef_shift')
!
delta_n = (0.d0, 0.d0)
!
DO is = 1, nspin_lsda
!
! FFT to G-space
CALL fwfft ('Rho', drhoscf(:,is), dfftp)
!
IF (gg(1).lt.1.0d-8) delta_n = delta_n + omega * drhoscf(dfftp%nl(1),is)
!
! FFT to R-space
CALL invfft ('Rho', drhoscf(:,is), dfftp)
!
ENDDO
!
CALL mp_sum ( delta_n, intra_bgrp_comm )
!
def = - delta_n / dos_ef
!
WRITE( stdout, '(6x,"Fermi energy shift (Ry) =", 2e12.4)') def
!
! Corrects the density response accordingly:
! drhoscfh = drhoscfh + ldos * def
! See Eq.(75) in Rev. Mod. Phys. 73, 515 (2001).
!
CALL zaxpy (dfftp%nnr*nspin_mag, def, ldos, 1, drhoscf, 1)
!
! In the PAW case there is also a metallic term
!
IF (PRESENT(dbecsum) .AND. PRESENT(becsum1)) &
dbecsum(:,:,:,1) = dbecsum(:,:,:,1) + &
def * CMPLX(becsum1(:,:,:), 0.0_DP, kind=DP)
!
CALL stop_clock ('hp_ef_shift')
!
RETURN
!
END SUBROUTINE hp_ef_shift
END MODULE hp_efermi_shift

View File

@ -20,7 +20,8 @@ SUBROUTINE hp_generate_grids()
USE qpoint, ONLY : xq
USE lr_symm_base, ONLY : rtau
USE ldaU, ONLY : Hubbard_lmax
USE ldaU_hp, ONLY : dnsscf, dns0, dnsscf_tot, dns0_tot, nqsh, &
USE ldaU_lr, ONLY : dnsscf
USE ldaU_hp, ONLY : dns0, dnsscf_tot, dns0_tot, nqsh, &
start_q, last_q, tmp_dir_hp, nqs, comp_iq
!
IMPLICIT NONE

View File

@ -19,10 +19,10 @@ SUBROUTINE hp_openfil_q()
USE buffers, ONLY : open_buffer
USE qpoint, ONLY : nksq
USE control_lr, ONLY : lgamma
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc, iudwf, lrdwf
USE ldaU, ONLY : nwfcU
USE ldaU_hp, ONLY : recalc_sym, tmp_dir_save, tmp_dir_hp, &
iudwfc, lrdwfc, iudvwfc, lrdvwfc
iudvwfc, lrdvwfc
!
IMPLICIT NONE
LOGICAL :: exst, exst_mem
@ -58,9 +58,9 @@ SUBROUTINE hp_openfil_q()
!
! Open a file to write/read a solution of the linear system (dpsi)
!
iudwfc = 22
lrdwfc = nbnd * npwx * npol
CALL open_buffer (iudwfc, 'dwfc', lrdwfc, io_level, exst_mem, exst, tmp_dir)
iudwf = 22
lrdwf = nbnd * npwx * npol
CALL open_buffer (iudwf, 'dwfc', lrdwf, io_level, exst_mem, exst, tmp_dir)
!
! Open a file to write/read S*phi at k and k+q (atomic wfct's)
!

View File

@ -37,7 +37,7 @@ SUBROUTINE hp_print_clock
CALL print_clock ('hp_calc_chi')
CALL print_clock ('hp_postproc')
CALL print_clock ('hp_vpsifft')
CALL print_clock ('hp_ef_shift')
CALL print_clock ('ef_shift')
CALL print_clock ('hp_run_nscf')
CALL print_clock ('hp_postproc')
!
@ -51,6 +51,8 @@ SUBROUTINE hp_print_clock
WRITE( stdout, * ) ' PRINTING TIMING FROM LR MODULE: '
WRITE( stdout, * )
!
CALL print_clock ('sth_kernel')
CALL print_clock ('apply_dpot_b')
CALL print_clock ('ortho')
CALL print_clock ('cgsolve')
CALL print_clock ('ch_psi')

View File

@ -18,7 +18,8 @@ SUBROUTINE hp_read_dnsq()
USE io_files, ONLY : prefix, tmp_dir
USE lsda_mod, ONLY : nspin
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard
USE ldaU_hp, ONLY : nah_pert, dns0, dnsscf, nqs, tmp_dir_hp
USE ldaU_lr, ONLY : dnsscf
USE ldaU_hp, ONLY : nah_pert, dns0, nqs, tmp_dir_hp
!
IMPLICIT NONE
!

View File

@ -20,15 +20,13 @@ SUBROUTINE hp_solve_linear_system (na, iq)
USE io_global, ONLY : stdout
USE check_stop, ONLY : check_stop_now
USE wavefunctions, ONLY : evc
USE cell_base, ONLY : tpiba2
USE klist, ONLY : lgauss, ltetra, xk, wk, nelec, ngk, igk_k
USE gvect, ONLY : g
USE klist, ONLY : lgauss, ltetra, nelec, ngk
USE gvecs, ONLY : doublegrid
USE scf, ONLY : rho
USE fft_base, ONLY : dfftp, dffts
USE lsda_mod, ONLY : lsda, current_spin, isk
USE wvfct, ONLY : nbnd, npwx, g2kin, et
USE uspp, ONLY : okvan, vkb, nkb
USE wvfct, ONLY : nbnd, npwx
USE uspp, ONLY : okvan, nkb
USE uspp_param, ONLY : nhm
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, becp
USE buffers, ONLY : save_buffer, get_buffer
@ -38,21 +36,21 @@ SUBROUTINE hp_solve_linear_system (na, iq)
USE paw_symmetry, ONLY : paw_dusymmetrize, paw_dumqsymmetrize
USE mp_pools, ONLY : inter_pool_comm, intra_pool_comm
USE mp, ONLY : mp_sum
USE hp_efermi_shift, ONLY : hp_ef_shift, def
USE eqv, ONLY : dvpsi, dpsi, evq
USE qpoint, ONLY : nksq, ikks, ikqs, xq
USE control_lr, ONLY : lgamma, nbnd_occ
USE qpoint, ONLY : nksq, ikks, xq
USE control_lr, ONLY : lgamma
USE units_lr, ONLY : iuwfc, lrwfc
USE lrus, ONLY : int3, int3_paw
USE dv_of_drho_lr, ONLY : dv_of_drho
USE fft_helper_subroutines
USE fft_interfaces, ONLY : fft_interpolate
USE lr_symm_base, ONLY : irotmq, minus_q, nsymq, rtau
USE ldaU_hp, ONLY : thresh_init, dnsscf, dns0, trace_dns_tot_old, &
USE ldaU_lr, ONLY : dnsscf
USE ldaU_hp, ONLY : thresh_init, dns0, trace_dns_tot_old, &
conv_thr_chi_best, iter_best, niter_max, nmix, &
alpha_mix, iudwfc, lrdwfc, code
USE apply_dpot_mod, ONLY : apply_dpot_allocate, apply_dpot_deallocate, &
apply_dpot_bands
alpha_mix, code, lrdvwfc, iudvwfc
USE apply_dpot_mod, ONLY : apply_dpot_allocate, apply_dpot_deallocate
USE efermi_shift, ONLY : ef_shift, def
USE response_kernels, ONLY : sternheimer_kernel
!
IMPLICIT NONE
!
@ -62,20 +60,18 @@ SUBROUTINE hp_solve_linear_system (na, iq)
REAL(DP), ALLOCATABLE :: h_diag (:,:) ! diagonal part of the Hamiltonian
!
REAL(DP) :: thresh, & ! convergence threshold
anorm, & ! the norm of the error
averlt, & ! average number of iterations
dr2 ! self-consistency error
!
REAL(DP) :: dos_ef, & ! density of states at the Fermi level
weight, & ! Misc variables for metals
aux_avg(2) ! Misc variables for metals
REAL(DP) :: dos_ef
!! density of states at the Fermi level
!
REAL(DP), ALLOCATABLE :: becsum1(:,:,:)
!
COMPLEX(DP), ALLOCATABLE, TARGET :: dvscfin(:,:)
! change of the scf potential (input)
!
COMPLEX(DP), POINTER :: dvscfins(:,:)
COMPLEX(DP), POINTER :: dvscfins(:,:,:)
! change of the scf potential (smooth part only)
!
COMPLEX(DP), ALLOCATABLE :: drhoscf (:,:), &
@ -88,42 +84,27 @@ SUBROUTINE hp_solve_linear_system (na, iq)
ldoss (:,:), & ! as above, without augmentation charges
dbecsum (:,:,:,:), & ! the derivative of becsum
aux2 (:,:), & ! auxiliary arrays
mixin(:), mixout(:), & ! auxiliary arrays for mixing of the response potential
tg_dv(:,:), & ! Task groups: auxiliary array for potential * wfct
tg_psic(:,:) ! Task groups: auxiliary array for wavefunctions
mixin(:), mixout(:) ! auxiliary arrays for mixing of the response potential
COMPLEX(DP), ALLOCATABLE :: t(:,:,:,:), tmq(:,:,:)
! PAW: auxiliary arrays
LOGICAL :: conv_root, & ! true if linear system is converged
exst, & ! used to open the recover file
lmetq0, & ! true if xq=(0,0,0) in a metal
LOGICAL :: all_conv
!! True if sternheimer_kernel is converged at all k points
LOGICAL :: lmetq0, & ! true if xq=(0,0,0) in a metal
convt, & ! not needed for HP
convt_chi ! used instead of convt to control the convergence
REAL(DP), PARAMETER :: tr2 = 1.D-30 ! threshold parameter
INTEGER :: ibnd, & ! counter on bands
iter, & ! counter on iterations
lter, & ! counter on iterations of linear system
ltaver, & ! average counter
lintercall, & ! average number of calls to cgsolve_all
INTEGER :: iter, & ! counter on iterations
ik, ikk, & ! counter on k points
ikq, & ! counter on k+q points
ig, & ! counter on G vectors
ndim, &
is, & ! counter on spin polarizations
nt, & ! counter on types
ios, & ! integer variable for I/O control
incr, & ! used for task groups
v_siz, & ! size of the potential
npw, & ! number of plane waves at k
npwq ! number of plane waves at k+q
npw ! number of plane waves at k
REAL(DP) :: tcpu, get_clock ! timing variables
CHARACTER(LEN=256) :: filename, &
flmixdpot = 'mixd'
EXTERNAL ch_psi_all, cg_psi
CHARACTER(LEN=256) :: flmixdpot = 'mixd'
!
CALL start_clock ('hp_solve_linear_system')
!
@ -133,15 +114,16 @@ SUBROUTINE hp_solve_linear_system (na, iq)
!
! Allocate arrays for the SCF density/potential
!
ALLOCATE (drhoscf (dffts%nnr, nspin_mag))
ALLOCATE (drhoscf (dfftp%nnr, nspin_mag))
ALLOCATE (drhoscfh(dfftp%nnr, nspin_mag))
ALLOCATE (dvscfin (dfftp%nnr, nspin_mag))
ALLOCATE (dvscfout(dfftp%nnr, nspin_mag))
!
dvscfin = (0.0_DP, 0.0_DP)
IF (doublegrid) THEN
ALLOCATE (dvscfins(dffts%nnr, nspin_mag))
ALLOCATE (dvscfins(dffts%nnr, nspin_mag, 1))
ELSE
dvscfins => dvscfin
dvscfins(1:dffts%nnr, 1:nspin_mag, 1:1) => dvscfin
ENDIF
!
! USPP-specific allocations
@ -175,16 +157,6 @@ SUBROUTINE hp_solve_linear_system (na, iq)
convt = .FALSE.
convt_chi = .FALSE.
!
incr = 1
IF ( dffts%has_task_groups ) THEN
!
v_siz = dffts%nnr_tg
ALLOCATE( tg_dv ( v_siz, nspin_mag ) )
ALLOCATE( tg_psic( v_siz, npol ) )
incr = fftx_ntgrp(dffts)
!
ENDIF
!
! If q=0 for a metal: allocate and compute local DOS and DOS at Ef
!
lmetq0 = (lgauss .OR. ltetra) .AND. lgamma
@ -197,158 +169,65 @@ SUBROUTINE hp_solve_linear_system (na, iq)
IF (.NOT.okpaw) DEALLOCATE (becsum1)
ENDIF
!
! Compute dV_bare * psi and write to buffer iubar
!
DO ik = 1, nksq
!
ikk = ikks(ik)
npw = ngk(ikk)
!
IF (lsda) current_spin = isk(ikk)
!
! Read unperturbed KS wavefuctions psi(k) and psi(k+q)
!
IF (nksq > 1) THEN
CALL get_buffer(evc, lrwfc, iuwfc, ikk)
ENDIF
!
! Computes (iter=1) or reads (iter>1) the action of the perturbing
! potential on the unperturbed KS wavefunctions: |dvpsi> = dV_pert * |evc>
! See Eq. (46) in Ref. [1]
!
CALL hp_dvpsi_pert(ik)
!
ENDDO ! ik
!
! The loop of the linear-response calculation
!
DO iter = 1, niter_max
!
WRITE(stdout,'(/6x,"atom #",i3,3x,"q point #",i4,3x,"iter # ",i3)') na, iq, iter
!
ltaver = 0
lintercall = 0
!
drhoscf(:,:) = (0.d0, 0.d0)
dvscfout(:,:) = (0.d0, 0.d0)
dbecsum(:,:,:,:) = (0.d0, 0.d0)
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!! START OF THE K LOOP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
IF ( iter == 1 ) THEN
! Starting threshold for iterative solution of the linear system.
! A strickt threshold for the first iteration is needed,
! because we need dns0 to very high precision.
thresh = thresh_init * nelec
ELSE
! Threshold for iterative solution of the linear system.
! We start with not a strict threshold for iter=2, and then
! it decreases with iterations.
thresh = MIN (1.D-1 * SQRT(dr2), 1.D-2)
ENDIF
!
DO ik = 1, nksq
!
ikk = ikks(ik)
ikq = ikqs(ik)
npw = ngk(ikk)
npwq = ngk(ikq)
!
IF (lsda) current_spin = isk(ikk)
!
! Read unperturbed KS wavefuctions psi(k) and psi(k+q)
!
IF (nksq.gt.1) THEN
IF (lgamma) THEN
CALL get_buffer (evc, lrwfc, iuwfc, ikk)
ELSE
CALL get_buffer (evc, lrwfc, iuwfc, ikk)
CALL get_buffer (evq, lrwfc, iuwfc, ikq)
ENDIF
ENDIF
!
! USPP: Compute the projectors vkb at k+q
!
CALL init_us_2 (npwq, igk_k(1,ikq), xk(1,ikq), vkb)
!
! Compute the kinetic energy at k+q
!
CALL g2_kin (ikq)
!
! Compute preconditioning matrix h_diag used by cgsolve_all
!
CALL h_prec (ik, evq, h_diag)
!
! Computes (iter=1) or reads (iter>1) the action of the perturbing
! potential on the unperturbed KS wavefunctions: |dvpsi> = dV_pert * |evc>
! See Eq. (46) in Ref. [1]
!
CALL hp_dvpsi_pert(ik)
!
IF ( iter > 1 ) THEN
!
! Add the contribution of the self consistent term.
! Calculates dvscf_q*psi(k) in G-space, for all bands, k=ik
! dvscf_q from previous iteration (mix_potential)
!
CALL start_clock ('hp_vpsifft')
CALL apply_dpot_bands(ik, nbnd_occ(ikk), dvscfins, evc, aux2)
dvpsi = dvpsi + aux2
CALL stop_clock ('hp_vpsifft')
!
! USPP: there is an additional self-consistent term proportional to int3
! |dvpsi> = |dvpsi> + dV_HXC*|evc> + int3 * |beta><beta|evc>
!
IF (okvan) CALL adddvscf(1, ik)
!
ENDIF
!
! Ortogonalize dvpsi to valence states: ps = <evq|dvpsi>
! Apply -P_c^+. See Eq. (A21) in Ref. [1]
!
CALL orthogonalize(dvpsi, evq, ikk, ikq, dpsi, npwq, .FALSE.)
!
IF ( iter == 1 ) THEN
!
! At the first iteration dpsi and dvscfin are set to zero
!
dpsi(:,:) = (0.d0, 0.d0)
dvscfin(:,:) = (0.d0, 0.d0)
!
! Starting threshold for iterative solution of the linear system.
! A strickt threshold for the first iteration is needed,
! because we need dns0 to very high precision.
!
thresh = thresh_init * nelec
!
ELSE
!
! Starting value for dpsi is read from file
!
CALL get_buffer( dpsi, lrdwfc, iudwfc, ik)
!
! Threshold for iterative solution of the linear system.
! We start with not a strict threshold for iter=2, and then
! it decreases with iterations.
!
thresh = MIN (1.D-1 * SQRT(dr2), 1.D-2)
!
ENDIF
!
! Iterative solution of the linear system:
! (H + Q - eS) * |dpsi> = |dvpsi>,
! where |dvpsi> = - P_c^+ (dV_HXC + dV_pert) * |evc>
! See Eq. (43) in Ref. [1]
!
CALL cgsolve_all (ch_psi_all, cg_psi, et(1,ikk), dvpsi, dpsi, h_diag, &
& npwx, npwq, thresh, ik, lter, conv_root, anorm, nbnd_occ(ikk), npol )
!
ltaver = ltaver + lter
!
lintercall = lintercall + 1
!
IF (.NOT.conv_root) THEN
WRITE( stdout, '(6x,"kpoint",i4, &
& " hp_solve_linear_system: root not converged, thresh < ",e10.3)') ik , anorm
IF (iter == 1) WRITE( stdout, '(6x,"Try to increase thresh_init...")')
ENDIF
!
! Writes dpsi on file for a given k
!
CALL save_buffer (dpsi, lrdwfc, iudwfc, ik)
!
! Setup the weight at point k (normalized by the number of k points)
!
weight = wk(ikk)
!
! Calculates the response charge density (sum over k)
! See Eq. (48) in Ref. [1]
!
CALL incdrhoscf (drhoscf(:,current_spin), weight, ik, &
& dbecsum(:,:,current_spin,1), dpsi)
!
ENDDO ! k points
! Compute drhoscf, the charge density response to the total potential
!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!! END OF THE K LOOP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CALL sternheimer_kernel(iter==1, .FALSE., 1, lrdvwfc, iudvwfc, &
thresh, dvscfins, all_conv, averlt, drhoscf, dbecsum, exclude_hubbard=.TRUE.)
!
#if defined (__MPI)
IF ((.NOT. all_conv) .AND. (iter == 1)) THEN
WRITE(stdout, '(6x, "sternheimer_kernel not converged. Try to increase thresh_init.")')
ENDIF
!
! USPP: The calculation of dbecsum is distributed across processors (see addusdbec)
! Sum over processors the contributions coming from each slice of bands
!
CALL mp_sum ( dbecsum, intra_pool_comm )
!
#endif
!
! Copy/interpolate the response density drhoscf -> drhoscfh
!
IF (doublegrid) THEN
@ -363,10 +242,9 @@ SUBROUTINE hp_solve_linear_system (na, iq)
!
IF (okvan) CALL lr_addusddens (drhoscfh, dbecsum)
!
#if defined (__MPI)
call mp_sum ( drhoscf, inter_pool_comm )
CALL mp_sum ( drhoscfh, inter_pool_comm )
IF (okpaw) CALL mp_sum ( dbecsum, inter_pool_comm )
#endif
!
! PAW: the factor of 2 is due to the presence of the CC term
! (see first two terms in Eq.(9) in PRB 81, 075123 (2010))
@ -381,14 +259,14 @@ SUBROUTINE hp_solve_linear_system (na, iq)
IF (lmetq0) THEN
!
IF (okpaw) THEN
CALL hp_ef_shift (drhoscfh, ldos, ldoss, dos_ef, dbecsum, becsum1)
CALL ef_shift(1, dos_ef, ldos, drhoscfh, dbecsum=dbecsum, becsum1=becsum1)
ELSE
CALL hp_ef_shift (drhoscfh, ldos, ldoss, dos_ef)
CALL ef_shift(1, dos_ef, ldos, drhoscfh)
ENDIF
!
! Check that def is not too large (it is in Ry).
!
IF ( ABS(DBLE(def)) > 5.0d0 ) THEN
IF ( ABS(DBLE(def(1))) > 5.0d0 ) THEN
!
WRITE( stdout, '(/6x,"WARNING: The Fermi energy shift is too big!")')
WRITE( stdout, '(6x, "This may happen in two cases:")')
@ -451,7 +329,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
!
IF (doublegrid) THEN
DO is = 1, nspin_mag
CALL fft_interpolate (dfftp, dvscfin(:,is), dffts, dvscfins(:,is))
CALL fft_interpolate (dfftp, dvscfin(:,is), dffts, dvscfins(:,is,1))
ENDDO
ENDIF
!
@ -478,16 +356,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
!
IF ( iter == 1 ) dns0(:,:,:,:,iq) = dnsscf(:,:,:,:,iq)
!
! Compute the average number of iterations
!
#if defined (__MPI)
aux_avg(1) = DBLE(ltaver)
aux_avg(2) = DBLE(lintercall)
CALL mp_sum ( aux_avg, inter_pool_comm )
averlt = aux_avg(1) / aux_avg(2)
#else
averlt = DBLE(ltaver) / DBLE(lintercall)
#endif
! Print the average number of iterations
!
tcpu = get_clock(code)
!
@ -506,12 +375,10 @@ SUBROUTINE hp_solve_linear_system (na, iq)
CALL hp_stop_smoothly (.TRUE.)
ENDIF
!
IF (convt_chi) goto 155
IF (convt_chi) EXIT
!
ENDDO ! loop over the iterations iter
!
155 CONTINUE
!
CALL apply_dpot_deallocate()
DEALLOCATE (h_diag)
DEALLOCATE (aux2)
@ -533,10 +400,6 @@ SUBROUTINE hp_solve_linear_system (na, iq)
DEALLOCATE (tmq)
ENDIF
CALL deallocate_bec_type (becp)
IF ( dffts%has_task_groups ) THEN
DEALLOCATE( tg_dv )
DEALLOCATE( tg_psic )
ENDIF
!
WRITE( stdout,*) " "
WRITE( stdout,*) " =--------------------------------------------="

View File

@ -18,7 +18,8 @@ SUBROUTINE hp_write_dnsq(iq)
USE lsda_mod, ONLY : nspin
USE io_files, ONLY : prefix, tmp_dir
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard
USE ldaU_hp, ONLY : nah_pert, dns0, dnsscf, x_q
USE ldaU_lr, ONLY : dnsscf
USE ldaU_hp, ONLY : nah_pert, dns0, x_q
!
IMPLICIT NONE
!

View File

@ -49,7 +49,6 @@ MODULE ldaU_hp
LOGICAL, ALLOCATABLE :: todo_atom(:), & ! Which atoms must be perturbed
perturbed_atom(:), & ! Controls which atom is perturbed in the HP
! calculation
this_pert_is_on_file(:), & ! The perturbation is written on file or not
comp_iq(:) ! If .true. this q point has to be calculated
!
INTEGER :: nath, & ! Number of (real) atoms in the primitive cell
@ -77,8 +76,6 @@ MODULE ldaU_hp
nq1, nq2, nq3, & ! Number of q points in each direction
nqs, & ! Number of q points to be calculated
start_q, last_q, & ! Initial and final q in the list
iudwfc, & ! Unit for response wavefunctions
lrdwfc, & ! Length of the record for response wavefunctions
iudvwfc, & ! Unit for the perturbing potential * wavefunctions
lrdvwfc ! Length of the record for the perturbing potential * wavefunctions
!
@ -116,7 +113,6 @@ MODULE ldaU_hp
!
COMPLEX(DP), ALLOCATABLE :: dns0(:,:,:,:,:), & ! Bare response occupation matrix
! (from 1st iteration)
dnsscf(:,:,:,:,:), & ! SCF response occupation matrix
dns0_tot(:,:,:,:,:), & ! Total bare response occupation matrix
! (summed over q)
dnsscf_tot(:,:,:,:,:), & ! Total SCF response occupation matrix
@ -126,7 +122,4 @@ MODULE ldaU_hp
!
INTEGER, ALLOCATABLE :: ityp_new(:) ! Types of atoms
!
COMPLEX(DP), ALLOCATABLE, TARGET :: swfcatomk(:,:) ! S * atomic wfc at k
COMPLEX(DP), POINTER :: swfcatomkpq(:,:) ! S * atomic wfc at k+q
!
END MODULE ldaU_hp

View File

@ -9,7 +9,6 @@ hp_allocate_q.o : ../../Modules/wavefunctions.o
hp_allocate_q.o : ../../PW/src/ldaU.o
hp_allocate_q.o : ../../PW/src/pwcom.o
hp_allocate_q.o : ../../upflib/uspp.o
hp_allocate_q.o : hpcom.o
hp_bcast_input.o : ../../LR_Modules/lrcom.o
hp_bcast_input.o : ../../Modules/check_stop.o
hp_bcast_input.o : ../../Modules/control_flags.o
@ -49,7 +48,7 @@ hp_dealloc_q.o : ../../LR_Modules/lrcom.o
hp_dealloc_q.o : ../../Modules/becmod.o
hp_dealloc_q.o : ../../Modules/noncol.o
hp_dealloc_q.o : ../../upflib/uspp.o
hp_dealloc_q.o : hpcom.o
hp_dnsq.o : ../../LR_Modules/efermi_shift.o
hp_dnsq.o : ../../LR_Modules/lrcom.o
hp_dnsq.o : ../../Modules/constants.o
hp_dnsq.o : ../../Modules/control_flags.o
@ -64,7 +63,6 @@ hp_dnsq.o : ../../PW/src/ldaU.o
hp_dnsq.o : ../../PW/src/pwcom.o
hp_dnsq.o : ../../UtilXlib/mp.o
hp_dnsq.o : ../../upflib/uspp.o
hp_dnsq.o : hp_efermi_shift.o
hp_dnsq.o : hpcom.o
hp_dnstot_sum_q.o : ../../LR_Modules/lrcom.o
hp_dnstot_sum_q.o : ../../Modules/cell_base.o
@ -88,20 +86,6 @@ hp_dvpsi_pert.o : ../../PW/src/ldaU.o
hp_dvpsi_pert.o : ../../PW/src/pwcom.o
hp_dvpsi_pert.o : ../../UtilXlib/mp.o
hp_dvpsi_pert.o : hpcom.o
hp_efermi_shift.o : ../../FFTXlib/fft_interfaces.o
hp_efermi_shift.o : ../../Modules/cell_base.o
hp_efermi_shift.o : ../../Modules/fft_base.o
hp_efermi_shift.o : ../../Modules/io_global.o
hp_efermi_shift.o : ../../Modules/ions_base.o
hp_efermi_shift.o : ../../Modules/kind.o
hp_efermi_shift.o : ../../Modules/mp_bands.o
hp_efermi_shift.o : ../../Modules/noncol.o
hp_efermi_shift.o : ../../Modules/recvec.o
hp_efermi_shift.o : ../../Modules/wavefunctions.o
hp_efermi_shift.o : ../../PW/src/buffers.o
hp_efermi_shift.o : ../../PW/src/pwcom.o
hp_efermi_shift.o : ../../UtilXlib/mp.o
hp_efermi_shift.o : ../../upflib/uspp.o
hp_find_inequiv_sites.o : ../../Modules/io_global.o
hp_find_inequiv_sites.o : ../../Modules/ions_base.o
hp_find_inequiv_sites.o : ../../Modules/kind.o
@ -195,6 +179,7 @@ hp_read_chi.o : ../../Modules/io_global.o
hp_read_chi.o : ../../Modules/ions_base.o
hp_read_chi.o : ../../Modules/kind.o
hp_read_chi.o : hpcom.o
hp_read_dnsq.o : ../../LR_Modules/lrcom.o
hp_read_dnsq.o : ../../Modules/io_files.o
hp_read_dnsq.o : ../../Modules/io_global.o
hp_read_dnsq.o : ../../Modules/ions_base.o
@ -259,9 +244,10 @@ hp_solve_linear_system.o : ../../FFTXlib/fft_helper_subroutines.o
hp_solve_linear_system.o : ../../FFTXlib/fft_interfaces.o
hp_solve_linear_system.o : ../../LR_Modules/apply_dpot_mod.o
hp_solve_linear_system.o : ../../LR_Modules/dv_of_drho.o
hp_solve_linear_system.o : ../../LR_Modules/efermi_shift.o
hp_solve_linear_system.o : ../../LR_Modules/lrcom.o
hp_solve_linear_system.o : ../../LR_Modules/response_kernels.o
hp_solve_linear_system.o : ../../Modules/becmod.o
hp_solve_linear_system.o : ../../Modules/cell_base.o
hp_solve_linear_system.o : ../../Modules/check_stop.o
hp_solve_linear_system.o : ../../Modules/fft_base.o
hp_solve_linear_system.o : ../../Modules/io_global.o
@ -279,7 +265,6 @@ hp_solve_linear_system.o : ../../PW/src/scf_mod.o
hp_solve_linear_system.o : ../../UtilXlib/mp.o
hp_solve_linear_system.o : ../../upflib/paw_variables.o
hp_solve_linear_system.o : ../../upflib/uspp.o
hp_solve_linear_system.o : hp_efermi_shift.o
hp_solve_linear_system.o : hpcom.o
hp_stop_smoothly.o : ../../Modules/environment.o
hp_stop_smoothly.o : ../../Modules/mp_global.o
@ -333,6 +318,7 @@ hp_write_chi.o : hpcom.o
hp_write_chi_full.o : ../../Modules/io_files.o
hp_write_chi_full.o : ../../Modules/ions_base.o
hp_write_chi_full.o : hpcom.o
hp_write_dnsq.o : ../../LR_Modules/lrcom.o
hp_write_dnsq.o : ../../Modules/io_files.o
hp_write_dnsq.o : ../../Modules/io_global.o
hp_write_dnsq.o : ../../Modules/ions_base.o

View File

@ -1,3 +1,17 @@
set(ks_headers
ks_solver_interfaces.h)
set (ks_include_dir ${CMAKE_CURRENT_BINARY_DIR}/include)
#FIXME maybe better to move this part to an helper function
foreach (in_h ${ks_headers})
get_filename_component(in_h_basename ${in_h} NAME_WE)
set (out_h "${ks_include_dir}/${in_h_basename}.fh")
qe_preprocess_source("${CMAKE_CURRENT_SOURCE_DIR}/${in_h}" ${out_h})
list (APPEND ks_out_headers ${out_h})
endforeach()
add_custom_target(qe_ks_headers
DEPENDS ${ks_out_headers}
VERBATIM )
set(src_davidson
Davidson_RCI/david_rci.f90
Davidson/cegterg.f90
@ -31,12 +45,22 @@ set(src_dense
DENSE/rotate_HSpsi_gamma.f90
DENSE/rotate_HSpsi_k.f90
DENSE/rotate_wfc_gamma.f90
DENSE/rotate_wfc_k.f90
DENSE/rotate_wfc_k.f90
DENSE/rotate_xpsi_k.f90
DENSE/rotate_xpsi_gamma.f90
DENSE/gram_schmidt_k.f90
DENSE/gram_schmidt_gamma.f90
DENSE/rotate_driver.f90
# GPU
DENSE/rotate_HSpsi_gamma_gpu.f90
DENSE/rotate_HSpsi_k_gpu.f90
DENSE/rotate_wfc_gamma_gpu.f90
DENSE/rotate_wfc_k_gpu.f90)
DENSE/rotate_xpsi_k_gpu.f90
DENSE/rotate_xpsi_gamma_gpu.f90
DENSE/gram_schmidt_k_gpu.f90
DENSE/gram_schmidt_gamma_gpu.f90
DENSE/rotate_wfc_k_gpu.f90
DENSE/rotate_driver_cuf.f90)
qe_enable_cuda_fortran("${src_dense}")
set(src_paro
@ -55,6 +79,14 @@ set(src_paro
ParO/paro_k_new_gpu.f90)
qe_enable_cuda_fortran("${src_paro}")
set(src_rmmdiis
RMM/crmmdiagg.f90
RMM/rrmmdiagg.f90
# GPU
RMM/crmmdiagg_gpu.f90
RMM/rrmmdiagg_gpu.f90)
qe_enable_cuda_fortran("${src_rmmdiis}")
qe_add_library(qe_kssolver_davidson ${src_davidson})
target_link_libraries(qe_kssolver_davidson
PRIVATE
@ -95,6 +127,15 @@ target_link_libraries(qe_kssolver_dense
qe_utilx
qe_mpi_fortran
qe_devxlib)
target_include_directories(qe_kssolver_dense
PUBLIC
$<BUILD_INTERFACE:${ks_include_dir}>
$<INSTALL_INTERFACE:include/qe>
)
set_target_properties(qe_kssolver_dense
PROPERTIES PUBLIC_HEADER ${ks_out_headers}
)
add_dependencies(qe_kssolver_dense qe_ks_headers)
qe_add_library(qe_kssolver_paro ${src_paro})
target_link_libraries(qe_kssolver_paro
@ -104,6 +145,14 @@ target_link_libraries(qe_kssolver_paro
qe_utilx
qe_mpi_fortran)
qe_add_library(qe_kssolver_rmmdiis ${src_rmmdiis})
target_link_libraries(qe_kssolver_rmmdiis
PRIVATE
qe_lax
qe_utilx
qe_devxlib
qe_mpi_fortran )
###########################################################
qe_install_targets(
@ -112,4 +161,5 @@ qe_install_targets(
qe_kssolver_cg
qe_kssolver_ppcg
qe_kssolver_paro
qe_kssolver_dense)
qe_kssolver_dense
qe_kssolver_rmmdiis)

View File

@ -3,17 +3,27 @@
include ../../make.inc
# location of needed modules and included files (if any)
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG) ../../Modules $(MOD_FLAG).
DENSE = \
rotate_HSpsi_gamma.o \
rotate_HSpsi_k.o \
rotate_wfc_gamma.o \
rotate_wfc_k.o \
rotate_driver.o \
rotate_HSpsi_gamma_gpu.o \
rotate_HSpsi_k_gpu.o \
rotate_wfc_gamma_gpu.o \
rotate_wfc_k_gpu.o
rotate_wfc_k_gpu.o \
rotate_driver_cuf.o \
rotate_xpsi_k.o \
rotate_xpsi_k_gpu.o \
rotate_xpsi_gamma.o \
rotate_xpsi_gamma_gpu.o \
gram_schmidt_gamma.o \
gram_schmidt_k.o \
gram_schmidt_gamma_gpu.o \
gram_schmidt_k_gpu.o
all : libdense.a

View File

@ -0,0 +1,462 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define ZERO ( 0._DP, 0._DP )
!
!--------------------------------------------------------------------------
SUBROUTINE gram_schmidt_gamma( npwx, npw, nbnd, psi, hpsi, spsi, e, &
uspp, eigen, reorder, nbsize )
!--------------------------------------------------------------------------
!
! ... Gram-Schmidt orthogonalization, for Gamma-only calculations.
! ... blocking algorithm is used.
!
USE util_param, ONLY : DP, eps16
USE mp, ONLY : mp_sum, mp_max, mp_bcast
USE mp_bands_util, ONLY : gstart, inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
!
IMPLICIT NONE
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nbnd
COMPLEX(DP), INTENT(INOUT) :: psi (npwx,nbnd)
COMPLEX(DP), INTENT(INOUT) :: hpsi(npwx,nbnd)
COMPLEX(DP), INTENT(INOUT) :: spsi(npwx,nbnd)
REAL(DP), INTENT(OUT) :: e(nbnd)
LOGICAL, INTENT(IN) :: uspp
LOGICAL, INTENT(IN) :: eigen
LOGICAL, INTENT(IN) :: reorder
INTEGER, INTENT(IN) :: nbsize
!
! ... local variables
!
LOGICAL :: eigen_
INTEGER :: npw2, npwx2
INTEGER :: iblock, nblock
INTEGER :: iblock_start, iblock_end
INTEGER :: jblock_start, jblock_end
INTEGER :: ibnd_start, ibnd_end
INTEGER :: jbnd_start, jbnd_end
COMPLEX(DP), ALLOCATABLE :: phi(:,:), hphi(:,:), sphi(:,:)
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
!
eigen_ = eigen
!
IF ( reorder ) THEN
!
eigen_ = .TRUE.
!
END IF
!
npw2 = 2 * npw
npwx2 = 2 * npwx
!
nblock = nbnd / nbsize
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
!
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
!
IF ( my_bgrp_id >= nblock ) THEN
!
iblock_start = nblock + 1
iblock_end = nblock
!
END IF
!
ALLOCATE( phi ( npwx, nbnd ) )
IF ( eigen_ ) ALLOCATE( hphi( npwx, nbnd ) )
IF ( uspp ) ALLOCATE( sphi( npwx, nbnd ) )
ALLOCATE( owner_bgrp_id( nblock ) )
!
phi = ZERO
!
IF ( eigen_ ) hphi = ZERO
!
IF ( uspp ) sphi = ZERO
!
! ... Set owers of blocks
!
owner_bgrp_id = 0
!
DO iblock = 1, nblock
!
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
owner_bgrp_id(iblock) = my_bgrp_id
!
END DO
!
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
!
! ... Set Im[ psi(G=0) ] - needed for numerical stability
!
IF ( gstart == 2 ) psi(1,1:nbnd) = CMPLX( DBLE( psi(1,1:nbnd) ), 0._DP, kind=DP )
!
! ... Set initial : |phi_j> = |psi_j>
!
CALL DCOPY( npwx2 * nbnd, psi(1,1), 1, phi(1,1), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) phi(1,1:nbnd) = CMPLX( DBLE( phi(1,1:nbnd) ), 0._DP, kind=DP )
!
IF ( eigen_ ) THEN
!
CALL DCOPY( npwx2 * nbnd, hpsi(1,1), 1, hphi(1,1), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) hphi(1,1:nbnd) = CMPLX( DBLE( hphi(1,1:nbnd) ), 0._DP, kind=DP )
!
END IF
!
IF ( uspp ) THEN
!
CALL DCOPY( npwx2 * nbnd, spsi(1,1), 1, sphi(1,1), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) sphi(1,1:nbnd) = CMPLX( DBLE( sphi(1,1:nbnd) ), 0._DP, kind=DP )
!
END IF
!
! ... Blocking loop
!
DO iblock = 1, nblock
!
! ... Orthogonalize diagonal block by standard Gram-Schmidt
!
ibnd_start = ( iblock - 1 ) * nbsize + 1
ibnd_end = MIN( iblock * nbsize, nbnd )
!
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
CALL gram_schmidt_diag( ibnd_start, ibnd_end )
!
! ... Bcast diagonal block
!
CALL mp_bcast( phi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( eigen_ ) &
CALL mp_bcast( hphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( uspp ) &
CALL mp_bcast( sphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
! ... Project off-diagonal block outside of diagonal block
!
jblock_start = MAX( iblock_start, iblock + 1 )
jblock_end = iblock_end
!
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
jbnd_end = MIN( jblock_end * nbsize, nbnd )
!
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
CALL project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
END DO
!
! ... Copy psi <- phi
!
CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
!
IF ( eigen_ ) &
CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
!
IF ( uspp ) &
CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
!
! ... Calculate energy eigenvalues
!
IF ( eigen_ ) CALL energyeigen( )
!
! ... Sort wave functions
!
IF ( reorder ) CALL sort_vectors( )
!
DEALLOCATE( phi )
IF ( eigen_ ) DEALLOCATE( hphi )
IF ( uspp ) DEALLOCATE( sphi )
DEALLOCATE( owner_bgrp_id )
!
RETURN
!
!
CONTAINS
!
!
SUBROUTINE gram_schmidt_diag( ibnd_start, ibnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
!
INTEGER :: ibnd
REAL(DP), ALLOCATABLE :: sr(:)
REAL(DP) :: norm
REAL(DP), EXTERNAL :: DDOT
!
ALLOCATE( sr( ibnd_start:ibnd_end ) )
!
DO ibnd = ibnd_start, ibnd_end
!
IF ( ibnd > ibnd_start ) THEN
!
! ... <phi_j| S |psi_i>
!
IF ( uspp ) THEN
!
CALL DGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi(1,ibnd_start), npwx2, &
spsi(1,ibnd), 1, 0._DP, sr(ibnd_start), 1 )
!
IF ( gstart == 2 ) &
CALL DAXPY( ibnd - ibnd_start, -spsi(1,ibnd), phi(1,ibnd_start), npwx2, &
sr(ibnd_start), 1 )
!
ELSE
!
CALL DGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi(1,ibnd_start), npwx2, &
psi(1,ibnd), 1, 0._DP, sr(ibnd_start), 1 )
!
IF ( gstart == 2 ) &
CALL DAXPY( ibnd - ibnd_start, -psi(1,ibnd), phi(1,ibnd_start), npwx2, &
sr(ibnd_start), 1 )
!
END IF
!
CALL mp_sum( sr, intra_bgrp_comm )
!
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
!
CALL DGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, phi(1,ibnd_start), npwx2, &
sr(ibnd_start), 1, 1._DP, phi(1,ibnd), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) phi(1,ibnd) = CMPLX( DBLE( phi(1,ibnd) ), 0._DP, kind=DP )
!
IF ( eigen_ ) THEN
!
CALL DGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi(1,ibnd_start), npwx2, &
sr(ibnd_start), 1, 1._DP, hphi(1,ibnd), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) hphi(1,ibnd) = CMPLX( DBLE( hphi(1,ibnd) ), 0._DP, kind=DP )
!
END IF
!
IF ( uspp ) THEN
!
CALL DGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi(1,ibnd_start), npwx2, &
sr(ibnd_start), 1, 1._DP, sphi(1,ibnd), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) sphi(1,ibnd) = CMPLX( DBLE( sphi(1,ibnd) ), 0._DP, kind=DP )
!
END IF
!
END IF
!
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
!
IF ( uspp ) THEN
!
norm = 2._DP * DDOT( npw2, phi(1,ibnd), 1, sphi(1,ibnd), 1 )
!
IF ( gstart == 2 ) norm = norm - DBLE( phi(1,ibnd) ) * DBLE ( sphi(1,ibnd) )
!
ELSE
!
norm = 2._DP * DDOT( npw2, phi(1,ibnd), 1, phi(1,ibnd), 1 )
!
IF ( gstart == 2 ) norm = norm - DBLE( phi(1,ibnd) ) * DBLE ( phi(1,ibnd) )
!
END IF
!
CALL mp_sum( norm, intra_bgrp_comm )
!
norm = SQRT( MAX( norm, 0._DP ) )
!
IF ( norm < eps16 ) &
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
!
CALL DSCAL( npw2, 1._DP / norm, phi(1,ibnd), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) phi(1,ibnd) = CMPLX( DBLE( phi(1,ibnd) ), 0._DP, kind=DP )
!
IF ( eigen_ ) THEN
!
CALL DSCAL( npw2, 1._DP / norm, hphi(1,ibnd), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) hphi(1,ibnd) = CMPLX( DBLE( hphi(1,ibnd) ), 0._DP, kind=DP )
!
END IF
!
IF ( uspp ) THEN
!
CALL DSCAL( npw2, 1._DP / norm, sphi(1,ibnd), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) sphi(1,ibnd) = CMPLX( DBLE( sphi(1,ibnd) ), 0._DP, kind=DP )
!
END IF
!
END DO
!
DEALLOCATE( sr )
!
RETURN
!
END SUBROUTINE gram_schmidt_diag
!
!
SUBROUTINE project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
!
INTEGER :: ibnd_size
INTEGER :: jbnd_size
REAL(DP), ALLOCATABLE :: sr(:,:)
!
ibnd_size = ibnd_end - ibnd_start + 1
jbnd_size = jbnd_end - jbnd_start + 1
!
ALLOCATE( sr( ibnd_start:ibnd_end, jbnd_start:jbnd_end ) )
!
! ... <phi_i| S |psi_j>
!
IF ( uspp ) THEN
!
CALL DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi(1,ibnd_start), npwx2, &
spsi(1,jbnd_start), npwx2, 0._DP, sr(ibnd_start,jbnd_start), ibnd_size )
!
IF ( gstart == 2 ) &
CALL DGER( ibnd_size, jbnd_size, -1._DP, psi(1,ibnd_start), npwx2, &
spsi(1,jbnd_start), npwx2, sr(ibnd_start,jbnd_start), ibnd_size )
!
ELSE
!
CALL DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi(1,ibnd_start), npwx2, &
psi(1,jbnd_start), npwx2, 0._DP, sr(ibnd_start,jbnd_start), ibnd_size )
!
IF ( gstart == 2 ) &
CALL DGER( ibnd_size, jbnd_size, -1._DP, psi(1,ibnd_start), npwx2, &
psi(1,jbnd_start), npwx2, sr(ibnd_start,jbnd_start), ibnd_size )
!
END IF
!
CALL mp_sum( sr, intra_bgrp_comm )
!
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
!
CALL DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi(1,ibnd_start), npwx2, &
sr(ibnd_start,jbnd_start), ibnd_size, 1._DP, phi(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) phi(1,jbnd_start:jbnd_end) = &
CMPLX( DBLE( phi(1,jbnd_start:jbnd_end) ), 0._DP, kind=DP )
!
IF ( eigen_ ) THEN
!
CALL DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi(1,ibnd_start), npwx2, &
sr(ibnd_start,jbnd_start), ibnd_size, 1._DP, hphi(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) hphi(1,jbnd_start:jbnd_end) = &
CMPLX( DBLE( hphi(1,jbnd_start:jbnd_end) ), 0._DP, kind=DP )
!
END IF
!
IF ( uspp ) THEN
!
CALL DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi(1,ibnd_start), npwx2, &
sr(ibnd_start,jbnd_start), ibnd_size, 1._DP, sphi(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) sphi(1,jbnd_start:jbnd_end) = &
CMPLX( DBLE( sphi(1,jbnd_start:jbnd_end) ), 0._DP, kind=DP )
!
END IF
!
DEALLOCATE( sr )
!
RETURN
!
END SUBROUTINE project_offdiag
!
!
SUBROUTINE energyeigen( )
!
IMPLICIT NONE
!
INTEGER :: ibnd, ibnd_start, ibnd_end
!
REAL(DP), EXTERNAL :: DDOT
!
! ... <psi_i| H |psi_i>
!
e(:) = 0._DP
!
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
!
DO ibnd = ibnd_start, ibnd_end
!
e(ibnd) = 2._DP * DDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
!
IF ( gstart == 2 ) e(ibnd) = e(ibnd) - DBLE( psi(1,ibnd) ) * DBLE ( hpsi(1,ibnd) )
!
END DO
!
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
CALL mp_sum( e, inter_bgrp_comm )
!
RETURN
!
END SUBROUTINE energyeigen
!
!
SUBROUTINE sort_vectors( )
!
IMPLICIT NONE
!
INTEGER :: ibnd
INTEGER :: nswap
REAL(DP) :: e0
!
10 nswap = 0
!
DO ibnd = 2, nbnd
!
IF ( e(ibnd) < e(ibnd-1) ) THEN
!
nswap = nswap + 1
!
e0 = e(ibnd)
e(ibnd) = e(ibnd-1)
e(ibnd-1) = e0
!
CALL DSWAP( npw2, psi(1,ibnd), 1, psi(1,ibnd-1), 1 )
!
IF ( eigen_ ) &
CALL DSWAP( npw2, hpsi(1,ibnd), 1, hpsi(1,ibnd-1), 1 )
!
IF ( uspp ) &
CALL DSWAP( npw2, spsi(1,ibnd), 1, spsi(1,ibnd-1), 1 )
!
END IF
!
END DO
!
IF ( nswap > 0 ) GOTO 10
!
RETURN
!
END SUBROUTINE sort_vectors
!
!
END SUBROUTINE gram_schmidt_gamma

View File

@ -0,0 +1,584 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define ZERO ( 0._DP, 0._DP )
!
!--------------------------------------------------------------------------
SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
uspp, eigen, reorder, nbsize )
!--------------------------------------------------------------------------
!
! ... Gram-Schmidt orthogonalization, for Gamma-only calculations.
! ... blocking algorithm is used.
!
USE util_param, ONLY : DP, eps16
USE mp, ONLY : mp_sum, mp_max, mp_bcast
USE mp_bands_util, ONLY : gstart, inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
USE device_memcpy_m, ONLY : dev_memcpy, dev_memset
!
IMPLICIT NONE
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nbnd
COMPLEX(DP), INTENT(INOUT) :: psi_d (npwx,nbnd)
COMPLEX(DP), INTENT(INOUT) :: hpsi_d(npwx,nbnd)
COMPLEX(DP), INTENT(INOUT) :: spsi_d(npwx,nbnd)
REAL(DP), INTENT(OUT) :: e(nbnd)
LOGICAL, INTENT(IN) :: uspp
LOGICAL, INTENT(IN) :: eigen
LOGICAL, INTENT(IN) :: reorder
INTEGER, INTENT(IN) :: nbsize
!
! ... local variables
!
LOGICAL :: eigen_
INTEGER :: npw2, npwx2
INTEGER :: iblock, nblock
INTEGER :: iblock_start, iblock_end
INTEGER :: jblock_start, jblock_end
INTEGER :: ibnd_start, ibnd_end
INTEGER :: jbnd_start, jbnd_end
COMPLEX(DP), ALLOCATABLE :: phi(:,:), hphi(:,:), sphi(:,:)
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
!
! ... device variables
!
INTEGER :: ii, buf_start, buf_end, buf_size, info
COMPLEX(DP),ALLOCATABLE :: phi_d (:,:)
COMPLEX(DP),ALLOCATABLE :: hphi_d(:,:)
COMPLEX(DP),ALLOCATABLE :: sphi_d(:,:)
!
#if defined (__CUDA)
attributes(device) :: psi_d, hpsi_d, spsi_d
attributes(device) :: phi_d, hphi_d, sphi_d
#endif
!
COMPLEX(DP), ALLOCATABLE :: sr_d(:), sr2_d(:,:)
#if defined (__CUDA)
attributes(device) :: sr_d, sr2_d
#endif
!
!
CALL start_clock( 'gsorth' )
!
eigen_ = eigen
!
IF ( reorder ) THEN
!
eigen_ = .TRUE.
!
END IF
!
npw2 = 2 * npw
npwx2 = 2 * npwx
!
nblock = nbnd / nbsize
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
!
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
!
IF ( my_bgrp_id >= nblock ) THEN
!
iblock_start = nblock + 1
iblock_end = nblock
!
END IF
!
ALLOCATE( phi_d ( npwx, nbnd ) )
IF ( eigen_ ) ALLOCATE( hphi_d( npwx, nbnd ) )
IF ( uspp ) ALLOCATE( sphi_d( npwx, nbnd ) )
!
phi_d = ZERO
!
IF ( eigen_ ) hphi_d = ZERO
!
IF ( uspp ) sphi_d = ZERO
!
!
! ... Set owers of blocks
!
owner_bgrp_id = 0
!
DO iblock = 1, nblock
!
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
owner_bgrp_id(iblock) = my_bgrp_id
!
END DO
!
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
!
! ... Set Im[ psi(G=0) ] - needed for numerical stability
!
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii =1,nbnd
psi_d(1,ii) = CMPLX( DBLE( psi_d(1,ii) ), 0._DP, kind=DP )
END DO
END IF
!
! ... Set initial : |phi_j> = |psi_j>
!
CALL DCOPY_gpu( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii =1,nbnd
phi_d(1,ii) = CMPLX( DBLE( phi_d(1,ii) ), 0._DP, kind=DP )
END DO
END IF
!
!
IF ( eigen_ ) THEN
!
CALL DCOPY_gpu( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii =1,nbnd
hphi_d(1,ii) = CMPLX( DBLE( hphi_d(1,ii) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
IF ( uspp ) THEN
!
CALL DCOPY_gpu( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii =1,nbnd
sphi_d(1,ii) = CMPLX( DBLE( sphi_d(1,ii) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
! ... Buffer allocation
!
buf_size = nbsize
!
ALLOCATE( sr_d(buf_size))
ALLOCATE( sr2_d(buf_size, buf_size))
!
! ... Blocking loop
!
DO iblock = 1, nblock
!
! ... Orthogonalize diagonal block by standard Gram-Schmidt
!
ibnd_start = ( iblock - 1 ) * nbsize + 1
ibnd_end = MIN( iblock * nbsize, nbnd )
!
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
CALL gram_schmidt_diag_gpu( ibnd_start, ibnd_end )
!
! ... Bcast diagonal block
!
CALL mp_bcast( phi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( eigen_ ) &
CALL mp_bcast( hphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( uspp ) &
CALL mp_bcast( sphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
! ... Project off-diagonal block outside of diagonal block
!
jblock_start = MAX( iblock_start, iblock + 1 )
jblock_end = iblock_end
!
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
jbnd_end = MIN( jblock_end * nbsize, nbnd )
!
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
CALL project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
END DO
!
! ... Buffer Realese
!
DEALLOCATE (sr_d, sr2_d)
!
!
! ... Copy psi <- phi
!
!CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
CALL DCOPY_gpu( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
!
IF ( eigen_ ) &
CALL DCOPY_gpu( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
!CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
!
IF ( uspp ) &
CALL DCOPY_gpu( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
!CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
!
! ... Calculate energy eigenvalues
!
IF ( eigen_ ) CALL energyeigen_gpu( )
!
! ... Sort wave functions
!
IF ( reorder ) CALL sort_vectors_gpu( )
!
DEALLOCATE( owner_bgrp_id )
!
DEALLOCATE( phi_d )
IF ( eigen_ ) DEALLOCATE( hphi_d )
IF ( uspp ) DEALLOCATE( sphi_d )
!
RETURN
!
!
CALL stop_clock( 'gsorth' )
!
!
CONTAINS
!
!
SUBROUTINE gram_schmidt_diag_gpu( ibnd_start, ibnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
!
INTEGER :: ibnd
REAL(DP) :: norm
REAL(DP) :: psi_ibnd
REAL(DP), EXTERNAL :: gpu_DDOT
!
DO ibnd = ibnd_start, ibnd_end
!
IF ( ibnd > ibnd_start ) THEN
!
! ... <phi_j| S |psi_i>
!
IF ( uspp ) THEN
!
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
spsi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
!
IF ( gstart == 2 ) THEN
psi_ibnd = -spsi_d(1,ibnd)
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, &
sr_d(1), 1 )
END IF
!
ELSE
!
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
psi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
!
IF ( gstart == 2 ) THEN
psi_ibnd = -psi_d(1,ibnd)
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, &
sr_d(1), 1 )
END IF
!
END IF
!
CALL mp_sum( sr_d, intra_bgrp_comm )
!
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
!
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, &
sr_d(1), 1, 1._DP, phi_d(1,ibnd), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
phi_d(1,ibnd) = CMPLX( DBLE( phi_d(1,ibnd) ), 0._DP, kind=DP )
END DO
END IF
!
IF ( eigen_ ) THEN
!
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, &
sr_d(1), 1, 1._DP, hphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
hphi_d(1,ibnd) = CMPLX( DBLE( hphi_d(1,ibnd) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
IF ( uspp ) THEN
!
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, &
sr_d(1), 1, 1._DP, sphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
sphi_d(1,ibnd) = CMPLX( DBLE( sphi_d(1,ibnd) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
END IF
!
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
!
IF ( uspp ) THEN
!
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 )
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
norm = norm - DBLE( phi_d(1,ibnd) ) * DBLE ( sphi_d(1,ibnd) )
END DO
END IF
!
ELSE
!
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 )
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
norm = norm - DBLE( phi_d(1,ibnd) ) * DBLE ( phi_d(1,ibnd) )
END DO
END IF
!
END IF
!
CALL mp_sum( norm, intra_bgrp_comm )
!
norm = SQRT( MAX( norm, 0._DP ) )
!
IF ( norm < eps16 ) &
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
!
CALL DSCAL_gpu( npw2, 1._DP / norm, phi_d(1,ibnd), 1 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
phi_d(1,ibnd) = CMPLX( DBLE( phi_d(1,ibnd) ), 0._DP, kind=DP )
END DO
END IF
!
IF ( eigen_ ) THEN
!
CALL DSCAL_gpu( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
hphi_d(1,ibnd) = CMPLX( DBLE( hphi_d(1,ibnd) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
IF ( uspp ) THEN
!
CALL DSCAL_gpu( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
sphi_d(1,ibnd) = CMPLX( DBLE( sphi_d(1,ibnd) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
END DO
!
RETURN
!
END SUBROUTINE gram_schmidt_diag_gpu
!
!
SUBROUTINE project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
!
INTEGER :: ibnd_size
INTEGER :: jbnd_size
!
!
ibnd_size = ibnd_end - ibnd_start + 1
jbnd_size = jbnd_end - jbnd_start + 1
!
! ... <phi_i| S |psi_j>
!
IF ( uspp ) THEN
!
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
spsi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), ibnd_size )
!
IF ( gstart == 2 ) &
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
spsi_d(1,jbnd_start), npwx2, sr2_d(1,1), ibnd_size )
!
ELSE
!
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
psi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), ibnd_size )
!
IF ( gstart == 2 ) &
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
psi_d(1,jbnd_start), npwx2, sr2_d(1,1), ibnd_size )
!
END IF
!
CALL mp_sum( sr2_d, intra_bgrp_comm )
!
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
!
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, &
sr2_d(1,1), ibnd_size, 1._DP, phi_d(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=jbnd_start, jbnd_end
phi_d(1, ii) = &
CMPLX( DBLE( phi_d(1, ii) ), 0._DP, kind=DP )
END DO
END IF
!
IF ( eigen_ ) THEN
!
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, &
sr2_d(1,1), ibnd_size, 1._DP, hphi_d(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii= jbnd_start, jbnd_end
hphi_d(1, ii) = &
CMPLX( DBLE( hphi_d(1, ii) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
IF ( uspp ) THEN
!
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, &
sr2_d(1,1), ibnd_size, 1._DP, sphi_d(1,jbnd_start), npwx2 )
!
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=jbnd_start, jbnd_end
sphi_d(1, ii ) = &
CMPLX( DBLE( sphi_d(1, ii) ), 0._DP, kind=DP )
END DO
END IF
!
END IF
!
RETURN
!
END SUBROUTINE project_offdiag_gpu
!
!
SUBROUTINE energyeigen_gpu( )
!
IMPLICIT NONE
!
INTEGER :: ibnd, ibnd_start, ibnd_end
REAL(DP) :: e_ibnd
!
REAL(DP), EXTERNAL :: gpu_DDOT
!
! ... <psi_i| H |psi_i>
!
e(:) = 0._DP
!
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
!
DO ibnd = ibnd_start, ibnd_end
!
e(ibnd) = 2._DP * gpu_DDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 )
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO ii=1,1
e_ibnd = DBLE( psi_d(1,ibnd) ) * DBLE ( hpsi_d(1,ibnd) )
END DO
e(ibnd) = e(ibnd) - e_ibnd
END IF
!
END DO
!
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
CALL mp_sum( e, inter_bgrp_comm )
!
RETURN
!
END SUBROUTINE energyeigen_gpu
!
!
SUBROUTINE sort_vectors_gpu( )
!
IMPLICIT NONE
!
INTEGER :: ibnd
INTEGER :: nswap
REAL(DP) :: e0
!
10 nswap = 0
!
DO ibnd = 2, nbnd
!
IF ( e(ibnd) < e(ibnd-1) ) THEN
!
nswap = nswap + 1
!
e0 = e(ibnd)
e(ibnd) = e(ibnd-1)
e(ibnd-1) = e0
!
CALL DSWAP_gpu( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
!
IF ( eigen_ ) &
CALL DSWAP_gpu( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
!
IF ( uspp ) &
CALL DSWAP_gpu( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
!
END IF
!
END DO
!
IF ( nswap > 0 ) GOTO 10
!
RETURN
!
END SUBROUTINE sort_vectors_gpu
!
!
END SUBROUTINE gram_schmidt_gamma_gpu

View File

@ -0,0 +1,384 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define ZERO ( 0._DP, 0._DP )
#define ONE ( 1._DP, 0._DP )
#define MONE (-1._DP, 0._DP )
!
!--------------------------------------------------------------------------
SUBROUTINE gram_schmidt_k( npwx, npw, nbnd, npol, psi, hpsi, spsi, e, &
uspp, eigen, reorder, nbsize )
!--------------------------------------------------------------------------
!
! ... Gram-Schmidt orthogonalization, for k-point calculations.
! ... blocking algorithm is used.
!
USE util_param, ONLY : DP, eps16
USE mp, ONLY : mp_sum, mp_max, mp_bcast
USE mp_bands_util, ONLY : inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
!
IMPLICIT NONE
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol
COMPLEX(DP), INTENT(INOUT) :: psi (npwx*npol,nbnd)
COMPLEX(DP), INTENT(INOUT) :: hpsi(npwx*npol,nbnd)
COMPLEX(DP), INTENT(INOUT) :: spsi(npwx*npol,nbnd)
REAL(DP), INTENT(INOUT) :: e(nbnd)
LOGICAL, INTENT(IN) :: uspp
LOGICAL, INTENT(IN) :: eigen
LOGICAL, INTENT(IN) :: reorder
INTEGER, INTENT(IN) :: nbsize
!
! ... local variables
!
LOGICAL :: eigen_
INTEGER :: kdim, kdmx
INTEGER :: iblock, nblock
INTEGER :: iblock_start, iblock_end
INTEGER :: jblock_start, jblock_end
INTEGER :: ibnd_start, ibnd_end
INTEGER :: jbnd_start, jbnd_end
COMPLEX(DP), ALLOCATABLE :: phi(:,:), hphi(:,:), sphi(:,:)
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
!
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx*npol
kdmx = npwx*npol
!
END IF
!
eigen_ = eigen
!
IF ( reorder ) THEN
!
eigen_ = .TRUE.
!
END IF
!
nblock = nbnd / nbsize
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
!
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
!
IF ( my_bgrp_id >= nblock ) THEN
!
iblock_start = nblock + 1
iblock_end = nblock
!
END IF
!
ALLOCATE( phi ( kdmx, nbnd ) )
IF ( eigen_ ) ALLOCATE( hphi( kdmx, nbnd ) )
IF ( uspp ) ALLOCATE( sphi( kdmx, nbnd ) )
ALLOCATE( owner_bgrp_id( nblock ) )
!
phi = ZERO
!
IF ( eigen_ ) hphi = ZERO
!
IF ( uspp ) sphi = ZERO
!
! ... Set owers of blocks
!
owner_bgrp_id = 0
!
DO iblock = 1, nblock
!
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
owner_bgrp_id(iblock) = my_bgrp_id
!
END DO
!
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
!
! ... Set initial : |phi_j> = |psi_j>
!
CALL ZCOPY( kdmx * nbnd, psi(1,1), 1, phi(1,1), 1 )
!
IF ( eigen_ ) &
CALL ZCOPY( kdmx * nbnd, hpsi(1,1), 1, hphi(1,1), 1 )
!
IF ( uspp ) &
CALL ZCOPY( kdmx * nbnd, spsi(1,1), 1, sphi(1,1), 1 )
!
! ... Blocking loop
!
DO iblock = 1, nblock
!
! ... Orthogonalize diagonal block by standard Gram-Schmidt
!
ibnd_start = ( iblock - 1 ) * nbsize + 1
ibnd_end = MIN( iblock * nbsize, nbnd )
!
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
CALL gram_schmidt_diag( ibnd_start, ibnd_end )
!
! ... Bcast diagonal block
!
CALL mp_bcast( phi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( eigen_ ) &
CALL mp_bcast( hphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( uspp ) &
CALL mp_bcast( sphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
! ... Project off-diagonal block outside of diagonal block
!
jblock_start = MAX( iblock_start, iblock + 1 )
jblock_end = iblock_end
!
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
jbnd_end = MIN( jblock_end * nbsize, nbnd )
!
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
CALL project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
END DO
!
! ... Copy psi <- phi
!
CALL ZCOPY( kdmx * nbnd, phi(1,1), 1, psi(1,1), 1 )
!
IF ( eigen_ ) &
CALL ZCOPY( kdmx * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
!
IF ( uspp ) &
CALL ZCOPY( kdmx * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
!
! ... Calculate energy eigenvalues
!
IF ( eigen_ ) CALL energyeigen( )
!
! ... Sort wave functions
!
IF ( reorder ) CALL sort_vectors( )
!
DEALLOCATE( phi )
IF ( eigen_ ) DEALLOCATE( hphi )
IF ( uspp ) DEALLOCATE( sphi )
DEALLOCATE( owner_bgrp_id )
!
RETURN
!
!
CONTAINS
!
!
SUBROUTINE gram_schmidt_diag( ibnd_start, ibnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
!
INTEGER :: ibnd
COMPLEX(DP), ALLOCATABLE :: sc(:)
REAL(DP) :: norm
REAL(DP), EXTERNAL :: DDOT
!
ALLOCATE( sc( ibnd_start:ibnd_end ) )
!
DO ibnd = ibnd_start, ibnd_end
!
IF ( ibnd > ibnd_start ) THEN
!
! ... <phi_j| S |psi_i>
!
IF ( uspp ) THEN
!
CALL ZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi(1,ibnd_start), kdmx, &
spsi(1,ibnd), 1, ZERO, sc(ibnd_start), 1 )
!
ELSE
!
CALL ZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi(1,ibnd_start), kdmx, &
psi(1,ibnd), 1, ZERO, sc(ibnd_start), 1 )
!
END IF
!
CALL mp_sum( sc, intra_bgrp_comm )
!
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
!
CALL ZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, phi(1,ibnd_start), kdmx, &
sc(ibnd_start), 1, ONE, phi(1,ibnd), 1 )
!
IF ( eigen_ ) &
CALL ZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, hphi(1,ibnd_start), kdmx, &
sc(ibnd_start), 1, ONE, hphi(1,ibnd), 1 )
!
IF ( uspp ) &
CALL ZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, sphi(1,ibnd_start), kdmx, &
sc(ibnd_start), 1, ONE, sphi(1,ibnd), 1 )
!
END IF
!
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
!
IF ( uspp ) THEN
!
norm = DDOT( 2*kdim, phi(1,ibnd), 1, sphi(1,ibnd), 1 )
!
ELSE
!
norm = DDOT ( 2*kdim, phi(1,ibnd), 1, phi(1,ibnd), 1 )
!
END IF
!
CALL mp_sum( norm, intra_bgrp_comm )
!
norm = SQRT( MAX( norm, 0._DP ) )
!
IF ( norm < eps16 ) &
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
!
CALL ZDSCAL( kdim, 1._DP / norm, phi(1,ibnd), 1 )
!
IF ( eigen_ ) &
CALL ZDSCAL( kdim, 1._DP / norm, hphi(1,ibnd), 1 )
!
IF ( uspp ) &
CALL ZDSCAL( kdim, 1._DP / norm, sphi(1,ibnd), 1 )
!
END DO
!
DEALLOCATE( sc )
!
RETURN
!
END SUBROUTINE gram_schmidt_diag
!
!
SUBROUTINE project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
!
INTEGER :: ibnd_size
INTEGER :: jbnd_size
COMPLEX(DP), ALLOCATABLE :: sc(:,:)
!
ibnd_size = ibnd_end - ibnd_start + 1
jbnd_size = jbnd_end - jbnd_start + 1
!
ALLOCATE( sc( ibnd_start:ibnd_end, jbnd_start:jbnd_end ) )
!
! ... <phi_i| S |psi_j>
!
IF ( uspp ) THEN
!
CALL ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi(1,ibnd_start), kdmx, &
spsi(1,jbnd_start), kdmx, ZERO, sc(ibnd_start,jbnd_start), ibnd_size )
!
ELSE
!
CALL ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi(1,ibnd_start), kdmx, &
psi(1,jbnd_start), kdmx, ZERO, sc(ibnd_start,jbnd_start), ibnd_size )
!
END IF
!
CALL mp_sum( sc, intra_bgrp_comm )
!
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
!
CALL ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi(1,ibnd_start), kdmx, &
sc(ibnd_start,jbnd_start), ibnd_size, ONE, phi(1,jbnd_start), kdmx )
!
IF ( eigen_ ) &
CALL ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi(1,ibnd_start), kdmx, &
sc(ibnd_start,jbnd_start), ibnd_size, ONE, hphi(1,jbnd_start), kdmx )
!
IF ( uspp ) &
CALL ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi(1,ibnd_start), kdmx, &
sc(ibnd_start,jbnd_start), ibnd_size, ONE, sphi(1,jbnd_start), kdmx )
!
DEALLOCATE( sc )
!
RETURN
!
END SUBROUTINE project_offdiag
!
!
SUBROUTINE energyeigen( )
!
IMPLICIT NONE
!
INTEGER :: ibnd, ibnd_start, ibnd_end
!
REAL(DP), EXTERNAL :: DDOT
!
! ... <psi_i| H |psi_i>
!
e(:) = 0._DP
!
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
!
DO ibnd = ibnd_start, ibnd_end
!
e(ibnd) = DDOT( 2*kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
!
END DO
!
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
CALL mp_sum( e, inter_bgrp_comm )
!
RETURN
!
END SUBROUTINE energyeigen
!
!
SUBROUTINE sort_vectors( )
!
IMPLICIT NONE
!
INTEGER :: ibnd
INTEGER :: nswap
REAL(DP) :: e0
!
10 nswap = 0
!
DO ibnd = 2, nbnd
!
IF ( e(ibnd) < e(ibnd-1) ) THEN
!
nswap = nswap + 1
!
e0 = e(ibnd)
e(ibnd) = e(ibnd-1)
e(ibnd-1) = e0
!
CALL ZSWAP( kdim, psi(1,ibnd), 1, psi(1,ibnd-1), 1 )
!
IF ( eigen_ ) &
CALL ZSWAP( kdim, hpsi(1,ibnd), 1, hpsi(1,ibnd-1), 1 )
!
IF ( uspp ) &
CALL ZSWAP( kdim, spsi(1,ibnd), 1, spsi(1,ibnd-1), 1 )
!
END IF
!
END DO
!
IF ( nswap > 0 ) GOTO 10
!
RETURN
!
END SUBROUTINE sort_vectors
!
!
END SUBROUTINE gram_schmidt_k

View File

@ -0,0 +1,433 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#define ZERO ( 0._DP, 0._DP )
#define ONE ( 1._DP, 0._DP )
#define MONE (-1._DP, 0._DP )
!
!--------------------------------------------------------------------------
SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e, &
uspp, eigen, reorder, nbsize )
!--------------------------------------------------------------------------
!
! ... Gram-Schmidt orthogonalization, for k-point calculations.
! ... blocking algorithm is used.
!
USE util_param, ONLY : DP, eps16
USE mp, ONLY : mp_sum, mp_max, mp_bcast
USE mp_bands_util, ONLY : inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
USE device_fbuff_m, ONLY : buffer => dev_buf
USE device_memcpy_m, ONLY : dev_memcpy, dev_memset
!
IMPLICIT NONE
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol
COMPLEX(DP), INTENT(INOUT) :: psi_d (npwx*npol,nbnd)
COMPLEX(DP), INTENT(INOUT) :: hpsi_d(npwx*npol,nbnd)
COMPLEX(DP), INTENT(INOUT) :: spsi_d(npwx*npol,nbnd)
REAL(DP), INTENT(INOUT) :: e(nbnd)
LOGICAL, INTENT(IN) :: uspp
LOGICAL, INTENT(IN) :: eigen
LOGICAL, INTENT(IN) :: reorder
INTEGER, INTENT(IN) :: nbsize
!
! ... local variables
!
LOGICAL :: eigen_
INTEGER :: kdim, kdmx
INTEGER :: iblock, nblock
INTEGER :: iblock_start, iblock_end
INTEGER :: jblock_start, jblock_end
INTEGER :: ibnd_start, ibnd_end
INTEGER :: jbnd_start, jbnd_end
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
INTEGER :: buf_start, buf_end, buf_size
!
! ... device variables
!
INTEGER :: ii, jj, kk, info
COMPLEX(DP), ALLOCATABLE :: phi_d(:,:), hphi_d(:,:), sphi_d(:,:)
#if defined (__CUDA)
attributes(device) :: psi_d, hpsi_d, spsi_d
attributes(device) :: phi_d, hphi_d, sphi_d
#endif
!
COMPLEX(DP), ALLOCATABLE :: sc_d(:), sc2_d(:,:)
#if defined (__CUDA)
attributes(device) :: sc_d, sc2_d
#endif
!
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx*npol
kdmx = npwx*npol
!
END IF
!
eigen_ = eigen
!
IF ( reorder ) THEN
!
eigen_ = .TRUE.
!
END IF
!
nblock = nbnd / nbsize
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
!
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
!
IF ( my_bgrp_id >= nblock ) THEN
!
iblock_start = nblock + 1
iblock_end = nblock
!
END IF
!
ALLOCATE( phi_d ( kdmx, nbnd ) )
IF ( eigen_ ) ALLOCATE( hphi_d( kdmx, nbnd ) )
IF ( uspp ) ALLOCATE( sphi_d( kdmx, nbnd ) )
!
ALLOCATE( owner_bgrp_id( nblock ) )
!
!$cuf kernel do(2)
DO ii = 1, kdmx
DO jj = 1, nbnd
phi_d(ii, jj) = ZERO
END DO
END DO
!
IF ( eigen_ ) THEN
!$cuf kernel do(2)
DO ii = 1, kdmx
DO jj = 1, nbnd
hphi_d(ii, jj) = ZERO
END DO
END DO
END IF
!
IF ( uspp ) THEN
!$cuf kernel do(2)
DO ii = 1, kdmx
DO jj = 1, nbnd
sphi_d(ii, jj) = ZERO
END DO
END DO
END IF
!
! ... Set owers of blocks
!
owner_bgrp_id = 0
!
DO iblock = 1, nblock
!
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
owner_bgrp_id(iblock) = my_bgrp_id
!
END DO
!
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
!
! ... Set initial : |phi_j> = |psi_j>
!
CALL ZCOPY_gpu( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
!
IF ( eigen_ ) &
CALL ZCOPY_gpu( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
!
IF ( uspp ) &
CALL ZCOPY_gpu( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
!
!
! ... Allocate buffers
!
buf_size = nbsize
ALLOCATE (sc_d(buf_size), sc2_d(buf_size, buf_size))
!
!
! ... Blocking loop
!
DO iblock = 1, nblock
!
! ... Orthogonalize diagonal block by standard Gram-Schmidt
!
ibnd_start = ( iblock - 1 ) * nbsize + 1
ibnd_end = MIN( iblock * nbsize, nbnd )
!
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
!
!
CALL gram_schmidt_diag( ibnd_start, ibnd_end )
!
!
! ... Bcast diagonal block
!
CALL mp_bcast( phi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( eigen_ ) &
CALL mp_bcast( hphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
IF ( uspp ) &
CALL mp_bcast( sphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
!
! ... Project off-diagonal block outside of diagonal block
!
jblock_start = MAX( iblock_start, iblock + 1 )
jblock_end = iblock_end
!
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
jbnd_end = MIN( jblock_end * nbsize, nbnd )
!
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
!
!
CALL project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
!
END DO
!
! ... Buffer Realese
!
DEALLOCATE (sc_d, sc2_d)
!
!
! ... Copy psi <- phi
!
CALL ZCOPY_gpu( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
!
IF ( eigen_ ) &
CALL ZCOPY_gpu( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
!
IF ( uspp ) &
CALL ZCOPY_gpu( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
!
! ... Calculate energy eigenvalues
!
IF ( eigen_ ) CALL energyeigen_gpu( )
!
! ... Sort wave functions
!
IF ( reorder ) CALL sort_vectors_gpu( )
!
DEALLOCATE( phi_d )
IF ( eigen_ ) DEALLOCATE( hphi_d )
IF ( uspp ) DEALLOCATE( sphi_d )
!
DEALLOCATE( owner_bgrp_id )
!
RETURN
!
!
CONTAINS
!
!
SUBROUTINE gram_schmidt_diag( ibnd_start, ibnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
!
INTEGER :: ibnd
REAL(DP) :: norm
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
!
!
DO ibnd = ibnd_start, ibnd_end
!
IF ( ibnd > ibnd_start ) THEN
!
! ... <phi_j| S |psi_i>
!
IF ( uspp ) THEN
!
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
spsi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
!
ELSE
!
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
psi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
!
END IF
!
!
CALL mp_sum( sc_d, intra_bgrp_comm )
!
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
!
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, &
sc_d(1), 1, ONE, phi_d(1,ibnd), 1 )
!
!
IF ( eigen_ ) &
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, &
sc_d(1), 1, ONE, hphi_d(1,ibnd), 1 )
!
IF ( uspp ) &
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, &
sc_d(1), 1, ONE, sphi_d(1,ibnd), 1 )
!
END IF
!
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
!
IF ( uspp ) THEN
!
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) )
!
ELSE
!
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) )
!
END IF
!
CALL mp_sum( norm, intra_bgrp_comm )
!
norm = SQRT( MAX( norm, 0._DP ) )
!
IF ( norm < eps16 ) &
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
!
CALL ZDSCAL_gpu( kdim, 1._DP / norm, phi_d(1,ibnd), 1 )
!
IF ( eigen_ ) &
CALL ZDSCAL_gpu( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 )
!
IF ( uspp ) &
CALL ZDSCAL_gpu( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 )
!
END DO
!
!
RETURN
!
END SUBROUTINE gram_schmidt_diag
!
!
SUBROUTINE project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
!
IMPLICIT NONE
!
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
!
INTEGER :: ibnd_size
INTEGER :: jbnd_size
!
ibnd_size = ibnd_end - ibnd_start + 1
jbnd_size = jbnd_end - jbnd_start + 1
!
! ... <phi_i| S |psi_j>
!
IF ( uspp ) THEN
!
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
spsi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), ibnd_size )
!
ELSE
!
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
psi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), ibnd_size )
!
END IF
!
CALL mp_sum( sc2_d, intra_bgrp_comm )
!
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
!
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, &
sc2_d(1,1), ibnd_size, ONE, phi_d(1,jbnd_start), kdmx )
!
IF ( eigen_ ) &
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, &
sc2_d(1,1), ibnd_size, ONE, hphi_d(1,jbnd_start), kdmx )
!
IF ( uspp ) &
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, &
sc2_d(1,1), ibnd_size, ONE, sphi_d(1,jbnd_start), kdmx )
!
RETURN
!
END SUBROUTINE project_offdiag_gpu
!
!
SUBROUTINE energyeigen_gpu( )
!
IMPLICIT NONE
!
INTEGER :: ibnd, ibnd_start, ibnd_end
!
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
!
! ... <psi_i| H |psi_i>
!
e(:) = 0._DP
!
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
!
DO ibnd = ibnd_start, ibnd_end
!
e(ibnd) = DBLE( ZDOTC_gpu( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) )
!
END DO
!
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
CALL mp_sum( e, inter_bgrp_comm )
!
RETURN
!
END SUBROUTINE energyeigen_gpu
!
!
SUBROUTINE sort_vectors_gpu( )
!
IMPLICIT NONE
!
INTEGER :: ibnd
INTEGER :: nswap
REAL(DP) :: e0
!
10 nswap = 0
!
DO ibnd = 2, nbnd
!
IF ( e(ibnd) < e(ibnd-1) ) THEN
!
nswap = nswap + 1
!
e0 = e(ibnd)
e(ibnd) = e(ibnd-1)
e(ibnd-1) = e0
!
CALL ZSWAP_gpu( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
!
IF ( eigen_ ) &
CALL ZSWAP_gpu( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
!
IF ( uspp ) &
CALL ZSWAP_gpu( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
!
END IF
!
END DO
!
IF ( nswap > 0 ) GOTO 10
!
RETURN
!
END SUBROUTINE sort_vectors_gpu
!
!
END SUBROUTINE gram_schmidt_k_gpu

View File

@ -1,3 +1,15 @@
gram_schmidt_gamma.o : ../../UtilXlib/mp.o
gram_schmidt_gamma.o : ../../UtilXlib/mp_bands_util.o
gram_schmidt_gamma.o : ../../UtilXlib/util_param.o
gram_schmidt_gamma_gpu.o : ../../UtilXlib/mp.o
gram_schmidt_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
gram_schmidt_gamma_gpu.o : ../../UtilXlib/util_param.o
gram_schmidt_k.o : ../../UtilXlib/mp.o
gram_schmidt_k.o : ../../UtilXlib/mp_bands_util.o
gram_schmidt_k.o : ../../UtilXlib/util_param.o
gram_schmidt_k_gpu.o : ../../UtilXlib/mp.o
gram_schmidt_k_gpu.o : ../../UtilXlib/mp_bands_util.o
gram_schmidt_k_gpu.o : ../../UtilXlib/util_param.o
rotate_HSpsi_gamma.o : ../../UtilXlib/mp.o
rotate_HSpsi_gamma.o : ../../UtilXlib/mp_bands_util.o
rotate_HSpsi_gamma.o : ../../UtilXlib/util_param.o
@ -10,6 +22,8 @@ rotate_HSpsi_k.o : ../../UtilXlib/util_param.o
rotate_HSpsi_k_gpu.o : ../../UtilXlib/mp.o
rotate_HSpsi_k_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_HSpsi_k_gpu.o : ../../UtilXlib/util_param.o
rotate_driver.o : ../../UtilXlib/util_param.o
rotate_driver_cuf.o : ../../UtilXlib/util_param.o
rotate_wfc_gamma.o : ../../UtilXlib/mp.o
rotate_wfc_gamma.o : ../../UtilXlib/mp_bands_util.o
rotate_wfc_gamma.o : ../../UtilXlib/util_param.o
@ -22,3 +36,15 @@ rotate_wfc_k.o : ../../UtilXlib/util_param.o
rotate_wfc_k_gpu.o : ../../UtilXlib/mp.o
rotate_wfc_k_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_wfc_k_gpu.o : ../../UtilXlib/util_param.o
rotate_xpsi_gamma.o : ../../UtilXlib/mp.o
rotate_xpsi_gamma.o : ../../UtilXlib/mp_bands_util.o
rotate_xpsi_gamma.o : ../../UtilXlib/util_param.o
rotate_xpsi_gamma_gpu.o : ../../UtilXlib/mp.o
rotate_xpsi_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_xpsi_gamma_gpu.o : ../../UtilXlib/util_param.o
rotate_xpsi_k.o : ../../UtilXlib/mp.o
rotate_xpsi_k.o : ../../UtilXlib/mp_bands_util.o
rotate_xpsi_k.o : ../../UtilXlib/util_param.o
rotate_xpsi_k_gpu.o : ../../UtilXlib/mp.o
rotate_xpsi_k_gpu.o : ../../UtilXlib/mp_bands_util.o
rotate_xpsi_k_gpu.o : ../../UtilXlib/util_param.o

View File

@ -0,0 +1,95 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_xpsi_driver &
( npwx, npw, nstart, nbnd, psi, npol, overlap, evc, hevc, sevc, e, use_para_diag, gamma_only )
!----------------------------------------------------------------------------
!
! ... Driver routine (maybe it should be an interface) for
! ... Hamiltonian diagonalization in the subspace spanned
! ... by nstart states psi ( atomic or random wavefunctions ).
! ... Produces on output nbnd eigenvectors ( nbnd <= nstart ) in evc.
! ... Calls h_psi, s_psi to calculate H|psi> and S|psi>,
! ... which are saved in hevc and sevc.
!
USE util_param, ONLY : DP
!
IMPLICIT NONE
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
!! dimension of the matrix to be diagonalized
!! leading dimension of matrix psi, as declared in the calling pgm unit
!! input number of states
!! output number of states
!! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
!! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
!! vectors spanning the subspace
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
!! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
!! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e(nbnd)
!! eigenvalues
LOGICAL,INTENT(IN) :: use_para_diag
!! if true parallel diagonalization will be used
LOGICAL,INTENT(IN) :: gamma_only
!! set to true when H is real
!
EXTERNAL :: h_psi, s_psi
! h_psi(npwx,npw,nbnd,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nbnd,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
!
CALL start_clock( 'wfcrot' )
!
IF( use_para_diag ) THEN
!
! use data distributed subroutine
!
IF ( gamma_only ) THEN
!
CALL protate_xpsi_gamma ( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
!
ELSE
!
CALL protate_xpsi_k ( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
!
END IF
!
ELSE
!
! use serial subroutines
!
IF ( gamma_only ) THEN
!
CALL rotate_xpsi_gamma ( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
!
ELSE
!
CALL rotate_xpsi_k ( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
!
END IF
!
END IF
!
CALL stop_clock( 'wfcrot' )
!
END SUBROUTINE rotate_xpsi_driver

View File

@ -0,0 +1,131 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_xpsi_driver_cuf &
( npwx, npw, nstart, nbnd, psi_d, npol, overlap, evc_d, hevc_d, sevc_d, e_d, use_para_diag, gamma_only )
!----------------------------------------------------------------------------
!
!! Driver routine for Hamiltonian diagonalization in the subspace
!! spanned by nstart states psi ( atomic or random wavefunctions ).
!! Produces on output nbnd eigenvectors ( nbnd <= nstart ) in evc.
!! Calls h_psi, s_psi to calculate H|psi> and S|psi>,
!! which are saved in hevc and sevc.
!
USE util_param, ONLY : DP
!
IMPLICIT NONE
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
!! dimension of the matrix to be diagonalized
!! leading dimension of matrix psi, as declared in the calling pgm unit
!! input number of states
!! output number of states
!! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
!! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart)
!! vectors spannign the subspace
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
!! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx*npol,nbnd), sevc_d(npwx*npol,nbnd)
!! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e_d(nbnd)
!! eigenvalues
LOGICAL, INTENT(IN) :: use_para_diag
!! if true, use parallel diagonalization
LOGICAL, INTENT(IN) :: gamma_only
!! set to true if H matrix is real
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
#endif
COMPLEX(DP), ALLOCATABLE :: psi_h(:,:)
COMPLEX(DP), ALLOCATABLE, TARGET :: evc_h(:,:)
COMPLEX(DP), ALLOCATABLE :: hevc_h(:,:)
COMPLEX(DP), POINTER :: sevc_h(:,:)
REAL(DP), ALLOCATABLE :: e_h(:)
!
EXTERNAL :: h_psi, s_psi, h_psi_gpu, s_psi_gpu
! h_psi(npwx,npw,nbnd,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nbnd,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
!
CALL start_clock_gpu( 'wfcrot' ); !write (*,*) 'start wfcrot' ; FLUSH(6)
!write (*,*) 'gamma_only' , gamma_only; FLUSH(6)
!
IF( use_para_diag ) THEN
!
!Allocate arrays to workaround parallel case
!
ALLOCATE(psi_h(npwx*npol,nstart), evc_h(npwx*npol,nbnd), hevc_h(npwx*npol,nbnd), &
e_h(nbnd))
IF(overlap) THEN
ALLOCATE(sevc_h(npwx*npol,nbnd))
ELSE
sevc_h => evc_h
END IF
!
psi_h(1:npwx*npol,1:nstart) = psi_d(1:npwx*npol,1:nstart)
evc_h(1:npwx*npol,1:nbnd) = evc_d(1:npwx*npol,1:nbnd)
!
! use data distributed subroutine
!
IF ( gamma_only ) THEN
!write (*,*) 'inside para gamma'; FLUSH(6)
!
CALL protate_xpsi_gamma ( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, psi_h, evc_h, hevc_h, sevc_h, e_h )
!
ELSE
!write (*,*) 'inside para k'; FLUSH(6)
!
CALL protate_xpsi_k ( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, npol, psi_h, evc_h, hevc_h, sevc_h, e_h )
!
END IF
psi_d(1:npwx*npol,1:nstart) = psi_h(1:npwx*npol,1:nstart)
evc_d(1:npwx*npol,1:nbnd) = evc_h(1:npwx*npol,1:nbnd)
hevc_d(1:npwx*npol,1:nbnd) = hevc_h(1:npwx*npol,1:nbnd)
e_d(1:nbnd) = e_h(1:nbnd)
!
DEALLOCATE(psi_h, evc_h, hevc_h, e_h)
IF(overlap) THEN
sevc_d(1:npwx*npol,1:nbnd) = sevc_h(1:npwx*npol,1:nbnd)
DEALLOCATE(sevc_h)
ELSE
NULLIFY(sevc_h)
END IF
!
ELSE
!
! use serial subroutines
!
IF ( gamma_only ) THEN
!write (*,*) 'inside serial gamma'; FLUSH(6)
!
CALL rotate_xpsi_gamma_gpu ( h_psi_gpu, s_psi_gpu, overlap, &
npwx, npw, nstart, nbnd, psi_d, evc_d, hevc_d, sevc_d, e_d )
!
ELSE
!write (*,*) 'inside serial k'; FLUSH(6)
!
CALL rotate_xpsi_k_gpu ( h_psi_gpu, s_psi_gpu, overlap, &
npwx, npw, nstart, nbnd, npol, psi_d, evc_d, hevc_d, sevc_d, e_d )
!
END IF
!
END IF
!
CALL stop_clock_gpu( 'wfcrot' )
!
END SUBROUTINE rotate_xpsi_driver_cuf

View File

@ -19,7 +19,7 @@ SUBROUTINE rotate_wfc_gamma( h_psi, s_psi, overlap, &
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_sum
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!

View File

@ -0,0 +1,529 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_xpsi_gamma( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_xpsi for Gamma-only calculations
! ... This version assumes real wavefunctions (k=0) with only
! ... half plane waves stored: psi(-G)=psi*(G), except G=0
!
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! first G with nonzero norm
LOGICAL, INTENT(IN) :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi(npwx,nstart)
COMPLEX(DP), INTENT(OUT) :: evc(npwx,nbnd)
! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc(npwx,nbnd), sevc(npwx,nbnd)
! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e(nbnd)
! eigenvalues
!
! ... local variables:
!
INTEGER :: npw2, npwx2
INTEGER :: n_start, n_end, my_n
REAL(DP), ALLOCATABLE :: hr(:,:), sr(:,:), vr(:,:)
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
REAL(DP), ALLOCATABLE :: en(:)
!
EXTERNAL :: h_psi, s_psi
! h_psi(npwx,npw,nbnd,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nbnd,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
npw2 = 2 * npw
npwx2 = 2 * npwx
!
IF ( gstart == -1 ) CALL errore( 'rotxpsig', 'gstart variable not initialized', 1 )
!
CALL start_clock('rotxpsig')
!
ALLOCATE( tpsi( npwx, nstart ) )
ALLOCATE( hpsi( npwx, nstart ) )
IF ( overlap ) &
ALLOCATE( spsi( npwx, nstart ) )
ALLOCATE( hr( nstart, nstart ) )
ALLOCATE( sr( nstart, nstart ) )
ALLOCATE( vr( nstart, nstart ) )
ALLOCATE( en( nstart ) )
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
! ... set Im[ psi(G=0) ] - needed for numerical stability
!
IF ( gstart == 2 ) &
psi(1,1:nstart) = CMPLX( DBLE( psi(1,1:nstart) ), 0.D0 ,kind=DP)
!
CALL start_clock('rotxpsig:hpsi')
!
CALL h_psi( npwx, npw, nstart, psi, hpsi )
!
CALL stop_clock('rotxpsig:hpsi')
!
IF ( overlap ) THEN
!
CALL start_clock('rotxpsig:spsi')
!
CALL s_psi( npwx, npw, nstart, psi, spsi )
!
CALL stop_clock('rotxpsig:spsi')
!
END IF
!
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
my_n = n_end - n_start + 1
!
CALL start_clock('rotxpsig:hc')
!
hr = 0.D0
!
IF ( n_start <= n_end ) &
CALL DGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, &
psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nstart )
!
IF ( gstart == 2 ) &
CALL DGER( nstart, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nstart )
!
CALL mp_sum( hr , inter_bgrp_comm )
!
CALL mp_sum( hr , intra_bgrp_comm )
!
CALL stop_clock('rotxpsig:hc')
!
CALL start_clock('rotxpsig:sc')
!
sr = 0.D0
!
IF ( overlap ) THEN
!
IF ( n_start <= n_end ) &
CALL DGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, &
psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nstart )
!
IF ( gstart == 2 ) &
CALL DGER( nstart, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nstart )
!
ELSE
!
IF ( n_start <= n_end ) &
CALL DGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, &
psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nstart )
!
IF ( gstart == 2 ) &
CALL DGER( nstart, my_n, -1.D0, psi, npwx2, psi(1,n_start), npwx2, sr(1,n_start), nstart )
!
END IF
!
CALL mp_sum( sr , inter_bgrp_comm )
!
CALL mp_sum( sr , intra_bgrp_comm )
!
CALL stop_clock('rotxpsig:sc')
!
! ... Diagonalize
!
CALL start_clock('rotxpsig:diag')
!
CALL diaghg( nstart, nbnd, hr, sr, nstart, en, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
!
e(:) = en(1:nbnd)
!
CALL stop_clock('rotxpsig:diag')
!
! ... update the basis set
!
CALL start_clock('rotxpsig:evc')
!
tpsi = psi
!
evc = (0.D0, 0.D0)
hevc = (0.D0, 0.D0)
IF ( overlap ) &
sevc = (0.D0, 0.D0)
!
IF ( n_start <= n_end ) THEN
!
CALL DGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
tpsi(1,n_start), npwx2, vr(n_start,1), nstart, 0.D0, evc, npwx2 )
!
CALL DGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
hpsi(1,n_start), npwx2, vr(n_start,1), nstart, 0.D0, hevc, npwx2 )
!
IF ( overlap ) &
CALL DGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
spsi(1,n_start), npwx2, vr(n_start,1), nstart, 0.D0, sevc, npwx2 )
!
END IF
!
CALL mp_sum( evc, inter_bgrp_comm )
CALL mp_sum( hevc, inter_bgrp_comm )
IF ( overlap ) &
CALL mp_sum( sevc, inter_bgrp_comm )
!
CALL stop_clock('rotxpsig:evc')
!
DEALLOCATE( en )
DEALLOCATE( vr )
DEALLOCATE( sr )
DEALLOCATE( hr )
IF ( overlap ) &
DEALLOCATE( spsi )
DEALLOCATE( hpsi )
DEALLOCATE( tpsi )
!
CALL stop_clock('rotxpsig')
!
!CALL print_clock('rotxpsig')
!CALL print_clock('rotxpsig:hpsi')
!CALL print_clock('rotxpsig:spsi')
!CALL print_clock('rotxpsig:hc')
!CALL print_clock('rotxpsig:sc')
!CALL print_clock('rotxpsig:diag')
!CALL print_clock('rotxpsig:evc')
!
RETURN
!
END SUBROUTINE rotate_xpsi_gamma
!
!
!----------------------------------------------------------------------------
SUBROUTINE protate_xpsi_gamma( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
!----------------------------------------------------------------------------
!
! ... Parallel version of rotate_xpsi for Gamma-only calculations
! ... Subroutine with distributed matrices, written by Carlo Cavazzoni
! ... This version assumes real wavefunctions (k=0) with only
! ... half plane waves stored: psi(-G)=psi*(G), except G=0
!
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, &
nbgrp, root_bgrp_id, my_bgrp_id
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_bcast, mp_root_sum, mp_sum, mp_barrier
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! first G with nonzero norm
LOGICAL, INTENT(IN) :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi(npwx,nstart)
COMPLEX(DP), INTENT(OUT) :: evc(npwx,nbnd)
! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc(npwx,nbnd), sevc(npwx,nbnd)
! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e(nbnd)
! eigenvalues
!
! ... local variables
!
INTEGER :: npw2, npwx2
REAL(DP), ALLOCATABLE :: hr(:,:), sr(:,:), vr(:,:)
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
REAL(DP), ALLOCATABLE :: en(:)
!
INTEGER :: idesc(LAX_DESC_SIZE)
! matrix distribution descriptors
INTEGER :: nx
! maximum local block dimension
LOGICAL :: la_proc
! flag to distinguish procs involved in linear algebra
LOGICAL :: do_distr_diag_inside_bgrp
INTEGER :: ortho_parent_comm
INTEGER, ALLOCATABLE :: idesc_ip( :, :, : )
INTEGER, ALLOCATABLE :: rank_ip( :, : )
!
EXTERNAL :: h_psi, s_psi
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nvec,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
CALL start_clock('protxpsig')
!
CALL laxlib_getval( do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp, &
ortho_parent_comm = ortho_parent_comm )
CALL desc_init( nstart, nx, la_proc, idesc, rank_ip, idesc_ip )
!
npw2 = 2 * npw
npwx2 = 2 * npwx
!
IF ( gstart == -1 ) CALL errore( 'protxpsig', 'gstart variable not initialized', 1 )
!
ALLOCATE( tpsi( npwx, nstart ) )
ALLOCATE( hpsi( npwx, nstart ) )
IF ( overlap ) &
ALLOCATE( spsi( npwx, nstart ) )
ALLOCATE( hr( nx, nx ) )
ALLOCATE( sr( nx, nx ) )
ALLOCATE( vr( nx, nx ) )
ALLOCATE( en( nstart ) )
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
! ... set Im[ psi(G=0) ] - needed for numerical stability
!
IF ( gstart == 2 ) &
psi(1,1:nstart) = CMPLX( DBLE( psi(1,1:nstart) ), 0.D0, kind=DP)
!
CALL start_clock('protxpsig:hpsi')
!
CALL h_psi( npwx, npw, nstart, psi, hpsi )
!
CALL stop_clock('protxpsig:hpsi')
!
IF ( overlap ) THEN
!
CALL start_clock('protxpsig:spsi')
!
CALL s_psi( npwx, npw, nstart, psi, spsi )
!
CALL stop_clock('protxpsig:spsi')
!
END IF
!
CALL start_clock('protxpsig:hc')
!
CALL compute_distmat( hr, psi, hpsi )
!
CALL stop_clock('protxpsig:hc')
!
CALL start_clock('protxpsig:sc')
!
IF ( overlap ) THEN
!
CALL compute_distmat( sr, psi, spsi )
!
ELSE
!
CALL compute_distmat( sr, psi, psi )
!
END IF
!
CALL stop_clock('protxpsig:sc')
!
! ... Diagonalize
!
CALL start_clock('protxpsig:diag')
!
IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg en and vr are the same across ortho_parent_comm
! only the first bgrp performs the diagonalization
IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nstart, hr, sr, nx, en, vr, idesc )
IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
CALL mp_bcast( vr, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( en, root_bgrp_id, inter_bgrp_comm )
ENDIF
ELSE
CALL pdiaghg( nstart, hr, sr, nx, en, vr, idesc )
END IF
!
e(:) = en(1:nbnd)
!
CALL stop_clock('protxpsig:diag')
!
! ... update the basis set
!
CALL start_clock('protxpsig:evc')
!
tpsi = psi
!
CALL refresh_evc()
!
CALL stop_clock('protxpsig:evc')
!
DEALLOCATE( idesc_ip )
DEALLOCATE( rank_ip )
DEALLOCATE( en )
DEALLOCATE( vr )
DEALLOCATE( sr )
DEALLOCATE( hr )
IF ( overlap ) &
DEALLOCATE( spsi )
DEALLOCATE( hpsi )
DEALLOCATE( tpsi )
!
CALL stop_clock('protxpsig')
!
!CALL print_clock('protxpsig')
!CALL print_clock('protxpsig:hpsi')
!CALL print_clock('protxpsig:spsi')
!CALL print_clock('protxpsig:hc')
!CALL print_clock('protxpsig:sc')
!CALL print_clock('protxpsig:diag')
!CALL print_clock('protxpsig:evc')
!
RETURN
!
CONTAINS
!
SUBROUTINE compute_distmat( dm, v, w )
!
! This subroutine compute <vi|wj> and store the
! result in distributed matrix dm
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
REAL(DP), INTENT(OUT) :: dm( :, : )
COMPLEX(DP) :: v(:,:), w(:,:)
REAL(DP), ALLOCATABLE :: work( :, : )
!
ALLOCATE( work( nx, nx ) )
!
work = 0.0d0
!
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
!
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
!
DO ipr = 1, ipc ! use symmetry for the loop on row procs
!
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
!
! rank of the processor for which this block (ipr,ipc) is destinated
!
root = rank_ip( ipr, ipc )
! use blas subs. on the matrix block
CALL DGEMM( 'T', 'N', nr, nc, npw2, 2.D0, v(1,ir), npwx2, w(1,ic), npwx2, 0.D0, work, nx )
IF ( gstart == 2 ) &
CALL DGER( nr, nc, -1.D0, v(1,ir), npwx2, w(1,ic), npwx2, work, nx )
! accumulate result on dm of root proc.
CALL mp_root_sum( work, dm, root, ortho_parent_comm )
END DO
!
END DO
if (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
!
CALL laxlib_dsqmsym( nstart, dm, nx, idesc )
!
DEALLOCATE( work )
!
RETURN
END SUBROUTINE compute_distmat
!
!
SUBROUTINE refresh_evc( )
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
REAL(DP), ALLOCATABLE :: vtmp( :, : )
REAL(DP) :: beta
ALLOCATE( vtmp( nx, nx ) )
!
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
!
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
!
IF( ic <= nbnd ) THEN
!
nc = min( nc, nbnd - ic + 1 )
!
beta = 0.0d0
DO ipr = 1, idesc(LAX_DESC_NPR)
!
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
!
root = rank_ip( ipr, ipc )
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
CALL mp_bcast( vr(:,1:nc), root, ortho_parent_comm )
!
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
tpsi(1,ir), npwx2, vr, nx, beta, evc(1,ic), npwx2 )
!
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
hpsi(1,ir), npwx2, vr, nx, beta, hevc(1,ic), npwx2 )
!
IF ( overlap ) &
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
spsi(1,ir), npwx2, vr, nx, beta, sevc(1,ic), npwx2 )
!
ELSE
!
! all other procs receive
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
!
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
tpsi(1,ir), npwx2, vtmp, nx, beta, evc(1,ic), npwx2 )
!
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
hpsi(1,ir), npwx2, vtmp, nx, beta, hevc(1,ic), npwx2 )
!
IF ( overlap ) &
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
spsi(1,ir), npwx2, vtmp, nx, beta, sevc(1,ic), npwx2 )
!
END IF
!
beta = 1.0d0
END DO
!
END IF
!
END DO
!
DEALLOCATE( vtmp )
RETURN
END SUBROUTINE refresh_evc
!
END SUBROUTINE protate_xpsi_gamma

View File

@ -0,0 +1,233 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_xpsi_gamma_gpu( h_psi_gpu, s_psi_gpu, overlap, &
npwx, npw, nstart, nbnd, psi_d, evc_d, hevc_d, sevc_d, e_d )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_xpsi for Gamma-only calculations
! ... This version assumes real wavefunctions (k=0) with only
! ... half plane waves stored: psi_d(-G)=psi_d*(G), except G=0
!
#if defined(__CUDA)
USE cudafor
USE cublas
#else
#define cublasDGEMM dgemm
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! first G with nonzero norm
LOGICAL, INTENT(IN) :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nstart)
COMPLEX(DP), INTENT(OUT) :: evc_d(npwx,nbnd)
! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx,nbnd), sevc_d(npwx,nbnd)
! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e_d(nbnd)
! eigenvalues
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
#endif
!
! ... local variables:
!
INTEGER :: npw2, npwx2
INTEGER :: n_start, n_end, my_n, i
REAL(DP), ALLOCATABLE :: hr_d(:,:), sr_d(:,:), vr_d(:,:)
COMPLEX(DP), ALLOCATABLE :: tpsi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
REAL(DP), ALLOCATABLE :: en_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: hr_d, sr_d, vr_d, tpsi_d, hpsi_d, spsi_d, en_d
#endif
!
EXTERNAL :: h_psi_gpu, s_psi_gpu
! h_psi(npwx,npw,nbnd,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nbnd,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
npw2 = 2 * npw
npwx2 = 2 * npwx
!
IF ( gstart == -1 ) CALL errore( 'rotxpsig', 'gstart variable not initialized', 1 )
!
ALLOCATE( tpsi_d( npwx, nstart ) )
ALLOCATE( hpsi_d( npwx, nstart ) )
IF ( overlap ) &
ALLOCATE( spsi_d( npwx, nstart ) )
ALLOCATE( hr_d( nstart, nstart ) )
ALLOCATE( sr_d( nstart, nstart ) )
ALLOCATE( vr_d( nstart, nstart ) )
ALLOCATE( en_d( nstart ) )
!
CALL start_clock('rotxpsig')
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
! ... set Im[ psi_d(G=0) ] - needed for numerical stability
!
IF ( gstart == 2 ) THEN
!$cuf kernel do(1)
DO i=1,nstart
psi_d(1,i) = CMPLX( DBLE( psi_d(1,i) ), 0.D0, kind=DP)
END DO
END IF
!
CALL start_clock('rotxpsig:hpsi')
!
CALL h_psi_gpu( npwx, npw, nstart, psi_d, hpsi_d)
!
CALL stop_clock('rotxpsig:hpsi')
!
IF ( overlap ) THEN
!
CALL start_clock('rotxpsig:spsi')
!
CALL s_psi_gpu( npwx, npw, nstart, psi_d, spsi_d )
!
CALL stop_clock('rotxpsig:spsi')
!
END IF
!
CALL start_clock('rotxpsig:hc')
hr_d = 0.D0
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
my_n = n_end - n_start + 1
!
IF ( n_start .le. n_end ) &
CALL cublasDGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, psi_d, &
npwx2, hpsi_d(1,n_start), npwx2, 0.D0, hr_d(1,n_start), nstart )
!
IF ( gstart == 2 ) &
CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, hr_d(1,n_start), nstart )
!
CALL mp_sum( hr_d , inter_bgrp_comm )
!
CALL mp_sum( hr_d , intra_bgrp_comm )
!
CALL stop_clock('rotxpsig:hc')
!
CALL start_clock('rotxpsig:sc')
!
sr_d = 0.D0
!
IF ( overlap ) THEN
!
IF ( n_start .le. n_end ) &
CALL cublasDGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, psi_d, &
npwx2, spsi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
!
IF ( gstart == 2 ) &
CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, npwx2, spsi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
!
ELSE
!
IF ( n_start .le. n_end ) &
CALL cublasDGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, psi_d, &
npwx2, psi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
!
IF ( gstart == 2 ) &
CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, npwx2, psi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
!
END IF
!
CALL mp_sum( sr_d , inter_bgrp_comm )
!
CALL mp_sum( sr_d , intra_bgrp_comm )
!
CALL stop_clock('rotxpsig:sc')
!
! ... Diagonalize
!
CALL start_clock('rotxpsig:diag')
!
CALL diaghg( nstart, nbnd, hr_d, sr_d, nstart, en_d, vr_d, me_bgrp, root_bgrp, intra_bgrp_comm )
!
CALL stop_clock('rotxpsig:diag')
!
CALL start_clock('rotxpsig:evc')
!
!$cuf kernel do(1) <<<*,*>>>
DO i=1, nbnd
e_d(i) = en_d(i)
END DO
!
! ... update the basis set
!
tpsi_d = psi_d
!
evc_d = (0.D0, 0.D0)
hevc_d = (0.D0, 0.D0)
IF ( overlap ) &
sevc_d = (0.D0, 0.D0)
!
IF ( n_start .le. n_end ) THEN
!
CALL cublasDGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
tpsi_d(1,n_start), npwx2, vr_d(n_start,1), nstart, 0.D0, evc_d, npwx2 )
!
CALL cublasDGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
hpsi_d(1,n_start), npwx2, vr_d(n_start,1), nstart, 0.D0, hevc_d, npwx2 )
!
IF ( overlap ) &
CALL cublasDGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
spsi_d(1,n_start), npwx2, vr_d(n_start,1), nstart, 0.D0, sevc_d, npwx2 )
!
END IF
!
CALL mp_sum( evc_d, inter_bgrp_comm )
CALL mp_sum( hevc_d, inter_bgrp_comm )
IF ( overlap ) &
CALL mp_sum( sevc_d, inter_bgrp_comm )
!
CALL stop_clock('rotxpsig:evc')
!
DEALLOCATE( en_d )
DEALLOCATE( vr_d )
DEALLOCATE( sr_d )
DEALLOCATE( hr_d )
IF ( overlap ) &
DEALLOCATE( spsi_d )
DEALLOCATE( hpsi_d )
DEALLOCATE( tpsi_d )
!
CALL stop_clock('rotxpsig')
!
!CALL print_clock('rotxpsig')
!CALL print_clock('rotxpsig:hpsi')
!CALL print_clock('rotxpsig:spsi')
!CALL print_clock('rotxpsig:hc')
!CALL print_clock('rotxpsig:sc')
!CALL print_clock('rotxpsig:diag')
!CALL print_clock('rotxpsig:evc')
!
RETURN
!
END SUBROUTINE rotate_xpsi_gamma_gpu

View File

@ -0,0 +1,515 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_xpsi_k( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_xpsi for colinear, k-point calculations
!
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e(nbnd)
! eigenvalues
!
! ... local variables
!
INTEGER :: kdim, kdmx
INTEGER :: n_start, n_end, my_n
COMPLEX(DP), ALLOCATABLE :: hc(:,:), sc(:,:), vc(:,:)
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
REAL(DP), ALLOCATABLE :: en(:)
!
EXTERNAL :: h_psi, s_psi
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nvec,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx*npol
kdmx = npwx*npol
!
END IF
!
CALL start_clock('rotxpsik')
!
ALLOCATE( tpsi( kdmx, nstart ) )
ALLOCATE( hpsi( kdmx, nstart ) )
IF ( overlap ) &
ALLOCATE( spsi(kdmx, nstart ) )
ALLOCATE( hc( nstart, nstart) )
ALLOCATE( sc( nstart, nstart) )
ALLOCATE( vc( nstart, nstart) )
ALLOCATE( en( nstart ) )
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
CALL start_clock('rotxpsik:hpsi')
!
CALL h_psi( npwx, npw, nstart, psi, hpsi )
!
CALL stop_clock('rotxpsik:hpsi')
!
IF ( overlap ) THEN
!
CALL start_clock('rotxpsik:spsi')
!
CALL s_psi( npwx, npw, nstart, psi, spsi )
!
CALL stop_clock('rotxpsik:spsi')
!
END IF
!
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
my_n = n_end - n_start + 1
!
CALL start_clock('rotxpsik:hc')
!
hc = (0.D0, 0.D0)
!
IF ( n_start <= n_end ) &
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), &
psi, kdmx, hpsi(1,n_start), kdmx, (0.D0, 0.D0), hc(1,n_start), nstart )
!
CALL mp_sum( hc, inter_bgrp_comm )
!
CALL mp_sum( hc, intra_bgrp_comm )
!
CALL stop_clock('rotxpsik:hc')
!
CALL start_clock('rotxpsik:sc')
!
sc = (0.D0, 0.D0)
!
IF ( overlap ) THEN
!
IF ( n_start <= n_end ) &
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), &
psi, kdmx, spsi(1,n_start), kdmx, (0.D0, 0.D0), sc(1,n_start), nstart )
!
ELSE
!
IF ( n_start <= n_end ) &
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), &
psi, kdmx, psi(1,n_start), kdmx, (0.D0, 0.D0), sc(1,n_start), nstart )
!
END IF
!
CALL mp_sum( sc, inter_bgrp_comm )
!
CALL mp_sum( sc, intra_bgrp_comm )
!
CALL stop_clock('rotxpsik:sc')
!
! ... Diagonalize
!
CALL start_clock('rotxpsik:diag')
!
CALL diaghg( nstart, nbnd, hc, sc, nstart, en, vc, me_bgrp, root_bgrp, intra_bgrp_comm )
!
e(:) = en(1:nbnd)
!
CALL stop_clock('rotxpsik:diag')
!
! ... update the basis set
!
CALL start_clock('rotxpsik:evc')
!
tpsi = psi
!
evc = (0.D0, 0.D0)
hevc = (0.D0, 0.D0)
IF ( overlap ) &
sevc = (0.D0, 0.D0)
!
IF ( n_start <= n_end ) THEN
!
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
tpsi(1,n_start), kdmx, vc(n_start,1), nstart, (0.D0, 0.D0), evc, kdmx )
!
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
hpsi(1,n_start), kdmx, vc(n_start,1), nstart, (0.D0, 0.D0), hevc, kdmx )
!
IF ( overlap ) &
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
spsi(1,n_start), kdmx, vc(n_start,1), nstart, (0.D0, 0.D0), sevc, kdmx )
!
END IF
!
CALL mp_sum( evc, inter_bgrp_comm )
CALL mp_sum( hevc, inter_bgrp_comm )
IF ( overlap ) &
CALL mp_sum( sevc, inter_bgrp_comm )
!
CALL stop_clock('rotxpsik:evc')
!
DEALLOCATE( en )
DEALLOCATE( vc )
DEALLOCATE( sc )
DEALLOCATE( hc )
IF ( overlap ) &
DEALLOCATE( spsi )
DEALLOCATE( hpsi )
DEALLOCATE( tpsi )
!
CALL stop_clock('rotxpsik')
!
!CALL print_clock('rotxpsik')
!CALL print_clock('rotxpsik:hpsi')
!CALL print_clock('rotxpsik:spsi')
!CALL print_clock('rotxpsik:hc')
!CALL print_clock('rotxpsik:sc')
!CALL print_clock('rotxpsik:diag')
!CALL print_clock('rotxpsik:evc')
!
RETURN
!
END SUBROUTINE rotate_xpsi_k
!
!
!----------------------------------------------------------------------------
SUBROUTINE protate_xpsi_k( h_psi, s_psi, overlap, &
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
!----------------------------------------------------------------------------
!
! ... Parallel version of rotate_xpsi for colinear, k-point calculations
! ... Subroutine with distributed matrices, written by Carlo Cavazzoni
!
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, &
nbgrp, root_bgrp_id, my_bgrp_id
USE mp, ONLY : mp_bcast, mp_root_sum, mp_sum, mp_barrier
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e(nbnd)
! eigenvalues
!
! ... local variables
!
INTEGER :: kdim, kdmx
COMPLEX(DP), ALLOCATABLE :: hc(:,:), sc(:,:), vc(:,:)
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
REAL(DP), ALLOCATABLE :: en(:)
!
INTEGER :: idesc(LAX_DESC_SIZE)
! matrix distribution descriptors
INTEGER :: nx
! maximum local block dimension
LOGICAL :: la_proc
! flag to distinguish procs involved in linear algebra
LOGICAL :: do_distr_diag_inside_bgrp
INTEGER :: ortho_parent_comm
INTEGER, ALLOCATABLE :: idesc_ip( :, :, : )
INTEGER, ALLOCATABLE :: rank_ip( :, : )
!
EXTERNAL :: h_psi, s_psi
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nvec,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
CALL start_clock('protxpsik')
!
CALL laxlib_getval( do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp, &
ortho_parent_comm = ortho_parent_comm )
CALL desc_init( nstart, nx, la_proc, idesc, rank_ip, idesc_ip )
!
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx*npol
kdmx = npwx*npol
!
END IF
!
ALLOCATE( tpsi( kdmx, nstart ) )
ALLOCATE( hpsi( kdmx, nstart ) )
IF ( overlap ) &
ALLOCATE( spsi( kdmx, nstart ) )
ALLOCATE( hc( nx, nx) )
ALLOCATE( sc( nx, nx) )
ALLOCATE( vc( nx, nx) )
ALLOCATE( en( nstart ) )
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
CALL start_clock('protxpsik:hpsi')
!
CALL h_psi( npwx, npw, nstart, psi, hpsi )
!
CALL stop_clock('protxpsik:hpsi')
!
IF ( overlap ) THEN
!
CALL start_clock('protxpsik:spsi')
!
CALL s_psi( npwx, npw, nstart, psi, spsi )
!
CALL stop_clock('protxpsik:spsi')
!
END IF
!
CALL start_clock('protxpsik:hc')
!
CALL compute_distmat( hc, psi, hpsi )
!
CALL stop_clock('protxpsik:hc')
!
CALL start_clock('protxpsik:sc')
!
IF ( overlap ) THEN
!
CALL compute_distmat( sc, psi, spsi )
!
ELSE
!
CALL compute_distmat( sc, psi, psi )
!
END IF
!
CALL stop_clock('protxpsik:sc')
!
! ... Diagonalize
!
CALL start_clock('protxpsik:diag')
!
IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg en and vc are the same across ortho_parent_comm
! only the first bgrp performs the diagonalization
IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nstart, hc, sc, nx, en, vc, idesc )
IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
CALL mp_bcast( vc, root_bgrp_id, inter_bgrp_comm )
CALL mp_bcast( en, root_bgrp_id, inter_bgrp_comm )
ENDIF
ELSE
CALL pdiaghg( nstart, hc, sc, nx, en, vc, idesc )
END IF
!
e(:) = en(1:nbnd)
!
CALL stop_clock('protxpsik:diag')
!
! ... update the basis set
!
CALL start_clock('protxpsik:evc')
!
tpsi = psi
!
CALL refresh_evc()
!
CALL stop_clock('protxpsik:evc')
!
DEALLOCATE( en )
DEALLOCATE( vc )
DEALLOCATE( sc )
DEALLOCATE( hc )
IF ( overlap ) &
DEALLOCATE( spsi )
DEALLOCATE( hpsi )
DEALLOCATE( tpsi )
!
DEALLOCATE( idesc_ip )
DEALLOCATE( rank_ip )
!
CALL stop_clock('protxpsik')
!
!CALL print_clock('protxpsik')
!CALL print_clock('protxpsik:hpsi')
!CALL print_clock('protxpsik:spsi')
!CALL print_clock('protxpsik:hc')
!CALL print_clock('protxpsik:sc')
!CALL print_clock('protxpsik:diag')
!CALL print_clock('protxpsik:evc')
!
RETURN
!
!
CONTAINS
!
SUBROUTINE compute_distmat( dm, v, w )
!
! This subroutine compute <vi|wj> and store the
! result in distributed matrix dm
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
COMPLEX(DP), INTENT(OUT) :: dm( :, : )
COMPLEX(DP) :: v(:,:), w(:,:)
COMPLEX(DP), ALLOCATABLE :: work( :, : )
!
ALLOCATE( work( nx, nx ) )
!
work = ( 0.0_DP, 0.0_DP )
!
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
!
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
!
DO ipr = 1, ipc ! desc%npr ! ipc ! use symmetry for the loop on row procs
!
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
!
! rank of the processor for which this block (ipr,ipc) is destinated
!
root = rank_ip( ipr, ipc )
! use blas subs. on the matrix block
CALL ZGEMM( 'C', 'N', nr, nc, kdim, ( 1.D0, 0.D0 ), v(1,ir), kdmx, w(1,ic), kdmx, ( 0.D0, 0.D0 ), work, nx )
! accumulate result on dm of root proc.
CALL mp_root_sum( work, dm, root, ortho_parent_comm )
END DO
!
END DO
if (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
!
CALL laxlib_zsqmher( nstart, dm, nx, idesc )
!
DEALLOCATE( work )
!
RETURN
END SUBROUTINE compute_distmat
SUBROUTINE refresh_evc( )
!
INTEGER :: ipc, ipr
INTEGER :: nr, nc, ir, ic, root
COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
COMPLEX(DP) :: beta
ALLOCATE( vtmp( nx, nx ) )
!
DO ipc = 1, idesc(LAX_DESC_NPC)
!
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
!
IF( ic <= nbnd ) THEN
!
nc = min( nc, nbnd - ic + 1 )
!
beta = ( 0.D0, 0.D0 )
DO ipr = 1, idesc(LAX_DESC_NPR)
!
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
!
root = rank_ip( ipr, ipc )
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
!
! this proc sends his block
!
CALL mp_bcast( vc(:,1:nc), root, ortho_parent_comm )
!
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
tpsi(1,ir), kdmx, vc, nx, beta, evc(1,ic), kdmx )
!
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
hpsi(1,ir), kdmx, vc, nx, beta, hevc(1,ic), kdmx )
!
IF ( overlap ) &
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
spsi(1,ir), kdmx, vc, nx, beta, sevc(1,ic), kdmx )
!
ELSE
!
! all other procs receive
!
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
!
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
tpsi(1,ir), kdmx, vtmp, nx, beta, evc(1,ic), kdmx )
!
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
hpsi(1,ir), kdmx, vtmp, nx, beta, hevc(1,ic), kdmx )
!
IF ( overlap ) &
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
spsi(1,ir), kdmx, vtmp, nx, beta, sevc(1,ic), kdmx )
!
END IF
!
beta = ( 1.D0, 0.D0 )
END DO
!
END IF
!
END DO
!
DEALLOCATE( vtmp )
RETURN
END SUBROUTINE refresh_evc
!
END SUBROUTINE protate_xpsi_k

View File

@ -0,0 +1,225 @@
!
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
!
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!----------------------------------------------------------------------------
SUBROUTINE rotate_xpsi_k_gpu( h_psi_gpu, s_psi_gpu, overlap, &
npwx, npw, nstart, nbnd, npol, psi_d, evc_d, hevc_d, sevc_d, e_d )
!----------------------------------------------------------------------------
!
! ... Serial version of rotate_xpsi for colinear, k-point calculations
!
#if defined(__CUDA)
USE cudafor
USE cublas
#endif
USE util_param, ONLY : DP
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, nbgrp, my_bgrp_id, &
me_bgrp, root_bgrp
USE mp, ONLY : mp_sum
!
IMPLICIT NONE
!
include 'laxlib.fh'
!
! ... I/O variables
!
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
! dimension of the matrix to be diagonalized
! leading dimension of matrix psi, as declared in the calling pgm unit
! input number of states
! output number of states
! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart)
COMPLEX(DP), INTENT(OUT) :: evc_d(npwx*npol,nbnd) !(intent inout?)
! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx*npol,nbnd), sevc_d(npwx*npol,nbnd)
! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e_d(nbnd)
! eigenvalues
#if defined(__CUDA)
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
#endif
!
! ... local variables
!
INTEGER :: kdim, kdmx
INTEGER :: n_start, n_end, my_n, i, j
COMPLEX(DP), ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
COMPLEX(DP), ALLOCATABLE :: tpsi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
REAL(DP), ALLOCATABLE :: en_d(:)
#if defined(__CUDA)
attributes(DEVICE) :: hc_d, sc_d, vc_d, tpsi_d, hpsi_d, spsi_d, en_d
#endif
!
EXTERNAL :: h_psi_gpu, s_psi_gpu
! h_psi(npwx,npw,nvec,psi,hpsi)
! calculates H|psi>
! s_psi(npwx,npw,nvec,spsi)
! calculates S|psi> (if needed)
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
IF ( npol == 1 ) THEN
!
kdim = npw
kdmx = npwx
!
ELSE
!
kdim = npwx*npol
kdmx = npwx*npol
!
END IF
!
!
ALLOCATE( tpsi_d( kdmx, nstart ) )
ALLOCATE( hpsi_d( kdmx, nstart ) )
IF ( overlap ) &
ALLOCATE( spsi_d(kdmx, nstart ) )
ALLOCATE( hc_d( nstart, nstart) )
ALLOCATE( sc_d( nstart, nstart) )
ALLOCATE( vc_d( nstart, nstart) )
ALLOCATE( en_d( nstart ) )
!
CALL start_clock('rotxpsik')
!
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
!
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
!
CALL start_clock('rotxpsik:hpsi')
!
CALL h_psi_gpu( npwx, npw, nstart, psi_d, hpsi_d )
!
CALL stop_clock('rotxpsik:hpsi')
!
IF ( overlap ) THEN
!
CALL start_clock('rotxpsik:spsi')
!
CALL s_psi_gpu( npwx, npw, nstart, psi_d, spsi_d )
!
CALL stop_clock('rotxpsik:spsi')
!
END IF
!
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
my_n = n_end - n_start + 1
!
CALL start_clock('rotxpsik:hc')
!
hc_d = (0.D0, 0.D0)
!
IF ( n_start .le. n_end ) &
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), psi_d, &
kdmx, hpsi_d(1,n_start), kdmx, (0.D0, 0.D0), hc_d(1,n_start), nstart )
!
CALL mp_sum( hc_d, inter_bgrp_comm )
!
CALL mp_sum( hc_d, intra_bgrp_comm )
!
CALL stop_clock('rotxpsik:hc')
!
CALL start_clock('rotxpsik:sc')
!
sc_d = (0.D0, 0.D0)
!
IF ( overlap ) THEN
!
IF ( n_start .le. n_end ) &
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), psi_d, &
kdmx, spsi_d(1,n_start), kdmx, (0.D0, 0.D0), sc_d(1,n_start), nstart )
!
ELSE
!
IF ( n_start .le. n_end ) &
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), psi_d, &
kdmx, psi_d(1,n_start), kdmx, (0.D0, 0.D0), sc_d(1,n_start), nstart )
!
END IF
!
CALL mp_sum( sc_d, inter_bgrp_comm )
!
CALL mp_sum( sc_d, intra_bgrp_comm )
!
CALL stop_clock('rotxpsik:sc')
!
! ... Diagonalize
!
CALL start_clock('rotxpsik:diag')
!
CALL diaghg( nstart, nbnd, hc_d, sc_d, nstart, en_d, vc_d, me_bgrp, root_bgrp, intra_bgrp_comm )
!
CALL stop_clock('rotxpsik:diag')
CALL start_clock('rotxpsik:evc')
!
!$cuf kernel do(1) <<<*,*>>>
DO i=1,nbnd
e_d(i) = en_d(i)
END DO
!
! ... update the basis set
!
!$cuf kernel do(1) <<<*,*>>>
DO i=1, nbnd
DO j=1, kdmx
tpsi_d(j,i) = psi_d(j,i)
END DO
END DO
!
evc_d = (0.D0, 0.D0)
hevc_d = (0.D0, 0.D0)
!
IF ( overlap ) sevc_d = (0.D0, 0.D0)
!
IF ( n_start .le. n_end ) THEN
!
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
tpsi_d(1,n_start), kdmx, vc_d(n_start,1), nstart, (0.D0, 0.D0), evc_d, kdmx )
!
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
hpsi_d(1,n_start), kdmx, vc_d(n_start,1), nstart, (0.D0, 0.D0), hevc_d, kdmx )
!
IF ( overlap ) &
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
spsi_d(1,n_start), kdmx, vc_d(n_start,1), nstart, (0.D0, 0.D0), sevc_d, kdmx )
!
END IF
!
CALL mp_sum( evc_d, inter_bgrp_comm )
CALL mp_sum( hevc_d, inter_bgrp_comm )
IF ( overlap ) &
CALL mp_sum( sevc_d, inter_bgrp_comm )
!
CALL stop_clock('rotxpsik:evc')
!
DEALLOCATE( en_d )
DEALLOCATE( vc_d )
DEALLOCATE( sc_d )
DEALLOCATE( hc_d )
IF ( overlap ) DEALLOCATE( spsi_d )
DEALLOCATE( hpsi_d )
DEALLOCATE( tpsi_d )
!
CALL stop_clock('rotxpsik')
!
!CALL print_clock('rotxpsik')
!CALL print_clock('rotxpsik:hpsi')
!CALL print_clock('rotxpsik:spsi')
!CALL print_clock('rotxpsik:hc')
!CALL print_clock('rotxpsik:sc')
!CALL print_clock('rotxpsik:diag')
!CALL print_clock('rotxpsik:evc')
!
RETURN
!
END SUBROUTINE rotate_xpsi_k_gpu
!
!
!----------------------------------------------------------------------------

View File

@ -1,5 +1,5 @@
# Makefile for KS_Solvers
sinclude ../make.inc
include ../make.inc
# the following directive prevents execution of this makefile in parallel
# Problem is, libks_solvers.a depends upon all libdavid libcg etc, but if
@ -10,7 +10,7 @@ sinclude ../make.inc
default: all
all: libdavid_rci libdavid libcg libppcg libparo libdense libnewsolver libks_solvers.a
all: libdavid_rci libdavid libcg libppcg libparo librmm libdense libnewsolver libks_solvers.a ks_solver_interfaces.fh
ALLOBJS = \
Davidson_RCI/david_rci.o \
@ -18,6 +18,7 @@ Davidson/cegterg.o \
Davidson/regterg.o \
CG/ccgdiagg.o \
CG/rcgdiagg.o \
PPCG/generic_cublas.o \
PPCG/ppcg_gamma.o \
PPCG/ppcg_k.o \
ParO/bpcg_gamma.o \
@ -31,7 +32,21 @@ ParO/paro_k_new.o \
DENSE/rotate_HSpsi_gamma.o \
DENSE/rotate_HSpsi_k.o \
DENSE/rotate_wfc_gamma.o \
DENSE/rotate_wfc_k.o
DENSE/rotate_wfc_k.o \
DENSE/rotate_driver.o \
DENSE/gram_schmidt_gamma.o \
DENSE/gram_schmidt_k.o \
DENSE/gram_schmidt_k_gpu.o \
DENSE/gram_schmidt_gamma_gpu.o \
DENSE/rotate_xpsi_gamma.o \
DENSE/rotate_xpsi_gamma_gpu.o \
DENSE/rotate_xpsi_k_gpu.o \
DENSE/rotate_xpsi_k.o \
RMM/crmmdiagg.o \
RMM/crmmdiagg_gpu.o \
RMM/rrmmdiagg.o \
RMM/rrmmdiagg_gpu.o
# GPU-related objects
ALLOBJS += \
@ -39,6 +54,7 @@ Davidson/cegterg_gpu.o \
Davidson/regterg_gpu.o \
DENSE/rotate_wfc_k_gpu.o \
DENSE/rotate_wfc_gamma_gpu.o \
DENSE/rotate_driver_cuf.o \
CG/rcgdiagg_gpu.o \
CG/ccgdiagg_gpu.o \
PPCG/generic_cublas.o \
@ -53,10 +69,12 @@ DENSE/rotate_HSpsi_k_gpu.o
# add here other objects, e.g. ParO/*.o NewSolver/*.o
libks_solvers.a: $(ALLOBJS)
libks_solvers.a: $(ALLOBJS)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
libdavid_rci :
if test -d Davidson_RCI ; then \
( cd Davidson_RCI ; $(MAKE) all || exit 1 ) ; fi
@ -81,6 +99,10 @@ libdense :
if test -d DENSE ; then \
( cd DENSE ; $(MAKE) all || exit 1 ) ; fi
librmm :
if test -d RMM ; then \
( cd RMM ; $(MAKE) all || exit 1 ) ; fi
libnewsolver :
if test -d NewSolver ; then \
( cd NewSolver ; $(MAKE) all || exit 1 ) ; fi
@ -92,6 +114,9 @@ clean :
if test -d PPCG ; then ( cd PPCG ; $(MAKE) clean ) ; fi
if test -d ParO ; then ( cd ParO ; $(MAKE) clean ) ; fi
if test -d DENSE ; then ( cd DENSE ; $(MAKE) clean ) ; fi
if test -d RMM ; then ( cd RMM ; $(MAKE) clean ) ; fi
if test -d NewSolver ; then ( cd NewSolver ; $(MAKE) clean ) ; fi
rm -f *.fh
rm -f libks_solvers.a

View File

@ -82,6 +82,172 @@ implicit none
return
end subroutine gpu_DTRSM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
DOUBLE COMPLEX function ZDOTC_gpu(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
ZDOTC_gpu = cublasZDOTC(n, zx, incx, zy, incy)
#endif
return
end function ZDOTC_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZSWAP_gpu(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZSWAP(n, zx, incx, zy, incy)
#endif
return
END SUBROUTINE ZSWAP_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZCOPY_gpu(n, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZCOPY(n, zx, incx, zy, incy)
#endif
RETURN
END SUBROUTINE ZCOPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZAXPY_gpu(n, za, zx, incx, zy, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE COMPLEX :: za
DOUBLE COMPLEX, dimension(*) :: zx, zy
#if defined(__CUDA)
attributes(device) :: zx, zy
CALL cublasZAXPY(n, za, zx, incx, zy, incy)
#endif
RETURN
END SUBROUTINE ZAXPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZGEMV_gpu(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
CHARACTER :: trans
INTEGER :: lda, m, n, incx, incy
DOUBLE COMPLEX :: alpha, beta
DOUBLE COMPLEX, dimension(lda, *) :: a
DOUBLE COMPLEX, dimension(*) :: x, y
#if defined(__CUDA)
attributes(device) :: a, x, y
CALL cublasZGEMV(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
#endif
RETURN
END SUBROUTINE ZGEMV_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZDSCAL_gpu(n, da, zx, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx
DOUBLE PRECISION :: da
DOUBLE COMPLEX, dimension(*) :: zx
#if defined(__CUDA)
attributes(device) :: zx
CALL cublasZDSCAL(n, da, zx, incx)
#endif
RETURN
END SUBROUTINE ZDSCAL_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE ZSCAL_gpu(n, za, zx, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx
DOUBLE COMPLEX :: za
DOUBLE COMPLEX, dimension(*) :: zx
#if defined(__CUDA)
attributes(device) :: zx
CALL cublasZSCAL(n, za, zx, incx)
#endif
RETURN
END SUBROUTINE ZSCAL_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DGEMV_gpu(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#if defined(__CUDA)
use cublas
#endif
IMPLICIT NONE
DOUBLE PRECISION :: ALPHA,BETA
INTEGER :: INCX,INCY,LDA,M,N
CHARACTER :: TRANS
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
#if defined(__CUDA)
attributes(device) :: A, X, Y
call cublasDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
#endif
RETURN
END SUBROUTINE DGEMV_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DCOPY_gpu(n, x, incx, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE PRECISION, INTENT(IN) :: x(*)
DOUBLE PRECISION, INTENT(OUT) :: y(*)
#if defined(__CUDA)
attributes(device) :: x, y
call cublasDCOPY(n, x, incx, y, incy)
#endif
RETURN
END SUBROUTINE DCOPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DAXPY_gpu(n, a, x, incx, y, incy)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
INTEGER :: n, incx, incy
DOUBLE PRECISION, INTENT(IN) :: a
DOUBLE PRECISION, INTENT(IN) :: x(*)
DOUBLE PRECISION, INTENT(OUT) :: y(*)
#if defined(__CUDA)
attributes(device) :: x, y
call cublasDAXPY( n, a, x, incx, y, incy)
#endif
RETURN
END SUBROUTINE DAXPY_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DSCAL_gpu(n, a, x, incx)
#if defined(__CUDA)
USE cublas
#endif
IMPLICIT NONE
integer :: n, incx
DOUBLE PRECISION :: a
DOUBLE PRECISION, dimension(*) :: x
#if defined(__CUDA)
attributes(device) :: x
call cublasDSCAL(n, a, x, incx)
#endif
RETURN
END SUBROUTINE DSCAL_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE gpu_threaded_memset(array, val, length)
!
#if defined(__CUDA)
@ -219,3 +385,17 @@ SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2
!
END SUBROUTINE gpu_threaded_backassign
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
SUBROUTINE DSWAP_gpu(n, dx, incx, dy, incy)
#if defined(__CUDA)
USE cublas
#endif
implicit none
integer :: n, incx, incy
REAL(8), dimension(*) :: dx, dy
#if defined(__CUDA)
attributes(device) :: dx, dy
CALL cublasDSWAP(n, dx, incx, dy, incy)
#endif
return
END SUBROUTINE DSWAP_gpu
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

View File

@ -87,7 +87,6 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
INTEGER :: kdim, kdmx, cg_iter, ibnd
!
REAL(DP), EXTERNAL :: DDOT
COMPLEX(DP), EXTERNAL :: ZDOTC
EXTERNAL hs_1psi, g_1psi
! hs_1psi( npwx, npw, psi, hpsi, spsi )
@ -115,7 +114,7 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
CALL stop_clock( 'pcg:ortho' )
!-
g0 = ZDOTC( kdim, z ,1 ,r ,1)
g0 = DDOT( 2*kdim, z ,1 ,r ,1)
CALL mp_sum( g0, intra_bgrp_comm ) ! g0 = < initial z | initial r >
ff = 0.d0 ; ff0 = ff
@ -143,13 +142,13 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
CALL stop_clock( 'pcg:hs_1psi' )
w = w - e* sp
gamma = ZDOTC( kdim, p ,1 ,w ,1)
gamma = DDOT( 2*kdim, p ,1 ,w ,1)
CALL mp_sum( gamma, intra_bgrp_comm )
alpha = g0/gamma
psi(:) = psi(:) + alpha * p(:) ! updated solution
r(:) = r(:) - alpha * w(:) ! updated gradient
g2 = ZDOTC( kdim, z ,1 ,r ,1)
g2 = DDOT ( 2*kdim, z ,1 ,r ,1)
CALL mp_sum( g2, intra_bgrp_comm ) ! g2 = < old z | new r >
z(:) = r(:) ; call g_1psi(npwx,npw,z,e) ! updated preconditioned gradient
!- project on conduction bands
@ -159,10 +158,10 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
CALL ZGEMV( 'N', kdim, nbnd, (-1.D0,0.D0), psi0, kdmx, spsi0vec, 1, ONE, z, 1 )
CALL stop_clock( 'pcg:ortho' )
!-
g1 = ZDOTC( kdim, z, 1, r ,1)
g1 = DDOT ( 2*kdim, z, 1, r ,1)
CALL mp_sum( g1, intra_bgrp_comm ) ! g1 = < new z | new r >
! evaluate the function
ff = - 0.5_DP * (ZDOTC( kdim, psi, 1, r ,1) + ZDOTC( kdim, psi, 1, b ,1) )
ff = - 0.5_DP * (DDOT(2*kdim, psi, 1, r ,1) + DDOT(2*kdim, psi, 1, b ,1) )
CALL mp_sum( ff, intra_bgrp_comm )
!write (6,*) cg_iter, g1, ff, gamma
if ( ff > ff0 .AND. ff0 < 0.d0 ) psi(:) = psi(:) - alpha * p(:) ! fallback solution if last iteration failed to improve the function... exit and hope next time it'll be better

25
KS_Solvers/RMM/Makefile Normal file
View File

@ -0,0 +1,25 @@
# Makefile for RMM
include ../../make.inc
# location of needed modules and included files (if any)
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../PPCG $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).
RMM = \
crmmdiagg.o \
crmmdiagg_gpu.o \
rrmmdiagg.o \
rrmmdiagg_gpu.o
all : librmm.a
librmm.a: $(RMM)
$(AR) $(ARFLAGS) $@ $?
$(RANLIB) $@
clean :
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
include make.depend

1062
KS_Solvers/RMM/crmmdiagg.f90 Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
crmmdiagg.o : ../../UtilXlib/mp.o
crmmdiagg.o : ../../UtilXlib/mp_bands_util.o
crmmdiagg.o : ../../UtilXlib/util_param.o
crmmdiagg_gpu.o : ../../UtilXlib/mp.o
crmmdiagg_gpu.o : ../../UtilXlib/mp_bands_util.o
crmmdiagg_gpu.o : ../../UtilXlib/util_param.o
rrmmdiagg.o : ../../UtilXlib/mp.o
rrmmdiagg.o : ../../UtilXlib/mp_bands_util.o
rrmmdiagg.o : ../../UtilXlib/util_param.o
rrmmdiagg_gpu.o : ../../UtilXlib/mp.o
rrmmdiagg_gpu.o : ../../UtilXlib/mp_bands_util.o
rrmmdiagg_gpu.o : ../../UtilXlib/util_param.o

1141
KS_Solvers/RMM/rrmmdiagg.f90 Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,72 @@
! Copyright (C) 2013-2021 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
INTERFACE rotate_xpsi
SUBROUTINE rotate_xpsi_driver &
( npwx, npw, nstart, nbnd, psi, npol, overlap, evc, hevc, sevc, e, use_para_diag, gamma_only )
!
IMPORT :: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
!! dimension of the matrix to be diagonalized
!! leading dimension of matrix psi, as declared in the calling pgm unit
!! input number of states
!! output number of states
!! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
!! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
!! vectors spanning the subspace
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
!! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
!! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e(nbnd)
!! eigenvalues
LOGICAL,INTENT(IN) :: use_para_diag
!! if true parallel diagonalization will be used
LOGICAL,INTENT(IN) :: gamma_only
!! set to true when H is real
END SUBROUTINE rotate_xpsi_driver
#if defined (__CUDA)
SUBROUTINE rotate_xpsi_driver_cuf &
( npwx, npw, nstart, nbnd, psi_d, npol, overlap, evc_d, hevc_d, sevc_d, e_d, use_para_diag, gamma_only )
!! Driver routine for Hamiltonian diagonalization in the subspace
!! spanned by nstart states psi ( atomic or random wavefunctions ).
!! Interface for the CUDA-Fortran case.
!! Produces on output nbnd eigenvectors ( nbnd <= nstart ) in evc.
!! Calls h_psi, s_psi to calculate H|psi> and S|psi>,
!! which are saved in hevc and sevc.
IMPORT :: DP
IMPLICIT NONE
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
!! dimension of the matrix to be diagonalized
!! leading dimension of matrix psi, as declared in the calling pgm unit
!! input number of states
!! output number of states
!! number of spin polarizations
LOGICAL, INTENT(IN) :: overlap
!! if .FALSE. : S|psi> not needed
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart)
!! vectors spannign the subspace
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
!! input and output eigenvectors (may overlap)
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx*npol,nbnd), sevc_d(npwx*npol,nbnd)
!! H|psi> and S|psi>
REAL(DP), INTENT(OUT) :: e_d(nbnd)
!! eigenvalues
LOGICAL, INTENT(IN) :: use_para_diag
!! if true, use parallel diagonalization
LOGICAL, INTENT(IN) :: gamma_only
!! set to true if H matrix is real
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
END SUBROUTINE rotate_xpsi_driver_cuf
#endif
END INTERFACE

View File

@ -107,4 +107,6 @@ if(QE_ENABLE_TEST)
add_unit_test(test_qe_lax-r1-t3 1 3 $<TARGET_FILE:qe_lax_test>)
add_unit_test(test_qe_lax-r4-t1 4 1 $<TARGET_FILE:qe_lax_test>)
add_unit_test(test_qe_lax-r9-t2 9 2 $<TARGET_FILE:qe_lax_test>)
add_subdirectory(tests)
endif(QE_ENABLE_TEST)

Some files were not shown because too many files have changed in this diff Show More