mirror of https://gitlab.com/QEF/q-e.git
Merge branch 'develop' of gitlab.com:giannozz/q-e into bader
This commit is contained in:
commit
cc6bb68bac
|
@ -48,3 +48,4 @@ tempdir
|
|||
tags
|
||||
EPW/src/tmp
|
||||
LAXlib/*.fh
|
||||
KS_Solvers/*.fh
|
||||
|
|
|
@ -49,6 +49,7 @@ build:cmake-gnu:
|
|||
- mkdir build
|
||||
- cd build
|
||||
- cmake -DBUILD_SHARED_LIBS=ON -DCMAKE_Fortran_COMPILER=gfortran -DCMAKE_C_COMPILER=gcc .. && make
|
||||
- make pw ph hp pwcond neb pp pwall cp tddfpt gwl ld1 upf xspectra couple epw all_currents
|
||||
|
||||
build:pgi:
|
||||
tags: [docker]
|
||||
|
@ -71,7 +72,7 @@ build:cmake-nvhpc:
|
|||
- cmake --version
|
||||
- mkdir build
|
||||
- cd build
|
||||
- cmake -DBUILD_SHARED_LIBS=ON -DCMAKE_Fortran_COMPILER=mpif90 -DCMAKE_C_COMPILER=mpicc
|
||||
- cmake -DBUILD_SHARED_LIBS=OFF -DCMAKE_Fortran_COMPILER=mpif90 -DCMAKE_C_COMPILER=mpicc
|
||||
-DQE_ENABLE_CUDA=ON -DQE_ENABLE_OPENACC=ON .. && make
|
||||
|
||||
#build:centos:
|
||||
|
|
|
@ -13,7 +13,7 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
|||
set(CMAKE_POLICY_DEFAULT_CMP0048 NEW)
|
||||
|
||||
project(qe
|
||||
VERSION 6.7.1
|
||||
VERSION 6.8
|
||||
DESCRIPTION "ESPRESSO: opEn-Source Package for Research in Electronic Structure, Simulation, and Optimization"
|
||||
LANGUAGES Fortran C)
|
||||
|
||||
|
@ -32,16 +32,15 @@ endif()
|
|||
# Define the paths for static libraries and executables
|
||||
##########################################################
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${qe_BINARY_DIR}/lib
|
||||
CACHE
|
||||
CACHE
|
||||
PATH "Single output directory for building all libraries.")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${qe_BINARY_DIR}/bin
|
||||
CACHE
|
||||
CACHE
|
||||
PATH "Single output directory for building all executables.")
|
||||
|
||||
###########################################################
|
||||
# Build helpers
|
||||
###########################################################
|
||||
set(PROJECT_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
|
||||
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
|
||||
include(cmake/qeHelpers.cmake)
|
||||
|
@ -111,7 +110,7 @@ option(QE_ENABLE_STATIC_BUILD
|
|||
"enable fully static build of executables" OFF)
|
||||
option(QE_ENABLE_DOC
|
||||
"enable documentation building" OFF)
|
||||
set(QE_FFTW_VENDOR "AUTO" CACHE
|
||||
set(QE_FFTW_VENDOR "AUTO" CACHE
|
||||
STRING "select a specific FFTW library [Intel_DFTI, Intel_FFTW3, ArmPL, IBMESSL, FFTW3, Internal]")
|
||||
set(QE_ENABLE_SANITIZER "none" CACHE STRING "none,asan,ubsan,tsan,msan")
|
||||
|
||||
|
@ -215,14 +214,35 @@ if(CMAKE_Fortran_COMPILER_ID MATCHES "PGI" OR CMAKE_Fortran_COMPILER_ID MATCHES
|
|||
endif()
|
||||
|
||||
############################################################
|
||||
## Compiler vendor specific options
|
||||
# C preprocessor
|
||||
# Note: reply on the compiler preprocessor whenever possible
|
||||
############################################################
|
||||
if(DEFINED ENV{CPP})
|
||||
set(QE_CPP_DEFAULT $ENV{CPP})
|
||||
else()
|
||||
set(QE_CPP_DEFAULT cpp)
|
||||
endif()
|
||||
# QE_CPP_DEFAULT is only effective when cached QE_CPP doesn't exist.
|
||||
set(QE_CPP ${QE_CPP_DEFAULT} CACHE
|
||||
STRING "C preprocessor for qe_preprocess_source in qeHelpers.cmake")
|
||||
find_program(QE_CPP_FULL_PATH NAMES ${QE_CPP} DOC "C preprocessor full path")
|
||||
if(QE_CPP_FULL_PATH)
|
||||
message(STATUS "C preprocessor used by qe_preprocess_source in qeHelpers.cmake: ${QE_CPP_FULL_PATH}")
|
||||
else()
|
||||
set(QE_CPP_SAVED ${QE_CPP})
|
||||
unset(QE_CPP CACHE)
|
||||
message(FATAL_ERROR "C preprocessor ${QE_CPP_SAVED} not found. Pass a working one to CMake via QE_CPP!")
|
||||
endif()
|
||||
|
||||
############################################################
|
||||
# Compiler vendor specific options
|
||||
############################################################
|
||||
if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
|
||||
include(${PROJECT_CMAKE}/GNUFortranCompiler.cmake)
|
||||
include(GNUFortranCompiler)
|
||||
elseif(CMAKE_Fortran_COMPILER_ID MATCHES "PGI" OR CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC")
|
||||
include(${PROJECT_CMAKE}/NVFortranCompiler.cmake)
|
||||
include(NVFortranCompiler)
|
||||
elseif(CMAKE_Fortran_COMPILER_ID MATCHES "XL")
|
||||
include(${PROJECT_CMAKE}/IBMFortranCompiler.cmake)
|
||||
include(IBMFortranCompiler)
|
||||
endif()
|
||||
|
||||
if(QE_ENABLE_STATIC_BUILD)
|
||||
|
@ -248,6 +268,7 @@ if(QE_ENABLE_CUDA OR QE_ENABLE_PROFILE_NVTX)
|
|||
set_target_properties(CUDA::nvToolsExt PROPERTIES INTERFACE_LINK_LIBRARIES "-cuda;libnvToolsExt.so")
|
||||
set(CMAKE_REQUIRED_LIBRARIES "-cuda;libnvToolsExt.so")
|
||||
check_function_exists(nvtxRangePushEx NVTX_FOUND)
|
||||
unset(CMAKE_REQUIRED_LIBRARIES)
|
||||
if(NOT NVTX_FOUND)
|
||||
message(FATAL_ERROR "Check nvtxRangePushEx in libnvToolsExt.so failed")
|
||||
endif()
|
||||
|
@ -348,12 +369,14 @@ if(NOT QE_LAPACK_INTERNAL)
|
|||
set(CMAKE_REQUIRED_LINK_OPTIONS ${OpenMP_Fortran_FLAGS})
|
||||
endif()
|
||||
find_package(LAPACK)
|
||||
unset(CMAKE_REQUIRED_LINK_OPTIONS)
|
||||
endif()
|
||||
else()
|
||||
if(QE_ENABLE_OPENMP)
|
||||
set(CMAKE_REQUIRED_LINK_OPTIONS ${OpenMP_Fortran_FLAGS})
|
||||
endif()
|
||||
find_package(LAPACK)
|
||||
unset(CMAKE_REQUIRED_LINK_OPTIONS)
|
||||
endif()
|
||||
if(LAPACK_FOUND)
|
||||
list(APPEND _lapack_libs
|
||||
|
@ -369,6 +392,7 @@ if(NOT QE_LAPACK_INTERNAL)
|
|||
target_link_libraries(qe_lapack INTERFACE ${_lapack_libs})
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${_lapack_libs})
|
||||
check_fortran_function_exists(zhpev ZHPEV_FOUND)
|
||||
unset(CMAKE_REQUIRED_LIBRARIES)
|
||||
if(NOT ZHPEV_FOUND)
|
||||
unset(ZHPEV_FOUND CACHE)
|
||||
message(FATAL_ERROR "Incomplete LAPACK! function zhpev not found!")
|
||||
|
@ -552,7 +576,6 @@ endif(QE_ENABLE_PROFILE_NVTX)
|
|||
# Components
|
||||
###########################################################
|
||||
add_subdirectory(external)
|
||||
add_subdirectory(clib)
|
||||
add_subdirectory(FFTXlib)
|
||||
add_subdirectory(UtilXlib)
|
||||
add_subdirectory(Modules)
|
||||
|
@ -660,9 +683,6 @@ add_custom_target(ph
|
|||
qe_phonon_fdef_exe
|
||||
qe_phonon_fdifc_exe
|
||||
qe_phonon_postahc_exe
|
||||
qe_plotphon_kforbands_exe
|
||||
qe_plotphon_bandstognuplot_exe
|
||||
qe_plotphon_eminmax_exe
|
||||
COMMENT
|
||||
"phonon code, Gamma-only and third-order derivatives")
|
||||
|
||||
|
|
|
@ -9,11 +9,7 @@ MPILIBS=-lgfortran -lmpi_mpifh -lmpi
|
|||
# location of required libraries
|
||||
PWOBJS = ../src/libqecouple.a ../../PW/src/libpw.a
|
||||
CPOBJS = ../src/libqecouple.a ../../CPV/src/libcp.a
|
||||
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/Davidson/libdavid.a \
|
||||
../../KS_Solvers/CG/libcg.a ../../FFTXlib/libqefft.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
|
||||
../../dft-d3/libdftd3qe.a
|
||||
LIBOBJS = ../../clib/clib.a ../../iotk/src/libiotk.a
|
||||
QEMODS=../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a $(BASEMODS)
|
||||
|
||||
TLDEPS=couple
|
||||
|
||||
|
@ -21,16 +17,16 @@ TLDEPS=couple
|
|||
all : tldeps c2pw.x f2pw.x c2cp.x f2cp.x
|
||||
|
||||
f2pw.x : f2pw.o $(PWOBJS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
|
||||
|
||||
c2pw.x : c2pw.o $(PWOBJS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
|
||||
|
||||
f2cp.x : f2cp.o $(CPOBJS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
|
||||
|
||||
c2cp.x : c2cp.o $(CPOBJS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
|
||||
|
||||
%.o: %.cpp
|
||||
$(MPICXX) -I../include -c $(MPICXXFLAGS) $< -o $@
|
||||
|
|
|
@ -9,11 +9,7 @@ MPILIBS=-Wl,-Bstatic,-lifport,-lifcore,-limf,-Bdynamic -lmpi_mpifh -lmpi
|
|||
# location of required libraries
|
||||
PWOBJS = ../src/libqecouple.a ../../PW/src/libpw.a
|
||||
CPOBJS = ../src/libqecouple.a ../../CPV/src/libcp.a
|
||||
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/Davidson/libdavid.a \
|
||||
../../KS_Solvers/CG/libcg.a ../../FFTXlib/libqefft.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
|
||||
../../dft-d3/libdftd3qe.a
|
||||
LIBOBJS = ../../clib/clib.a ../../iotk/src/libiotk.a
|
||||
QEMODS=../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a $(BASEMODS)
|
||||
|
||||
TLDEPS=couple
|
||||
|
||||
|
@ -21,16 +17,16 @@ TLDEPS=couple
|
|||
all : tldeps c2pw.x f2pw.x c2cp.x f2cp.x
|
||||
|
||||
f2pw.x : f2pw.o $(PWOBJS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
|
||||
|
||||
c2pw.x : c2pw.o $(PWOBJS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
|
||||
|
||||
f2cp.x : f2cp.o $(CPOBJS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(LD) $(LDFLAGS) -o $@ $^ $(QEMODS) $(QELIBS)
|
||||
|
||||
c2cp.x : c2cp.o $(CPOBJS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(LIBOBJS) $(LIBS)
|
||||
$(MPICXX) $(LDFLAGS) -o $@ $^ $(MPILIBS) $(QEMODS) $(QELIBS)
|
||||
|
||||
%.o: %.cpp
|
||||
$(MPICXX) -I../include -c $(MPICXXFLAGS) $< -o $@
|
||||
|
|
|
@ -911,7 +911,7 @@ input_description -distribution {Quantum Espresso} -package CP -program cp.x {
|
|||
|
||||
|
||||
var ortho_max -type INTEGER {
|
||||
default { 20 }
|
||||
default {300 }
|
||||
info {
|
||||
maximum number of iterations for orthonormalization
|
||||
meaningful only if orthogonalization = 'ortho'
|
||||
|
|
|
@ -3,42 +3,36 @@
|
|||
Introduction
|
||||
============
|
||||
|
||||
This guide covers the usage of the `CP` package, version 6.6, a core
|
||||
This guide covers the usage of the `CP` package, version 6.8, a core
|
||||
component of the Quantum ESPRESSO distribution. Further documentation,
|
||||
beyond what is provided in this guide, can be found in the directory
|
||||
`CPV/Doc/`, containing a copy of this guide.
|
||||
|
||||
*Important notice: due to the lack of time and of manpower, this manual
|
||||
is only partially updated and may contain outdated information.*
|
||||
|
||||
This guide assumes that you know the physics that `CP` describes and the
|
||||
This guide assumes that you know the physics that `CP` describes and the
|
||||
methods it implements. It also assumes that you have already installed,
|
||||
or know how to install, Quantum ESPRESSO. If not, please read the
|
||||
general User's Guide for Quantum ESPRESSO, found in directory `Doc/` two
|
||||
levels above the one containing this guide; or consult the web site:\
|
||||
levels above the one containing this guide; or consult the web site:
|
||||
`http://www.quantum-espresso.org`.
|
||||
|
||||
People who want to modify or contribute to `CP` should read the
|
||||
Developer Manual:\
|
||||
`Doc/developer_man.pdf`.
|
||||
People who want to modify or contribute to `CP` should read the
|
||||
Developer Manual: `https://gitlab.com/QEF/q-e/-/wikis/home`.
|
||||
|
||||
`CP` can perform Car-Parrinello molecular dynamics, including
|
||||
variable-cell dynamics, and free-energy surface calculation at fixed
|
||||
cell through meta-dynamics, if patched with PLUMED.
|
||||
|
||||
The `CP` package is based on the original code written by Roberto Car
|
||||
and Michele Parrinello. `CP` was developed by Alfredo Pasquarello (EPF
|
||||
`CP` can perform Car-Parrinello molecular dynamics, including
|
||||
variable-cell dynamics. The `CP` package is based on the original code
|
||||
written by Roberto Car
|
||||
and Michele Parrinello. `CP` was developed by Alfredo Pasquarello (EPF
|
||||
Lausanne), Kari Laasonen (Oulu), Andrea Trave, Roberto Car (Princeton),
|
||||
Nicola Marzari (EPF Lausanne), Paolo Giannozzi, and others. FPMD, later
|
||||
merged with `CP`, was developed by Carlo Cavazzoni, Gerardo Ballabio
|
||||
(CINECA), Sandro Scandolo (ICTP), Guido Chiarotti, Paolo Focher, and
|
||||
others. We quote in particular:
|
||||
merged with `CP`, was developed by Carlo Cavazzoni (Leonardo), Gerardo
|
||||
Ballabio (CINECA), Sandro Scandolo (ICTP), Guido Chiarotti, Paolo Focher,
|
||||
and others. We quote in particular:
|
||||
|
||||
- Federico Grasselli and Riccardo Bertossa (SISSA) for bug fixes,
|
||||
extensions to Autopilot;
|
||||
|
||||
- Biswajit Santra, Hsin-Yu Ko, Marcus Calegari Andrade (Princeton) for
|
||||
SCAN functional;
|
||||
various contribution, notably the SCAN functional;
|
||||
|
||||
- Robert DiStasio (Cornell)), Biswajit Santra, and Hsin-Yu Ko for
|
||||
hybrid functionals with MLWF; (maximally localized Wannier
|
||||
|
@ -50,21 +44,21 @@ others. We quote in particular:
|
|||
- Paolo Umari (Univ. Padua) for finite electric fields and conjugate
|
||||
gradients;
|
||||
|
||||
- Paolo Umari and Ismaila Dabo for ensemble-DFT;
|
||||
- Paolo Umari and Ismaila Dabo (Penn State) for ensemble-DFT;
|
||||
|
||||
- Xiaofei Wang (Princeton) for META-GGA;
|
||||
|
||||
- The Autopilot feature was implemented by Targacept, Inc.
|
||||
|
||||
This guide has been mostly writen by Gerardo Ballabio and Carlo
|
||||
Cavazzoni.
|
||||
The original version of this guide was mostly written by Gerardo Ballabio
|
||||
and Carlo Cavazzoni.
|
||||
|
||||
`CP` is free software, released under the GNU General Public License.\
|
||||
`CP` is free software, released under the GNU General Public License.\
|
||||
See `http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt`, or the file
|
||||
License in the distribution).
|
||||
`License` in the distribution.
|
||||
|
||||
We shall greatly appreciate if scientific work done using the Quantum
|
||||
ESPRESSO distribution will contain an acknowledgment to the following
|
||||
ESPRESSO distribution will contain an acknowledgment to the following
|
||||
references:
|
||||
|
||||
> P. Giannozzi, S. Baroni, N. Bonini, M. Calandra, R. Car, C. Cavazzoni,
|
||||
|
@ -95,22 +89,22 @@ Users of the GPU-enabled version should also cite the following paper:
|
|||
> Ferretti, N. Marzari, I. Timrov, A. Urru, S. Baroni, J. Chem. Phys.
|
||||
> 152, 154105 (2020)
|
||||
|
||||
Note the form Quantum ESPRESSO for textual citations of the code. Please
|
||||
also see package-specific documentation for further recommended
|
||||
citations. Pseudopotentials should be cited as (for instance)
|
||||
Note the form `Quantum ESPRESSO` (in small caps) for textual citations
|
||||
of the code. Please also see other package-specific documentation for
|
||||
further recommended citations. Pseudopotentials should be cited as
|
||||
(for instance)
|
||||
|
||||
> \[ \] We used the pseudopotentials C.pbe-rrjkus.UPF and O.pbe-vbc.UPF
|
||||
> from\
|
||||
> `http://www.quantum-espresso.org`.
|
||||
> from `http://www.quantum-espresso.org`.
|
||||
|
||||
Compilation
|
||||
===========
|
||||
|
||||
`CP` is included in the core Quantum ESPRESSO distribution. Instruction
|
||||
`CP` is included in the core Quantum ESPRESSO distribution. Instruction
|
||||
on how to install it can be found in the general documentation (User's
|
||||
Guide) for Quantum ESPRESSO.
|
||||
|
||||
Typing `make cp` from the main Quantum ESPRESSO directory or `make` from
|
||||
Typing `make cp` from the main Quantum ESPRESSO directory or `make` from
|
||||
the `CPV/` subdirectory produces the following codes in `CPV/src`:
|
||||
|
||||
- `cp.x`: Car-Parrinello Molecular Dynamics code
|
||||
|
@ -125,52 +119,44 @@ Symlinks to executable programs will be placed in the `bin/`
|
|||
subdirectory.
|
||||
|
||||
As a final check that compilation was successful, you may want to run
|
||||
some or all of the tests and examples. Automated tests for `cp.x` are in
|
||||
some or all of the tests and examples. Automated tests for `cp.x` are in
|
||||
directory `test-suite/` and can be run via the `Makefile` found there.
|
||||
Please see the general User's Guide for their setup.
|
||||
|
||||
You may take the tests and examples distributed with `CP` as templates
|
||||
You may take the tests and examples distributed with `CP` as templates
|
||||
for writing your own input files. Input files for tests are contained in
|
||||
subdirectories `test-suite/cp_` with file type `*.in1`, `*.in2`, \... .
|
||||
subdirectories `test-suite/cp_*` with file type `*.in1`, `*.in2`, \... .
|
||||
Input files for examples are produced, if you run the examples, in the
|
||||
`results/` subdirectories, with names ending with `.in`.
|
||||
|
||||
For general information on parallelism and how to run in parallel
|
||||
execution, please see the general User's Guide. `CP` currently can take
|
||||
advantage of both MPI and OpenMP parallelization. The "plane-wave",
|
||||
"linear-algebra" and "task-group" parallelization levels are
|
||||
implemented.
|
||||
execution, please see the general User's Guide. `CP` currently can take
|
||||
advantage of both MPI and OpenMP parallelization and on GPU acceleration.
|
||||
The "plane-wave", "linear-algebra" and "task-group" parallelization levels
|
||||
are implemented.
|
||||
|
||||
Input data
|
||||
==========
|
||||
|
||||
Input data for `cp.x` is organized into several namelists, followed by
|
||||
other fields ("cards") introduced by keywords. The namelists are
|
||||
other fields ("cards") introduced by keywords. The namelists are:
|
||||
|
||||
------------------- ----------------------------------------------------------
|
||||
&CONTROL: general variables controlling the run
|
||||
&SYSTEM: structural information on the system under investigation
|
||||
&ELECTRONS: electronic variables, electron dynamics
|
||||
&IONS : ionic variables, ionic dynamics
|
||||
&CELL (optional): variable-cell dynamics
|
||||
------------------- ----------------------------------------------------------
|
||||
> &CONTROL: general variables controlling the run\
|
||||
> &SYSTEM: structural information on the system under investigation\
|
||||
> &ELECTRONS: electronic variables, electron dynamics\
|
||||
> &IONS : ionic variables, ionic dynamics\
|
||||
> &CELL (optional): variable-cell dynamics\
|
||||
|
||||
\
|
||||
The `&CELL` namelist may be omitted for fixed-cell calculations. This
|
||||
depends on the value of variable `calculation` in namelist &CONTROL.
|
||||
Most variables in namelists have default values. Only the following
|
||||
Most variables in namelists have default values. Only he following
|
||||
variables in &SYSTEM must always be specified:
|
||||
|
||||
----------- --------------------- -----------------------------------------------
|
||||
`ibrav` (integer) Bravais-lattice index
|
||||
`celldm` (real, dimension 6) crystallographic constants
|
||||
`nat` (integer) number of atoms in the unit cell
|
||||
`ntyp` (integer) number of types of atoms in the unit cell
|
||||
`ecutwfc` (real) kinetic energy cutoff (Ry) for wavefunctions.
|
||||
----------- --------------------- -----------------------------------------------
|
||||
|
||||
\
|
||||
).
|
||||
> `ibrav` (integer) Bravais-lattice index\
|
||||
> `celldm` (real, dimension 6) crystallographic constants\
|
||||
> `nat` (integer) number of atoms in the unit cell\
|
||||
> `ntyp` (integer) number of types of atoms in the unit cell\
|
||||
> `ecutwfc` (real) kinetic energy cutoff (Ry) for wavefunctions
|
||||
|
||||
Explanations for the meaning of variables `ibrav` and `celldm`, as well
|
||||
as on alternative ways to input structural data, are contained in files
|
||||
|
@ -178,34 +164,31 @@ as on alternative ways to input structural data, are contained in files
|
|||
describe a large number of other variables as well. Almost all variables
|
||||
have default values, which may or may not fit your needs.
|
||||
|
||||
Comment lines in namelists can be introduced by a \"!\", exactly as in
|
||||
fortran code.
|
||||
|
||||
After the namelists, you have several fields ("cards") introduced by
|
||||
keywords with self-explanatory names:
|
||||
|
||||
> ATOMIC\_SPECIES\
|
||||
> ATOMIC\_POSITIONS\
|
||||
> CELL\_PARAMETERS (optional)\
|
||||
> OCCUPATIONS (optional)\
|
||||
> OCCUPATIONS (optional)
|
||||
|
||||
The keywords may be followed on the same line by an option. Unknown
|
||||
fields are ignored. See the files mentioned above for details on the
|
||||
available "cards".
|
||||
|
||||
Comments lines in "cards" can be introduced by either a "!" or a "\#"
|
||||
character in the first position of a line.
|
||||
Comment lines in namelists can be introduced by a \"!\", exactly as in
|
||||
fortran code. Comments lines in "cards" can be introduced by either a "!"
|
||||
or a "\#" character in the first position of a line.
|
||||
|
||||
Data files
|
||||
----------
|
||||
|
||||
The output data files are written in the directory specified by variable
|
||||
`outdir`, with names specified by variable `prefix` (a string that is
|
||||
prepended to all file names, whose default value is: `prefix=’pwscf’`).
|
||||
The `iotk` toolkit is used to write the file in a XML format, whose
|
||||
definition can be found in the Developer Manual. In order to use the
|
||||
data directory on a different machine, you need to convert the binary
|
||||
files to formatted and back, using the `bin/iotk` script.
|
||||
prepended to all file names, whose default value is `prefix=’cp_$ndw’`,
|
||||
where `ndw` is an integer specified in input).
|
||||
In order to use the data on a different machine, you may need to
|
||||
compile `CP` with HDF5 enabled.
|
||||
|
||||
The execution stops if you create a file `prefix.EXIT` either in the
|
||||
working directory (i.e. where the program is executed), or in the
|
||||
|
@ -215,58 +198,13 @@ this procedure is that all files are properly closed, whereas just
|
|||
killing the process may leave data and output files in an unusable
|
||||
state.
|
||||
|
||||
Format of arrays containing charge density, potential, etc.
|
||||
-----------------------------------------------------------
|
||||
|
||||
The index of arrays used to store functions defined on 3D meshes is
|
||||
actually a shorthand for three indices, following the FORTRAN convention
|
||||
(\"leftmost index runs faster\"). An example will explain this better.
|
||||
Suppose you have a 3D array `psi(nr1x,nr2x,nr3x)`. FORTRAN compilers
|
||||
store this array sequentially in the computer RAM in the following way:
|
||||
|
||||
psi( 1, 1, 1)
|
||||
psi( 2, 1, 1)
|
||||
...
|
||||
psi(nr1x, 1, 1)
|
||||
psi( 1, 2, 1)
|
||||
psi( 2, 2, 1)
|
||||
...
|
||||
psi(nr1x, 2, 1)
|
||||
...
|
||||
...
|
||||
psi(nr1x,nr2x, 1)
|
||||
...
|
||||
psi(nr1x,nr2x,nr3x)
|
||||
etc
|
||||
|
||||
Let `ind` be the position of the `(i,j,k)` element in the above list:
|
||||
the following relation
|
||||
|
||||
ind = i + (j - 1) * nr1x + (k - 1) * nr2x * nr1x
|
||||
|
||||
holds. This should clarify the relation between 1D and 3D indexing. In
|
||||
real space, the `(i,j,k)` point of the FFT grid with dimensions `nr1`
|
||||
( $`\le`$ `nr1x`), `nr2` ( $`\le`$ `nr2x`), , `nr3` ( $`\le`$ `nr3x`), is
|
||||
|
||||
```math
|
||||
r_{ijk}=\frac{i-1}{nr1} \tau_1 + \frac{j-1}{nr2} \tau_2 + \frac{k-1}{nr3} \tau_3
|
||||
```
|
||||
|
||||
where the $`\tau_i`$ are the basis vectors of the
|
||||
Bravais lattice. The latter are stored row-wise in the `at` array:
|
||||
$`\tau_1 =`$ `at(:, 1)`, $`\tau_2 =`$ `at(:, 2)`, $`\tau_3 =`$ `at(:, 3)`.
|
||||
|
||||
The distinction between the dimensions of the FFT grid, `(nr1,nr2,nr3)`
|
||||
and the physical dimensions of the array, `(nr1x,nr2x,nr3x)` is done
|
||||
only because it is computationally convenient in some cases that the two
|
||||
sets are not the same. In particular, it is often convenient to have
|
||||
`nrx1`=`nr1`+1 to reduce memory conflicts.
|
||||
|
||||
The format of arrays containing charge density, potential, etc.
|
||||
is described in the developer manual.
|
||||
|
||||
Output files
|
||||
==========
|
||||
|
||||
The `cp.x` code produces many output file, that together build up the trajectory.
|
||||
The `cp.x` code produces many output files, that together build up the trajectory.
|
||||
|
||||
You have a file for the positions, called `prefix.pos`, where `prefix` is defined in
|
||||
the input file, that is formatted like:
|
||||
|
@ -280,35 +218,40 @@ the input file, that is formatted like:
|
|||
0.42395189282719E+01 0.55766875434652E+01 0.31291744042209E+01
|
||||
0.45445534106843E+01 0.36049553522533E+01 0.55864387532281E+01
|
||||
|
||||
where in the first line there is an header with, in order, the number of the step and
|
||||
the time in ps of this step. Later you found the positions of all the atoms, in the
|
||||
same order of the input file (note that this behaviour emerged in v6.6 -- previously
|
||||
atoms were sorted by type). In this example we have 3 atoms.
|
||||
The type must be deduced from the input file. After the first 4 lines
|
||||
you find the same structure for the second step. The units of the position are Bohr's
|
||||
radius. Note that the atoms coordinates are unwrapped, so it is possible that they go
|
||||
outside the simulation cell.
|
||||
where the first line contains the step number and elapsed time, in ps, at this
|
||||
step; the following lines contain the positions, in Bohr radii, of all the
|
||||
atoms (3 in this examples), in the same order as in the input file (since v6.6
|
||||
-- previously, atoms were sorted by type; the type must be deduced from the
|
||||
input file). The same structure is repeated for the second step and so on.
|
||||
The printout is made every `iprint` steps (10 in this case, so at step 10, 20,
|
||||
etc.). Note that the atomic coordinates are not wrapped into the simulation
|
||||
cell, so it is possible that they lie outside it.
|
||||
|
||||
The velocities are written in a similar file named `prefix.vel`, where `prefix` is defined in
|
||||
the input file, that is formatted like the `.pos` file. The units are the usual Hartree
|
||||
atomic units (note again that the velocity in the pw code differs by a factor of 2).
|
||||
The velocities are written in a similar file named `prefix.vel`, where `prefix`
|
||||
is defined in the input file, that is formatted like the `.pos` file. The units
|
||||
are the usual Hartree atomic units (note that the velocities in the `pw.x` code
|
||||
are in _Rydberg_ a.u. and differ by a factor 2).
|
||||
|
||||
The `prefix.for` file is formatted like the previous two. Contains the computed forces
|
||||
and has Hartree atomic units too.
|
||||
It is written only if `tprnfor = .true.` is set in the input file.
|
||||
The `prefix.for` file, formatted like the previous two, contains the computed
|
||||
forces, in Hartree atomic units as well. It is written only if a molecular
|
||||
dynamics calculation is performed, or if `tprnfor = .true.` is set in input.
|
||||
|
||||
The file `prefix.evp` has one line per printed step and contains some thermodynamic data.
|
||||
The file `prefix.evp` has one line per printed step and contains some
|
||||
thermodynamical data.
|
||||
The first line of the file names the columns:
|
||||
```
|
||||
# nfi time(ps) ekinc T\_cell(K) Tion(K) etot enthal econs econt Volume Pressure(GPa
|
||||
# nfi time(ps) ekinc Tcell(K) Tion(K) etot enthal econs econt Volume Pressure(GPa)
|
||||
```
|
||||
where:
|
||||
- `ekinc` $`K_{ELECTRONS}`$, the electron's fake kinetic energy
|
||||
- `enthal` $`E_{DFT}+PV`$
|
||||
- `etot` $`E_{DFT}`$ potential energy of the system, the DFT energy
|
||||
- `econs` $`E_{DFT} + K_{NUCLEI}`$ this is something that is a constant of motion in the limit where the electronic fictitious mass is zero. It has a physical meaning.
|
||||
- `econt` $`E_{DFT} + K_{IONS} + K_{ELECTRONS}`$ this is a constant of motion of the lagrangian. If the dt is small enough this will be up to a very good precision a constant. It is not a physical quantity, since $`K_{ELECTRONS}`$ has _nothing_ to do with the quantum kinetic energy of the electrons.
|
||||
|
||||
- `ekinc` is the electrons fictitious kinetic energy, $`K_{ELECTRONS}`$
|
||||
- `enthal` is the enthalpy, $`E_{DFT}+PV`$
|
||||
- `etot` is the DFT (potential) energy of the system, $`E_{DFT}`$
|
||||
- `econs` is a physically meaningful constant of motion, $`E_{DFT} + K_{NUCLEI}`$,
|
||||
in the limit of zero electronic fictitious mass
|
||||
- `econt` is the constant of motion of the lagrangian$`E_{DFT} + K_{IONS} + K_{ELECTRONS}`$ t.
|
||||
If the time step `dt` is small enough this will be up to a very good precision a constant.
|
||||
It is not a physical quantity, since $`K_{ELECTRONS}`$ has _nothing_ to do with the quantum
|
||||
kinetic energy of the electrons.
|
||||
|
||||
|
||||
Using `CP`
|
||||
|
@ -317,7 +260,7 @@ Using `CP`
|
|||
It is important to understand that a CP simulation is a sequence of
|
||||
different runs, some of them used to \"prepare\" the initial state of
|
||||
the system, and other performed to collect statistics, or to modify the
|
||||
state of the system itself, i.e. modify the temperature or the pressure.
|
||||
state of the system itself, i.e. to modify the temperature or the pressure.
|
||||
|
||||
To prepare and run a CP simulation you should first of all define the
|
||||
system:
|
||||
|
@ -393,8 +336,7 @@ An example of input file (Benzene Molecule):
|
|||
H -2.2 2.2 0.0
|
||||
H 2.2 2.2 0.0
|
||||
|
||||
You can find the description of the input variables in file
|
||||
`Doc/INPUT_CP.*`.
|
||||
You can find the description of the input variables in file `Doc/INPUT_CP.*`.
|
||||
|
||||
Reaching the electronic ground state
|
||||
------------------------------------
|
||||
|
@ -403,7 +345,7 @@ The first run, when starting from scratch, is always an electronic
|
|||
minimization, with fixed ions and cell, to bring the electronic system
|
||||
on the ground state (GS) relative to the starting atomic configuration.
|
||||
This step is conceptually very similar to self-consistency in a
|
||||
`pw.x` run.
|
||||
`pw.x` run.
|
||||
|
||||
Sometimes a single run is not enough to reach the GS. In this case, you
|
||||
need to re-run the electronic minimization stage. Use the input of the
|
||||
|
@ -428,14 +370,12 @@ $`< 10^{-5}`$. You could check the value of the fictitious kinetic energy
|
|||
on the standard output (column EKINC).
|
||||
|
||||
Different strategies are available to minimize electrons, but the most
|
||||
used ones are:
|
||||
|
||||
- steepest descent: `electron_dynamics = ’sd’`
|
||||
|
||||
- damped dynamics: `electron_dynamics = ’damp’`, `electron_damping` =
|
||||
a number typically ranging from 0.1 and 0.5
|
||||
|
||||
frequently used is _damped dynamics_: `electron_dynamics = ’damp’` and
|
||||
`electron_damping` = a number typically ranging from 0.1 and 0.5.
|
||||
See the input description to compute the optimal damping factor.
|
||||
Steepest descent: `electron_dynamics = ’sd’`, is also available but it
|
||||
is typicallyslower than damped dynamics and should be used only to
|
||||
start the minimization.
|
||||
|
||||
Relax the system
|
||||
----------------
|
||||
|
@ -860,14 +800,6 @@ ranges between 4 and 7.
|
|||
All the other parameters have the same meaning in the usual `CP` input,
|
||||
and they are discussed above.
|
||||
|
||||
### Free-energy surface calculations
|
||||
|
||||
Once `CP` is patched with `PLUMED` plug-in, it becomes possible to
|
||||
turn-on most of the PLUMED functionalities running `CP` as:
|
||||
`./cp.x -plumed` plus the other usual `CP` arguments. The PLUMED input
|
||||
file has to be located in the specified `outdir` with the fixed name
|
||||
`plumed.dat`.
|
||||
|
||||
### Treatment of USPPs
|
||||
|
||||
The cutoff `ecutrho` defines the resolution on the real space FFT mesh
|
||||
|
@ -1030,99 +962,62 @@ An example input is listed as following:
|
|||
O 16.0D0 O_HSCV_PBE-1.0.UPF
|
||||
H 2.0D0 H_HSCV_PBE-1.0.UPF
|
||||
|
||||
Performances
|
||||
============
|
||||
Parallel Performances
|
||||
=====================
|
||||
|
||||
`cp.x` can run in principle on any number of processors. The
|
||||
effectiveness of parallelization is ultimately judged by the "scaling",
|
||||
i.e. how the time needed to perform a job scales with the number of
|
||||
processors, and depends upon:
|
||||
processors. Ideally one would like to have linear scaling, i.e.
|
||||
$`T \sim T_0/N_p`$ for $`N_p`$ processors, where $`T_0`$ is the estimated
|
||||
time for serial execution. In addition, one would like to have linear
|
||||
scaling of the RAM per processor: $`O_N \sim O_0/N_p`$, so that large-memory
|
||||
systems fit into the RAM of each processor.
|
||||
|
||||
- the size and type of the system under study;
|
||||
We refer to the "Parallelization" section of the general User's Guide for
|
||||
a description of MPI and OpenMP parallelization paradigms, of the various
|
||||
MPI parallelization levels, and on how to activate them.
|
||||
|
||||
- the judicious choice of the various levels of parallelization
|
||||
(detailed in
|
||||
Sec.[\[SubSec:para\]](#SubSec:para){reference-type="ref"
|
||||
reference="SubSec:para"});
|
||||
A judicious choice of the various levels of parallelization, together
|
||||
with the availability of suitable hardware (e.g. fast communications)
|
||||
is fundamental to reach good performances._VERY IMPORTANT_: For each
|
||||
system there is an optimal range of number of processors on which to
|
||||
run the job. A too large number of processors or a bad parallelization
|
||||
style will yield performance degradation.
|
||||
|
||||
- the availability of fast interprocess communications (or lack of
|
||||
it).
|
||||
For `CP` with hybrid functionals, see the related section above this one.
|
||||
For all other cases, the relevant MPI parallelization levels are:
|
||||
|
||||
Ideally one would like to have linear scaling, i.e. $`T \sim T_0/N_p`$ for
|
||||
$`N_p`$ processors, where $`T_0`$ is the estimated time for serial
|
||||
execution. In addition, one would like to have linear scaling of the RAM
|
||||
per processor: $`O_N \sim O_0/N_p`$, so that large-memory systems fit into
|
||||
the RAM of each processor.
|
||||
- "plane waves" (PW);
|
||||
- "tasks" (activated by command-line option `-nt N`);
|
||||
- "linear algebra" (`-nd N`);
|
||||
- "bands" parallelization (`-nb N`), to be used only in
|
||||
special cases;
|
||||
- "images" parallelization (`-ni N`), used only in code `manycp.x`
|
||||
(see the header of `CPV/src/manycp.f90` for documentation).
|
||||
|
||||
As a general rule, image parallelization:
|
||||
As a rule of thumb:
|
||||
- start with PW parallelization only (e.g. `mpirun -np N cp.x ...` with
|
||||
no other parallelization options); the code will scale well unless `N`
|
||||
exceeds the third FFT dimensions `nr3` and/or `nr3s`.
|
||||
- To further increase the number of processors, use "task groups",
|
||||
typically 4 to 8 (e.g. `mpirun -np N cp.x -nt 8 ...`).
|
||||
- Alternatively, or in addition, you may compile with OpenMP:
|
||||
`./configure --enable-openmp ...`, then `export OMP_NUM_THREADS=n`
|
||||
and run on `n` threads (4 to 8 typically).
|
||||
_Beware conflicts between MPI and OpenMP threads_!
|
||||
don't do this unless you know what you are doing.
|
||||
- Finally, the optimal number of processors for \"linear-algebra\"
|
||||
parallelization can be found by observing the performances of `ortho`
|
||||
in the final time report for different numbers of processors in the
|
||||
linear-algebra group (must be a square integer, not larger than the
|
||||
number of processoris for plane-wave parallelization). Linear-algebra
|
||||
parallelization distributes `M\times M`$ matrices, with `M` number of
|
||||
bands, so it may be useful if memory-constrained.
|
||||
|
||||
- may give good scaling, but the slowest image will determine the
|
||||
overall performances ("load balancing" may be a problem);
|
||||
Note: optimal serial performances are achieved when the data are as much
|
||||
as possible kept into the cache. As a side effect, PW parallelization may
|
||||
yield superlinear (better than linear) scaling, thanks to the increase in
|
||||
serial speed coming from the reduction of data size (making it easier for
|
||||
the machine to keep data in the cache).
|
||||
|
||||
- requires very little communications (suitable for ethernet
|
||||
communications);
|
||||
|
||||
- does not reduce the required memory per processor (unsuitable for
|
||||
large-memory jobs).
|
||||
|
||||
Parallelization on k-points:
|
||||
|
||||
- guarantees (almost) linear scaling if the number of k-points is a
|
||||
multiple of the number of pools;
|
||||
|
||||
- requires little communications (suitable for ethernet
|
||||
communications);
|
||||
|
||||
- does not reduce the required memory per processor (unsuitable for
|
||||
large-memory jobs).
|
||||
|
||||
Parallelization on PWs:
|
||||
|
||||
- yields good to very good scaling, especially if the number of
|
||||
processors in a pool is a divisor of $`N_3`$ and $`N_{r3}`$ (the
|
||||
dimensions along the z-axis of the FFT grids, `nr3` and `nr3s`,
|
||||
which coincide for NCPPs);
|
||||
|
||||
- requires heavy communications (suitable for Gigabit ethernet up to
|
||||
4, 8 CPUs at most, specialized communication hardware needed for 8
|
||||
or more processors );
|
||||
|
||||
- yields almost linear reduction of memory per processor with the
|
||||
number of processors in the pool.
|
||||
|
||||
A note on scaling: optimal serial performances are achieved when the
|
||||
data are as much as possible kept into the cache. As a side effect, PW
|
||||
parallelization may yield superlinear (better than linear) scaling,
|
||||
thanks to the increase in serial speed coming from the reduction of data
|
||||
size (making it easier for the machine to keep data in the cache).
|
||||
|
||||
VERY IMPORTANT: For each system there is an optimal range of number of
|
||||
processors on which to run the job. A too large number of processors
|
||||
will yield performance degradation. If the size of pools is especially
|
||||
delicate: $`N_p`$ should not exceed $`N_3`$ and $`N_{r3}`$, and should ideally
|
||||
be no larger than $`1/2\div1/4 N_3`$ and/or $`N_{r3}`$. In order to increase
|
||||
scalability, it is often convenient to further subdivide a pool of
|
||||
processors into "task groups". When the number of processors exceeds the
|
||||
number of FFT planes, data can be redistributed to \"task groups\" so
|
||||
that each group can process several wavefunctions at the same time.
|
||||
|
||||
The optimal number of processors for \"linear-algebra\" parallelization,
|
||||
taking care of multiplication and diagonalization of $`M\times M`$
|
||||
matrices, should be determined by observing the performances of
|
||||
`cdiagh/rdiagh` (`pw.x`) or `ortho` (`cp.x`) for different numbers of
|
||||
processors in the linear-algebra group (must be a square integer).
|
||||
|
||||
Actual parallel performances will also depend on the available software
|
||||
(MPI libraries) and on the available communication hardware. For PC
|
||||
clusters, OpenMPI (`http://www.openmpi.org/`) seems to yield better
|
||||
performances than other implementations (info by Kostantin Kudin). Note
|
||||
however that you need a decent communication hardware (at least Gigabit
|
||||
ethernet) in order to have acceptable performances with PW
|
||||
parallelization. Do not expect good scaling with cheap hardware: PW
|
||||
calculations are by no means an \"embarrassing parallel\" problem.
|
||||
|
||||
Also note that multiprocessor motherboards for Intel Pentium CPUs
|
||||
typically have just one memory bus for all processors. This dramatically
|
||||
slows down any code doing massive access to memory (as most codes in the
|
||||
Quantum ESPRESSO distribution do) that runs on processors of the same
|
||||
motherboard.
|
||||
|
|
|
@ -114,33 +114,33 @@ makov_payne.o
|
|||
LOBJS = \
|
||||
entropy.o
|
||||
|
||||
QEMODS=../../Modules/libqemod.a ../../upflib/libupf.a ../../FFTXlib/libqefft.a ../../LAXlib/libqela.a ../../UtilXlib/libutil.a ../../XClib/xc_lib.a
|
||||
QEMODS=$(BASEMODS)
|
||||
|
||||
TLDEPS= bindir libs mods
|
||||
|
||||
all : tldeps libcp.a manycp.x cp.x wfdd.x cppp.x
|
||||
|
||||
manycp.x : manycp.o libcp.a $(QEMODS) $(LIBOBJS)
|
||||
manycp.x : manycp.o libcp.a $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o manycp.x manycp.o \
|
||||
libcp.a $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
libcp.a $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../bin ; ln -fs ../CPV/src/manycp.x . )
|
||||
|
||||
cp.x : cprstart.o libcp.a $(QEMODS) $(LIBOBJS)
|
||||
cp.x : cprstart.o libcp.a $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o cp.x cprstart.o \
|
||||
libcp.a $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
libcp.a $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../bin ; ln -fs ../CPV/src/cp.x . )
|
||||
|
||||
libcp.a : $(FOBJS) $(LOBJS)
|
||||
$(AR) $(ARFLAGS) $@ $?
|
||||
$(RANLIB) $@
|
||||
|
||||
cppp.x : cppp.o $(QEMODS) $(LIBOBJS)
|
||||
cppp.x : cppp.o $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o cppp.x cppp.o \
|
||||
$(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
$(QEMODS) $(QELIBS)
|
||||
- (cd ../../bin ; ln -fs ../CPV/src/cppp.x . )
|
||||
|
||||
wfdd.x : wfdd.o
|
||||
$(LD) $(LDFLAGS) -o $@ wfdd.o $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
$(LD) $(LDFLAGS) -o $@ wfdd.o $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../bin ; ln -fs ../CPV/src/$@ . )
|
||||
|
||||
tldeps :
|
||||
|
|
|
@ -12,7 +12,8 @@
|
|||
rhor, rhog, rhos, rhoc, ei1, ei2, ei3, sfac, fion, ema0bg, becdr, &
|
||||
lambdap, lambda, nlam, vpot, c0, cm, phi, dbec,l_cprestart )
|
||||
|
||||
!! please see https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.64.1045
|
||||
!! please see https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.79.1337 (ensemble DFT)
|
||||
!! and https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.64.1045 (conjugate gradient)
|
||||
|
||||
use kinds, only: dp
|
||||
use control_flags, only: tpre, iverbosity, tfor, tprnfor
|
||||
|
@ -52,11 +53,11 @@
|
|||
use cp_electronic_mass, ONLY : emass_cutoff
|
||||
use orthogonalize_base, ONLY : calphi_bgrp
|
||||
use cp_interfaces, ONLY : rhoofr, dforce, compute_stress, vofrho, nlfl_bgrp, prefor
|
||||
use cp_interfaces, ONLY : nlsm2_bgrp, calbec, caldbec_bgrp, nlfq_bgrp
|
||||
use cp_interfaces, ONLY : nlsm2_bgrp, calbec, caldbec_bgrp, nlfq_bgrp, runcp_uspp
|
||||
USE cp_main_variables, ONLY : idesc, drhor, drhog
|
||||
USE mp_global, ONLY: me_image, my_image_id, nbgrp
|
||||
USE fft_base, ONLY: dffts, dfftp
|
||||
|
||||
use wave_gauge, only: project_parallel_gauge_2
|
||||
|
||||
!
|
||||
implicit none
|
||||
|
@ -100,6 +101,11 @@
|
|||
complex(dp), allocatable :: c3(:)
|
||||
real(dp) :: gamma, entmp, sta
|
||||
complex(dp),allocatable :: hpsi(:,:), hpsi0(:,:), gi(:,:), hi(:,:)
|
||||
#if defined(__CUDA)
|
||||
complex(dp), allocatable, DEVICE :: hpsi_dummy(:,:), c0_dummy(:,:), gi_dummy(:,:)
|
||||
#else
|
||||
complex(dp), allocatable :: hpsi_dummy(:,:), c0_dummy(:,:), gi_dummy(:,:)
|
||||
#endif
|
||||
real(DP), allocatable:: s_minus1(:,:)!factors for inverting US S matrix
|
||||
real(DP), allocatable:: k_minus1(:,:)!factors for inverting US preconditioning matrix
|
||||
real(DP), allocatable :: lambda_repl(:,:) ! replicated copy of lambda
|
||||
|
@ -308,34 +314,10 @@
|
|||
call newd(vpot,rhovan,fion,.true.)
|
||||
|
||||
|
||||
call prefor(eigr,betae)!ATTENZIONE
|
||||
|
||||
do i=1,nbsp,2
|
||||
call dforce( i, bec, betae, c0,c2,c3,rhos, dffts%nnr, ispin,f,nbsp,nspin)
|
||||
if(tefield .and. (evalue.ne.0.d0)) then
|
||||
call dforceb(c0, i, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
|
||||
c2(1:ngw)=c2(1:ngw)+evalue*df(1:ngw)
|
||||
call dforceb(c0, i+1, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
|
||||
c3(1:ngw)=c3(1:ngw)+evalue*df(1:ngw)
|
||||
endif
|
||||
if(tefield2 .and. (evalue2.ne.0.d0)) then
|
||||
call dforceb(c0, i, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
|
||||
c2(1:ngw)=c2(1:ngw)+evalue2*df(1:ngw)
|
||||
call dforceb(c0, i+1, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
|
||||
c3(1:ngw)=c3(1:ngw)+evalue2*df(1:ngw)
|
||||
endif
|
||||
|
||||
hpsi(1:ngw, i)=c2(1:ngw)
|
||||
if(i+1 <= nbsp) then
|
||||
hpsi(1:ngw,i+1)=c3(1:ngw)
|
||||
endif
|
||||
if (gstart==2) then
|
||||
hpsi(1, i)=CMPLX(DBLE(hpsi(1, i)), 0.d0,kind=DP)
|
||||
if(i+1 <= nbsp) then
|
||||
hpsi(1,i+1)=CMPLX(DBLE(hpsi(1,i+1)), 0.d0,kind=DP)
|
||||
endif
|
||||
end if
|
||||
enddo
|
||||
call prefor(eigr,betae)
|
||||
! this puts the gradient inside the array hpsi
|
||||
call runcp_uspp(0,0.d0,0.d0,ema0bg, 0.d0, rhos, bec, &
|
||||
c0, c0_dummy, hpsi, hpsi_dummy, .false., .false., .true.)
|
||||
|
||||
if(pre_state) call ave_kin(c0,SIZE(c0,1),nbsp,ave_ene)
|
||||
|
||||
|
@ -864,47 +846,9 @@
|
|||
#endif
|
||||
|
||||
call prefor(eigr,betae)
|
||||
do i=1,nbsp,2
|
||||
call dforce(i,bec,betae,c0,c2,c3,rhos,dffts%nnr,ispin,f,nbsp,nspin)
|
||||
if(tefield.and.(evalue .ne. 0.d0)) then
|
||||
call dforceb &
|
||||
(c0, i, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
|
||||
do ig=1,ngw
|
||||
c2(ig)=c2(ig)+evalue*df(ig)
|
||||
enddo
|
||||
call dforceb &
|
||||
(c0, i+1, betae, ipolp, bec ,ctabin(1,1,ipolp), gqq, gqqm, qmat, deeq, df)
|
||||
do ig=1,ngw
|
||||
c3(ig)=c3(ig)+evalue*df(ig)
|
||||
enddo
|
||||
endif
|
||||
if(tefield2.and.(evalue2 .ne. 0.d0)) then
|
||||
call dforceb &
|
||||
(c0, i, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
|
||||
do ig=1,ngw
|
||||
c2(ig)=c2(ig)+evalue2*df(ig)
|
||||
enddo
|
||||
call dforceb &
|
||||
(c0, i+1, betae, ipolp2, bec ,ctabin2(1,1,ipolp2), gqq2, gqqm2, qmat2, deeq, df)
|
||||
do ig=1,ngw
|
||||
c3(ig)=c3(ig)+evalue2*df(ig)
|
||||
enddo
|
||||
endif
|
||||
|
||||
do ig=1,ngw
|
||||
gi(ig, i)=c2(ig)
|
||||
if(i+1 <= nbsp) then
|
||||
gi(ig,i+1)=c3(ig)
|
||||
endif
|
||||
end do
|
||||
if (gstart==2) then
|
||||
gi(1, i)=CMPLX(DBLE(gi(1, i)),0.d0,kind=DP)
|
||||
if(i+1 <= nbsp) then
|
||||
gi(1,i+1)=CMPLX(DBLE(gi(1,i+1)),0.d0,kind=DP)
|
||||
endif
|
||||
end if
|
||||
enddo
|
||||
|
||||
! this puts the gradient inside the array gi
|
||||
call runcp_uspp(0,0.d0,0.d0,ema0bg, 0.d0, rhos, bec, &
|
||||
c0, c0_dummy, gi, gi_dummy, .false., .false., .true.)
|
||||
ALLOCATE( lambda_repl( nudx, nudx ) )
|
||||
!
|
||||
do is = 1, nspin
|
||||
|
@ -940,55 +884,19 @@
|
|||
!if required project c0 on previous manifold of occupied states
|
||||
!NOT IMPLEMENTED YET FOR ENSEMBLE DFT AND NSPIN==2
|
||||
!NOT IMPLEMENTED FOR US PSEUDOPOTENTIALS
|
||||
|
||||
lambda_repl=0.d0
|
||||
do i = 1, nss
|
||||
do j = 1, nss
|
||||
ii = i + istart - 1
|
||||
jj = j + istart - 1
|
||||
do ig = 1, ngw
|
||||
lambda_repl( i, j ) = lambda_repl( i, j ) + &
|
||||
2.d0 * DBLE( CONJG( c0old( ig, ii ) ) * c0( ig, jj) )
|
||||
enddo
|
||||
if( gstart == 2 ) then
|
||||
lambda_repl( i, j ) = lambda_repl( i, j ) - &
|
||||
DBLE( CONJG( c0old( 1, ii ) ) * c0( 1, jj ) )
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
CALL mp_sum( lambda_repl, intra_bgrp_comm )
|
||||
|
||||
|
||||
cm(:,:)=c0(:,:)
|
||||
c0=(0.d0,0.d0)
|
||||
do i=1,nss
|
||||
do j=1,nss
|
||||
c0(1:ngw,i)=c0(1:ngw,i)+lambda_repl(i,j)*cm(1:ngw,j)
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call project_parallel_gauge_2(c0old, cm, c0, &
|
||||
nss, ngw, ngw,gstart)
|
||||
|
||||
|
||||
call calbec (nbsp,betae,c0,bec)
|
||||
CALL gram_bgrp( betae, bec, nkb, c0, ngw )
|
||||
call calbec(nbsp, betae,c0,bec)
|
||||
|
||||
|
||||
call runcp_uspp(0,0.d0,0.d0,ema0bg, 0.d0, rhos, bec, &
|
||||
c0, c0_dummy, gi, gi_dummy, .false., .false., .true.)
|
||||
|
||||
do i=1,nbsp,2
|
||||
call dforce(i,bec,betae,c0,c2,c3,rhos,dffts%nnr,ispin,f,nbsp,nspin)
|
||||
|
||||
do ig=1,ngw
|
||||
gi(ig, i)=c2(ig)
|
||||
if(i+1 <= nbsp) then
|
||||
gi(ig,i+1)=c3(ig)
|
||||
endif
|
||||
end do
|
||||
if (gstart==2) then
|
||||
gi(1, i)=CMPLX(DBLE(gi(1, i)),0.d0,kind=DP)
|
||||
if(i+1 <= nbsp) then
|
||||
gi(1,i+1)=CMPLX(DBLE(gi(1,i+1)),0.d0,kind=DP)
|
||||
endif
|
||||
end if
|
||||
enddo
|
||||
|
||||
lambda_repl = 0.d0
|
||||
do i = 1, nss
|
||||
|
|
|
@ -1099,7 +1099,7 @@ SUBROUTINE rhov(rhovan,rhog,rhor)
|
|||
CALL fftx_add_threed2oned_gamma( dfftp, v, rhog(:,isup), rhog(:,isdw) )
|
||||
!
|
||||
IF( iverbosity > 1 ) THEN
|
||||
WRITE( stdout,'(a,2f12.8,/,a,2f12.8)') &
|
||||
WRITE( stdout,'(a,f12.8,/,a,f12.8)') &
|
||||
& ' rhov: n_v(g=0) up = ',omega*DBLE (rhog(1,isup)), &
|
||||
& ' rhov: n_v(g=0) down = ',omega*DBLE(rhog(1,isdw))
|
||||
END IF
|
||||
|
|
|
@ -336,10 +336,10 @@
|
|||
END SUBROUTINE writefile_x
|
||||
END INTERFACE
|
||||
|
||||
|
||||
!
|
||||
INTERFACE runcp_uspp
|
||||
SUBROUTINE runcp_uspp_x &
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart, compute_only_gradient )
|
||||
USE kinds, ONLY: DP
|
||||
IMPLICIT NONE
|
||||
integer, intent(in) :: nfi
|
||||
|
@ -350,7 +350,7 @@
|
|||
complex(DP) :: c0_bgrp(:,:), cm_bgrp(:,:)
|
||||
complex(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
|
||||
logical, optional, intent(in) :: fromscra
|
||||
logical, optional, intent(in) :: restart
|
||||
logical, optional, intent(in) :: restart, compute_only_gradient
|
||||
END SUBROUTINE
|
||||
END INTERFACE
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ cg_sub.o : ../../Modules/ions_base.o
|
|||
cg_sub.o : ../../Modules/kind.o
|
||||
cg_sub.o : ../../Modules/mp_global.o
|
||||
cg_sub.o : ../../Modules/recvec.o
|
||||
cg_sub.o : ../../Modules/wave_gauge.o
|
||||
cg_sub.o : ../../UtilXlib/mp.o
|
||||
cg_sub.o : ../../upflib/uspp.o
|
||||
cg_sub.o : cg.o
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
IMPLICIT NONE
|
||||
! input
|
||||
REAL(DP), INTENT(IN) :: rhovan(nhm*(nhm+1)/2,nat,nspin)
|
||||
REAL(DP) vr(dfftp%nnr,nspin)
|
||||
REAL(DP), INTENT(IN) :: vr(dfftp%nnr,nspin)
|
||||
LOGICAL, INTENT(IN) :: tprint
|
||||
! output
|
||||
REAL(DP) fion(3,nat)
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
|
||||
|
||||
SUBROUTINE runcp_uspp_x &
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart )
|
||||
( nfi, fccc, ccc, ema0bg, dt2bye, rhos, bec_bgrp, c0_bgrp, c0_d, cm_bgrp, cm_d, fromscra, restart, compute_only_gradient )
|
||||
!
|
||||
! This subroutine performs a Car-Parrinello or Steepest-Descent step
|
||||
! on the electronic variables, computing forces on electrons
|
||||
|
@ -30,6 +30,8 @@
|
|||
! on output:
|
||||
! cm_bgrp wave functions at time t + dt, not yet othogonalized
|
||||
!
|
||||
! if compute_only_gradient is true, this routine only puts the gradient
|
||||
! in the array cm_*
|
||||
USE parallel_include
|
||||
USE kinds, ONLY : DP
|
||||
USE mp_global, ONLY : me_bgrp, &
|
||||
|
@ -40,7 +42,7 @@
|
|||
use control_flags, only : lwf, tsde, many_fft
|
||||
use uspp, only : deeq, vkb, vkb_d
|
||||
use gvect, only : gstart
|
||||
use electrons_base, only : nbsp_bgrp, ispin_bgrp, f_bgrp, nspin, nupdwn_bgrp, iupdwn_bgrp
|
||||
use electrons_base, only : nbsp_bgrp, ispin_bgrp, f_bgrp , nspin, nupdwn_bgrp, iupdwn_bgrp
|
||||
use wannier_subroutines, only : ef_potential
|
||||
use efield_module, only : dforce_efield, tefield, dforce_efield2, tefield2
|
||||
use gvecw, only : ngw, ngwx
|
||||
|
@ -59,6 +61,7 @@
|
|||
COMPLEX(DP) DEVICEATTR :: c0_d(:,:), cm_d(:,:)
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: fromscra
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: restart
|
||||
LOGICAL, OPTIONAL, INTENT(IN) :: compute_only_gradient
|
||||
!
|
||||
!
|
||||
real(DP) :: verl1, verl2, verl3
|
||||
|
@ -78,7 +81,7 @@
|
|||
integer :: i, nsiz, incr, idx, idx_in, ierr
|
||||
integer :: iwfc, nwfc, is, ii, tg_rhos_siz, c2_siz
|
||||
integer :: iflag
|
||||
logical :: ttsde
|
||||
logical :: ttsde, only_gradient
|
||||
INTEGER :: omp_get_num_threads
|
||||
|
||||
#if defined (__CUDA)
|
||||
|
@ -96,6 +99,12 @@
|
|||
IF( PRESENT( restart ) ) THEN
|
||||
IF( restart ) iflag = 2
|
||||
END IF
|
||||
|
||||
IF(PRESENT( compute_only_gradient) ) then
|
||||
only_gradient = compute_only_gradient
|
||||
ELSE
|
||||
only_gradient = .false.
|
||||
END IF
|
||||
|
||||
IF( dffts%has_task_groups ) THEN
|
||||
tg_rhos_siz = dffts%nnr_tg
|
||||
|
@ -115,12 +124,14 @@
|
|||
verl2 = 1.0d0 - verl1
|
||||
verl3 = 1.0d0 * fccc
|
||||
|
||||
ALLOCATE( emadt2( ngw ) )
|
||||
ALLOCATE( emaver( ngw ) )
|
||||
|
||||
ccc = fccc * dt2bye
|
||||
emadt2 = dt2bye * ema0bg
|
||||
emaver = emadt2 * verl3
|
||||
IF( .not. only_gradient) then
|
||||
ALLOCATE( emadt2( ngw ) )
|
||||
ALLOCATE( emaver( ngw ) )
|
||||
ccc = fccc * dt2bye
|
||||
emadt2 = dt2bye * ema0bg
|
||||
emaver = emadt2 * verl3
|
||||
END IF
|
||||
|
||||
IF( iflag == 0 ) THEN
|
||||
ttsde = tsde
|
||||
|
@ -279,7 +290,7 @@
|
|||
CALL dforce_efield2 ( bec_bgrp, i, c0_bgrp, c2, c3, rhos)
|
||||
END IF
|
||||
|
||||
IF( iflag == 2 ) THEN
|
||||
IF( iflag == 2 .and. .not. only_gradient ) THEN
|
||||
DO idx = 1, incr, 2
|
||||
IF( i + idx - 1 <= nbsp_bgrp ) THEN
|
||||
cm_bgrp( :, i+idx-1) = c0_bgrp(:,i+idx-1)
|
||||
|
@ -293,16 +304,25 @@
|
|||
DO idx = 1, incr, 2
|
||||
idx_in = idx/2+1
|
||||
IF( i + idx - 1 <= nbsp_bgrp ) THEN
|
||||
IF (tsde) THEN
|
||||
CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
|
||||
CALL wave_steepest( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), emaver, c3(:), ngw, idx_in )
|
||||
IF( .not. only_gradient) then
|
||||
IF (tsde) THEN
|
||||
CALL wave_steepest( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), emaver, c2(:), ngw, idx_in )
|
||||
CALL wave_steepest( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), emaver, c3(:), ngw, idx_in )
|
||||
ELSE
|
||||
CALL wave_verlet( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), verl1, verl2, emaver, c2(:), ngw, idx_in )
|
||||
CALL wave_verlet( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), verl1, verl2, emaver, c3(:), ngw, idx_in )
|
||||
ENDIF
|
||||
IF ( gstart == 2 ) THEN
|
||||
cm_bgrp(1,i+idx-1) = CMPLX(real(cm_bgrp(1,i+idx-1)),0.0d0,kind=dp)
|
||||
cm_bgrp(1,i+idx ) = CMPLX(real(cm_bgrp(1,i+idx )),0.0d0,kind=dp)
|
||||
END IF
|
||||
ELSE
|
||||
CALL wave_verlet( cm_bgrp(:, i+idx-1 ), c0_bgrp(:, i+idx-1 ), verl1, verl2, emaver, c2(:), ngw, idx_in )
|
||||
CALL wave_verlet( cm_bgrp(:, i+idx ), c0_bgrp(:, i+idx ), verl1, verl2, emaver, c3(:), ngw, idx_in )
|
||||
ENDIF
|
||||
IF ( gstart == 2 ) THEN
|
||||
cm_bgrp(1,i+idx-1) = CMPLX(real(cm_bgrp(1,i+idx-1)),0.0d0,kind=dp)
|
||||
cm_bgrp(1,i+idx ) = CMPLX(real(cm_bgrp(1,i+idx )),0.0d0,kind=dp)
|
||||
cm_bgrp(:, i+idx-1) = c2(:)
|
||||
cm_bgrp(:, i+idx) = c3(:)
|
||||
IF ( gstart == 2 ) THEN
|
||||
cm_bgrp(1, i+idx-1) = CMPLX(dble(cm_bgrp(1, i+idx-1)), 0.0d0, kind=dp)
|
||||
cm_bgrp(1, i+idx) = CMPLX(dble(cm_bgrp(1, i+idx)), 0.0d0, kind=dp)
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
END DO
|
||||
|
@ -319,9 +339,10 @@
|
|||
END IF
|
||||
|
||||
END IF
|
||||
|
||||
DEALLOCATE( emadt2 )
|
||||
DEALLOCATE( emaver )
|
||||
IF (.not. only_gradient) then
|
||||
DEALLOCATE( emadt2 )
|
||||
DEALLOCATE( emaver )
|
||||
END IF
|
||||
#if defined (__CUDA)
|
||||
DEALLOCATE( rhos_d )
|
||||
#endif
|
||||
|
|
|
@ -1,3 +1,28 @@
|
|||
New in development version:
|
||||
* RMM-DIIS for CPU (S. Nisihara) and GPU (E. de Paoli, P. Delugas)
|
||||
* DFT-D3: MPI parallelization and GPU acceleration with OPenACC
|
||||
* projwfc.x can be used to compute the PDOS in a local basis (I. Timrov)
|
||||
|
||||
Fixed in development version:
|
||||
* Possible out-of-bound error (gfortran only) could crash DFT+U
|
||||
* incorrect exx factor multiplication in the gga term of polarized cx0
|
||||
functional (v.6.8 only)
|
||||
* Some build problems occurring under special circumstances
|
||||
* Some PP files were not correctly read since v.6.7
|
||||
* DFT-D3 with dftd3_version=4 or 6 could produce NaN's in parallel runs
|
||||
due to missing zero initialization of some work arrays
|
||||
* Ensemble-DFT in CP ("cg") wasn't working any longer for norm-conserving PPs
|
||||
* In DFT+U (lda_plus_u_kind = 0 and 1) the pw.x code was printing the squared
|
||||
eigenvectors instead of simply eigenvectors. Now it prints the
|
||||
eigenvectors (consistent with lda_plus_u_kind = 2).
|
||||
* plotband.x wasn't correctly plotting the bands, under some not-so-special
|
||||
circumstances
|
||||
* CP with DFT+U could crash when writing the xml file (since v.6.6)
|
||||
|
||||
Incompatible changes in develoment version:
|
||||
* Changes to Makefiles and to file "make.inc"
|
||||
* clib/ deleted, files clib/*.c moved to UtilXlib/ or to Modules/
|
||||
|
||||
Known problems in 6.8 version:
|
||||
* electron-phonon calculation in the non-colinear/spinorbit case is broken
|
||||
* some obscure problem still affects variable-cell with hybrid functionals
|
||||
|
|
|
@ -524,7 +524,7 @@ Some environment variables that are relevant to \configure\ are:
|
|||
\texttt{LIBDIRS}& extra directories where to search for libraries\\
|
||||
\end{tabular}\\
|
||||
(note that \texttt{F90} is an ``historical'' name -- we actually use
|
||||
Fortran 2003 -- and that it should be used only together with option
|
||||
Fortran 2008 -- and that it should be used only together with option
|
||||
\texttt{--disable-parallel}. In fact, the value of F90 must be
|
||||
consistent with the parallel Fortran compiler which is determined by
|
||||
\configure\ and stored in the \texttt{MPIF90} variable).
|
||||
|
@ -865,13 +865,8 @@ A few \libxc\ functional routines provides the energy and some others the potent
|
|||
%
|
||||
\subsubsection{XC test}
|
||||
\label{SubSec:XCtest}
|
||||
A testing program, \texttt{xclib\_test.x}, for the \texttt{XClib} library of \qe\ is available. Three options:
|
||||
\begin{itemize}
|
||||
\item \texttt{dft-info}: infos on the input dft are provided. If the functionals are from \libxc\ the external parameters, when present, are listed with a brief description and their default value.
|
||||
\item \texttt{exe-benchmark} (\texttt{gen-benchmark}): the program gets a data file generated previously as input (by running the same program with the \texttt{gen-benchmark} option) and compares the output data, namely energy and potential on a selected number of grid points. It also compares total energy and potential summed up on a large grid in order to better span the input domain. This option shoud be used to test modifications in the \texttt{XClib} library or to check the output matching among different parallelization schemes.
|
||||
\item \texttt{dft-comparison}: comparison between two different dfts on a large grid of points. Max, min and average percentage difference between the two dft outputs (energy and potential) are provided and the points of the grid where the two output differ are shown. This option can be used, for example, to find differences between internal \qe\ functionals and the \libxc\ ones.
|
||||
\end{itemize}
|
||||
The testing program is available for LDA, GGA and MGGA functionals. It also tests the potential derivatives for LDA (\texttt{dmxc}) and GGA (\texttt{dgcxc}).
|
||||
A testing program, \texttt{xclib\_test.x}, for the \texttt{XClib} library of \qe\ is available. The program is available for LDA, GGA and MGGA functionals (both QE and Libxc). It also tests the potential derivatives for LDA (\texttt{dmxc}) and GGA (\texttt{dgcxc}).\\
|
||||
See XClib/README.TEST file for further details.
|
||||
|
||||
\subsection{Compilation}
|
||||
\label{SubSec:Compilation}
|
||||
|
|
|
@ -10,42 +10,40 @@ MODFLAGS= $(BASEMOD_FLAGS) \
|
|||
|
||||
PHAUXOBJS = ../../../PHonon/PH/libphaux.a
|
||||
PHOBJS = ../../../PHonon/PH/libph.a
|
||||
PWOBJS = ../../../PW/src/libpw.a
|
||||
PWOBJS = ../../../PW/src/libpw.a ../../../KS_Solvers/libks_solvers.a ../../../dft-d3/libdftd3qe.a
|
||||
LRMODS = ../../../LR_Modules/liblrmod.a
|
||||
QEMODS = ../../../Modules/libqemod.a ../../../KS_Solvers/libks_solvers.a ../../../upflib/libupf.a ../../../XClib/xc_lib.a \
|
||||
../../../FFTXlib/libqefft.a ../../../LAXlib/libqela.a ../../../UtilXlib/libutil.a ../../../dft-d3/libdftd3qe.a
|
||||
LIBOBJS = ../../../clib/clib.a
|
||||
QEMODS = $(BASEMODS)
|
||||
|
||||
all : ZG.x disca.x pp_disca.x pp_spctrlfn.x bands_unfold.x epsilon_Gaus.x
|
||||
|
||||
ZG.x : ZG.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS) $(LIBOBJS)
|
||||
ZG.x : ZG.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
ZG.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
ZG.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
|
||||
|
||||
disca.x : disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS) $(LIBOBJS)
|
||||
disca.x : disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(LRMODS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
disca.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
disca.o $(PHAUXOBJS) $(PHOBJS) $(LRMODS) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
|
||||
|
||||
pp_disca.x : pp_disca.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
|
||||
pp_disca.x : pp_disca.o $(PWOBJS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
pp_disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
pp_disca.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
|
||||
|
||||
bands_unfold.x : bands_unfold.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
|
||||
bands_unfold.x : bands_unfold.o $(PWOBJS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
bands_unfold.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
bands_unfold.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
|
||||
|
||||
pp_spctrlfn.x : pp_spctrlfn.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
|
||||
pp_spctrlfn.x : pp_spctrlfn.o $(PWOBJS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
pp_spctrlfn.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
pp_spctrlfn.o $(PHAUXOBJS) $(PHOBJS) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
|
||||
|
||||
epsilon_Gaus.x : epsilon_Gaus.o $(PWOBJS) $(QEMODS) $(LIBOBJS)
|
||||
epsilon_Gaus.x : epsilon_Gaus.o $(PWOBJS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
epsilon_Gaus.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
epsilon_Gaus.o ../../../PP/src/libpp.a $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../../bin ; ln -fs ../EPW/ZG/src/$@ . )
|
||||
|
||||
clean :
|
||||
|
|
|
@ -78,13 +78,10 @@ EPWOBJS += ephblochkq.o wfc_elec.o test_tools.o
|
|||
|
||||
|
||||
PHOBJS = ../../PHonon/PH/libph.a
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
W90LIB = ../../wannier90-3.1.0/libwannier.a
|
||||
LRMODS = ../../LR_Modules/liblrmod.a
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
|
||||
../../upflib/libupf.a ../../FFTXlib/libqefft.a ../../dft-d3/libdftd3qe.a
|
||||
LIBOBJS =../../LAXlib/libqela.a ../../UtilXlib/libutil.a ../../clib/clib.a
|
||||
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
|
||||
QEMODS = $(BASEMODS)
|
||||
|
||||
TLDEPS= pw ph pp
|
||||
|
||||
|
@ -94,9 +91,9 @@ libepw.a : $(EPWOBJS)
|
|||
$(AR) $(ARFLAGS) $@ $?
|
||||
$(RANLIB) $@
|
||||
|
||||
epw.x : epw.o libepw.a $(PHOBJS) $(LRMODS) $(PWOBJS) $(W90LIB) $(QEMODS) $(LIBOBJS)
|
||||
epw.x : epw.o libepw.a $(PHOBJS) $(LRMODS) $(PWOBJS) $(W90LIB) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
epw.o libepw.a $(PHOBJS) $(LRMODS) $(W90LIB) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
epw.o libepw.a $(PHOBJS) $(LRMODS) $(W90LIB) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../bin ; ln -fs ../src/epw.x . )
|
||||
|
||||
pw :
|
||||
|
|
|
@ -130,8 +130,8 @@
|
|||
SUBROUTINE close_final
|
||||
!------------------------------------------------------------------
|
||||
!
|
||||
USE units_lr, ONLY : iuwfc
|
||||
USE units_ph, ONLY : iudwf, iudrho
|
||||
USE units_lr, ONLY : iuwfc, iudwf
|
||||
USE units_ph, ONLY : iudrho
|
||||
USE phcom, ONLY : fildrho
|
||||
USE mp_global, ONLY : me_pool,root_pool
|
||||
USE io_var, ONLY : iunepmatwe
|
||||
|
|
|
@ -606,7 +606,6 @@
|
|||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat, ityp, ntyp => nsp
|
||||
USE spin_orb, ONLY : lspinorb
|
||||
USE cell_base, ONLY : tpiba2, omega, tpiba
|
||||
USE gvect, ONLY : ngm, gg, g, eigts1, eigts2, eigts3, mill
|
||||
USE scf, ONLY : v, vltot
|
||||
|
|
|
@ -198,4 +198,6 @@ if(QE_ENABLE_TEST)
|
|||
add_unit_test(test_qe_fftx-r1-t3 1 3 $<TARGET_FILE:qe_fftx_test>)
|
||||
add_unit_test(test_qe_fftx-r3-t1 3 1 $<TARGET_FILE:qe_fftx_test>)
|
||||
add_unit_test(test_qe_fftx-r3-t2 3 2 $<TARGET_FILE:qe_fftx_test>)
|
||||
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
|
|
@ -160,16 +160,22 @@
|
|||
|
||||
SUBROUTINE init_plan()
|
||||
implicit none
|
||||
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: c_test(:)
|
||||
!
|
||||
ALLOCATE(c_test, mold=c)
|
||||
!
|
||||
IF( C_ASSOCIATED(fw_planz( icurrent)) ) CALL fftw_destroy_plan( fw_planz( icurrent) )
|
||||
IF( C_ASSOCIATED(bw_planz( icurrent)) ) CALL fftw_destroy_plan( bw_planz( icurrent) )
|
||||
idir = -1
|
||||
fw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c, &
|
||||
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_ESTIMATE)
|
||||
fw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c_test, &
|
||||
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c, &
|
||||
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_ESTIMATE)
|
||||
|
||||
bw_planz(icurrent) = fftw_plan_many_dft(1, (/nz/), nsl, c_test, &
|
||||
(/SIZE(c)/), 1, ldz, cout, (/SIZE(cout)/), 1, ldz, idir, FFTW_MEASURE)
|
||||
!
|
||||
DEALLOCATE(c_test)
|
||||
!
|
||||
zdims(1,icurrent) = nz; zdims(2,icurrent) = nsl; zdims(3,icurrent) = ldz;
|
||||
ip = icurrent
|
||||
icurrent = MOD( icurrent, ndims ) + 1
|
||||
|
@ -309,42 +315,47 @@
|
|||
|
||||
SUBROUTINE init_plan()
|
||||
implicit none
|
||||
|
||||
COMPLEX(DP), ALLOCATABLE :: f_test(:)
|
||||
!
|
||||
ALLOCATE(f_test,mold=r)
|
||||
!
|
||||
IF ( ldx /= nx .OR. ldy /= ny ) THEN
|
||||
IF( C_ASSOCIATED(fw_plan(2,icurrent)) ) CALL fftw_destroy_plan( fw_plan(2,icurrent) )
|
||||
IF( C_ASSOCIATED(bw_plan(2,icurrent)) ) CALL fftw_destroy_plan( bw_plan(2,icurrent) )
|
||||
idir = -1
|
||||
fw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, r(1:), &
|
||||
(/ldx*ldy/), ldx, 1, r(1:), (/ldx*ldy/), ldx, 1, idir, &
|
||||
FFTW_ESTIMATE)
|
||||
fw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, f_test(1:), &
|
||||
(/ldx*ldy/), ldx, 1, f_test(1:), (/ldx*ldy/), ldx, 1, idir, &
|
||||
FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, r(1:), &
|
||||
(/ldx*ldy/), ldx, 1, r(1:), (/ldx*ldy/), ldx, 1, idir, &
|
||||
FFTW_ESTIMATE)
|
||||
bw_plan(2,icurrent) = fftw_plan_many_dft(1, (/ny/), 1, f_test(1:), &
|
||||
(/ldx*ldy/), ldx, 1, f_test(1:), (/ldx*ldy/), ldx, 1, idir, &
|
||||
FFTW_MEASURE)
|
||||
|
||||
IF( C_ASSOCIATED(fw_plan(1,icurrent)) ) CALL fftw_destroy_plan( fw_plan(1,icurrent) )
|
||||
IF( C_ASSOCIATED(bw_plan(1,icurrent)) ) CALL fftw_destroy_plan( bw_plan(1,icurrent) )
|
||||
idir = -1
|
||||
fw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, r(1:), &
|
||||
(/ldx*ldy/), 1, ldx, r(1:), (/ldx*ldy/), 1, ldx, idir, &
|
||||
FFTW_ESTIMATE)
|
||||
fw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, f_test(1:), &
|
||||
(/ldx*ldy/), 1, ldx, f_test(1:), (/ldx*ldy/), 1, ldx, idir, &
|
||||
FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, r(1:), &
|
||||
(/ldx*ldy/), 1, ldx, r(1:), (/ldx*ldy/), 1, ldx, idir, &
|
||||
FFTW_ESTIMATE)
|
||||
bw_plan(1,icurrent) = fftw_plan_many_dft(1, (/nx/), ny, f_test(1:), &
|
||||
(/ldx*ldy/), 1, ldx, f_test(1:), (/ldx*ldy/), 1, ldx, idir, &
|
||||
FFTW_MEASURE)
|
||||
ELSE
|
||||
IF( C_ASSOCIATED(fw_plan( 1, icurrent)) ) CALL fftw_destroy_plan( fw_plan( 1, icurrent) )
|
||||
IF( C_ASSOCIATED(bw_plan( 1, icurrent)) ) CALL fftw_destroy_plan( bw_plan( 1, icurrent) )
|
||||
idir = -1
|
||||
fw_plan(1, icurrent) = fftw_plan_many_dft(2, (/ny, nx/), nzl,&
|
||||
r(1:), (/ny, nx/), 1, nx*ny, r(1:), (/ny, nx/), 1, nx*ny, idir,&
|
||||
FFTW_ESTIMATE)
|
||||
f_test(1:), (/ldy, ldx/), 1, ldx*ldy, f_test(1:), (/ldy, ldx/), 1, ldx*ldy, idir,&
|
||||
FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(1, icurrent) = fftw_plan_many_dft(2, (/ny, nx/), nzl,&
|
||||
r(1:), (/ny, nx/), 1, nx*ny, r(1:), (/ny, nx/), 1, nx*ny, idir,&
|
||||
FFTW_ESTIMATE)
|
||||
f_test(1:), (/ldy, ldx/), 1, ldx*ldy, f_test(1:), (/ldy, ldx/), 1, ldx*ldy, idir,&
|
||||
FFTW_MEASURE)
|
||||
END IF
|
||||
|
||||
!
|
||||
DEALLOCATE(f_test)
|
||||
!
|
||||
dims(1,icurrent) = ny; dims(2,icurrent) = ldx;
|
||||
dims(3,icurrent) = nx; dims(4,icurrent) = nzl;
|
||||
ip = icurrent
|
||||
|
@ -450,15 +461,21 @@
|
|||
|
||||
SUBROUTINE init_plan()
|
||||
implicit none
|
||||
COMPLEX(DP), ALLOCATABLE :: f_test(:)
|
||||
IF ( nx /= ldx .or. ny /= ldy .or. nz /= ldz ) &
|
||||
call fftx_error__('cfft3','not implemented',3)
|
||||
IF( C_ASSOCIATED(fw_plan(icurrent)) ) CALL fftw_destroy_plan( fw_plan(icurrent) )
|
||||
IF( C_ASSOCIATED(bw_plan(icurrent)) ) CALL fftw_destroy_plan( bw_plan(icurrent) )
|
||||
!
|
||||
ALLOCATE(f_test,mold=f)
|
||||
!
|
||||
idir = -1
|
||||
fw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f(1:), f(1:), idir, FFTW_ESTIMATE)
|
||||
fw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f_test(1:), f_test(1:), idir, FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f(1:), f(1:), idir, FFTW_ESTIMATE)
|
||||
|
||||
bw_plan(icurrent) = fftw_plan_dft_3d(nz, ny, nx, f_test(1:), f_test(1:), idir, FFTW_MEASURE)
|
||||
!
|
||||
DEALLOCATE(f_test)
|
||||
!
|
||||
dims(1,icurrent) = nx; dims(2,icurrent) = ny; dims(3,icurrent) = nz
|
||||
ip = icurrent
|
||||
icurrent = MOD( icurrent, ndims ) + 1
|
||||
|
@ -630,7 +647,9 @@ SUBROUTINE cfft3ds (f, nx, ny, nz, ldx, ldy, ldz, howmany, isign, &
|
|||
|
||||
SUBROUTINE init_plan()
|
||||
implicit none
|
||||
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: f_test(:)
|
||||
!
|
||||
IF( C_ASSOCIATED(fw_plan( 1, icurrent)) ) &
|
||||
CALL fftw_destroy_plan( fw_plan( 1, icurrent) )
|
||||
IF( C_ASSOCIATED(bw_plan( 1, icurrent)) ) &
|
||||
|
@ -643,25 +662,30 @@ SUBROUTINE cfft3ds (f, nx, ny, nz, ldx, ldy, ldz, howmany, isign, &
|
|||
CALL fftw_destroy_plan( fw_plan( 3, icurrent) )
|
||||
IF( C_ASSOCIATED(bw_plan( 3, icurrent)) ) &
|
||||
CALL fftw_destroy_plan( bw_plan( 3, icurrent) )
|
||||
!
|
||||
ALLOCATE(f_test, mold=f)
|
||||
!
|
||||
idir = -1
|
||||
fw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f(1:), (/ldz, ldy, ldx/), 1, ldx, &
|
||||
f(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_ESTIMATE)
|
||||
fw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f_test(1:), (/ldz, ldy, ldx/), 1, ldx, &
|
||||
f_test(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f(1:), (/ldz, ldy, ldx/), 1, ldx, &
|
||||
f(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_ESTIMATE)
|
||||
bw_plan(1, icurrent) = fftw_plan_many_dft(1, (/nx/), ny*nz, f_test(1:), (/ldz, ldy, ldx/), 1, ldx, &
|
||||
f_test(1:), (/ldz, ldy, ldx/), 1, ldx, idir, FFTW_MEASURE)
|
||||
idir = -1
|
||||
fw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
|
||||
f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_ESTIMATE)
|
||||
fw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
|
||||
f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
|
||||
f(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_ESTIMATE)
|
||||
bw_plan(2, icurrent) = fftw_plan_many_dft(1, (/ny/), nz, f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, &
|
||||
f_test(1:), (/ldz, ldy, ldx/), ldx, ldx*ldy, idir, FFTW_MEASURE)
|
||||
idir = -1
|
||||
fw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
|
||||
f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_ESTIMATE)
|
||||
fw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
|
||||
f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_MEASURE)
|
||||
idir = 1
|
||||
bw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
|
||||
f(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_ESTIMATE)
|
||||
|
||||
bw_plan(3, icurrent) = fftw_plan_many_dft(1, (/nz/), 1, f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, &
|
||||
f_test(1:), (/ldz, ldy, ldx/), ldx*ldy, 1, idir, FFTW_MEASURE)
|
||||
!
|
||||
DEALLOCATE(f_test)
|
||||
!
|
||||
dims(1,icurrent) = nx; dims(2,icurrent) = ny; dims(3,icurrent) = nz
|
||||
ip = icurrent
|
||||
icurrent = MOD( icurrent, ndims ) + 1
|
||||
|
|
|
@ -67,7 +67,9 @@ program test
|
|||
!!
|
||||
!!-ntg Number of task groups
|
||||
!!
|
||||
!!-gamma Enables gamma point trick. Should be about 2 times faster.
|
||||
!!-gamma Enables gamma point trick. Should be about 2 times faster
|
||||
!!
|
||||
!!-pd If .true. uses pencil decomposition, otherwise uses slab decomposition
|
||||
!!
|
||||
!!-av1 x y z First lattice vector, in atomic units. N.B.: when using -av1, -alat is ignored!
|
||||
!!
|
||||
|
@ -90,8 +92,19 @@ program test
|
|||
USE fft_helper_subroutines
|
||||
USE fft_interfaces, ONLY:fwfft, invfft
|
||||
USE timers
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
ENUM, BIND(C)
|
||||
ENUMERATOR :: STAMP_BEGIN = 1
|
||||
ENUMERATOR :: STAMP_PSI = 2
|
||||
ENUMERATOR :: STAMP_INVFFT = 3
|
||||
ENUMERATOR :: STAMP_VLOC = 4
|
||||
ENUMERATOR :: STAMP_FWFFT = 5
|
||||
ENUMERATOR :: STAMP_HPSI = 6
|
||||
END ENUM
|
||||
INTEGER, PARAMETER :: NUM_STAMPS = 6
|
||||
!
|
||||
TYPE(fft_type_descriptor) :: dfftp, dffts, dfft3d
|
||||
!
|
||||
TYPE(sticks_map) :: smap
|
||||
|
@ -121,17 +134,21 @@ program test
|
|||
!! cut-off for the wave-function
|
||||
REAL*8 :: tpiba, alat, alat_in
|
||||
!! lattice parameters
|
||||
REAL*8 :: time(100)
|
||||
REAL*8 :: my_time(100)
|
||||
REAL*8 :: time_min(100)
|
||||
REAL*8 :: time_max(100)
|
||||
REAL*8 :: time_avg(100)
|
||||
REAL*8 :: time(NUM_STAMPS)
|
||||
REAL*8 :: my_time(NUM_STAMPS)
|
||||
REAL*8 :: time_min(NUM_STAMPS)
|
||||
REAL*8 :: time_max(NUM_STAMPS)
|
||||
REAL*8 :: time_avg(NUM_STAMPS)
|
||||
REAL*8 :: wall
|
||||
REAL*8 :: wall_avg
|
||||
!
|
||||
LOGICAL :: gamma_only = .false.
|
||||
LOGICAL :: use_tg
|
||||
!! if calculations require only gamma point
|
||||
LOGICAL :: use_tg
|
||||
!! if calculations use task group
|
||||
LOGICAL :: use_pd = .false.
|
||||
!! if calculations use pencil decomposition
|
||||
LOGICAL :: lpara
|
||||
REAL*8 :: at(3, 3), bg(3, 3)
|
||||
REAL(DP), PARAMETER :: pi = 4.0_DP * atan(1.0_DP)
|
||||
!
|
||||
|
@ -247,6 +264,10 @@ program test
|
|||
CALL get_command_argument(i + 1, arg)
|
||||
READ (arg, *) gamma_only
|
||||
END IF
|
||||
IF (TRIM(arg) == '-pd') THEN
|
||||
CALL get_command_argument(i + 1, arg)
|
||||
READ (arg, *) use_pd
|
||||
END IF
|
||||
IF ((TRIM(arg) == '-howmany').or.(TRIM(arg) == '-nh')) THEN
|
||||
CALL get_command_argument(i + 1, arg)
|
||||
READ (arg, *) many_fft
|
||||
|
@ -357,6 +378,7 @@ program test
|
|||
write (*, *) 'Num Task Group = ', ntgs
|
||||
write (*, *) 'Num Many FFTs = ', many_fft
|
||||
write (*, *) 'Gamma trick = ', gamma_only
|
||||
write (*, *) 'Pencil decomp = ', use_pd
|
||||
end if
|
||||
!
|
||||
nx = 2*int(sqrt(gcutm)*sqrt(at(1, 1)**2 + at(2, 1)**2 + at(3, 1)**2)) + 1
|
||||
|
@ -371,12 +393,16 @@ program test
|
|||
IF (gamma_only) incr = 2
|
||||
dffts%has_task_groups = (ntgs > 1)
|
||||
use_tg = dffts%has_task_groups
|
||||
lpara = (npes > 1)
|
||||
!
|
||||
dffts%rho_clock_label='ffts' ; dffts%wave_clock_label='fftw'
|
||||
|
||||
CALL fft_type_init(dffts, smap, "wave", gamma_only, .true., comm, at, bg, gkcut, gcutms/gkcut, nyfft=ntgs, nmany=many_fft)
|
||||
dffts%rho_clock_label='ffts'
|
||||
dffts%wave_clock_label='fftw'
|
||||
!
|
||||
CALL fft_type_init(dffts, smap, "wave", gamma_only, lpara, comm, at, bg, gkcut, gcutms/gkcut, &
|
||||
nyfft=ntgs, nmany=many_fft, use_pd=use_pd)
|
||||
dfftp%rho_clock_label='fft'
|
||||
CALL fft_type_init(dfftp, smap, "rho", gamma_only, .true., comm, at, bg, gcutm, 4.d0, nyfft=ntgs, nmany=many_fft)
|
||||
CALL fft_type_init(dfftp, smap, "rho", gamma_only, lpara, comm, at, bg, gcutm, 4.d0, nyfft=ntgs, &
|
||||
nmany=many_fft, use_pd=use_pd)
|
||||
!
|
||||
CALL fft_base_info(mype == 0, dffts, dfftp)
|
||||
if (mype == 0) then
|
||||
|
@ -431,7 +457,7 @@ program test
|
|||
g, gg, mill, ig_l2g, gstart, .TRUE. )
|
||||
ELSE
|
||||
CALL ggen( dfftp, gamma_only, at, bg, gcutm, ngm_g, ngm, &
|
||||
g, gg, mill, ig_l2g, gstart, .FALSE. )
|
||||
g, gg, mill, ig_l2g, gstart, .FALSE. )
|
||||
END IF
|
||||
CALL ggens( dffts, gamma_only, at, g, gg, mill, gcutms, ngms )
|
||||
!
|
||||
|
@ -500,13 +526,13 @@ program test
|
|||
IF (use_tg) THEN
|
||||
DO ib = 1, nbnd, incr
|
||||
!
|
||||
time(1) = mpi_wall_time()
|
||||
time(STAMP_BEGIN) = mpi_wall_time()
|
||||
!
|
||||
call prepare_psi_tg(ib, nbnd, ngms, psi, tg_psic, dffts, gamma_only)
|
||||
time(2) = mpi_wall_time()
|
||||
time(STAMP_PSI) = mpi_wall_time()
|
||||
!
|
||||
CALL invfft('tgWave', tg_psic, dffts);
|
||||
time(3) = mpi_wall_time()
|
||||
CALL invfft('tgWave', tg_psic, dffts)
|
||||
time(STAMP_INVFFT) = mpi_wall_time()
|
||||
!
|
||||
CALL tg_get_group_nr3(dffts, right_nr3)
|
||||
!
|
||||
|
@ -514,13 +540,13 @@ program test
|
|||
tg_psic(j) = tg_psic(j)*tg_v(j)
|
||||
ENDDO
|
||||
!
|
||||
time(4) = mpi_wall_time()
|
||||
time(STAMP_VLOC) = mpi_wall_time()
|
||||
!
|
||||
CALL fwfft('tgWave', tg_psic, dffts);
|
||||
time(5) = mpi_wall_time()
|
||||
CALL fwfft('tgWave', tg_psic, dffts)
|
||||
time(STAMP_FWFFT) = mpi_wall_time()
|
||||
!
|
||||
CALL accumulate_hpsi_tg(ib, nbnd, ngms, hpsi, tg_psic, dffts, gamma_only)
|
||||
time(6) = mpi_wall_time()
|
||||
time(STAMP_HPSI) = mpi_wall_time()
|
||||
!
|
||||
DO i = 2, 6
|
||||
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
||||
|
@ -531,6 +557,8 @@ program test
|
|||
ENDDO
|
||||
ELSEIF (many_fft > 1) THEN
|
||||
DO ib = 1, nbnd, many_fft
|
||||
!
|
||||
time(STAMP_BEGIN) = mpi_wall_time()
|
||||
!
|
||||
group_size = MIN(many_fft, nbnd - (ib -1))
|
||||
!
|
||||
|
@ -538,24 +566,24 @@ program test
|
|||
!call prepare_psi(ib, nbnd, ngms, psi, psic(1+(k-1)*dffts%nnr:), dffts, gamma_only)
|
||||
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
|
||||
ENDDO
|
||||
time(2) = mpi_wall_time()
|
||||
time(STAMP_PSI) = mpi_wall_time()
|
||||
!
|
||||
CALL invfft('Wave', psic, dffts, howmany=group_size)
|
||||
time(3) = mpi_wall_time()
|
||||
time(STAMP_INVFFT) = mpi_wall_time()
|
||||
!
|
||||
DO j = 1, dffts%nnr
|
||||
psic(j) = psic(j)*v(j)
|
||||
ENDDO
|
||||
time(4) = mpi_wall_time()
|
||||
time(STAMP_VLOC) = mpi_wall_time()
|
||||
!
|
||||
CALL fwfft('Wave', psic, dffts, howmany=group_size)
|
||||
time(5) = mpi_wall_time()
|
||||
time(STAMP_FWFFT) = mpi_wall_time()
|
||||
!
|
||||
DO k=0, group_size - 1
|
||||
!CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic(1+(k-1)*dffts%nnr:), dffts, gamma_only)
|
||||
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, dffts, gamma_only)
|
||||
ENDDO
|
||||
time(6) = mpi_wall_time()
|
||||
time(STAMP_HPSI) = mpi_wall_time()
|
||||
!
|
||||
DO i = 2, 6
|
||||
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
||||
|
@ -567,21 +595,24 @@ program test
|
|||
ELSE
|
||||
DO ib = 1, nbnd, incr
|
||||
!
|
||||
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
|
||||
time(2) = mpi_wall_time()
|
||||
time(STAMP_BEGIN) = mpi_wall_time()
|
||||
!
|
||||
CALL invfft('Wave', psic, dffts); time(3) = mpi_wall_time()
|
||||
call prepare_psi(ib, nbnd, ngms, psi, psic, dffts, gamma_only)
|
||||
time(STAMP_PSI) = mpi_wall_time()
|
||||
!
|
||||
CALL invfft('Wave', psic, dffts)
|
||||
time(STAMP_INVFFT) = mpi_wall_time()
|
||||
!
|
||||
DO j = 1, dffts%nnr
|
||||
psic(j) = psic(j)*v(j)
|
||||
ENDDO
|
||||
time(4) = mpi_wall_time()
|
||||
time(STAMP_VLOC) = mpi_wall_time()
|
||||
!
|
||||
CALL fwfft('Wave', psic, dffts);
|
||||
time(5) = mpi_wall_time()
|
||||
CALL fwfft('Wave', psic, dffts)
|
||||
time(STAMP_FWFFT) = mpi_wall_time()
|
||||
!
|
||||
CALL accumulate_hpsi(ib, nbnd, ngms, hpsi, psic, dffts, gamma_only)
|
||||
time(6) = mpi_wall_time()
|
||||
time(STAMP_HPSI) = mpi_wall_time()
|
||||
!
|
||||
DO i = 2, 6
|
||||
my_time(i) = my_time(i) + (time(i) - time(i - 1))
|
||||
|
@ -639,21 +670,21 @@ program test
|
|||
write(*,100)
|
||||
write(*,1)
|
||||
write(*,100)
|
||||
write(*,2) time_min(2), time_max(2), time_avg(2)
|
||||
write(*,3) time_min(3), time_max(3), time_avg(3)
|
||||
write(*,4) time_min(4), time_max(4), time_avg(4)
|
||||
write(*,5) time_min(5), time_max(5), time_avg(5)
|
||||
write(*,6) time_min(6), time_max(6), time_avg(6)
|
||||
write(*,2) time_min(STAMP_PSI), time_max(STAMP_PSI), time_avg(STAMP_PSI)
|
||||
write(*,3) time_min(STAMP_INVFFT), time_max(STAMP_INVFFT), time_avg(STAMP_INVFFT)
|
||||
write(*,4) time_min(STAMP_VLOC), time_max(STAMP_VLOC), time_avg(STAMP_VLOC)
|
||||
write(*,5) time_min(STAMP_FWFFT), time_max(STAMP_FWFFT), time_avg(STAMP_FWFFT)
|
||||
write(*,6) time_min(STAMP_HPSI), time_max(STAMP_HPSI), time_avg(STAMP_HPSI)
|
||||
write(*,7) wall
|
||||
write(*,100)
|
||||
|
||||
100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
|
||||
1 FORMAT(' |FFT TEST subroutine | sec. min | sec. max | sec. avg |' )
|
||||
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
||||
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
||||
4 FORMAT(' |workload | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
2 FORMAT(' |prepare_psi | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
||||
3 FORMAT(' |invfft | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
||||
4 FORMAT(' |workload | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
5 FORMAT(' |fwfft | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
6 FORMAT(' |accumulate_hpsi | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
7 FORMAT(' |wall time | ', D14.5, ' |')
|
||||
|
||||
end if
|
||||
|
@ -1241,27 +1272,27 @@ subroutine print_clock(mype, npes, ncount)
|
|||
end if
|
||||
10100 FORMAT(' +--------------------+----------------+-----------------+----------------+' )
|
||||
101 FORMAT(' |FFT subroutine | sec. min | sec. max | sec. avg |' )
|
||||
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
||||
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.3, ' | ', D14.3, ' |' )
|
||||
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
108 FORMAT(' |fft | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1021 FORMAT(' |fft_scatt_many_yz | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
1022 FORMAT(' |fft_scatt_many_xy | ', D14.5, ' | ', D14.3, ' | ', D14.3 , ' |')
|
||||
102 FORMAT(' |cft_1z | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
||||
103 FORMAT(' |cft_2xy | ', D14.5, ' | ', D14.5, ' | ', D14.5, ' |' )
|
||||
104 FORMAT(' |cgather | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
105 FORMAT(' |cgather_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
106 FORMAT(' |cscatter_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
107 FORMAT(' |cscatter_sym | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
108 FORMAT(' |fft | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
109 FORMAT(' |fft_scatt_tg | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1010 FORMAT(' |fft_scatt_xy | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1011 FORMAT(' |fft_scatt_yz | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1012 FORMAT(' |fftb | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1013 FORMAT(' |fftc | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1014 FORMAT(' |fftcw | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1015 FORMAT(' |ffts | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1016 FORMAT(' |fftw | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1017 FORMAT(' |rgather_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1018 FORMAT(' |rscatter_grid | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1019 FORMAT(' |fft_scatter | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1020 FORMAT(' |ALLTOALL | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1021 FORMAT(' |fft_scatt_many_yz | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
1022 FORMAT(' |fft_scatt_many_xy | ', D14.5, ' | ', D14.5, ' | ', D14.5 , ' |')
|
||||
|
||||
end subroutine
|
||||
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
set(test_common_src tester.f90 utils.f90)
|
||||
qe_add_library(qe_fftx_test_common ${test_common_src})
|
||||
target_link_libraries(qe_fftx_test_common PRIVATE qe_mpi_fortran)
|
||||
|
||||
set(source_names fft_scalar_gpu fft_scatter_mod_gpu fwinv_gpu)
|
||||
foreach(NAME ${source_names})
|
||||
set(TEST_SOURCE_FILE test_${NAME}.f90)
|
||||
qe_enable_cuda_fortran("${TEST_SOURCE_FILE}")
|
||||
qe_add_executable(test_qe_fftx_${NAME} ${TEST_SOURCE_FILE})
|
||||
set_target_properties(test_qe_fftx_${NAME}
|
||||
PROPERTIES
|
||||
OUTPUT_NAME test_qe_fftx_${NAME}.x
|
||||
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../bin)
|
||||
|
||||
target_link_libraries(test_qe_fftx_${NAME}
|
||||
PRIVATE
|
||||
qe_openmp_fortran
|
||||
qe_mpi_fortran
|
||||
qe_fftx_test_common
|
||||
qe_fftx)
|
||||
|
||||
add_unit_test(test_qe_fftx_${NAME}-r1-t1 1 1 $<TARGET_FILE:test_qe_fftx_${NAME}>)
|
||||
add_unit_test(test_qe_fftx_${NAME}-r1-t3 1 3 $<TARGET_FILE:test_qe_fftx_${NAME}>)
|
||||
add_unit_test(test_qe_fftx_${NAME}-r3-t1 3 1 $<TARGET_FILE:test_qe_fftx_${NAME}>)
|
||||
add_unit_test(test_qe_fftx_${NAME}-r3-t2 3 2 $<TARGET_FILE:test_qe_fftx_${NAME}>)
|
||||
endforeach()
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
include ../../make.inc
|
||||
|
||||
MODFLAGS= $(MOD_FLAG).. $(MOD_FLAG).
|
||||
MODFLAGS = $(MOD_FLAG).. $(MOD_FLAG).
|
||||
|
||||
SRCS = test_fft_scalar_gpu.f90 \
|
||||
test_fft_scatter_mod_gpu.f90 \
|
||||
test_fwinv_gpu.f90
|
||||
test_fft_scatter_mod_gpu.f90 \
|
||||
test_fwinv_gpu.f90
|
||||
|
||||
EXECS = $(SRCS:.f90=.x)
|
||||
|
||||
|
@ -15,16 +15,10 @@ all: common $(EXECS)
|
|||
common: tester.o utils.o
|
||||
|
||||
%.x: %.o
|
||||
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(QELIBS)
|
||||
$(LD) $(LDFLAGS) $< utils.o tester.o -o $@ ../libqefft.a $(FFT_LIBS) $(BLAS_LIBS) $(MPI_LIBS) $(LD_LIBS)
|
||||
|
||||
clean :
|
||||
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
|
||||
|
||||
# .PHONY forces execution of a rule irrespective of the presence of an
|
||||
# updated file with the same name of the rule. In this way, the script
|
||||
# that generates version.f90 always runs, updating the version if you
|
||||
# execute "svn update". The update_version script takes care of not
|
||||
# changing the file if the svn version did not change
|
||||
clean:
|
||||
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x rnd_seed*
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
|
|
|
@ -271,3 +271,20 @@ end program test_fft_scalar_gpu
|
|||
program test_fft_scalar_gpu
|
||||
end program test_fft_scalar_gpu
|
||||
#endif
|
||||
!
|
||||
! Dummy
|
||||
SUBROUTINE stop_clock(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE stop_clock
|
||||
!
|
||||
SUBROUTINE start_clock(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE start_clock
|
||||
!
|
||||
SUBROUTINE stop_clock_gpu(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE stop_clock_gpu
|
||||
!
|
||||
SUBROUTINE start_clock_gpu(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE start_clock_gpu
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
#if defined(__CUDA)
|
||||
program test_fft_scatter_mod_gpu
|
||||
#if defined(__MPI)
|
||||
#if defined(__MPI) && defined(__MPI_MODULE)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
#if defined(__MPI) && ! defined(__MPI_MODULE)
|
||||
INCLUDE 'mpif.h'
|
||||
#endif
|
||||
! MPI type
|
||||
type mpi_t
|
||||
integer :: me, n, root, comm
|
||||
|
@ -87,7 +90,8 @@ program test_fft_scatter_mod_gpu
|
|||
bg = RESHAPE((/1.d0, 0.d0, 0.d0, 0.d0, 1.d0, 0.d0, 0.d0, 0.d0, 1.d0/), shape(bg))
|
||||
bg = 2.d0*pi
|
||||
!
|
||||
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, nyfft=nyfft)
|
||||
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 4.d0, &
|
||||
& nyfft=nyfft, nmany=1)
|
||||
!
|
||||
END SUBROUTINE fft_desc_init
|
||||
|
||||
|
@ -124,8 +128,8 @@ program test_fft_scatter_mod_gpu
|
|||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE scatter_mod, ONLY : fft_scatter_xy
|
||||
USE scatter_mod_gpu, ONLY : fft_scatter_xy_gpu
|
||||
USE fft_scatter, ONLY : fft_scatter_xy
|
||||
USE fft_scatter_gpu, ONLY : fft_scatter_xy_gpu
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
|
@ -196,8 +200,8 @@ program test_fft_scatter_mod_gpu
|
|||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE scatter_mod, ONLY : fft_scatter_yz
|
||||
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu
|
||||
USE fft_scatter, ONLY : fft_scatter_yz
|
||||
USE fft_scatter_gpu, ONLY : fft_scatter_yz_gpu
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
|
@ -265,8 +269,8 @@ program test_fft_scatter_mod_gpu
|
|||
USE fft_param, ONLY : DP
|
||||
USE fft_types, ONLY : fft_type_descriptor
|
||||
USE stick_base, ONLY : sticks_map
|
||||
USE scatter_mod, ONLY : fft_scatter_yz
|
||||
USE scatter_mod_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
|
||||
USE fft_scatter, ONLY : fft_scatter_yz
|
||||
USE fft_scatter_gpu, ONLY : fft_scatter_yz_gpu, fft_scatter_many_yz_gpu
|
||||
implicit none
|
||||
TYPE(mpi_t) :: mp
|
||||
TYPE(tester_t) :: test
|
||||
|
@ -392,15 +396,24 @@ program test_fft_scatter_mod_gpu
|
|||
END SUBROUTINE test_fft_scatter_many_yz_gpu_1
|
||||
|
||||
end program test_fft_scatter_mod_gpu
|
||||
! dummy subroutines
|
||||
subroutine stop_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine stop_clock
|
||||
subroutine start_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine start_clock
|
||||
!
|
||||
#else
|
||||
program test_fft_scatter_mod_gpu
|
||||
end program test_fft_scatter_mod_gpu
|
||||
#endif
|
||||
!
|
||||
! Dummy
|
||||
SUBROUTINE stop_clock(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE stop_clock
|
||||
!
|
||||
SUBROUTINE start_clock(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE start_clock
|
||||
!
|
||||
SUBROUTINE stop_clock_gpu(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE stop_clock_gpu
|
||||
!
|
||||
SUBROUTINE start_clock_gpu(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE start_clock_gpu
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
#if defined(__CUDA)
|
||||
program test_fwinv_gpu
|
||||
#if defined(__MPI)
|
||||
#if defined(__MPI) && defined(__MPI_MODULE)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
#if defined(__MPI) && ! defined(__MPI_MODULE)
|
||||
INCLUDE 'mpif.h'
|
||||
#endif
|
||||
! MPI type
|
||||
type mpi_t
|
||||
integer :: me, n, root, comm
|
||||
|
@ -40,6 +43,7 @@ program test_fwinv_gpu
|
|||
CALL test_invfft_gpu_1(mp, test, .false., i)
|
||||
END IF
|
||||
END DO
|
||||
!
|
||||
CALL test_fwfft_many_gpu_1(mp, test, .true., 1)
|
||||
CALL test_fwfft_many_gpu_1(mp, test, .false., 1)
|
||||
!
|
||||
|
@ -108,7 +112,8 @@ program test_fwinv_gpu
|
|||
at = RESHAPE((/10.d0, 0.d0, 0.d0, 0.d0, 10.d0, 0.d0, 0.d0, 0.d0, 10.d0/), shape(at))
|
||||
CALL calc_bg(at, bg)
|
||||
!
|
||||
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, nyfft=nyfft)
|
||||
CALL fft_type_init(dfft, smap, flavor, gamma_only, parallel, comm, at, bg, 12.d0, 6.d0, &
|
||||
& nyfft=nyfft, nmany=1)
|
||||
!
|
||||
END SUBROUTINE fft_desc_init
|
||||
|
||||
|
@ -410,15 +415,24 @@ program test_fwinv_gpu
|
|||
END SUBROUTINE test_invfft_many_gpu_1
|
||||
|
||||
end program test_fwinv_gpu
|
||||
! dummy subroutines
|
||||
subroutine stop_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine stop_clock
|
||||
subroutine start_clock( label )
|
||||
character(len=*) :: label
|
||||
end subroutine start_clock
|
||||
!
|
||||
#else
|
||||
program test_fwinv_gpu
|
||||
end program test_fwinv_gpu
|
||||
#endif
|
||||
!
|
||||
! Dummy
|
||||
SUBROUTINE stop_clock(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE stop_clock
|
||||
!
|
||||
SUBROUTINE start_clock(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE start_clock
|
||||
!
|
||||
SUBROUTINE stop_clock_gpu(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE stop_clock_gpu
|
||||
!
|
||||
SUBROUTINE start_clock_gpu(label)
|
||||
CHARACTER(*) :: label
|
||||
END SUBROUTINE start_clock_gpu
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
SUBROUTINE collect_results(test)
|
||||
#if defined(__MPI)
|
||||
#if defined(__MPI) && defined(__MPI_MODULE)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
#if defined(__MPI) && ! defined(__MPI_MODULE)
|
||||
INCLUDE 'mpif.h'
|
||||
#endif
|
||||
!
|
||||
TYPE(tester_t) :: test
|
||||
INTEGER :: itottests, itoterr, ierr, me
|
||||
|
@ -75,11 +78,14 @@ END SUBROUTINE save_random_seed
|
|||
|
||||
|
||||
SUBROUTINE no_test
|
||||
#if defined(__MPI)
|
||||
#if defined(__MPI) && defined(__MPI_MODULE)
|
||||
USE mpi
|
||||
#endif
|
||||
USE tester
|
||||
IMPLICIT NONE
|
||||
#if defined(__MPI) && ! defined(__MPI_MODULE)
|
||||
INCLUDE 'mpif.h'
|
||||
#endif
|
||||
!TYPE(tester_t) :: test
|
||||
INTEGER :: ierr
|
||||
!
|
||||
|
|
|
@ -71,7 +71,10 @@ set(src_pw4gww
|
|||
pw4gww/hpsi_pw4gww.f90
|
||||
pw4gww/cgsolve_all_gamma.f90
|
||||
pw4gww/realus.f90
|
||||
pw4gww/operator_1_vp.f90)
|
||||
pw4gww/operator_1_vp.f90
|
||||
pw4gww/lanczos_chains.f90
|
||||
pw4gww/convergence.f90
|
||||
pw4gww/easy_gw.f90)
|
||||
qe_enable_cuda_fortran("${src_pw4gww}")
|
||||
|
||||
set(src_bse
|
||||
|
@ -243,11 +246,11 @@ target_link_libraries(qe_gww_simpleip
|
|||
###########################################################
|
||||
set(src_graph_x util/graph.f90)
|
||||
qe_add_executable(qe_gww_util_grap_exe ${src_graph_x})
|
||||
set_target_properties(qe_gww_util_grap_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_util_grap_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME graph.x)
|
||||
set_target_properties(qe_gww_util_grap_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_util_grap_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME graph.x)
|
||||
|
||||
###########################################################
|
||||
|
@ -255,11 +258,11 @@ set_target_properties(qe_gww_util_grap_exe
|
|||
###########################################################
|
||||
set(src_abcoeff_to_eps_x util/abcoeff_to_eps.f90)
|
||||
qe_add_executable(qe_gww_util_abcoefftoeps_exe ${src_abcoeff_to_eps_x})
|
||||
set_target_properties(qe_gww_util_abcoefftoeps_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_util_abcoefftoeps_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME abcoeff_to_eps.x)
|
||||
set_target_properties(qe_gww_util_abcoefftoeps_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_util_abcoefftoeps_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME abcoeff_to_eps.x)
|
||||
|
||||
###########################################################
|
||||
|
@ -267,11 +270,11 @@ set_target_properties(qe_gww_util_abcoefftoeps_exe
|
|||
###########################################################
|
||||
set(src_memory_pw4gww_x util/memory_pw4gww.f90)
|
||||
qe_add_executable(qe_gww_util_memorypw4gww_exe ${src_memory_pw4gww_x})
|
||||
set_target_properties(qe_gww_util_memorypw4gww_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_util_memorypw4gww_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME memory_pw4gww.x)
|
||||
set_target_properties(qe_gww_util_memorypw4gww_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_util_memorypw4gww_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME memory_pw4gww.x)
|
||||
|
||||
#############################################################
|
||||
|
@ -279,8 +282,8 @@ set_target_properties(qe_gww_util_memorypw4gww_exe
|
|||
############################################################
|
||||
set(src_bse_main_x bse/bse_main.f90)
|
||||
qe_add_executable(qe_gww_bse_bse_main_exe ${src_bse_main_x})
|
||||
set_target_properties(qe_gww_bse_bse_main_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_bse_bse_main_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME bse_main.x)
|
||||
target_link_libraries(qe_gww_bse_bse_main_exe
|
||||
PRIVATE
|
||||
|
@ -298,8 +301,8 @@ target_link_libraries(qe_gww_bse_bse_main_exe
|
|||
################################################################
|
||||
set(src_gww_x gww/gww.f90)
|
||||
qe_add_executable(qe_gww_gww_exe ${src_gww_x})
|
||||
set_target_properties(qe_gww_gww_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_gww_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME gww.x)
|
||||
target_link_libraries(qe_gww_gww_exe
|
||||
PRIVATE
|
||||
|
@ -309,42 +312,59 @@ target_link_libraries(qe_gww_gww_exe
|
|||
qe_modules)
|
||||
|
||||
################################################################
|
||||
# gww_fit.x
|
||||
# gww_fit.x
|
||||
################################################################
|
||||
set(src_gww_fit_x gww/gww_fit.f90)
|
||||
qe_add_executable(qe_gww_gww_fit_exe ${src_gww_fit_x})
|
||||
set_target_properties(qe_gww_gww_fit_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME gww.x)
|
||||
set_target_properties(qe_gww_gww_fit_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME gww_fit.x)
|
||||
target_link_libraries(qe_gww_gww_fit_exe
|
||||
PRIVATE
|
||||
PRIVATE
|
||||
qe_gww
|
||||
qe_gww_minpack
|
||||
qe_modules)
|
||||
|
||||
################################################################
|
||||
# head.x
|
||||
# head.x
|
||||
################################################################
|
||||
set(src_head_x head/head.f90)
|
||||
qe_add_executable(qe_gww_head_exe ${src_head_x})
|
||||
set_target_properties(qe_gww_head_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_head_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME head.x)
|
||||
target_link_libraries(qe_gww_head_exe
|
||||
PRIVATE
|
||||
PRIVATE
|
||||
qe_gww_head
|
||||
qe_gww_minpack
|
||||
qe_phonon_ph
|
||||
qe_modules
|
||||
qe_pw)
|
||||
|
||||
###########################################################
|
||||
# simple.x
|
||||
###########################################################
|
||||
set(src_simple_x simple/simple.f90)
|
||||
qe_add_executable(qe_gww_simple_exe ${src_simple_x})
|
||||
set_target_properties(qe_gww_simple_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME simple.x)
|
||||
target_link_libraries(qe_gww_simple_exe
|
||||
PRIVATE
|
||||
qe_gww_simple
|
||||
qe_fftx
|
||||
qe_xclib
|
||||
qe_pw
|
||||
qe_upflib
|
||||
qe_modules)
|
||||
|
||||
###########################################################
|
||||
# simple_bse.x
|
||||
###########################################################
|
||||
set(src_simple_bse_x simple_bse/simple_bse.f90)
|
||||
qe_add_executable(qe_gww_simple_bse_exe ${src_simple_bse_x})
|
||||
set_target_properties(qe_gww_simple_bse_exe
|
||||
PROPERTIES
|
||||
set_target_properties(qe_gww_simple_bse_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME simple_bse.x)
|
||||
target_link_libraries(qe_gww_simple_bse_exe
|
||||
PRIVATE
|
||||
|
@ -356,15 +376,32 @@ target_link_libraries(qe_gww_simple_bse_exe
|
|||
# simple_ip.x
|
||||
################################################################
|
||||
set(src_simple_ip_x simple_ip/simple_ip.f90)
|
||||
qe_add_executable(qe_gww_simple_ip_exe ${src_simple_ip_x})
|
||||
set_target_properties(qe_gww_simple_ip_exe
|
||||
PROPERTIES
|
||||
qe_add_executable(qe_gww_simple_ip_exe ${src_simple_ip_x})
|
||||
set_target_properties(qe_gww_simple_ip_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME simple_ip.x)
|
||||
target_link_libraries(qe_gww_simple_ip_exe
|
||||
PRIVATE
|
||||
qe_gww_simpleip
|
||||
qe_modules)
|
||||
|
||||
################################################################
|
||||
# pw4gww.x
|
||||
################################################################
|
||||
set(src_pw4gww_pw4gww_x pw4gww/pw4gww.f90)
|
||||
qe_add_executable(qe_gww_pw4gww_exe ${src_pw4gww_pw4gww_x})
|
||||
set_target_properties(qe_gww_pw4gww_exe
|
||||
PROPERTIES
|
||||
OUTPUT_NAME pw4gww.x)
|
||||
target_link_libraries(qe_gww_pw4gww_exe
|
||||
PRIVATE
|
||||
qe_gww_pw4gww
|
||||
qe_phonon_gamma
|
||||
qe_fftx
|
||||
qe_xclib
|
||||
qe_pw
|
||||
qe_modules)
|
||||
|
||||
###########################################################
|
||||
|
||||
qe_install_targets(
|
||||
|
@ -378,6 +415,7 @@ qe_install_targets(
|
|||
qe_gww_simplebse
|
||||
qe_gww_simpleip
|
||||
# Executables
|
||||
qe_gww_pw4gww_exe
|
||||
qe_gww_util_grap_exe
|
||||
qe_gww_util_abcoefftoeps_exe
|
||||
qe_gww_util_memorypw4gww_exe
|
||||
|
@ -385,5 +423,6 @@ qe_install_targets(
|
|||
qe_gww_gww_exe
|
||||
qe_gww_gww_fit_exe
|
||||
qe_gww_head_exe
|
||||
qe_gww_simple_exe
|
||||
qe_gww_simple_bse_exe
|
||||
qe_gww_simple_ip_exe)
|
||||
|
|
|
@ -45,10 +45,8 @@ qpcorrections.o
|
|||
#zvscal_test.o\
|
||||
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
|
||||
../../upflib/libupf.a ../../KS_Solvers/libks_solvers.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a ../../XClib/xc_lib.a
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
QEMODS = $(BASEMODS)
|
||||
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a
|
||||
PW4GWWOBJ = ../pw4gww/fft_custom.o ../pw4gww/stop_pp.o ../pw4gww/mp_wave_parallel.o
|
||||
GWWOBJ = ../gww/libgww.a ../minpack/minpacklib.a
|
||||
|
||||
|
@ -58,7 +56,7 @@ all : tldeps bse_main.x
|
|||
|
||||
bse_main.x : bse_main.o libbse.a $(BSEOBJS) $(PWOBJS) $(QEMODS) $(GWWOBJ)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
bse_main.o libbse.a $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS) $(PW4GWWOBJ) $(GWWOBJ)
|
||||
bse_main.o libbse.a $(PWOBJS) $(QEMODS) $(QELIBS) $(PW4GWWOBJ) $(GWWOBJ)
|
||||
- ( cd ../../bin ; ln -fs ../GWW/bse/$@ . )
|
||||
|
||||
tldeps:
|
||||
|
|
|
@ -43,9 +43,7 @@ times_gw.o \
|
|||
vcprim.o
|
||||
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
|
||||
../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a
|
||||
QEMODS = ../../KS_Solvers/libks_solvers.a $(BASEMODS)
|
||||
|
||||
LIBMIN= ../minpack/minpacklib.a
|
||||
|
||||
|
@ -53,14 +51,14 @@ TLDEPS=phlibs
|
|||
|
||||
all : tldeps gww.x gww_fit.x libgww.a
|
||||
|
||||
gww.x : gww.o $(GWWOBJS) $(LIBOBJS) $(QEMODS) $(LIBMIN)
|
||||
gww.x : gww.o $(GWWOBJS) $(QEMODS) $(LIBMIN)
|
||||
$(MPIF90) $(LDFLAGS) -o $@ \
|
||||
gww.o $(GWWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
|
||||
gww.o $(GWWOBJS) $(QEMODS) $(LIBMIN) $(QELIBS)
|
||||
- ( cd ../../bin; ln -fs ../GWW/gww/$@ . )
|
||||
|
||||
gww_fit.x : gww_fit.o $(GWWOBJS) $(LIBOBJS) $(QEMODS) $(LIBMIN)
|
||||
gww_fit.x : gww_fit.o $(GWWOBJS) $(QEMODS) $(LIBMIN)
|
||||
$(MPIF90) $(LDFLAGS) -o $@ \
|
||||
gww_fit.o $(GWWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
|
||||
gww_fit.o $(GWWOBJS) $(QEMODS) $(LIBMIN) $(QELIBS)
|
||||
- ( cd ../../bin; ln -fs ../GWW/gww/$@ . )
|
||||
|
||||
libgww.a : $(GWWOBJS)
|
||||
|
|
|
@ -376,8 +376,8 @@ subroutine do_polarization_lanczos(tf,options,ispin)
|
|||
call read_compact_q_lanczos(cql, 1)!just for obtaining numpw,numt...poor man solution
|
||||
allocate(cql_save(1,1,1))
|
||||
endif
|
||||
|
||||
|
||||
allocate(vtl_save(1,1,1))
|
||||
allocate(ttl_save(1,1,1))
|
||||
|
||||
else
|
||||
!put all matrices vtl and ttl in memory, distributed according to valence state
|
||||
|
@ -581,9 +581,9 @@ subroutine do_polarization_lanczos(tf,options,ispin)
|
|||
if(options%l_t_wannier) then
|
||||
deallocate(cql_save)
|
||||
call free_memory_compact_q_lanczos(cql)
|
||||
else
|
||||
deallocate(vtl_save,ttl_save)
|
||||
endif
|
||||
deallocate(vtl_save,ttl_save)
|
||||
|
||||
|
||||
! deallocate(e_mat)
|
||||
deallocate(af,occ)
|
||||
|
|
|
@ -19,11 +19,10 @@ openfilq.o \
|
|||
phq_readin.o \
|
||||
solve_head.o
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../upflib/libupf.a \
|
||||
../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
|
||||
../../FFTXlib/libqefft.a ../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
|
||||
../../dft-d3/libdftd3qe.a
|
||||
LIBPWPH = ../../PHonon/PH/libph.a ../..//LR_Modules/liblrmod.a ../../PW/src/libpw.a
|
||||
QEMODS = $(BASEMODS)
|
||||
LIBPWPH = ../../PHonon/PH/libph.a ../../LR_Modules/liblrmod.a \
|
||||
../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
|
||||
|
||||
|
||||
LIBMIN= ../minpack/minpacklib.a
|
||||
|
||||
|
@ -31,9 +30,9 @@ TLDEPS= phlibs
|
|||
|
||||
all : tldeps head.x
|
||||
|
||||
head.x : head.o $(LIBOBJS) $(HEADOBJS) $(LIBPWPH) $(LIBMIN)
|
||||
head.x : head.o $(HEADOBJS) $(LIBPWPH) $(LIBMIN)
|
||||
$(LD) $(LDFLAGS) -o head.x head.o \
|
||||
$(HEADOBJS) $(LIBPWPH) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
|
||||
$(HEADOBJS) $(LIBPWPH) $(QEMODS) $(LIBMIN) $(QELIBS)
|
||||
- ( cd ../../bin ; ln -fs ../GWW/head/head.x . )
|
||||
|
||||
tldeps :
|
||||
|
|
|
@ -36,7 +36,8 @@ subroutine bcast_ph_input ( )
|
|||
USE io_global, ONLY : ionode_id
|
||||
USE run_info, ONLY : title
|
||||
USE wannier_gw, ONLY : l_head, omega_gauss, n_gauss, grid_type, nsteps_lanczos,&
|
||||
&second_grid_n,second_grid_i,l_scissor,scissor,len_head_block_freq,len_head_block_wfc
|
||||
&second_grid_n,second_grid_i,l_scissor,scissor,len_head_block_freq,&
|
||||
l_easy,len_head_block_wfc
|
||||
|
||||
USE control_lr, ONLY : lgamma, lrpa
|
||||
|
||||
|
@ -106,6 +107,7 @@ subroutine bcast_ph_input ( )
|
|||
call mp_bcast(scissor, ionode_id, world_comm)
|
||||
call mp_bcast(len_head_block_freq, ionode_id, world_comm)
|
||||
call mp_bcast(len_head_block_wfc, ionode_id,world_comm)
|
||||
call mp_bcast(l_easy , ionode_id,world_comm)
|
||||
|
||||
#endif
|
||||
return
|
||||
|
|
|
@ -18,9 +18,9 @@ SUBROUTINE close_phq( flag )
|
|||
!
|
||||
USE io_global, ONLY : ionode, stdout
|
||||
USE uspp, ONLY : okvan
|
||||
USE units_ph, ONLY : iudwf, iubar, iudrhous, iuebar, iudrho, &
|
||||
USE units_ph, ONLY : iubar, iudrhous, iuebar, iudrho, &
|
||||
iudvscf, iucom, iudvkb3
|
||||
USE units_lr, ONLY : iuwfc
|
||||
USE units_lr, ONLY : iuwfc, iudwf
|
||||
USE control_ph, ONLY : zue, epsil
|
||||
USE recover_mod, ONLY : clean_recover
|
||||
USE output, ONLY : fildrho, fildvscf
|
||||
|
|
|
@ -149,13 +149,19 @@ PROGRAM head
|
|||
!
|
||||
! ... cleanup of the variables for the next q point
|
||||
!
|
||||
write(stdout,*) 'DEBUG 1'
|
||||
CALL clean_pw_ph(iq)
|
||||
write(stdout,*) 'DEBUG 2'
|
||||
!
|
||||
END DO
|
||||
|
||||
CALL ph_writefile('init',0,0,ierr)
|
||||
write(stdout,*) 'DEBUG 3'
|
||||
CALL ph_writefile('init',0,0,ierr)
|
||||
write(stdout,*) 'DEBUG 4'
|
||||
CALL collect_grid_files()
|
||||
write(stdout,*) 'DEBUG 5'
|
||||
CALL destroy_status_run()
|
||||
write(stdout,*) 'DEBUG 6'
|
||||
!
|
||||
IF (bands_computed) CALL print_clock_pw()
|
||||
!
|
||||
|
|
|
@ -28,6 +28,7 @@ subroutine lanczos_state_k(ik,nstates, nsteps,in_states,d,f,omat,dpsi_ipol, t_ou
|
|||
USE uspp, ONLY : vkb, nkb, okvan
|
||||
USE klist, ONLY : xk,igk_k, ngk
|
||||
USE noncollin_module, ONLY : noncolin, npol
|
||||
USE wvfct, ONLY : current_k
|
||||
|
||||
implicit none
|
||||
|
||||
|
@ -59,7 +60,9 @@ subroutine lanczos_state_k(ik,nstates, nsteps,in_states,d,f,omat,dpsi_ipol, t_ou
|
|||
allocate(alpha(nstates),beta(nstates),gamma(nstates),n_1(nstates),delta(nstates))
|
||||
allocate(c(nstates))
|
||||
allocate(spsi(npwx,nstates))
|
||||
|
||||
|
||||
current_k=ik
|
||||
|
||||
npw = ngk(ik)
|
||||
t_out(:,:,:)=(0.d0,0.d0)
|
||||
|
||||
|
|
|
@ -93,20 +93,28 @@ solve_head.o : ../../LR_Modules/lrcom.o
|
|||
solve_head.o : ../../Modules/becmod.o
|
||||
solve_head.o : ../../Modules/cell_base.o
|
||||
solve_head.o : ../../Modules/constants.o
|
||||
solve_head.o : ../../Modules/control_flags.o
|
||||
solve_head.o : ../../Modules/fft_base.o
|
||||
solve_head.o : ../../Modules/io_base.o
|
||||
solve_head.o : ../../Modules/io_files.o
|
||||
solve_head.o : ../../Modules/io_global.o
|
||||
solve_head.o : ../../Modules/ions_base.o
|
||||
solve_head.o : ../../Modules/kind.o
|
||||
solve_head.o : ../../Modules/mp_bands.o
|
||||
solve_head.o : ../../Modules/mp_images.o
|
||||
solve_head.o : ../../Modules/mp_pools.o
|
||||
solve_head.o : ../../Modules/mp_wave.o
|
||||
solve_head.o : ../../Modules/mp_world.o
|
||||
solve_head.o : ../../Modules/noncol.o
|
||||
solve_head.o : ../../Modules/recvec.o
|
||||
solve_head.o : ../../Modules/wannier_gw.o
|
||||
solve_head.o : ../../Modules/wavefunctions.o
|
||||
solve_head.o : ../../PHonon/PH/phcom.o
|
||||
solve_head.o : ../../PW/src/buffers.o
|
||||
solve_head.o : ../../PW/src/ldaU.o
|
||||
solve_head.o : ../../PW/src/pwcom.o
|
||||
solve_head.o : ../../PW/src/scf_mod.o
|
||||
solve_head.o : ../../PW/src/symme.o
|
||||
solve_head.o : ../../UtilXlib/mp.o
|
||||
solve_head.o : ../../upflib/paw_variables.o
|
||||
solve_head.o : ../../upflib/uspp.o
|
||||
|
|
|
@ -17,11 +17,11 @@ SUBROUTINE openfilq()
|
|||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE control_flags, ONLY : io_level, modenum
|
||||
USE units_ph, ONLY : iudwf, iubar, iucom, iudvkb3, &
|
||||
USE units_ph, ONLY : iubar, iucom, iudvkb3, &
|
||||
iudrhous, iuebar, iudrho, iudyn, iudvscf, &
|
||||
lrdwf, lrbar, lrcom, lrdvkb3, &
|
||||
lrbar, lrcom, lrdvkb3, &
|
||||
lrdrhous, lrebar, lrdrho
|
||||
USE units_lr, ONLY : iuwfc, lrwfc
|
||||
USE units_lr, ONLY : iuwfc, lrwfc, iudwf, lrdwf
|
||||
! USE io_files, ONLY : tmp_dir
|
||||
USE control_ph, ONLY : epsil, zue, ext_recover, trans, &
|
||||
tmp_dir_ph, start_irr, last_irr
|
||||
|
|
|
@ -79,7 +79,7 @@ SUBROUTINE phq_readin()
|
|||
USE ahc, ONLY : elph_ahc, ahc_dir, ahc_nbnd, ahc_nbndskip, &
|
||||
skip_upperfan
|
||||
USE wannier_gw, ONLY : l_head, omega_gauss, n_gauss, grid_type, nsteps_lanczos,second_grid_n,second_grid_i,&
|
||||
&l_scissor,scissor, len_head_block_freq, len_head_block_wfc
|
||||
&l_scissor,scissor, len_head_block_freq, len_head_block_wfc, l_easy
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -128,7 +128,7 @@ SUBROUTINE phq_readin()
|
|||
wpot_dir, ahc_dir, ahc_nbnd, ahc_nbndskip, &
|
||||
skip_upperfan, &
|
||||
l_head, omega_gauss, n_gauss, grid_type,nsteps_lanczos,l_scissor,scissor,&
|
||||
second_grid_n,second_grid_i,len_head_block_wfc,len_head_block_freq
|
||||
second_grid_n,second_grid_i,len_head_block_wfc,len_head_block_freq, l_easy
|
||||
|
||||
! tr2_ph : convergence threshold
|
||||
! amass : atomic masses
|
||||
|
@ -348,6 +348,8 @@ SUBROUTINE phq_readin()
|
|||
d2ns_type = 'full'
|
||||
len_head_block_freq=0
|
||||
len_head_block_wfc=0
|
||||
|
||||
l_easy=.false.
|
||||
!
|
||||
! ... reading the namelist inputph
|
||||
!
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
!
|
||||
! Copyright (C) 2001-2013 Quantum ESPRESSO group
|
||||
! Copyright (C) 2001-2021 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
|
@ -27,7 +27,7 @@ subroutine solve_head
|
|||
USE wannier_gw, ONLY : n_gauss, omega_gauss, grid_type,&
|
||||
nsteps_lanczos,second_grid_n,second_grid_i,&
|
||||
l_scissor,scissor,len_head_block_freq, &
|
||||
len_head_block_wfc
|
||||
len_head_block_wfc, l_easy
|
||||
USE control_ph, ONLY : tr2_ph
|
||||
USE gvect, ONLY : ngm, ngm_g, ig_l2g, gstart, g
|
||||
USE gvecs, ONLY : doublegrid
|
||||
|
@ -45,6 +45,7 @@ subroutine solve_head
|
|||
|
||||
use qpoint, ONLY : npwq, nksq
|
||||
use control_lr, ONLY : nbnd_occ, lgamma
|
||||
use SCF
|
||||
|
||||
implicit none
|
||||
|
||||
|
@ -92,6 +93,7 @@ subroutine solve_head
|
|||
COMPLEX(kind=DP), ALLOCATABLE :: z_dl(:),z_d(:),z_du(:),z_b(:)
|
||||
COMPLEX(kind=DP) :: csca, csca1
|
||||
COMPLEX(kind=DP), ALLOCATABLE :: t_out(:,:,:), psi_tmp(:)
|
||||
TYPE(scf_type) :: wing
|
||||
INTEGER :: n
|
||||
INTEGER :: npwx_g
|
||||
|
||||
|
@ -144,6 +146,9 @@ subroutine solve_head
|
|||
do i=1,n_gauss
|
||||
freqs(1+i)=omega_gauss*dble(i)/dble(n_gauss)
|
||||
enddo
|
||||
if(l_easy) then
|
||||
freqs(1)=omega_gauss/dble(n_gauss)/4.d0
|
||||
endif
|
||||
else if(grid_type==4) then!equally spaced grid shifted of 1/2
|
||||
freqs(1) = 0.d0
|
||||
do i=1,n_gauss
|
||||
|
@ -471,18 +476,19 @@ subroutine solve_head
|
|||
#else
|
||||
call syme (pola_charge(:,:,:,i))
|
||||
#endif
|
||||
|
||||
call create_scf_type ( wing, .true. )
|
||||
do ipol=1,3
|
||||
CALL fwfft ('Rho', pola_charge(1:dfftp%nnr,1,ipol,i), dfftp)
|
||||
tmp_g(:)=(0.d0,0.d0)
|
||||
tmp_g(gstart:ngm)=pola_charge(dfftp%nl(gstart:ngm),1,ipol,i)
|
||||
|
||||
wing%of_g(1:ngm,1)=-4.d0*tmp_g(1:ngm)
|
||||
call write_wing ( wing, nspin,ipol,i)
|
||||
!loop on frequency
|
||||
do ig=gstart,ngm
|
||||
e_head_pol(ig,i,ipol)=-4.d0*tmp_g(ig)
|
||||
enddo
|
||||
enddo
|
||||
|
||||
call destroy_scf_type (wing )
|
||||
|
||||
enddo
|
||||
|
||||
|
@ -568,9 +574,74 @@ subroutine solve_head
|
|||
|
||||
|
||||
call mp_barrier( world_comm )
|
||||
write(stdout,*) 'ATT2'
|
||||
write(stdout,*) 'THIS IS THE END'
|
||||
|
||||
call stop_clock ('solve_head')
|
||||
return
|
||||
end subroutine solve_head
|
||||
|
||||
|
||||
|
||||
SUBROUTINE write_wing ( rho, nspin,ipol,iw)
|
||||
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
USE io_files, ONLY : create_directory
|
||||
USE io_base, ONLY : write_rhog, read_rhog
|
||||
!
|
||||
USE paw_variables, ONLY : okpaw
|
||||
USE ldaU, ONLY : lda_plus_u
|
||||
USE noncollin_module, ONLY : noncolin
|
||||
USE spin_orb, ONLY : domag
|
||||
USE scf, ONLY : scf_type
|
||||
!
|
||||
USE cell_base, ONLY : bg, tpiba
|
||||
USE gvect, ONLY : ig_l2g, mill
|
||||
USE control_flags, ONLY : gamma_only
|
||||
USE io_files, ONLY : seqopn, tmp_dir, prefix, postfix
|
||||
USE io_global, ONLY : ionode, ionode_id, stdout
|
||||
USE mp_pools, ONLY : my_pool_id
|
||||
USE mp_bands, ONLY : my_bgrp_id, root_bgrp_id, &
|
||||
root_bgrp, intra_bgrp_comm
|
||||
USE mp_images, ONLY : intra_image_comm
|
||||
USE mp, ONLY : mp_bcast
|
||||
|
||||
!
|
||||
IMPLICIT NONE
|
||||
TYPE(scf_type), INTENT(IN) :: rho
|
||||
INTEGER, INTENT(IN) :: nspin
|
||||
INTEGER, INTENT(IN) :: ipol!direction
|
||||
INTEGER, INTENT(IN) :: iw !frequency
|
||||
!
|
||||
CHARACTER (LEN=256) :: dirname
|
||||
LOGICAL :: lexist
|
||||
INTEGER :: nspin_, iunocc, iunpaw, ierr
|
||||
INTEGER, EXTERNAL :: find_free_unit
|
||||
|
||||
CHARACTER(5) :: nfile
|
||||
CHARACTER :: npol
|
||||
|
||||
write(nfile,'(5i1)') &
|
||||
& iw/10000,mod(iw,10000)/1000,mod(iw,1000)/100,mod(iw,100)/10,mod(iw,10)
|
||||
write(npol,'(1i1)') ipol
|
||||
|
||||
dirname = TRIM(tmp_dir) // TRIM(prefix) // postfix
|
||||
CALL create_directory( dirname )
|
||||
! in the following case do not read or write polarization
|
||||
IF ( noncolin .AND. .NOT.domag ) THEN
|
||||
nspin_ = 1
|
||||
ELSE
|
||||
nspin_ = nspin
|
||||
ENDIF
|
||||
! Write G-space density
|
||||
IF ( my_pool_id == 0 .AND. my_bgrp_id == root_bgrp_id ) &
|
||||
CALL write_rhog( TRIM(dirname) // "wing_" // npol // "_" //nfile, &
|
||||
root_bgrp, intra_bgrp_comm, &
|
||||
bg(:,1)*tpiba, bg(:,2)*tpiba, bg(:,3)*tpiba, &
|
||||
gamma_only, mill, ig_l2g, rho%of_g(:,1:nspin_) )
|
||||
|
||||
|
||||
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE write_wing
|
||||
|
|
|
@ -47,25 +47,22 @@ hpsi_pw4gww.o \
|
|||
cgsolve_all_gamma.o \
|
||||
realus.o \
|
||||
operator_1_vp.o \
|
||||
operator_debug.o
|
||||
operator_debug.o \
|
||||
lanczos_chains.o \
|
||||
convergence.o \
|
||||
easy_gw.o
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../upflib/libupf.a \
|
||||
../../KS_Solvers/libks_solvers.a ../../XClib/xc_lib.a \
|
||||
../../FFTXlib/libqefft.a ../../LAXlib/libqela.a \
|
||||
../../UtilXlib/libutil.a ../../dft-d3/libdftd3qe.a
|
||||
# dft-d3 required by xlf for obscure reasons
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
PHOBJS = ../../PHonon/Gamma/libphcg.a
|
||||
|
||||
LIBMIN=
|
||||
QEMODS = $(BASEMODS)
|
||||
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
|
||||
PHOBJS = ../../PHonon/Gamma/libphcg.a ../../LR_Modules/liblrmod.a
|
||||
|
||||
TLDEPS= phlibs
|
||||
|
||||
all : tldeps pw4gww.x
|
||||
|
||||
pw4gww.x : pw4gww.o libpw4gww.a $(PW4GWWOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN)
|
||||
pw4gww.x : pw4gww.o libpw4gww.a $(PW4GWWOBJS) $(PWOBJS) $(QEMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
pw4gww.o libpw4gww.a $(PWOBJS) $(PHOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
|
||||
pw4gww.o libpw4gww.a $(PHOBJS) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../bin ; ln -fs ../GWW/pw4gww/$@ . )
|
||||
|
||||
tldeps :
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,270 @@
|
|||
!this subroutine performes a GW calculation with EASY strategy
|
||||
!for states from s_first_state to s_last_state
|
||||
|
||||
|
||||
SUBROUTINE easy_gw
|
||||
|
||||
|
||||
USE wannier_gw
|
||||
USE wvfct, ONLY : npw
|
||||
USE io_global, ONLY : ionode, stdout
|
||||
USE mp, ONLY : mp_barrier, mp_bcast,mp_sum
|
||||
USE mp_world, ONLY : world_comm, mpime
|
||||
USE io_files, ONLY : prefix, tmp_dir, nwordwfc,iunwfc
|
||||
USE fft_base, ONLY : dffts
|
||||
USE convergence_gw
|
||||
USE wvfct, ONLY : nbnd, et, npwx
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE constants, ONLY : rytoev
|
||||
USE lsda_mod, ONLY : lsda, nspin,current_spin,isk
|
||||
USE io_files, ONLY : create_directory
|
||||
|
||||
IMPLICIT NONE
|
||||
|
||||
REAL(kind=DP), ALLOCATABLE :: v_states(:,:,:)!valence states in real space
|
||||
INTEGER :: ii,iw,jj,kk
|
||||
TYPE(convergence_tests) :: ct!all data for testing convergence of basis sets
|
||||
TYPE(self_energy) :: se!for putting all the calculated data
|
||||
COMPLEX(kind=DP), ALLOCATABLE :: freq(:)
|
||||
REAL(kind=DP), ALLOCATABLE :: e_xc(:,:),e_h(:,:),e_x(:,:)
|
||||
COMPLEX(kind=DP), ALLOCATABLE :: ks_wfcs(:,:,:)
|
||||
INTEGER, EXTERNAL :: find_free_unit
|
||||
INTEGER :: iun
|
||||
INTEGER :: ix,iy,iz,ir,im
|
||||
INTEGER :: passed
|
||||
INTEGER :: nr_counter
|
||||
INTEGER :: ix_start,iy_start,iz_start
|
||||
REAL(kind=DP) :: disterr,mindisterr,disterrmin
|
||||
INTEGER :: is
|
||||
INTEGER :: i_restart,i_res
|
||||
INTEGER :: n_total, i_first, i_last, n_local, i_stop
|
||||
CHARACTER(4) :: nfilex,nfiley,nfilez,nfile_orb
|
||||
CHARACTER(5) :: nfilel2
|
||||
LOGICAL :: lex, l_old_restart
|
||||
|
||||
INTERFACE
|
||||
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin, v_states )
|
||||
USE kinds, ONLY : DP
|
||||
USE fft_base, ONLY : dffts
|
||||
USE lsda_mod, ONLY : nspin
|
||||
INTEGER :: lda, n, m
|
||||
COMPLEX(kind=DP) :: psi(lda,m)
|
||||
REAL(kind=DP) :: e_xc(m), e_h(m)
|
||||
INTEGER, INTENT(in) :: ispin !spin 1,2
|
||||
REAL(kind=DP), OPTIONAL :: v_states(dffts%nnr,m, nspin)
|
||||
END SUBROUTINE energies_xc
|
||||
END INTERFACE
|
||||
|
||||
|
||||
call start_clock('easy_gw')
|
||||
|
||||
|
||||
allocate(v_states(dffts%nnr,num_nbnds,nspin))
|
||||
if(.not.l_truncated_coulomb) call calculate_vg0()
|
||||
if(nspin==2) then
|
||||
CALL davcio(evc,2*nwordwfc,iunwfc,2,-1)
|
||||
call evc_to_real(num_nbnds, v_states(1,1,2))
|
||||
CALL davcio(evc,2*nwordwfc,iunwfc,1,-1)
|
||||
endif
|
||||
call evc_to_real(num_nbnds, v_states(1,1,1))
|
||||
|
||||
allocate(e_xc(nbnd,nspin),e_h(nbnd,nspin),e_x(nbnd,nspin))
|
||||
allocate(ks_wfcs(npw,nbnd,nspin))
|
||||
do is=1,nspin
|
||||
IF (lsda) current_spin = isk(is)
|
||||
if(nspin/=1) CALL davcio(evc,2*nwordwfc,iunwfc,is,-1)!read wfcs for
|
||||
call energies_xc( npwx, npw, nbnd, evc, e_xc(:,is),e_h(:,is),is ,v_states)
|
||||
ks_wfcs(1:npw,1:nbnd,is)=evc(1:npw,1:nbnd)
|
||||
enddo
|
||||
CALL dft_exchange(num_nbndv,nbnd,nset,e_x,ks_wfcs)
|
||||
|
||||
|
||||
|
||||
allocate(freq(n_gauss))
|
||||
do iw=1,n_gauss
|
||||
freq(iw)=(0.d0,1.d0)*(omega_gauss/dble(n_gauss)*(dble(iw)) -omega_gauss)
|
||||
if(abs(aimag(freq(iw))) < 1d-10) freq(iw)=0.d0
|
||||
enddo
|
||||
|
||||
call initialize_memory(se)
|
||||
call set_se_energies(se, et,e_xc,e_x)
|
||||
nr_counter=1
|
||||
if(easy_grid_type==0) then
|
||||
do is=s_first_spin,s_last_spin
|
||||
do ii=s_first_state,s_last_state
|
||||
|
||||
call start_convergence(ct,ii,is,v_states,.true.,0,0,0,n_gauss,freq,ks_wfcs)
|
||||
call calculate_convergence(ct,v_states,se,nr_counter)
|
||||
call free_memory(ct)
|
||||
|
||||
enddo
|
||||
enddo
|
||||
elseif(easy_grid_type==1) then
|
||||
!create directories
|
||||
do is=1,nspin
|
||||
do ii=s_first_state,s_last_state
|
||||
write(nfile_orb,'(4i1)') &
|
||||
& ii/1000,mod(ii,1000)/100,mod(ii,100)/10, mod(ii,10)
|
||||
if(is==1) then
|
||||
call create_directory(trim(prefix)//'-gwl_orbital_1_'//nfile_orb)
|
||||
else
|
||||
call create_directory(trim(prefix)//'-gwl_orbital_2_'//nfile_orb)
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
!calculate total number of points
|
||||
ix_start=1+easy_grid_param(1)
|
||||
iy_start=1+easy_grid_param(2)
|
||||
iz_start=1+easy_grid_param(3)
|
||||
|
||||
n_total=0
|
||||
do iz=iz_start,dffts%nr3,easy_grid_param(4)
|
||||
do iy=iy_start,dffts%nr2,easy_grid_param(4)
|
||||
do ix=ix_start,dffts%nr1,easy_grid_param(4)
|
||||
passed=0
|
||||
if(ix<easy_grid_param(5) .and. iy<easy_grid_param(5) .and. iz<easy_grid_param(5)) then
|
||||
if( iz>dffts%nr3p_offset(mpime+1) .and. iz <=( dffts%nr3p_offset(mpime+1)+dffts%my_nr3p)) then
|
||||
ii=(iz-dffts%nr3p_offset(mpime+1)-1)*dffts%nr2*dffts%nr1+&
|
||||
(iy-1)*dffts%nr1+ix
|
||||
do is=s_first_spin,s_last_spin
|
||||
do jj=s_first_state,s_last_state
|
||||
if(abs(v_states(ii,jj,is))> easy_psi_thrs) passed=1
|
||||
enddo
|
||||
enddo
|
||||
endif
|
||||
endif
|
||||
call mp_sum(passed,world_comm)
|
||||
if(passed>0) then
|
||||
n_total=n_total+1
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
write(stdout,*) 'TOTAL NUMBER OF POINTS:', n_total
|
||||
n_local=n_total/easy_split_calc_n
|
||||
if(n_local*easy_split_calc_n< n_total) n_local=n_local+1
|
||||
i_first=(easy_split_calc_i-1)*n_local+1
|
||||
i_last=i_first+n_local-1
|
||||
if(i_last>n_total) i_last=n_total
|
||||
|
||||
if(restart_gww>10000) then
|
||||
l_old_restart=.false.
|
||||
restart_gww=0
|
||||
else
|
||||
l_old_restart=.true.
|
||||
endif
|
||||
|
||||
if(restart_gww>1) then
|
||||
i_restart=restart_gww+i_first
|
||||
write(stdout,*) 'RESTARTING FROM POINT:', i_restart
|
||||
else
|
||||
i_restart=i_first
|
||||
endif
|
||||
i_res=0
|
||||
i_stop=i_last
|
||||
|
||||
write(stdout,*) 'DOING POINTS RANGE', i_restart,i_stop
|
||||
ix_start=1+easy_grid_param(1)
|
||||
iy_start=1+easy_grid_param(2)
|
||||
iz_start=1+easy_grid_param(3)
|
||||
|
||||
do iz=iz_start,dffts%nr3,easy_grid_param(4)
|
||||
do iy=iy_start,dffts%nr2,easy_grid_param(4)
|
||||
do ix=ix_start,dffts%nr1,easy_grid_param(4)
|
||||
write(stdout,*) 'COORDINATES:',ix,iy,iz
|
||||
passed=0
|
||||
!write(stdout,*) 'OFFSET', dffts%nr3p_offset(mpime+1),mpime
|
||||
!NOTE OFFSET and MPIME start from 0
|
||||
if(ix<easy_grid_param(5) .and. iy<easy_grid_param(5) .and. iz<easy_grid_param(5)) then
|
||||
if( iz>dffts%nr3p_offset(mpime+1) .and. iz <=( dffts%nr3p_offset(mpime+1)+dffts%my_nr3p)) then
|
||||
ii=(iz-dffts%nr3p_offset(mpime+1)-1)*dffts%nr2*dffts%nr1+&
|
||||
(iy-1)*dffts%nr1+ix
|
||||
do is=s_first_spin,s_last_spin
|
||||
do jj=s_first_state,s_last_state
|
||||
if(abs(v_states(ii,jj,is))> easy_psi_thrs) passed=1
|
||||
enddo
|
||||
enddo
|
||||
end if
|
||||
endif
|
||||
call mp_sum(passed,world_comm)
|
||||
write(stdout,*) 'PASSED', passed
|
||||
if(passed>0) then
|
||||
i_res=i_res+1
|
||||
if(l_old_restart) then
|
||||
if(i_res>=i_restart) then
|
||||
call start_convergence(ct,0,0,v_states,.false.,ix,iy,iz,n_gauss,freq,ks_wfcs)
|
||||
call calculate_convergence(ct,v_states,se,nr_counter)
|
||||
call free_memory(ct)
|
||||
endif
|
||||
else
|
||||
kk=s_first_state
|
||||
write(nfilel2,'(5i1)') &
|
||||
& kk/10000,mod(kk,10000)/1000,mod(kk,1000)/100,mod(kk,100)/10,mod(kk,10)
|
||||
|
||||
write(nfilex,'(4i1)') &
|
||||
& ix/1000,mod(ix,1000)/100,mod(ix,100)/10, mod(ix,10)
|
||||
write(nfiley,'(4i1)') &
|
||||
& iy/1000,mod(iy,1000)/100,mod(iy,100)/10, mod(iy,10)
|
||||
write(nfilez,'(4i1)') &
|
||||
& iz/1000,mod(iz,1000)/100,mod(iz,100)/10, mod(iz,10)
|
||||
!call create_directory(trim(prefix)//'-gwl_orbital_1_'//nfile_orb)
|
||||
write(nfile_orb,'(4i1)') &
|
||||
& kk/1000,mod(kk,1000)/100,mod(kk,100)/10, mod(kk,10)
|
||||
|
||||
inquire( file=trim(prefix)//'-gwl_orbital_1_'//nfile_orb//'/im_on_im_r_'//nfilex//'_'//nfiley&
|
||||
&//'_'//nfilez//'__'//nfilel2,exist=lex)
|
||||
! inquire( file=trim(prefix)//'-'//'im_on_im_r_'//nfilex//'_'//nfiley&
|
||||
! &//'_'//nfilez//'__'//nfilel2,exist=lex)
|
||||
if(.not.lex) then
|
||||
call start_convergence(ct,0,0,v_states,.false.,ix,iy,iz,n_gauss,freq,ks_wfcs)
|
||||
call calculate_convergence(ct,v_states,se,nr_counter)
|
||||
call free_memory(ct)
|
||||
endif
|
||||
endif
|
||||
if(i_res==i_stop) then
|
||||
exit
|
||||
end if
|
||||
endif
|
||||
|
||||
enddo
|
||||
if(i_res==i_stop) then
|
||||
exit
|
||||
end if
|
||||
enddo
|
||||
if(i_res==i_stop) then
|
||||
exit
|
||||
end if
|
||||
enddo
|
||||
end if
|
||||
|
||||
|
||||
!if(easy_grid_type==1.or.easy_grid_type==2) call average_self_energy(se)
|
||||
!if(easy_grid_type==0 .and. l_whole_s) call solve_off_diagonal(se)
|
||||
if(ionode .and. (easy_grid_type==0 .or. i_restart==1)) then
|
||||
iun = find_free_unit()
|
||||
open( unit=iun, file=trim(prefix)//'-'//'bands.dat', status='unknown')
|
||||
do is=1,nspin
|
||||
write(iun,*) num_nbndv(is)
|
||||
do ii=1,num_nbnds
|
||||
write(iun,*) ii,et(ii,is)*rytoev,0.d0,0.d0,(et(ii,is)-e_xc(ii,is)+e_x(ii,is))*rytoev
|
||||
enddo
|
||||
enddo
|
||||
close(iun)
|
||||
endif
|
||||
|
||||
call stop_clock('easy_gw')
|
||||
|
||||
call print_clock('easy_gw')
|
||||
call print_clock('gzero_complex')
|
||||
call print_clock('vpv_complex')
|
||||
call print_clock('vpv_lanczos')
|
||||
call print_clock('fft')
|
||||
call print_clock('ffts')
|
||||
call print_clock('fftw')
|
||||
|
||||
|
||||
call free_memory(se)
|
||||
deallocate(e_xc,e_h,e_x)
|
||||
deallocate(v_states,ks_wfcs)
|
||||
return
|
||||
END SUBROUTINE easy_gw
|
|
@ -9,7 +9,7 @@
|
|||
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
||||
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin, v_states )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! computes the expectation values of the exchange and correlation potential
|
||||
|
@ -45,6 +45,7 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
|||
USE exx, ONLY : vexx !Suriano
|
||||
USE xc_lib, ONLY : exx_is_active, xclib_dft_is
|
||||
USE klist, ONLY : igk_k
|
||||
|
||||
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -56,6 +57,7 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
|||
COMPLEX(DP) :: psi(lda,m)
|
||||
REAL(kind=DP) :: e_xc(m), e_h(m)
|
||||
INTEGER, INTENT(in) :: ispin !spin 1,2
|
||||
REAL(kind=DP), OPTIONAL :: v_states(dffts%nnr,m, nspin)
|
||||
|
||||
REAL(kind=DP), ALLOCATABLE :: vr(:,:)
|
||||
!
|
||||
|
@ -347,8 +349,9 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
|||
allocate(psi_r(dfftp%nnr),psi_rs(dfftp%nnr))
|
||||
|
||||
iunwfcreal=find_free_unit()
|
||||
CALL diropn( iunwfcreal, 'real_whole', dffts%nnr, exst )
|
||||
|
||||
if(.not. present(v_states)) then
|
||||
CALL diropn( iunwfcreal, 'real_whole', dffts%nnr, exst )
|
||||
endif
|
||||
|
||||
!calculate xc potential on fine grid
|
||||
|
||||
|
@ -404,16 +407,19 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
|||
|
||||
do ibnd=1,m!loop on states
|
||||
!read from disk wfc on coarse grid
|
||||
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
|
||||
if(doublegrid) then
|
||||
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
|
||||
else
|
||||
psi_r(:)=psi_rs(:)
|
||||
endif
|
||||
|
||||
do ir=1,dfftp%nnr
|
||||
psi_r(ir)=psi_r(ir)**2.d0
|
||||
enddo
|
||||
if(.not. present(v_states)) then
|
||||
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
|
||||
if(doublegrid) then
|
||||
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
|
||||
else
|
||||
psi_r(:)=psi_rs(:)
|
||||
endif
|
||||
else
|
||||
psi_r(1:dffts%nnr)=v_states(1:dffts%nnr,ibnd,ispin)
|
||||
endif
|
||||
do ir=1,dfftp%nnr
|
||||
psi_r(ir)=psi_r(ir)**2.d0
|
||||
enddo
|
||||
|
||||
!if(okvan) call adduspos_gamma_r(ibnd,ibnd,psi_r,1,becp_gw(:,ibnd),becp_gw(:,ibnd))
|
||||
|
||||
|
@ -454,11 +460,15 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
|||
|
||||
do ibnd=1,m!loop on states
|
||||
!read from disk wfc on coarse grid
|
||||
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
|
||||
if(doublegrid) then
|
||||
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
|
||||
if(.not. present(v_states)) then
|
||||
CALL davcio( psi_rs,dffts%nnr,iunwfcreal,ibnd+(ispin-1)*nbnd,-1)
|
||||
if(doublegrid) then
|
||||
call fft_interpolate(dffts, psi_rs, dfftp, psi_r) ! interpolate from smooth to dense
|
||||
else
|
||||
psi_r(:)=psi_rs(:)
|
||||
endif
|
||||
else
|
||||
psi_r(:)=psi_rs(:)
|
||||
psi_r(1:dffts%nnr)=v_states(1:dffts%nnr,ibnd,ispin)
|
||||
endif
|
||||
|
||||
do ir=1,dfftp%nnr
|
||||
|
@ -487,7 +497,7 @@ SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin )
|
|||
|
||||
deallocate(psi_r,psi_rs)
|
||||
deallocate(exact_x)
|
||||
close(iunwfcreal)
|
||||
if(.not. present(v_states) ) close(iunwfcreal)
|
||||
deallocate(e_hub)
|
||||
if(l_whole_s) then
|
||||
!NOT_TO_BE_INCLUDED_START
|
||||
|
|
|
@ -75,10 +75,141 @@ subroutine cg_psi_pw4gww (lda, n, m, psi, h_diag)
|
|||
!
|
||||
do k = 1, m
|
||||
do i = 1, n
|
||||
psi (i, k) = psi (i, k) * 1.d0/(1.d0+h_diag (i, k))!-et(k,1))
|
||||
!psi (i, k) = psi (i, k) * 1.d0/(h_diag (i, k)-100.d0)
|
||||
psi (i, k) = psi (i, k) * 1.d0/(1.d0+h_diag (i, k))
|
||||
|
||||
enddo
|
||||
enddo
|
||||
|
||||
return
|
||||
end subroutine cg_psi_pw4gww
|
||||
|
||||
subroutine hpsi_pw4gww2( ndim,psi,ppsi,et,ik,numv)
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
USE wvfct, ONLY : npwx, npw, nbnd
|
||||
USE mp, ONLY : mp_sum, mp_barrier, mp_bcast
|
||||
USE mp_world, ONLY : mpime, nproc
|
||||
|
||||
|
||||
implicit none
|
||||
|
||||
INTEGER, INTENT(in) :: ndim !leading dimension of psi and psip
|
||||
INTEGER, INTENT(in) :: numv!number of bands
|
||||
INTEGER, INTENT(in) ::ik!dumm integer
|
||||
COMPLEX(kind=DP), INTENT(in) :: psi(ndim,numv)
|
||||
COMPLEX(kind=DP), INTENT(out) :: ppsi(ndim,numv)
|
||||
REAL(kind=DP) :: et(numv)
|
||||
|
||||
INTEGER :: iv
|
||||
|
||||
!apply h_psi
|
||||
|
||||
call h_psi( ndim, npw, numv, psi, ppsi )
|
||||
do iv=1,numv
|
||||
ppsi(1:npw,iv)=ppsi(1:npw,iv)-et(iv)*psi(1:npw,iv)
|
||||
enddo
|
||||
|
||||
return
|
||||
|
||||
end subroutine hpsi_pw4gww2
|
||||
|
||||
subroutine hpsi_square( ndim,psi,ppsi,et,ik,numv)
|
||||
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
USE wvfct, ONLY : npwx, npw, nbnd
|
||||
USE mp, ONLY : mp_sum, mp_barrier, mp_bcast
|
||||
USE mp_world, ONLY : mpime, nproc
|
||||
|
||||
|
||||
implicit none
|
||||
|
||||
INTEGER, INTENT(in) :: ndim !leading dimension of psi and psip
|
||||
INTEGER, INTENT(in) :: numv!number of bands
|
||||
INTEGER, INTENT(in) ::ik!dumm integer
|
||||
REAL(kind=DP),SAVE :: freq_real=0.d0,freq_im=0.d0
|
||||
LOGICAL, SAVE :: lproj=.false. !if true project over empty manyfold
|
||||
COMPLEX(kind=DP), INTENT(in) :: psi(ndim,numv)
|
||||
COMPLEX(kind=DP), INTENT(out) :: ppsi(ndim,numv)
|
||||
REAL(kind=DP) :: et(numv)
|
||||
|
||||
|
||||
INTEGER :: iv
|
||||
COMPLEX(kind=DP),ALLOCATABLE :: psi1(:,:)
|
||||
|
||||
!apply h_psi
|
||||
|
||||
if(ik==-1) then
|
||||
freq_real=et(1)
|
||||
freq_im=et(2)
|
||||
elseif(ik==-2) then
|
||||
lproj=.true.
|
||||
elseif(ik==-3) then
|
||||
lproj=.false.
|
||||
else
|
||||
|
||||
if(lproj) then
|
||||
do iv=1,numv
|
||||
call pc_operator(psi(1,iv),1,.false.)
|
||||
enddo
|
||||
endif
|
||||
|
||||
allocate(psi1(ndim,numv))
|
||||
|
||||
call h_psi( ndim, npw, numv, psi, psi1 )
|
||||
do iv=1,numv
|
||||
psi1(1:npw,iv)=psi1(1:npw,iv)-(freq_real+et(iv))*psi(1:npw,iv)
|
||||
enddo
|
||||
|
||||
call h_psi( ndim, npw, numv, psi1, ppsi )
|
||||
do iv=1,numv
|
||||
ppsi(1:npw,iv)=ppsi(1:npw,iv)-(freq_real+et(iv))*psi1(1:npw,iv)
|
||||
enddo
|
||||
|
||||
|
||||
do iv=1,numv
|
||||
ppsi(1:npw,iv)=ppsi(1:npw,iv)+(freq_im**2.d0)*psi(1:npw,iv)
|
||||
enddo
|
||||
|
||||
if(lproj)then
|
||||
do iv=1,numv
|
||||
call pc_operator(ppsi(1,iv),1,.false.)
|
||||
enddo
|
||||
endif
|
||||
deallocate(psi1)
|
||||
endif
|
||||
return
|
||||
|
||||
end subroutine hpsi_square
|
||||
|
||||
!-----------------------------------------------------------------
|
||||
subroutine cg_psi_pw4gww_square (lda, n, m, psi, h_diag)
|
||||
!-----------------------------------------------------------------
|
||||
!
|
||||
! This routine gives a preconditioning to the linear system solver.
|
||||
! The preconditioning is diagonal in reciprocal space
|
||||
!
|
||||
USE kinds, only : DP
|
||||
USE wvfct, ONLY : et
|
||||
|
||||
implicit none
|
||||
|
||||
integer :: lda, n, m
|
||||
! input: the leading dimension of the psi vector
|
||||
! input: the real dimension of the vector
|
||||
! input: the number of vectors
|
||||
complex(DP) :: psi (lda, m)
|
||||
! inp/out: the vector to be preconditioned
|
||||
real(DP) :: h_diag (lda, m)
|
||||
! input: the preconditioning vector
|
||||
integer :: k, i
|
||||
! counter on bands
|
||||
! counter on the elements of the vector
|
||||
do k = 1, m
|
||||
do i = 1, n
|
||||
psi (i, k) = psi (i, k)* 1.d0/(h_diag (i, k))
|
||||
enddo
|
||||
enddo
|
||||
|
||||
return
|
||||
end subroutine cg_psi_pw4gww_square
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -28,6 +28,34 @@ contour_terms.o : ../../Modules/wannier_gw.o
|
|||
contour_terms.o : ../../Modules/wavefunctions.o
|
||||
contour_terms.o : ../../PW/src/pwcom.o
|
||||
contour_terms.o : ../../UtilXlib/mp.o
|
||||
convergence.o : ../../FFTXlib/fft_interfaces.o
|
||||
convergence.o : ../../Modules/becmod.o
|
||||
convergence.o : ../../Modules/cell_base.o
|
||||
convergence.o : ../../Modules/constants.o
|
||||
convergence.o : ../../Modules/fft_base.o
|
||||
convergence.o : ../../Modules/gvecw.o
|
||||
convergence.o : ../../Modules/io_base.o
|
||||
convergence.o : ../../Modules/io_files.o
|
||||
convergence.o : ../../Modules/io_global.o
|
||||
convergence.o : ../../Modules/kind.o
|
||||
convergence.o : ../../Modules/mp_bands.o
|
||||
convergence.o : ../../Modules/mp_global.o
|
||||
convergence.o : ../../Modules/mp_images.o
|
||||
convergence.o : ../../Modules/mp_pools.o
|
||||
convergence.o : ../../Modules/mp_wave.o
|
||||
convergence.o : ../../Modules/mp_world.o
|
||||
convergence.o : ../../Modules/noncol.o
|
||||
convergence.o : ../../Modules/recvec.o
|
||||
convergence.o : ../../Modules/wannier_gw.o
|
||||
convergence.o : ../../Modules/wavefunctions.o
|
||||
convergence.o : ../../PW/src/g_psi_mod.o
|
||||
convergence.o : ../../PW/src/ldaU.o
|
||||
convergence.o : ../../PW/src/pwcom.o
|
||||
convergence.o : ../../PW/src/scf_mod.o
|
||||
convergence.o : ../../UtilXlib/mp.o
|
||||
convergence.o : ../../upflib/paw_variables.o
|
||||
convergence.o : ../../upflib/uspp.o
|
||||
convergence.o : lanczos_chains.o
|
||||
dft_exchange.o : ../../FFTXlib/fft_interfaces.o
|
||||
dft_exchange.o : ../../Modules/cell_base.o
|
||||
dft_exchange.o : ../../Modules/constants.o
|
||||
|
@ -61,6 +89,17 @@ diago_cg_g.o : ../../Modules/mp_bands.o
|
|||
diago_cg_g.o : ../../Modules/mp_world.o
|
||||
diago_cg_g.o : ../../Modules/random_numbers.o
|
||||
diago_cg_g.o : ../../UtilXlib/mp.o
|
||||
easy_gw.o : ../../Modules/constants.o
|
||||
easy_gw.o : ../../Modules/fft_base.o
|
||||
easy_gw.o : ../../Modules/io_files.o
|
||||
easy_gw.o : ../../Modules/io_global.o
|
||||
easy_gw.o : ../../Modules/kind.o
|
||||
easy_gw.o : ../../Modules/mp_world.o
|
||||
easy_gw.o : ../../Modules/wannier_gw.o
|
||||
easy_gw.o : ../../Modules/wavefunctions.o
|
||||
easy_gw.o : ../../PW/src/pwcom.o
|
||||
easy_gw.o : ../../UtilXlib/mp.o
|
||||
easy_gw.o : convergence.o
|
||||
energies_xc.o : ../../FFTXlib/fft_interfaces.o
|
||||
energies_xc.o : ../../Modules/becmod.o
|
||||
energies_xc.o : ../../Modules/cell_base.o
|
||||
|
@ -152,6 +191,16 @@ hpsi_pw4gww.o : ../../Modules/kind.o
|
|||
hpsi_pw4gww.o : ../../Modules/mp_world.o
|
||||
hpsi_pw4gww.o : ../../PW/src/pwcom.o
|
||||
hpsi_pw4gww.o : ../../UtilXlib/mp.o
|
||||
lanczos_chains.o : ../../FFTXlib/fft_interfaces.o
|
||||
lanczos_chains.o : ../../Modules/fft_base.o
|
||||
lanczos_chains.o : ../../Modules/io_global.o
|
||||
lanczos_chains.o : ../../Modules/kind.o
|
||||
lanczos_chains.o : ../../Modules/mp_world.o
|
||||
lanczos_chains.o : ../../Modules/recvec.o
|
||||
lanczos_chains.o : ../../Modules/wannier_gw.o
|
||||
lanczos_chains.o : ../../Modules/wavefunctions.o
|
||||
lanczos_chains.o : ../../PW/src/pwcom.o
|
||||
lanczos_chains.o : ../../UtilXlib/mp.o
|
||||
matrix_wannier_gamma.o : ../../Modules/cell_base.o
|
||||
matrix_wannier_gamma.o : ../../Modules/constants.o
|
||||
matrix_wannier_gamma.o : ../../Modules/fft_base.o
|
||||
|
@ -295,6 +344,7 @@ produce_wannier_gamma.o : ../../Modules/fft_base.o
|
|||
produce_wannier_gamma.o : ../../Modules/gvecw.o
|
||||
produce_wannier_gamma.o : ../../Modules/io_files.o
|
||||
produce_wannier_gamma.o : ../../Modules/io_global.o
|
||||
produce_wannier_gamma.o : ../../Modules/kind.o
|
||||
produce_wannier_gamma.o : ../../Modules/mp_world.o
|
||||
produce_wannier_gamma.o : ../../Modules/recvec.o
|
||||
produce_wannier_gamma.o : ../../Modules/wannier_gw.o
|
||||
|
|
|
@ -74,6 +74,19 @@
|
|||
|
||||
TYPE(exchange_cus) :: exx_cus
|
||||
|
||||
INTERFACE
|
||||
SUBROUTINE energies_xc( lda, n, m, psi, e_xc, e_h,ispin, v_states )
|
||||
USE kinds, ONLY : DP
|
||||
USE fft_base, ONLY : dffts
|
||||
USE lsda_mod, ONLY : nspin
|
||||
INTEGER :: lda, n, m
|
||||
COMPLEX(kind=DP) :: psi(lda,m)
|
||||
REAL(kind=DP) :: e_xc(m), e_h(m)
|
||||
INTEGER, INTENT(in) :: ispin !spin 1,2
|
||||
REAL(kind=DP), OPTIONAL :: v_states(dffts%nnr,m, nspin)
|
||||
END SUBROUTINE energies_xc
|
||||
END INTERFACE
|
||||
|
||||
! interface
|
||||
! subroutine fake_conduction_wannier(fcw_n,fcw_s,fcw_m,cut,s_cut)
|
||||
|
||||
|
|
|
@ -154,7 +154,24 @@ program gwl_punch
|
|||
l_no_GW_just_screening,& ! JDE
|
||||
l_no_GW_bare_Coulomb,& ! JDE
|
||||
no_GW_cg_maxit,& ! JDE
|
||||
no_GW_cg_threshold ! JDE
|
||||
no_GW_cg_threshold,& ! JDE
|
||||
l_easy, &
|
||||
l_easy_lanczos_g, &
|
||||
easy_grid_type, &
|
||||
easy_grid_param, &
|
||||
easy_average_type, &
|
||||
easy_psi_thrs,&
|
||||
l_easy_update_basis_w, &
|
||||
l_easy_dielectric_constant, &
|
||||
easy_w_update_alpha, &
|
||||
easy_w_update_lanczos, &
|
||||
easy_w_thrs, &
|
||||
s_first_spin,&
|
||||
s_last_spin, &
|
||||
easy_split_calc_n, &
|
||||
easy_split_calc_i, &
|
||||
l_easy_w_disk
|
||||
|
||||
|
||||
|
||||
USE exchange_custom, ONLY : exchange_fast_dual
|
||||
|
@ -192,7 +209,13 @@ program gwl_punch
|
|||
l_cond_pol_base,l_semicore,n_semicore,l_semicore_read, l_verbose, l_contour,&
|
||||
l_real,exchange_fast_dual,l_bse,s_bse,dual_bse,l_big_system,extra_pw_cutoff,&
|
||||
l_list,l_scissor,scissor,l_full,n_full,l_simple,&
|
||||
l_no_GW_just_screening, l_no_GW_bare_Coulomb, no_GW_cg_maxit, no_GW_cg_threshold ! JDE
|
||||
l_no_GW_just_screening, l_no_GW_bare_Coulomb, no_GW_cg_maxit, no_GW_cg_threshold,&
|
||||
l_easy, easy_grid_type, easy_grid_param,easy_average_type, easy_psi_thrs,&
|
||||
l_easy_lanczos_g, &
|
||||
l_easy_update_basis_w, l_easy_dielectric_constant, easy_w_update_alpha, &
|
||||
easy_w_update_lanczos, easy_w_thrs, s_first_spin, s_last_spin, easy_split_calc_n,&
|
||||
easy_split_calc_i, l_easy_w_disk
|
||||
|
||||
|
||||
|
||||
!
|
||||
|
@ -294,6 +317,28 @@ program gwl_punch
|
|||
l_no_GW_bare_coulomb=.false. ! JDE
|
||||
no_GW_cg_maxit=30 ! JDE
|
||||
no_GW_cg_threshold=1.d-10 ! JDE
|
||||
l_easy=.false.
|
||||
l_easy_lanczos_g=.false.
|
||||
easy_grid_type=0
|
||||
easy_grid_param(1)=0
|
||||
easy_grid_param(2)=0
|
||||
easy_grid_param(3)=0
|
||||
easy_grid_param(4)=5
|
||||
easy_grid_param(5)=100000
|
||||
easy_average_type=0
|
||||
easy_psi_thrs=0.d0
|
||||
l_easy_update_basis_w=.false.
|
||||
l_easy_dielectric_constant=.false.
|
||||
easy_w_update_alpha=0.1d0
|
||||
easy_w_update_lanczos=0.5
|
||||
easy_w_thrs=1d-20
|
||||
s_first_spin=0
|
||||
s_last_spin=0
|
||||
easy_split_calc_n=1
|
||||
easy_split_calc_i=1
|
||||
l_easy_w_disk=.false.
|
||||
|
||||
|
||||
!
|
||||
! Reading input file
|
||||
!
|
||||
|
@ -418,7 +463,26 @@ program gwl_punch
|
|||
CALL mp_bcast(l_no_GW_bare_coulomb, ionode_id, world_comm) ! JDE
|
||||
CALL mp_bcast(no_GW_cg_maxit, ionode_id, world_comm) ! JDE
|
||||
CALL mp_bcast(no_GW_cg_threshold, ionode_id, world_comm) ! JDE
|
||||
CALL mp_bcast(l_easy, ionode_id, world_comm)
|
||||
CALL mp_bcast(l_easy_lanczos_g, ionode_id, world_comm)
|
||||
CALL mp_bcast(easy_grid_type, ionode_id, world_comm)
|
||||
CALL mp_bcast(easy_grid_param, ionode_id, world_comm)
|
||||
CALL mp_bcast(easy_average_type, ionode_id, world_comm)
|
||||
CALL mp_bcast(easy_psi_thrs, ionode_id, world_comm)
|
||||
CALL mp_bcast(l_easy_update_basis_w, ionode_id,world_comm)
|
||||
CALL mp_bcast(l_easy_dielectric_constant, ionode_id,world_comm)
|
||||
CALL mp_bcast(easy_w_update_alpha, ionode_id,world_comm)
|
||||
CALL mp_bcast(easy_w_update_lanczos , ionode_id,world_comm)
|
||||
CALL mp_bcast(easy_w_thrs, ionode_id,world_comm)
|
||||
CALL mp_bcast(s_first_spin, ionode_id, world_comm)
|
||||
CALL mp_bcast(s_last_spin, ionode_id, world_comm)
|
||||
CALL mp_bcast(easy_split_calc_n, ionode_id, world_comm)
|
||||
CALL mp_bcast(easy_split_calc_i, ionode_id, world_comm)
|
||||
CALL mp_bcast( l_easy_w_disk, ionode_id, world_comm)
|
||||
if(s_first_spin==0) s_first_spin=1
|
||||
if(s_last_spin==0) s_first_spin=nspin
|
||||
|
||||
|
||||
call read_file
|
||||
|
||||
|
||||
|
@ -443,9 +507,11 @@ program gwl_punch
|
|||
|
||||
call summary()
|
||||
|
||||
|
||||
!
|
||||
! init some quantities ...
|
||||
!
|
||||
CALL hinit0()
|
||||
if(lda_plus_u) then
|
||||
CALL init_ns()
|
||||
endif
|
||||
|
@ -498,14 +564,12 @@ program gwl_punch
|
|||
ENDIF
|
||||
|
||||
|
||||
|
||||
if(l_easy) then
|
||||
CALL easy_gw()
|
||||
else
|
||||
CALL produce_wannier_gamma
|
||||
endif
|
||||
|
||||
if(l_verbose) write(stdout,*) 'BEFORE produce_wannier_gamma'
|
||||
FLUSH( stdout )
|
||||
CALL produce_wannier_gamma
|
||||
if(l_verbose) write(stdout,*) 'AFTER produce_wannier_gamma'
|
||||
FLUSH( stdout )
|
||||
! ENDIF
|
||||
|
||||
!
|
||||
!
|
||||
|
@ -749,6 +813,10 @@ subroutine read_export (pp_file,kunit,uspp_spsi, ascii, single_file, raw)
|
|||
ALLOCATE( sevc(npwx,nbnd), STAT=ierr )
|
||||
IF (ierr/=0) CALL errore( ' read_export ',' Unable to allocate SEVC ', ABS(ierr) )
|
||||
|
||||
|
||||
CALL init_us_1
|
||||
!CALL init_at_1
|
||||
|
||||
CALL allocate_bec_type (nkb,nbnd,becp)
|
||||
|
||||
do ik = 1, nkstot
|
||||
|
|
|
@ -22,13 +22,8 @@ SIMPLEOBJS = \
|
|||
commutator.o
|
||||
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
|
||||
../../upflib/libupf.a ../../KS_Solvers/libks_solvers.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a \
|
||||
../../dft-d3/libdftd3qe.a ../../XClib/xc_lib.a
|
||||
# dft-d3 required by xlf for obscure reasons
|
||||
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
QEMODS = $(BASEMODS)
|
||||
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
|
||||
GWWOBJ = ../gww/libgww.a
|
||||
|
||||
TLDEPS= pwlibs gwwlib
|
||||
|
@ -37,7 +32,7 @@ all : tldeps simple.x
|
|||
|
||||
simple.x : simple.o libsimple.a $(SIMPLEOBJS) $(PWOBJS) $(QEMODS) $(GWWOBJ)
|
||||
$(LD) $(LDFLAGS) -o $@ \
|
||||
simple.o libsimple.a $(PWOBJS) $(GWWOBJ) $(QEMODS) $(LIBOBJS) $(QELIBS) $(LIBMIN)
|
||||
simple.o libsimple.a $(PWOBJS) $(GWWOBJ) $(QEMODS) $(QELIBS) $(LIBMIN)
|
||||
- ( cd ../../bin ; ln -fs ../GWW/simple/$@ . )
|
||||
|
||||
tldeps :
|
||||
|
|
|
@ -14,7 +14,6 @@ subroutine product_basis
|
|||
USE uspp_param, ONLY : upf, nh
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE mp_world, ONLY : world_comm
|
||||
USE spin_orb, ONLY: lspinorb
|
||||
USE ions_base, ONLY : nat, nsp, ityp
|
||||
USE io_global, ONLY : stdout, ionode
|
||||
USE input_simple
|
||||
|
@ -290,7 +289,6 @@ SUBROUTINE optimal_gram_schmidt_nc(num_in,wfcs,thres,num_out)
|
|||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE input_simple, ONLY : npw_max,vkb_max
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE spin_orb, ONLY: lspinorb
|
||||
USE ions_base, ONLY : nat, nsp, ityp
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@ subroutine v_product
|
|||
USE klist, ONLY : nks,ngk,xk
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE mp_world, ONLY : world_comm,mpime
|
||||
USE spin_orb, ONLY: lspinorb
|
||||
USE io_global, ONLY : stdout, ionode
|
||||
USE input_simple
|
||||
USE gvect, ONLY : ngm, gstart,gg, g
|
||||
|
|
|
@ -12,7 +12,6 @@ subroutine wfc_basis
|
|||
USE uspp_param, ONLY : upf, nh
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE mp_world, ONLY : world_comm
|
||||
USE spin_orb, ONLY: lspinorb
|
||||
USE ions_base, ONLY : nat, nsp, ityp
|
||||
USE io_global, ONLY : stdout, ionode
|
||||
USE input_simple
|
||||
|
@ -541,7 +540,6 @@ SUBROUTINE optimal_gram_schmidt_z(num_in,wfcs,ithres,thres,num_out)
|
|||
USE uspp, ONLY : nkb, vkb, becsum, nhtol, nhtoj, indv, okvan
|
||||
USE uspp_param, ONLY : upf, nh
|
||||
USE noncollin_module, ONLY: npol, noncolin
|
||||
USE spin_orb, ONLY: lspinorb
|
||||
USE ions_base, ONLY : nat, nsp, ityp
|
||||
|
||||
|
||||
|
|
|
@ -19,17 +19,15 @@ spectrum.o \
|
|||
lanczos.o \
|
||||
build_eemat.o
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
|
||||
../../KS_Solvers/libks_solvers.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a
|
||||
QEMODS = $(BASEMODS)
|
||||
|
||||
TLDEPS=bindir libs mods gwwlib
|
||||
|
||||
all : tldeps simple_bse.x libsimple_exc.a
|
||||
|
||||
simple_bse.x : simple_bse.o $(EXCOBJS) $(LIBOBJS) $(QEMODS)
|
||||
simple_bse.x : simple_bse.o $(EXCOBJS) $(QEMODS)
|
||||
$(MPIF90) $(LDFLAGS) -o $@ \
|
||||
simple_bse.o $(EXCOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
simple_bse.o $(EXCOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../bin; ln -fs ../GWW/simple_bse/$@ . )
|
||||
|
||||
|
||||
|
|
|
@ -16,10 +16,8 @@ diagonalization.o \
|
|||
dielectric.o
|
||||
|
||||
|
||||
QEMODS = ../../Modules/libqemod.a ../../FFTXlib/libqefft.a \
|
||||
../../KS_Solvers/libks_solvers.a \
|
||||
../../LAXlib/libqela.a ../../UtilXlib/libutil.a
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
QEMODS = $(BASEMODS)
|
||||
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a
|
||||
|
||||
LIBMIN=
|
||||
|
||||
|
@ -27,9 +25,9 @@ TLDEPS=pwlibs
|
|||
|
||||
all : tldeps simple_ip.x libsimple_ip.a
|
||||
|
||||
simple_ip.x : simple_ip.o $(SIMPLEIPOBJS) $(LIBOBJS) $(PWOBJS) $(QEMODS) $(LIBMIN)
|
||||
simple_ip.x : simple_ip.o $(SIMPLEIPOBJS) $(PWOBJS) $(QEMODS) $(LIBMIN)
|
||||
$(MPIF90) $(LDFLAGS) -o $@ \
|
||||
simple_ip.o $(SIMPLEIPOBJS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(LIBMIN) $(QELIBS)
|
||||
simple_ip.o $(SIMPLEIPOBJS) $(PWOBJS) $(QEMODS) $(LIBMIN) $(QELIBS)
|
||||
- ( cd ../../bin; ln -fs ../GWW/simple_ip/$@ . )
|
||||
|
||||
libsimple_ip.a : $(SIMPLEIPOBJS)
|
||||
|
|
|
@ -13,7 +13,6 @@ set(src_hp
|
|||
src/hp_dealloc_1.f90
|
||||
src/hp_dealloc_2.f90
|
||||
src/hp_dvpsi_pert.f90
|
||||
src/hp_efermi_shift.f90
|
||||
src/hp_find_inequiv_sites.f90
|
||||
src/hp_generate_grids.f90
|
||||
src/hp_init.f90
|
||||
|
|
|
@ -22,7 +22,6 @@ hp_dealloc_q.o \
|
|||
hp_dealloc_1.o \
|
||||
hp_dealloc_2.o \
|
||||
hp_dvpsi_pert.o \
|
||||
hp_efermi_shift.o \
|
||||
hp_find_inequiv_sites.o \
|
||||
hp_generate_grids.o \
|
||||
hp_init.o \
|
||||
|
@ -53,10 +52,8 @@ hp_write_chi_full.o \
|
|||
hp_write_dnsq.o
|
||||
|
||||
LRMODS = ../../LR_Modules/liblrmod.a
|
||||
PWOBJS = ../../PW/src/libpw.a
|
||||
QEMODS = ../../Modules/libqemod.a ../../KS_Solvers/libks_solvers.a \
|
||||
../../upflib/libupf.a ../../FFTXlib/libqefft.a ../../LAXlib/libqela.a\
|
||||
../../UtilXlib/libutil.a ../../dft-d3/libdftd3qe.a ../../XClib/xc_lib.a
|
||||
PWOBJS = ../../PW/src/libpw.a ../../KS_Solvers/libks_solvers.a ../../dft-d3/libdftd3qe.a
|
||||
QEMODS = $(BASEMODS)
|
||||
|
||||
TLDEPS= hplibs
|
||||
|
||||
|
@ -64,9 +61,9 @@ all : tldeps libs-hp hp.x
|
|||
|
||||
libs-hp : libhp.a
|
||||
|
||||
hp.x : hp_main.o libhp.a $(PWOBJS) $(LRMODS) $(LIBOBJS)
|
||||
hp.x : hp_main.o libhp.a $(PWOBJS) $(LRMODS)
|
||||
$(LD) $(LDFLAGS) -o $@ hp_main.o libhp.a \
|
||||
$(LRMODS) $(PWOBJS) $(QEMODS) $(LIBOBJS) $(QELIBS)
|
||||
$(LRMODS) $(PWOBJS) $(QEMODS) $(QELIBS)
|
||||
- ( cd ../../bin ; ln -fs ../HP/src/hp.x . )
|
||||
|
||||
tldeps :
|
||||
|
|
|
@ -26,8 +26,7 @@ subroutine hp_allocate_q
|
|||
USE eqv, ONLY : dpsi, evq, dmuxc, dvpsi
|
||||
USE control_lr, ONLY : lgamma
|
||||
USE ldaU, ONLY : Hubbard_lmax, nwfcU
|
||||
USE ldaU_hp, ONLY : this_pert_is_on_file, &
|
||||
swfcatomk, swfcatomkpq
|
||||
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER :: ik
|
||||
|
@ -44,9 +43,6 @@ subroutine hp_allocate_q
|
|||
ALLOCATE (dpsi(npwx*npol,nbnd))
|
||||
ALLOCATE (dmuxc(dfftp%nnr,nspin_mag,nspin_mag))
|
||||
!
|
||||
ALLOCATE (this_pert_is_on_file(nksq))
|
||||
this_pert_is_on_file(:) = .FALSE.
|
||||
!
|
||||
IF (okvan) THEN
|
||||
ALLOCATE (eigqts(nat))
|
||||
ALLOCATE (becp1(nksq))
|
||||
|
|
|
@ -16,8 +16,8 @@ SUBROUTINE hp_close_q ( flag )
|
|||
!
|
||||
USE buffers, ONLY : close_buffer
|
||||
USE io_files, ONLY : iunhub
|
||||
USE units_lr, ONLY : iuwfc, iuatswfc
|
||||
USE ldaU_hp, ONLY : iudwfc, iudvwfc
|
||||
USE units_lr, ONLY : iuwfc, iuatswfc, iudwf
|
||||
USE ldaU_hp, ONLY : iudvwfc
|
||||
USE control_lr, ONLY : lgamma
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
@ -28,10 +28,10 @@ SUBROUTINE hp_close_q ( flag )
|
|||
CALL close_buffer(iuwfc,'delete')
|
||||
!
|
||||
IF (flag) THEN
|
||||
CALL close_buffer(iudwfc,'delete')
|
||||
CALL close_buffer(iudwf,'delete')
|
||||
CALL close_buffer(iudvwfc,'delete')
|
||||
ELSE
|
||||
CALL close_buffer(iudwfc,'keep')
|
||||
CALL close_buffer(iudwf,'keep')
|
||||
CALL close_buffer(iudvwfc,'keep')
|
||||
ENDIF
|
||||
!
|
||||
|
|
|
@ -14,7 +14,8 @@ SUBROUTINE hp_dealloc_1()
|
|||
!
|
||||
USE lr_symm_base, ONLY : rtau
|
||||
USE start_k, ONLY : xk_start, wk_start
|
||||
USE ldaU_hp, ONLY : Rvect, dnsscf, dns0, dnsscf_tot, dns0_tot, &
|
||||
USE ldaU_lr, ONLY : dnsscf
|
||||
USE ldaU_hp, ONLY : Rvect, dns0, dnsscf_tot, dns0_tot, &
|
||||
x_q, comp_iq
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
|
|
@ -22,8 +22,7 @@ SUBROUTINE hp_dealloc_q()
|
|||
& dvxc_s, vsgga, segni
|
||||
USE eqv, ONLY : dmuxc, dpsi, dvpsi, evq
|
||||
USE control_lr, ONLY : lgamma, nbnd_occ
|
||||
USE ldaU_hp, ONLY : this_pert_is_on_file, &
|
||||
swfcatomk, swfcatomkpq
|
||||
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
|
||||
!
|
||||
IMPLICIT NONE
|
||||
INTEGER :: ik
|
||||
|
@ -42,9 +41,6 @@ SUBROUTINE hp_dealloc_q()
|
|||
if (allocated(ikqs)) deallocate (ikqs)
|
||||
if (allocated(m_loc)) deallocate (m_loc)
|
||||
!
|
||||
if (allocated(this_pert_is_on_file)) &
|
||||
& deallocate (this_pert_is_on_file)
|
||||
!
|
||||
IF (okvan) THEN
|
||||
if (allocated(eigqts)) deallocate (eigqts)
|
||||
if (allocated(becp1)) then
|
||||
|
|
|
@ -38,13 +38,13 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
|
|||
USE control_flags, ONLY : iverbosity
|
||||
USE qpoint, ONLY : nksq, ikks, ikqs
|
||||
USE control_lr, ONLY : lgamma, nbnd_occ
|
||||
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc
|
||||
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc, iudwf, lrdwf
|
||||
USE lr_symm_base, ONLY : nsymq
|
||||
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard, offsetU, nwfcU
|
||||
USE ldaU_hp, ONLY : conv_thr_chi, trace_dns_tot_old, &
|
||||
conv_thr_chi_best, iter_best, iudwfc, lrdwfc, &
|
||||
swfcatomk, swfcatomkpq
|
||||
USE hp_efermi_shift, ONLY : def
|
||||
conv_thr_chi_best, iter_best
|
||||
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
|
||||
USE efermi_shift, ONLY : def
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -120,7 +120,7 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
|
|||
!
|
||||
! At each SCF iteration for each ik read dpsi from file
|
||||
!
|
||||
CALL get_buffer (dpsi, lrdwfc, iudwfc, ik)
|
||||
CALL get_buffer (dpsi, lrdwf, iudwf, ik)
|
||||
!
|
||||
! Loop on Hubbard atoms
|
||||
!
|
||||
|
@ -149,10 +149,8 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
|
|||
!
|
||||
ENDDO
|
||||
!
|
||||
#if defined (__MPI)
|
||||
CALL mp_sum(proj1, intra_pool_comm)
|
||||
CALL mp_sum(proj2, intra_pool_comm)
|
||||
#endif
|
||||
!
|
||||
DO na = 1, nat
|
||||
!
|
||||
|
@ -185,7 +183,7 @@ SUBROUTINE hp_dnsq (lmetq0, iter, conv_root, dnsq)
|
|||
w1 = weight * wdelta
|
||||
!
|
||||
dnsq(m1, m2, current_spin, na) = dnsq(m1, m2, current_spin, na) + &
|
||||
w1 * def * CONJG(proj1(ibnd,ihubst1)) * proj1(ibnd,ihubst2)
|
||||
w1 * def(1) * CONJG(proj1(ibnd,ihubst1)) * proj1(ibnd,ihubst2)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
|
|
|
@ -29,7 +29,8 @@ SUBROUTINE hp_dnstot_sum_q
|
|||
USE control_flags, ONLY : iverbosity
|
||||
USE lr_symm_base, ONLY : nsymq, invsymq, minus_q, rtau
|
||||
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard
|
||||
USE ldaU_hp, ONLY : nqsh, Rvect, dnsscf, dns0, dnsscf_tot, dns0_tot, &
|
||||
USE ldaU_lr, ONLY : dnsscf
|
||||
USE ldaU_hp, ONLY : nqsh, Rvect, dns0, dnsscf_tot, dns0_tot, &
|
||||
skip_equivalence_q, nq1, nq2, nq3, x_q, nqs
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
|
|
@ -20,8 +20,6 @@ subroutine hp_dvpsi_pert (ik)
|
|||
!
|
||||
! dvpsi is for a given "k", "q" and "J"
|
||||
!
|
||||
! dvpsi is READ from file if this_pert_is_on_file(ik) = .TRUE.
|
||||
! otherwise dvpsi is COMPUTED and WRITTEN on file
|
||||
! (evc, swfcatomk, swfcatomkpq must be set)
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
|
@ -38,8 +36,8 @@ subroutine hp_dvpsi_pert (ik)
|
|||
USE units_lr, ONLY : iuatswfc
|
||||
USE control_lr, ONLY : lgamma
|
||||
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, offsetU, nwfcU
|
||||
USE ldaU_hp, ONLY : nqsh, perturbed_atom, this_pert_is_on_file, &
|
||||
iudvwfc, lrdvwfc, swfcatomk, swfcatomkpq
|
||||
USE ldaU_hp, ONLY : nqsh, perturbed_atom, iudvwfc, lrdvwfc
|
||||
USE ldaU_lr, ONLY : swfcatomk, swfcatomkpq
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -59,23 +57,11 @@ subroutine hp_dvpsi_pert (ik)
|
|||
DO na = 1, nat
|
||||
IF (perturbed_atom(na)) counter = counter + 1
|
||||
ENDDO
|
||||
IF (counter.NE.1) CALL errore( 'hp_dvpsi_pert', "One perturbed atom must be specified", 1)
|
||||
IF (counter /= 1) CALL errore( 'hp_dvpsi_pert', "One perturbed atom must be specified", 1)
|
||||
!
|
||||
dvpsi(:,:) = (0.0d0, 0.0d0)
|
||||
!
|
||||
! If this is not the first iteration, hence dvpsi was already
|
||||
! computed before. So read it from file and exit.
|
||||
!
|
||||
IF (this_pert_is_on_file(ik)) THEN
|
||||
!
|
||||
CALL get_buffer(dvpsi, lrdvwfc, iudvwfc, ik)
|
||||
CALL stop_clock ('hp_dvpsi_pert')
|
||||
RETURN
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! If this is a first iteration, then dvpsi must be computed
|
||||
! and written on file.
|
||||
! Compute dvpsi for ik and write on buffer iudvwfc
|
||||
!
|
||||
ALLOCATE (proj(nbnd,nwfcU))
|
||||
!
|
||||
|
@ -126,7 +112,6 @@ subroutine hp_dvpsi_pert (ik)
|
|||
! Write dvpsi on file.
|
||||
!
|
||||
CALL save_buffer(dvpsi, lrdvwfc, iudvwfc, ik)
|
||||
this_pert_is_on_file(ik) = .true.
|
||||
!
|
||||
DEALLOCATE (proj)
|
||||
!
|
||||
|
|
|
@ -1,103 +0,0 @@
|
|||
!
|
||||
! Copyright (C) 2001-2018 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
|
||||
MODULE hp_efermi_shift
|
||||
|
||||
USE kinds, ONLY : DP
|
||||
|
||||
COMPLEX(DP), SAVE, PUBLIC :: def
|
||||
|
||||
CONTAINS
|
||||
|
||||
!-----------------------------------------------------------------------
|
||||
SUBROUTINE hp_ef_shift (drhoscf, ldos, ldoss, dos_ef, dbecsum, becsum1)
|
||||
!-----------------------------------------------------------------------
|
||||
!
|
||||
! This routine takes care of the effects of a shift of Ef, due to the
|
||||
! perturbation, that can take place in a metal at q=0.
|
||||
! The pertubation is neutral.
|
||||
!
|
||||
USE kinds, ONLY : DP
|
||||
USE ions_base, ONLY : nat
|
||||
USE io_global, ONLY : stdout
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE cell_base, ONLY : omega
|
||||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE fft_interfaces, ONLY : fwfft, invfft
|
||||
USE gvect, ONLY : gg
|
||||
USE buffers, ONLY : get_buffer, save_buffer
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE klist, ONLY : degauss, ngauss, ngk
|
||||
USE noncollin_module, ONLY : nspin_mag, nspin_lsda
|
||||
USE mp_bands, ONLY : intra_bgrp_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE uspp_param, ONLY : nhm
|
||||
|
||||
IMPLICIT NONE
|
||||
!
|
||||
COMPLEX(DP) :: drhoscf(dfftp%nnr,nspin_mag), &
|
||||
ldos(dfftp%nnr,nspin_mag), &
|
||||
ldoss(dffts%nnr,nspin_mag)
|
||||
! input/output: the change of the charge (with augmentation)
|
||||
! input: local DOS at Ef (with augmentation)
|
||||
! input: local DOS at Ef
|
||||
REAL(DP) :: dos_ef
|
||||
! input: density of states at Ef
|
||||
!
|
||||
! The PAW case
|
||||
!
|
||||
COMPLEX(DP), OPTIONAL :: dbecsum ((nhm*(nhm+1))/2, nat, nspin_mag, 1)
|
||||
! input: dbecsum = 2 <psi|beta> <beta|dpsi>
|
||||
! output: dbecsum = 2 <psi|beta> <beta|dpsi> + def * becsum1
|
||||
REAL(DP), OPTIONAL :: becsum1 ((nhm*(nhm+1))/2, nat, nspin_mag)
|
||||
! input: becsum1 = wdelta * <psi|beta> <beta|psi>
|
||||
! (where wdelta is a Dirac-delta-like function)
|
||||
!
|
||||
COMPLEX(DP) :: delta_n ! the change in electron number
|
||||
INTEGER :: is ! counter on spin polarizations
|
||||
!
|
||||
CALL start_clock ('hp_ef_shift')
|
||||
!
|
||||
delta_n = (0.d0, 0.d0)
|
||||
!
|
||||
DO is = 1, nspin_lsda
|
||||
!
|
||||
! FFT to G-space
|
||||
CALL fwfft ('Rho', drhoscf(:,is), dfftp)
|
||||
!
|
||||
IF (gg(1).lt.1.0d-8) delta_n = delta_n + omega * drhoscf(dfftp%nl(1),is)
|
||||
!
|
||||
! FFT to R-space
|
||||
CALL invfft ('Rho', drhoscf(:,is), dfftp)
|
||||
!
|
||||
ENDDO
|
||||
!
|
||||
CALL mp_sum ( delta_n, intra_bgrp_comm )
|
||||
!
|
||||
def = - delta_n / dos_ef
|
||||
!
|
||||
WRITE( stdout, '(6x,"Fermi energy shift (Ry) =", 2e12.4)') def
|
||||
!
|
||||
! Corrects the density response accordingly:
|
||||
! drhoscfh = drhoscfh + ldos * def
|
||||
! See Eq.(75) in Rev. Mod. Phys. 73, 515 (2001).
|
||||
!
|
||||
CALL zaxpy (dfftp%nnr*nspin_mag, def, ldos, 1, drhoscf, 1)
|
||||
!
|
||||
! In the PAW case there is also a metallic term
|
||||
!
|
||||
IF (PRESENT(dbecsum) .AND. PRESENT(becsum1)) &
|
||||
dbecsum(:,:,:,1) = dbecsum(:,:,:,1) + &
|
||||
def * CMPLX(becsum1(:,:,:), 0.0_DP, kind=DP)
|
||||
!
|
||||
CALL stop_clock ('hp_ef_shift')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE hp_ef_shift
|
||||
|
||||
END MODULE hp_efermi_shift
|
|
@ -20,7 +20,8 @@ SUBROUTINE hp_generate_grids()
|
|||
USE qpoint, ONLY : xq
|
||||
USE lr_symm_base, ONLY : rtau
|
||||
USE ldaU, ONLY : Hubbard_lmax
|
||||
USE ldaU_hp, ONLY : dnsscf, dns0, dnsscf_tot, dns0_tot, nqsh, &
|
||||
USE ldaU_lr, ONLY : dnsscf
|
||||
USE ldaU_hp, ONLY : dns0, dnsscf_tot, dns0_tot, nqsh, &
|
||||
start_q, last_q, tmp_dir_hp, nqs, comp_iq
|
||||
!
|
||||
IMPLICIT NONE
|
||||
|
|
|
@ -19,10 +19,10 @@ SUBROUTINE hp_openfil_q()
|
|||
USE buffers, ONLY : open_buffer
|
||||
USE qpoint, ONLY : nksq
|
||||
USE control_lr, ONLY : lgamma
|
||||
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc
|
||||
USE units_lr, ONLY : iuwfc, lrwfc, iuatswfc, iudwf, lrdwf
|
||||
USE ldaU, ONLY : nwfcU
|
||||
USE ldaU_hp, ONLY : recalc_sym, tmp_dir_save, tmp_dir_hp, &
|
||||
iudwfc, lrdwfc, iudvwfc, lrdvwfc
|
||||
iudvwfc, lrdvwfc
|
||||
!
|
||||
IMPLICIT NONE
|
||||
LOGICAL :: exst, exst_mem
|
||||
|
@ -58,9 +58,9 @@ SUBROUTINE hp_openfil_q()
|
|||
!
|
||||
! Open a file to write/read a solution of the linear system (dpsi)
|
||||
!
|
||||
iudwfc = 22
|
||||
lrdwfc = nbnd * npwx * npol
|
||||
CALL open_buffer (iudwfc, 'dwfc', lrdwfc, io_level, exst_mem, exst, tmp_dir)
|
||||
iudwf = 22
|
||||
lrdwf = nbnd * npwx * npol
|
||||
CALL open_buffer (iudwf, 'dwfc', lrdwf, io_level, exst_mem, exst, tmp_dir)
|
||||
!
|
||||
! Open a file to write/read S*phi at k and k+q (atomic wfct's)
|
||||
!
|
||||
|
|
|
@ -37,7 +37,7 @@ SUBROUTINE hp_print_clock
|
|||
CALL print_clock ('hp_calc_chi')
|
||||
CALL print_clock ('hp_postproc')
|
||||
CALL print_clock ('hp_vpsifft')
|
||||
CALL print_clock ('hp_ef_shift')
|
||||
CALL print_clock ('ef_shift')
|
||||
CALL print_clock ('hp_run_nscf')
|
||||
CALL print_clock ('hp_postproc')
|
||||
!
|
||||
|
@ -51,6 +51,8 @@ SUBROUTINE hp_print_clock
|
|||
WRITE( stdout, * ) ' PRINTING TIMING FROM LR MODULE: '
|
||||
WRITE( stdout, * )
|
||||
!
|
||||
CALL print_clock ('sth_kernel')
|
||||
CALL print_clock ('apply_dpot_b')
|
||||
CALL print_clock ('ortho')
|
||||
CALL print_clock ('cgsolve')
|
||||
CALL print_clock ('ch_psi')
|
||||
|
|
|
@ -18,7 +18,8 @@ SUBROUTINE hp_read_dnsq()
|
|||
USE io_files, ONLY : prefix, tmp_dir
|
||||
USE lsda_mod, ONLY : nspin
|
||||
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard
|
||||
USE ldaU_hp, ONLY : nah_pert, dns0, dnsscf, nqs, tmp_dir_hp
|
||||
USE ldaU_lr, ONLY : dnsscf
|
||||
USE ldaU_hp, ONLY : nah_pert, dns0, nqs, tmp_dir_hp
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
|
|
@ -20,15 +20,13 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
USE io_global, ONLY : stdout
|
||||
USE check_stop, ONLY : check_stop_now
|
||||
USE wavefunctions, ONLY : evc
|
||||
USE cell_base, ONLY : tpiba2
|
||||
USE klist, ONLY : lgauss, ltetra, xk, wk, nelec, ngk, igk_k
|
||||
USE gvect, ONLY : g
|
||||
USE klist, ONLY : lgauss, ltetra, nelec, ngk
|
||||
USE gvecs, ONLY : doublegrid
|
||||
USE scf, ONLY : rho
|
||||
USE fft_base, ONLY : dfftp, dffts
|
||||
USE lsda_mod, ONLY : lsda, current_spin, isk
|
||||
USE wvfct, ONLY : nbnd, npwx, g2kin, et
|
||||
USE uspp, ONLY : okvan, vkb, nkb
|
||||
USE wvfct, ONLY : nbnd, npwx
|
||||
USE uspp, ONLY : okvan, nkb
|
||||
USE uspp_param, ONLY : nhm
|
||||
USE becmod, ONLY : allocate_bec_type, deallocate_bec_type, becp
|
||||
USE buffers, ONLY : save_buffer, get_buffer
|
||||
|
@ -38,21 +36,21 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
USE paw_symmetry, ONLY : paw_dusymmetrize, paw_dumqsymmetrize
|
||||
USE mp_pools, ONLY : inter_pool_comm, intra_pool_comm
|
||||
USE mp, ONLY : mp_sum
|
||||
USE hp_efermi_shift, ONLY : hp_ef_shift, def
|
||||
USE eqv, ONLY : dvpsi, dpsi, evq
|
||||
USE qpoint, ONLY : nksq, ikks, ikqs, xq
|
||||
USE control_lr, ONLY : lgamma, nbnd_occ
|
||||
USE qpoint, ONLY : nksq, ikks, xq
|
||||
USE control_lr, ONLY : lgamma
|
||||
USE units_lr, ONLY : iuwfc, lrwfc
|
||||
USE lrus, ONLY : int3, int3_paw
|
||||
USE dv_of_drho_lr, ONLY : dv_of_drho
|
||||
USE fft_helper_subroutines
|
||||
USE fft_interfaces, ONLY : fft_interpolate
|
||||
USE lr_symm_base, ONLY : irotmq, minus_q, nsymq, rtau
|
||||
USE ldaU_hp, ONLY : thresh_init, dnsscf, dns0, trace_dns_tot_old, &
|
||||
USE ldaU_lr, ONLY : dnsscf
|
||||
USE ldaU_hp, ONLY : thresh_init, dns0, trace_dns_tot_old, &
|
||||
conv_thr_chi_best, iter_best, niter_max, nmix, &
|
||||
alpha_mix, iudwfc, lrdwfc, code
|
||||
USE apply_dpot_mod, ONLY : apply_dpot_allocate, apply_dpot_deallocate, &
|
||||
apply_dpot_bands
|
||||
alpha_mix, code, lrdvwfc, iudvwfc
|
||||
USE apply_dpot_mod, ONLY : apply_dpot_allocate, apply_dpot_deallocate
|
||||
USE efermi_shift, ONLY : ef_shift, def
|
||||
USE response_kernels, ONLY : sternheimer_kernel
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
@ -62,20 +60,18 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
REAL(DP), ALLOCATABLE :: h_diag (:,:) ! diagonal part of the Hamiltonian
|
||||
!
|
||||
REAL(DP) :: thresh, & ! convergence threshold
|
||||
anorm, & ! the norm of the error
|
||||
averlt, & ! average number of iterations
|
||||
dr2 ! self-consistency error
|
||||
!
|
||||
REAL(DP) :: dos_ef, & ! density of states at the Fermi level
|
||||
weight, & ! Misc variables for metals
|
||||
aux_avg(2) ! Misc variables for metals
|
||||
REAL(DP) :: dos_ef
|
||||
!! density of states at the Fermi level
|
||||
!
|
||||
REAL(DP), ALLOCATABLE :: becsum1(:,:,:)
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE, TARGET :: dvscfin(:,:)
|
||||
! change of the scf potential (input)
|
||||
!
|
||||
COMPLEX(DP), POINTER :: dvscfins(:,:)
|
||||
COMPLEX(DP), POINTER :: dvscfins(:,:,:)
|
||||
! change of the scf potential (smooth part only)
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: drhoscf (:,:), &
|
||||
|
@ -88,42 +84,27 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
ldoss (:,:), & ! as above, without augmentation charges
|
||||
dbecsum (:,:,:,:), & ! the derivative of becsum
|
||||
aux2 (:,:), & ! auxiliary arrays
|
||||
mixin(:), mixout(:), & ! auxiliary arrays for mixing of the response potential
|
||||
tg_dv(:,:), & ! Task groups: auxiliary array for potential * wfct
|
||||
tg_psic(:,:) ! Task groups: auxiliary array for wavefunctions
|
||||
mixin(:), mixout(:) ! auxiliary arrays for mixing of the response potential
|
||||
|
||||
COMPLEX(DP), ALLOCATABLE :: t(:,:,:,:), tmq(:,:,:)
|
||||
! PAW: auxiliary arrays
|
||||
|
||||
LOGICAL :: conv_root, & ! true if linear system is converged
|
||||
exst, & ! used to open the recover file
|
||||
lmetq0, & ! true if xq=(0,0,0) in a metal
|
||||
LOGICAL :: all_conv
|
||||
!! True if sternheimer_kernel is converged at all k points
|
||||
LOGICAL :: lmetq0, & ! true if xq=(0,0,0) in a metal
|
||||
convt, & ! not needed for HP
|
||||
convt_chi ! used instead of convt to control the convergence
|
||||
|
||||
REAL(DP), PARAMETER :: tr2 = 1.D-30 ! threshold parameter
|
||||
|
||||
INTEGER :: ibnd, & ! counter on bands
|
||||
iter, & ! counter on iterations
|
||||
lter, & ! counter on iterations of linear system
|
||||
ltaver, & ! average counter
|
||||
lintercall, & ! average number of calls to cgsolve_all
|
||||
INTEGER :: iter, & ! counter on iterations
|
||||
ik, ikk, & ! counter on k points
|
||||
ikq, & ! counter on k+q points
|
||||
ig, & ! counter on G vectors
|
||||
ndim, &
|
||||
is, & ! counter on spin polarizations
|
||||
nt, & ! counter on types
|
||||
ios, & ! integer variable for I/O control
|
||||
incr, & ! used for task groups
|
||||
v_siz, & ! size of the potential
|
||||
npw, & ! number of plane waves at k
|
||||
npwq ! number of plane waves at k+q
|
||||
npw ! number of plane waves at k
|
||||
|
||||
REAL(DP) :: tcpu, get_clock ! timing variables
|
||||
CHARACTER(LEN=256) :: filename, &
|
||||
flmixdpot = 'mixd'
|
||||
EXTERNAL ch_psi_all, cg_psi
|
||||
CHARACTER(LEN=256) :: flmixdpot = 'mixd'
|
||||
!
|
||||
CALL start_clock ('hp_solve_linear_system')
|
||||
!
|
||||
|
@ -133,15 +114,16 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
!
|
||||
! Allocate arrays for the SCF density/potential
|
||||
!
|
||||
ALLOCATE (drhoscf (dffts%nnr, nspin_mag))
|
||||
ALLOCATE (drhoscf (dfftp%nnr, nspin_mag))
|
||||
ALLOCATE (drhoscfh(dfftp%nnr, nspin_mag))
|
||||
ALLOCATE (dvscfin (dfftp%nnr, nspin_mag))
|
||||
ALLOCATE (dvscfout(dfftp%nnr, nspin_mag))
|
||||
!
|
||||
dvscfin = (0.0_DP, 0.0_DP)
|
||||
IF (doublegrid) THEN
|
||||
ALLOCATE (dvscfins(dffts%nnr, nspin_mag))
|
||||
ALLOCATE (dvscfins(dffts%nnr, nspin_mag, 1))
|
||||
ELSE
|
||||
dvscfins => dvscfin
|
||||
dvscfins(1:dffts%nnr, 1:nspin_mag, 1:1) => dvscfin
|
||||
ENDIF
|
||||
!
|
||||
! USPP-specific allocations
|
||||
|
@ -175,16 +157,6 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
convt = .FALSE.
|
||||
convt_chi = .FALSE.
|
||||
!
|
||||
incr = 1
|
||||
IF ( dffts%has_task_groups ) THEN
|
||||
!
|
||||
v_siz = dffts%nnr_tg
|
||||
ALLOCATE( tg_dv ( v_siz, nspin_mag ) )
|
||||
ALLOCATE( tg_psic( v_siz, npol ) )
|
||||
incr = fftx_ntgrp(dffts)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! If q=0 for a metal: allocate and compute local DOS and DOS at Ef
|
||||
!
|
||||
lmetq0 = (lgauss .OR. ltetra) .AND. lgamma
|
||||
|
@ -197,158 +169,65 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
IF (.NOT.okpaw) DEALLOCATE (becsum1)
|
||||
ENDIF
|
||||
!
|
||||
! Compute dV_bare * psi and write to buffer iubar
|
||||
!
|
||||
DO ik = 1, nksq
|
||||
!
|
||||
ikk = ikks(ik)
|
||||
npw = ngk(ikk)
|
||||
!
|
||||
IF (lsda) current_spin = isk(ikk)
|
||||
!
|
||||
! Read unperturbed KS wavefuctions psi(k) and psi(k+q)
|
||||
!
|
||||
IF (nksq > 1) THEN
|
||||
CALL get_buffer(evc, lrwfc, iuwfc, ikk)
|
||||
ENDIF
|
||||
!
|
||||
! Computes (iter=1) or reads (iter>1) the action of the perturbing
|
||||
! potential on the unperturbed KS wavefunctions: |dvpsi> = dV_pert * |evc>
|
||||
! See Eq. (46) in Ref. [1]
|
||||
!
|
||||
CALL hp_dvpsi_pert(ik)
|
||||
!
|
||||
ENDDO ! ik
|
||||
!
|
||||
! The loop of the linear-response calculation
|
||||
!
|
||||
DO iter = 1, niter_max
|
||||
!
|
||||
WRITE(stdout,'(/6x,"atom #",i3,3x,"q point #",i4,3x,"iter # ",i3)') na, iq, iter
|
||||
!
|
||||
ltaver = 0
|
||||
lintercall = 0
|
||||
!
|
||||
drhoscf(:,:) = (0.d0, 0.d0)
|
||||
dvscfout(:,:) = (0.d0, 0.d0)
|
||||
dbecsum(:,:,:,:) = (0.d0, 0.d0)
|
||||
!
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
!!!!!!!!!!!!!!!!!!! START OF THE K LOOP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
IF ( iter == 1 ) THEN
|
||||
! Starting threshold for iterative solution of the linear system.
|
||||
! A strickt threshold for the first iteration is needed,
|
||||
! because we need dns0 to very high precision.
|
||||
thresh = thresh_init * nelec
|
||||
ELSE
|
||||
! Threshold for iterative solution of the linear system.
|
||||
! We start with not a strict threshold for iter=2, and then
|
||||
! it decreases with iterations.
|
||||
thresh = MIN (1.D-1 * SQRT(dr2), 1.D-2)
|
||||
ENDIF
|
||||
!
|
||||
DO ik = 1, nksq
|
||||
!
|
||||
ikk = ikks(ik)
|
||||
ikq = ikqs(ik)
|
||||
npw = ngk(ikk)
|
||||
npwq = ngk(ikq)
|
||||
!
|
||||
IF (lsda) current_spin = isk(ikk)
|
||||
!
|
||||
! Read unperturbed KS wavefuctions psi(k) and psi(k+q)
|
||||
!
|
||||
IF (nksq.gt.1) THEN
|
||||
IF (lgamma) THEN
|
||||
CALL get_buffer (evc, lrwfc, iuwfc, ikk)
|
||||
ELSE
|
||||
CALL get_buffer (evc, lrwfc, iuwfc, ikk)
|
||||
CALL get_buffer (evq, lrwfc, iuwfc, ikq)
|
||||
ENDIF
|
||||
ENDIF
|
||||
!
|
||||
! USPP: Compute the projectors vkb at k+q
|
||||
!
|
||||
CALL init_us_2 (npwq, igk_k(1,ikq), xk(1,ikq), vkb)
|
||||
!
|
||||
! Compute the kinetic energy at k+q
|
||||
!
|
||||
CALL g2_kin (ikq)
|
||||
!
|
||||
! Compute preconditioning matrix h_diag used by cgsolve_all
|
||||
!
|
||||
CALL h_prec (ik, evq, h_diag)
|
||||
!
|
||||
! Computes (iter=1) or reads (iter>1) the action of the perturbing
|
||||
! potential on the unperturbed KS wavefunctions: |dvpsi> = dV_pert * |evc>
|
||||
! See Eq. (46) in Ref. [1]
|
||||
!
|
||||
CALL hp_dvpsi_pert(ik)
|
||||
!
|
||||
IF ( iter > 1 ) THEN
|
||||
!
|
||||
! Add the contribution of the self consistent term.
|
||||
! Calculates dvscf_q*psi(k) in G-space, for all bands, k=ik
|
||||
! dvscf_q from previous iteration (mix_potential)
|
||||
!
|
||||
CALL start_clock ('hp_vpsifft')
|
||||
CALL apply_dpot_bands(ik, nbnd_occ(ikk), dvscfins, evc, aux2)
|
||||
dvpsi = dvpsi + aux2
|
||||
CALL stop_clock ('hp_vpsifft')
|
||||
!
|
||||
! USPP: there is an additional self-consistent term proportional to int3
|
||||
! |dvpsi> = |dvpsi> + dV_HXC*|evc> + int3 * |beta><beta|evc>
|
||||
!
|
||||
IF (okvan) CALL adddvscf(1, ik)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Ortogonalize dvpsi to valence states: ps = <evq|dvpsi>
|
||||
! Apply -P_c^+. See Eq. (A21) in Ref. [1]
|
||||
!
|
||||
CALL orthogonalize(dvpsi, evq, ikk, ikq, dpsi, npwq, .FALSE.)
|
||||
!
|
||||
IF ( iter == 1 ) THEN
|
||||
!
|
||||
! At the first iteration dpsi and dvscfin are set to zero
|
||||
!
|
||||
dpsi(:,:) = (0.d0, 0.d0)
|
||||
dvscfin(:,:) = (0.d0, 0.d0)
|
||||
!
|
||||
! Starting threshold for iterative solution of the linear system.
|
||||
! A strickt threshold for the first iteration is needed,
|
||||
! because we need dns0 to very high precision.
|
||||
!
|
||||
thresh = thresh_init * nelec
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! Starting value for dpsi is read from file
|
||||
!
|
||||
CALL get_buffer( dpsi, lrdwfc, iudwfc, ik)
|
||||
!
|
||||
! Threshold for iterative solution of the linear system.
|
||||
! We start with not a strict threshold for iter=2, and then
|
||||
! it decreases with iterations.
|
||||
!
|
||||
thresh = MIN (1.D-1 * SQRT(dr2), 1.D-2)
|
||||
!
|
||||
ENDIF
|
||||
!
|
||||
! Iterative solution of the linear system:
|
||||
! (H + Q - eS) * |dpsi> = |dvpsi>,
|
||||
! where |dvpsi> = - P_c^+ (dV_HXC + dV_pert) * |evc>
|
||||
! See Eq. (43) in Ref. [1]
|
||||
!
|
||||
CALL cgsolve_all (ch_psi_all, cg_psi, et(1,ikk), dvpsi, dpsi, h_diag, &
|
||||
& npwx, npwq, thresh, ik, lter, conv_root, anorm, nbnd_occ(ikk), npol )
|
||||
!
|
||||
ltaver = ltaver + lter
|
||||
!
|
||||
lintercall = lintercall + 1
|
||||
!
|
||||
IF (.NOT.conv_root) THEN
|
||||
WRITE( stdout, '(6x,"kpoint",i4, &
|
||||
& " hp_solve_linear_system: root not converged, thresh < ",e10.3)') ik , anorm
|
||||
IF (iter == 1) WRITE( stdout, '(6x,"Try to increase thresh_init...")')
|
||||
ENDIF
|
||||
!
|
||||
! Writes dpsi on file for a given k
|
||||
!
|
||||
CALL save_buffer (dpsi, lrdwfc, iudwfc, ik)
|
||||
!
|
||||
! Setup the weight at point k (normalized by the number of k points)
|
||||
!
|
||||
weight = wk(ikk)
|
||||
!
|
||||
! Calculates the response charge density (sum over k)
|
||||
! See Eq. (48) in Ref. [1]
|
||||
!
|
||||
CALL incdrhoscf (drhoscf(:,current_spin), weight, ik, &
|
||||
& dbecsum(:,:,current_spin,1), dpsi)
|
||||
!
|
||||
ENDDO ! k points
|
||||
! Compute drhoscf, the charge density response to the total potential
|
||||
!
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
!!!!!!!!!!!!!!!!!!!!! END OF THE K LOOP !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
CALL sternheimer_kernel(iter==1, .FALSE., 1, lrdvwfc, iudvwfc, &
|
||||
thresh, dvscfins, all_conv, averlt, drhoscf, dbecsum, exclude_hubbard=.TRUE.)
|
||||
!
|
||||
#if defined (__MPI)
|
||||
IF ((.NOT. all_conv) .AND. (iter == 1)) THEN
|
||||
WRITE(stdout, '(6x, "sternheimer_kernel not converged. Try to increase thresh_init.")')
|
||||
ENDIF
|
||||
!
|
||||
! USPP: The calculation of dbecsum is distributed across processors (see addusdbec)
|
||||
! Sum over processors the contributions coming from each slice of bands
|
||||
!
|
||||
CALL mp_sum ( dbecsum, intra_pool_comm )
|
||||
!
|
||||
#endif
|
||||
!
|
||||
! Copy/interpolate the response density drhoscf -> drhoscfh
|
||||
!
|
||||
IF (doublegrid) THEN
|
||||
|
@ -363,10 +242,9 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
!
|
||||
IF (okvan) CALL lr_addusddens (drhoscfh, dbecsum)
|
||||
!
|
||||
#if defined (__MPI)
|
||||
call mp_sum ( drhoscf, inter_pool_comm )
|
||||
CALL mp_sum ( drhoscfh, inter_pool_comm )
|
||||
IF (okpaw) CALL mp_sum ( dbecsum, inter_pool_comm )
|
||||
#endif
|
||||
!
|
||||
! PAW: the factor of 2 is due to the presence of the CC term
|
||||
! (see first two terms in Eq.(9) in PRB 81, 075123 (2010))
|
||||
|
@ -381,14 +259,14 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
IF (lmetq0) THEN
|
||||
!
|
||||
IF (okpaw) THEN
|
||||
CALL hp_ef_shift (drhoscfh, ldos, ldoss, dos_ef, dbecsum, becsum1)
|
||||
CALL ef_shift(1, dos_ef, ldos, drhoscfh, dbecsum=dbecsum, becsum1=becsum1)
|
||||
ELSE
|
||||
CALL hp_ef_shift (drhoscfh, ldos, ldoss, dos_ef)
|
||||
CALL ef_shift(1, dos_ef, ldos, drhoscfh)
|
||||
ENDIF
|
||||
!
|
||||
! Check that def is not too large (it is in Ry).
|
||||
!
|
||||
IF ( ABS(DBLE(def)) > 5.0d0 ) THEN
|
||||
IF ( ABS(DBLE(def(1))) > 5.0d0 ) THEN
|
||||
!
|
||||
WRITE( stdout, '(/6x,"WARNING: The Fermi energy shift is too big!")')
|
||||
WRITE( stdout, '(6x, "This may happen in two cases:")')
|
||||
|
@ -451,7 +329,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
!
|
||||
IF (doublegrid) THEN
|
||||
DO is = 1, nspin_mag
|
||||
CALL fft_interpolate (dfftp, dvscfin(:,is), dffts, dvscfins(:,is))
|
||||
CALL fft_interpolate (dfftp, dvscfin(:,is), dffts, dvscfins(:,is,1))
|
||||
ENDDO
|
||||
ENDIF
|
||||
!
|
||||
|
@ -478,16 +356,7 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
!
|
||||
IF ( iter == 1 ) dns0(:,:,:,:,iq) = dnsscf(:,:,:,:,iq)
|
||||
!
|
||||
! Compute the average number of iterations
|
||||
!
|
||||
#if defined (__MPI)
|
||||
aux_avg(1) = DBLE(ltaver)
|
||||
aux_avg(2) = DBLE(lintercall)
|
||||
CALL mp_sum ( aux_avg, inter_pool_comm )
|
||||
averlt = aux_avg(1) / aux_avg(2)
|
||||
#else
|
||||
averlt = DBLE(ltaver) / DBLE(lintercall)
|
||||
#endif
|
||||
! Print the average number of iterations
|
||||
!
|
||||
tcpu = get_clock(code)
|
||||
!
|
||||
|
@ -506,12 +375,10 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
CALL hp_stop_smoothly (.TRUE.)
|
||||
ENDIF
|
||||
!
|
||||
IF (convt_chi) goto 155
|
||||
IF (convt_chi) EXIT
|
||||
!
|
||||
ENDDO ! loop over the iterations iter
|
||||
!
|
||||
155 CONTINUE
|
||||
!
|
||||
CALL apply_dpot_deallocate()
|
||||
DEALLOCATE (h_diag)
|
||||
DEALLOCATE (aux2)
|
||||
|
@ -533,10 +400,6 @@ SUBROUTINE hp_solve_linear_system (na, iq)
|
|||
DEALLOCATE (tmq)
|
||||
ENDIF
|
||||
CALL deallocate_bec_type (becp)
|
||||
IF ( dffts%has_task_groups ) THEN
|
||||
DEALLOCATE( tg_dv )
|
||||
DEALLOCATE( tg_psic )
|
||||
ENDIF
|
||||
!
|
||||
WRITE( stdout,*) " "
|
||||
WRITE( stdout,*) " =--------------------------------------------="
|
||||
|
|
|
@ -18,7 +18,8 @@ SUBROUTINE hp_write_dnsq(iq)
|
|||
USE lsda_mod, ONLY : nspin
|
||||
USE io_files, ONLY : prefix, tmp_dir
|
||||
USE ldaU, ONLY : Hubbard_lmax, Hubbard_l, is_hubbard
|
||||
USE ldaU_hp, ONLY : nah_pert, dns0, dnsscf, x_q
|
||||
USE ldaU_lr, ONLY : dnsscf
|
||||
USE ldaU_hp, ONLY : nah_pert, dns0, x_q
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
|
|
@ -49,7 +49,6 @@ MODULE ldaU_hp
|
|||
LOGICAL, ALLOCATABLE :: todo_atom(:), & ! Which atoms must be perturbed
|
||||
perturbed_atom(:), & ! Controls which atom is perturbed in the HP
|
||||
! calculation
|
||||
this_pert_is_on_file(:), & ! The perturbation is written on file or not
|
||||
comp_iq(:) ! If .true. this q point has to be calculated
|
||||
!
|
||||
INTEGER :: nath, & ! Number of (real) atoms in the primitive cell
|
||||
|
@ -77,8 +76,6 @@ MODULE ldaU_hp
|
|||
nq1, nq2, nq3, & ! Number of q points in each direction
|
||||
nqs, & ! Number of q points to be calculated
|
||||
start_q, last_q, & ! Initial and final q in the list
|
||||
iudwfc, & ! Unit for response wavefunctions
|
||||
lrdwfc, & ! Length of the record for response wavefunctions
|
||||
iudvwfc, & ! Unit for the perturbing potential * wavefunctions
|
||||
lrdvwfc ! Length of the record for the perturbing potential * wavefunctions
|
||||
!
|
||||
|
@ -116,7 +113,6 @@ MODULE ldaU_hp
|
|||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: dns0(:,:,:,:,:), & ! Bare response occupation matrix
|
||||
! (from 1st iteration)
|
||||
dnsscf(:,:,:,:,:), & ! SCF response occupation matrix
|
||||
dns0_tot(:,:,:,:,:), & ! Total bare response occupation matrix
|
||||
! (summed over q)
|
||||
dnsscf_tot(:,:,:,:,:), & ! Total SCF response occupation matrix
|
||||
|
@ -126,7 +122,4 @@ MODULE ldaU_hp
|
|||
!
|
||||
INTEGER, ALLOCATABLE :: ityp_new(:) ! Types of atoms
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE, TARGET :: swfcatomk(:,:) ! S * atomic wfc at k
|
||||
COMPLEX(DP), POINTER :: swfcatomkpq(:,:) ! S * atomic wfc at k+q
|
||||
!
|
||||
END MODULE ldaU_hp
|
||||
|
|
|
@ -9,7 +9,6 @@ hp_allocate_q.o : ../../Modules/wavefunctions.o
|
|||
hp_allocate_q.o : ../../PW/src/ldaU.o
|
||||
hp_allocate_q.o : ../../PW/src/pwcom.o
|
||||
hp_allocate_q.o : ../../upflib/uspp.o
|
||||
hp_allocate_q.o : hpcom.o
|
||||
hp_bcast_input.o : ../../LR_Modules/lrcom.o
|
||||
hp_bcast_input.o : ../../Modules/check_stop.o
|
||||
hp_bcast_input.o : ../../Modules/control_flags.o
|
||||
|
@ -49,7 +48,7 @@ hp_dealloc_q.o : ../../LR_Modules/lrcom.o
|
|||
hp_dealloc_q.o : ../../Modules/becmod.o
|
||||
hp_dealloc_q.o : ../../Modules/noncol.o
|
||||
hp_dealloc_q.o : ../../upflib/uspp.o
|
||||
hp_dealloc_q.o : hpcom.o
|
||||
hp_dnsq.o : ../../LR_Modules/efermi_shift.o
|
||||
hp_dnsq.o : ../../LR_Modules/lrcom.o
|
||||
hp_dnsq.o : ../../Modules/constants.o
|
||||
hp_dnsq.o : ../../Modules/control_flags.o
|
||||
|
@ -64,7 +63,6 @@ hp_dnsq.o : ../../PW/src/ldaU.o
|
|||
hp_dnsq.o : ../../PW/src/pwcom.o
|
||||
hp_dnsq.o : ../../UtilXlib/mp.o
|
||||
hp_dnsq.o : ../../upflib/uspp.o
|
||||
hp_dnsq.o : hp_efermi_shift.o
|
||||
hp_dnsq.o : hpcom.o
|
||||
hp_dnstot_sum_q.o : ../../LR_Modules/lrcom.o
|
||||
hp_dnstot_sum_q.o : ../../Modules/cell_base.o
|
||||
|
@ -88,20 +86,6 @@ hp_dvpsi_pert.o : ../../PW/src/ldaU.o
|
|||
hp_dvpsi_pert.o : ../../PW/src/pwcom.o
|
||||
hp_dvpsi_pert.o : ../../UtilXlib/mp.o
|
||||
hp_dvpsi_pert.o : hpcom.o
|
||||
hp_efermi_shift.o : ../../FFTXlib/fft_interfaces.o
|
||||
hp_efermi_shift.o : ../../Modules/cell_base.o
|
||||
hp_efermi_shift.o : ../../Modules/fft_base.o
|
||||
hp_efermi_shift.o : ../../Modules/io_global.o
|
||||
hp_efermi_shift.o : ../../Modules/ions_base.o
|
||||
hp_efermi_shift.o : ../../Modules/kind.o
|
||||
hp_efermi_shift.o : ../../Modules/mp_bands.o
|
||||
hp_efermi_shift.o : ../../Modules/noncol.o
|
||||
hp_efermi_shift.o : ../../Modules/recvec.o
|
||||
hp_efermi_shift.o : ../../Modules/wavefunctions.o
|
||||
hp_efermi_shift.o : ../../PW/src/buffers.o
|
||||
hp_efermi_shift.o : ../../PW/src/pwcom.o
|
||||
hp_efermi_shift.o : ../../UtilXlib/mp.o
|
||||
hp_efermi_shift.o : ../../upflib/uspp.o
|
||||
hp_find_inequiv_sites.o : ../../Modules/io_global.o
|
||||
hp_find_inequiv_sites.o : ../../Modules/ions_base.o
|
||||
hp_find_inequiv_sites.o : ../../Modules/kind.o
|
||||
|
@ -195,6 +179,7 @@ hp_read_chi.o : ../../Modules/io_global.o
|
|||
hp_read_chi.o : ../../Modules/ions_base.o
|
||||
hp_read_chi.o : ../../Modules/kind.o
|
||||
hp_read_chi.o : hpcom.o
|
||||
hp_read_dnsq.o : ../../LR_Modules/lrcom.o
|
||||
hp_read_dnsq.o : ../../Modules/io_files.o
|
||||
hp_read_dnsq.o : ../../Modules/io_global.o
|
||||
hp_read_dnsq.o : ../../Modules/ions_base.o
|
||||
|
@ -259,9 +244,10 @@ hp_solve_linear_system.o : ../../FFTXlib/fft_helper_subroutines.o
|
|||
hp_solve_linear_system.o : ../../FFTXlib/fft_interfaces.o
|
||||
hp_solve_linear_system.o : ../../LR_Modules/apply_dpot_mod.o
|
||||
hp_solve_linear_system.o : ../../LR_Modules/dv_of_drho.o
|
||||
hp_solve_linear_system.o : ../../LR_Modules/efermi_shift.o
|
||||
hp_solve_linear_system.o : ../../LR_Modules/lrcom.o
|
||||
hp_solve_linear_system.o : ../../LR_Modules/response_kernels.o
|
||||
hp_solve_linear_system.o : ../../Modules/becmod.o
|
||||
hp_solve_linear_system.o : ../../Modules/cell_base.o
|
||||
hp_solve_linear_system.o : ../../Modules/check_stop.o
|
||||
hp_solve_linear_system.o : ../../Modules/fft_base.o
|
||||
hp_solve_linear_system.o : ../../Modules/io_global.o
|
||||
|
@ -279,7 +265,6 @@ hp_solve_linear_system.o : ../../PW/src/scf_mod.o
|
|||
hp_solve_linear_system.o : ../../UtilXlib/mp.o
|
||||
hp_solve_linear_system.o : ../../upflib/paw_variables.o
|
||||
hp_solve_linear_system.o : ../../upflib/uspp.o
|
||||
hp_solve_linear_system.o : hp_efermi_shift.o
|
||||
hp_solve_linear_system.o : hpcom.o
|
||||
hp_stop_smoothly.o : ../../Modules/environment.o
|
||||
hp_stop_smoothly.o : ../../Modules/mp_global.o
|
||||
|
@ -333,6 +318,7 @@ hp_write_chi.o : hpcom.o
|
|||
hp_write_chi_full.o : ../../Modules/io_files.o
|
||||
hp_write_chi_full.o : ../../Modules/ions_base.o
|
||||
hp_write_chi_full.o : hpcom.o
|
||||
hp_write_dnsq.o : ../../LR_Modules/lrcom.o
|
||||
hp_write_dnsq.o : ../../Modules/io_files.o
|
||||
hp_write_dnsq.o : ../../Modules/io_global.o
|
||||
hp_write_dnsq.o : ../../Modules/ions_base.o
|
||||
|
|
|
@ -1,3 +1,17 @@
|
|||
set(ks_headers
|
||||
ks_solver_interfaces.h)
|
||||
set (ks_include_dir ${CMAKE_CURRENT_BINARY_DIR}/include)
|
||||
#FIXME maybe better to move this part to an helper function
|
||||
foreach (in_h ${ks_headers})
|
||||
get_filename_component(in_h_basename ${in_h} NAME_WE)
|
||||
set (out_h "${ks_include_dir}/${in_h_basename}.fh")
|
||||
qe_preprocess_source("${CMAKE_CURRENT_SOURCE_DIR}/${in_h}" ${out_h})
|
||||
list (APPEND ks_out_headers ${out_h})
|
||||
endforeach()
|
||||
add_custom_target(qe_ks_headers
|
||||
DEPENDS ${ks_out_headers}
|
||||
VERBATIM )
|
||||
|
||||
set(src_davidson
|
||||
Davidson_RCI/david_rci.f90
|
||||
Davidson/cegterg.f90
|
||||
|
@ -31,12 +45,22 @@ set(src_dense
|
|||
DENSE/rotate_HSpsi_gamma.f90
|
||||
DENSE/rotate_HSpsi_k.f90
|
||||
DENSE/rotate_wfc_gamma.f90
|
||||
DENSE/rotate_wfc_k.f90
|
||||
DENSE/rotate_wfc_k.f90
|
||||
DENSE/rotate_xpsi_k.f90
|
||||
DENSE/rotate_xpsi_gamma.f90
|
||||
DENSE/gram_schmidt_k.f90
|
||||
DENSE/gram_schmidt_gamma.f90
|
||||
DENSE/rotate_driver.f90
|
||||
# GPU
|
||||
DENSE/rotate_HSpsi_gamma_gpu.f90
|
||||
DENSE/rotate_HSpsi_k_gpu.f90
|
||||
DENSE/rotate_wfc_gamma_gpu.f90
|
||||
DENSE/rotate_wfc_k_gpu.f90)
|
||||
DENSE/rotate_xpsi_k_gpu.f90
|
||||
DENSE/rotate_xpsi_gamma_gpu.f90
|
||||
DENSE/gram_schmidt_k_gpu.f90
|
||||
DENSE/gram_schmidt_gamma_gpu.f90
|
||||
DENSE/rotate_wfc_k_gpu.f90
|
||||
DENSE/rotate_driver_cuf.f90)
|
||||
qe_enable_cuda_fortran("${src_dense}")
|
||||
|
||||
set(src_paro
|
||||
|
@ -55,6 +79,14 @@ set(src_paro
|
|||
ParO/paro_k_new_gpu.f90)
|
||||
qe_enable_cuda_fortran("${src_paro}")
|
||||
|
||||
set(src_rmmdiis
|
||||
RMM/crmmdiagg.f90
|
||||
RMM/rrmmdiagg.f90
|
||||
# GPU
|
||||
RMM/crmmdiagg_gpu.f90
|
||||
RMM/rrmmdiagg_gpu.f90)
|
||||
qe_enable_cuda_fortran("${src_rmmdiis}")
|
||||
|
||||
qe_add_library(qe_kssolver_davidson ${src_davidson})
|
||||
target_link_libraries(qe_kssolver_davidson
|
||||
PRIVATE
|
||||
|
@ -95,6 +127,15 @@ target_link_libraries(qe_kssolver_dense
|
|||
qe_utilx
|
||||
qe_mpi_fortran
|
||||
qe_devxlib)
|
||||
target_include_directories(qe_kssolver_dense
|
||||
PUBLIC
|
||||
$<BUILD_INTERFACE:${ks_include_dir}>
|
||||
$<INSTALL_INTERFACE:include/qe>
|
||||
)
|
||||
set_target_properties(qe_kssolver_dense
|
||||
PROPERTIES PUBLIC_HEADER ${ks_out_headers}
|
||||
)
|
||||
add_dependencies(qe_kssolver_dense qe_ks_headers)
|
||||
|
||||
qe_add_library(qe_kssolver_paro ${src_paro})
|
||||
target_link_libraries(qe_kssolver_paro
|
||||
|
@ -104,6 +145,14 @@ target_link_libraries(qe_kssolver_paro
|
|||
qe_utilx
|
||||
qe_mpi_fortran)
|
||||
|
||||
qe_add_library(qe_kssolver_rmmdiis ${src_rmmdiis})
|
||||
target_link_libraries(qe_kssolver_rmmdiis
|
||||
PRIVATE
|
||||
qe_lax
|
||||
qe_utilx
|
||||
qe_devxlib
|
||||
qe_mpi_fortran )
|
||||
|
||||
###########################################################
|
||||
|
||||
qe_install_targets(
|
||||
|
@ -112,4 +161,5 @@ qe_install_targets(
|
|||
qe_kssolver_cg
|
||||
qe_kssolver_ppcg
|
||||
qe_kssolver_paro
|
||||
qe_kssolver_dense)
|
||||
qe_kssolver_dense
|
||||
qe_kssolver_rmmdiis)
|
||||
|
|
|
@ -3,17 +3,27 @@
|
|||
include ../../make.inc
|
||||
|
||||
# location of needed modules and included files (if any)
|
||||
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).
|
||||
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG) ../../Modules $(MOD_FLAG).
|
||||
|
||||
DENSE = \
|
||||
rotate_HSpsi_gamma.o \
|
||||
rotate_HSpsi_k.o \
|
||||
rotate_wfc_gamma.o \
|
||||
rotate_wfc_k.o \
|
||||
rotate_driver.o \
|
||||
rotate_HSpsi_gamma_gpu.o \
|
||||
rotate_HSpsi_k_gpu.o \
|
||||
rotate_wfc_gamma_gpu.o \
|
||||
rotate_wfc_k_gpu.o
|
||||
rotate_wfc_k_gpu.o \
|
||||
rotate_driver_cuf.o \
|
||||
rotate_xpsi_k.o \
|
||||
rotate_xpsi_k_gpu.o \
|
||||
rotate_xpsi_gamma.o \
|
||||
rotate_xpsi_gamma_gpu.o \
|
||||
gram_schmidt_gamma.o \
|
||||
gram_schmidt_k.o \
|
||||
gram_schmidt_gamma_gpu.o \
|
||||
gram_schmidt_k_gpu.o
|
||||
|
||||
all : libdense.a
|
||||
|
||||
|
|
|
@ -0,0 +1,462 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
#define ZERO ( 0._DP, 0._DP )
|
||||
!
|
||||
!--------------------------------------------------------------------------
|
||||
SUBROUTINE gram_schmidt_gamma( npwx, npw, nbnd, psi, hpsi, spsi, e, &
|
||||
uspp, eigen, reorder, nbsize )
|
||||
!--------------------------------------------------------------------------
|
||||
!
|
||||
! ... Gram-Schmidt orthogonalization, for Gamma-only calculations.
|
||||
! ... blocking algorithm is used.
|
||||
!
|
||||
USE util_param, ONLY : DP, eps16
|
||||
USE mp, ONLY : mp_sum, mp_max, mp_bcast
|
||||
USE mp_bands_util, ONLY : gstart, inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi (npwx,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi(npwx,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: spsi(npwx,nbnd)
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
LOGICAL, INTENT(IN) :: uspp
|
||||
LOGICAL, INTENT(IN) :: eigen
|
||||
LOGICAL, INTENT(IN) :: reorder
|
||||
INTEGER, INTENT(IN) :: nbsize
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
LOGICAL :: eigen_
|
||||
INTEGER :: npw2, npwx2
|
||||
INTEGER :: iblock, nblock
|
||||
INTEGER :: iblock_start, iblock_end
|
||||
INTEGER :: jblock_start, jblock_end
|
||||
INTEGER :: ibnd_start, ibnd_end
|
||||
INTEGER :: jbnd_start, jbnd_end
|
||||
COMPLEX(DP), ALLOCATABLE :: phi(:,:), hphi(:,:), sphi(:,:)
|
||||
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
|
||||
!
|
||||
eigen_ = eigen
|
||||
!
|
||||
IF ( reorder ) THEN
|
||||
!
|
||||
eigen_ = .TRUE.
|
||||
!
|
||||
END IF
|
||||
!
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
!
|
||||
nblock = nbnd / nbsize
|
||||
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
|
||||
!
|
||||
IF ( my_bgrp_id >= nblock ) THEN
|
||||
!
|
||||
iblock_start = nblock + 1
|
||||
iblock_end = nblock
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( phi ( npwx, nbnd ) )
|
||||
IF ( eigen_ ) ALLOCATE( hphi( npwx, nbnd ) )
|
||||
IF ( uspp ) ALLOCATE( sphi( npwx, nbnd ) )
|
||||
ALLOCATE( owner_bgrp_id( nblock ) )
|
||||
!
|
||||
phi = ZERO
|
||||
!
|
||||
IF ( eigen_ ) hphi = ZERO
|
||||
!
|
||||
IF ( uspp ) sphi = ZERO
|
||||
!
|
||||
! ... Set owers of blocks
|
||||
!
|
||||
owner_bgrp_id = 0
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
|
||||
owner_bgrp_id(iblock) = my_bgrp_id
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
|
||||
!
|
||||
! ... Set Im[ psi(G=0) ] - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) psi(1,1:nbnd) = CMPLX( DBLE( psi(1,1:nbnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
! ... Set initial : |phi_j> = |psi_j>
|
||||
!
|
||||
CALL DCOPY( npwx2 * nbnd, psi(1,1), 1, phi(1,1), 1 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) phi(1,1:nbnd) = CMPLX( DBLE( phi(1,1:nbnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DCOPY( npwx2 * nbnd, hpsi(1,1), 1, hphi(1,1), 1 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) hphi(1,1:nbnd) = CMPLX( DBLE( hphi(1,1:nbnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DCOPY( npwx2 * nbnd, spsi(1,1), 1, sphi(1,1), 1 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) sphi(1,1:nbnd) = CMPLX( DBLE( sphi(1,1:nbnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Blocking loop
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
! ... Orthogonalize diagonal block by standard Gram-Schmidt
|
||||
!
|
||||
ibnd_start = ( iblock - 1 ) * nbsize + 1
|
||||
ibnd_end = MIN( iblock * nbsize, nbnd )
|
||||
!
|
||||
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
|
||||
CALL gram_schmidt_diag( ibnd_start, ibnd_end )
|
||||
!
|
||||
! ... Bcast diagonal block
|
||||
!
|
||||
CALL mp_bcast( phi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL mp_bcast( hphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL mp_bcast( sphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
! ... Project off-diagonal block outside of diagonal block
|
||||
!
|
||||
jblock_start = MAX( iblock_start, iblock + 1 )
|
||||
jblock_end = iblock_end
|
||||
!
|
||||
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
|
||||
jbnd_end = MIN( jblock_end * nbsize, nbnd )
|
||||
!
|
||||
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
|
||||
CALL project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
! ... Copy psi <- phi
|
||||
!
|
||||
CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
|
||||
!
|
||||
! ... Calculate energy eigenvalues
|
||||
!
|
||||
IF ( eigen_ ) CALL energyeigen( )
|
||||
!
|
||||
! ... Sort wave functions
|
||||
!
|
||||
IF ( reorder ) CALL sort_vectors( )
|
||||
!
|
||||
DEALLOCATE( phi )
|
||||
IF ( eigen_ ) DEALLOCATE( hphi )
|
||||
IF ( uspp ) DEALLOCATE( sphi )
|
||||
DEALLOCATE( owner_bgrp_id )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!
|
||||
SUBROUTINE gram_schmidt_diag( ibnd_start, ibnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
REAL(DP), ALLOCATABLE :: sr(:)
|
||||
REAL(DP) :: norm
|
||||
REAL(DP), EXTERNAL :: DDOT
|
||||
!
|
||||
ALLOCATE( sr( ibnd_start:ibnd_end ) )
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
IF ( ibnd > ibnd_start ) THEN
|
||||
!
|
||||
! ... <phi_j| S |psi_i>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi(1,ibnd_start), npwx2, &
|
||||
spsi(1,ibnd), 1, 0._DP, sr(ibnd_start), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DAXPY( ibnd - ibnd_start, -spsi(1,ibnd), phi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start), 1 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL DGEMV( 'T', npw2, ibnd - ibnd_start, 2._DP, phi(1,ibnd_start), npwx2, &
|
||||
psi(1,ibnd), 1, 0._DP, sr(ibnd_start), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DAXPY( ibnd - ibnd_start, -psi(1,ibnd), phi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sr, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
||||
!
|
||||
CALL DGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, phi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start), 1, 1._DP, phi(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) phi(1,ibnd) = CMPLX( DBLE( phi(1,ibnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start), 1, 1._DP, hphi(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) hphi(1,ibnd) = CMPLX( DBLE( hphi(1,ibnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMV( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start), 1, 1._DP, sphi(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) sphi(1,ibnd) = CMPLX( DBLE( sphi(1,ibnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
norm = 2._DP * DDOT( npw2, phi(1,ibnd), 1, sphi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) norm = norm - DBLE( phi(1,ibnd) ) * DBLE ( sphi(1,ibnd) )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
norm = 2._DP * DDOT( npw2, phi(1,ibnd), 1, phi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) norm = norm - DBLE( phi(1,ibnd) ) * DBLE ( phi(1,ibnd) )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( norm, intra_bgrp_comm )
|
||||
!
|
||||
norm = SQRT( MAX( norm, 0._DP ) )
|
||||
!
|
||||
IF ( norm < eps16 ) &
|
||||
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
|
||||
!
|
||||
CALL DSCAL( npw2, 1._DP / norm, phi(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) phi(1,ibnd) = CMPLX( DBLE( phi(1,ibnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DSCAL( npw2, 1._DP / norm, hphi(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) hphi(1,ibnd) = CMPLX( DBLE( hphi(1,ibnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DSCAL( npw2, 1._DP / norm, sphi(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) sphi(1,ibnd) = CMPLX( DBLE( sphi(1,ibnd) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE( sr )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_diag
|
||||
!
|
||||
!
|
||||
SUBROUTINE project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
|
||||
!
|
||||
INTEGER :: ibnd_size
|
||||
INTEGER :: jbnd_size
|
||||
REAL(DP), ALLOCATABLE :: sr(:,:)
|
||||
!
|
||||
ibnd_size = ibnd_end - ibnd_start + 1
|
||||
jbnd_size = jbnd_end - jbnd_start + 1
|
||||
!
|
||||
ALLOCATE( sr( ibnd_start:ibnd_end, jbnd_start:jbnd_end ) )
|
||||
!
|
||||
! ... <phi_i| S |psi_j>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi(1,ibnd_start), npwx2, &
|
||||
spsi(1,jbnd_start), npwx2, 0._DP, sr(ibnd_start,jbnd_start), ibnd_size )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DGER( ibnd_size, jbnd_size, -1._DP, psi(1,ibnd_start), npwx2, &
|
||||
spsi(1,jbnd_start), npwx2, sr(ibnd_start,jbnd_start), ibnd_size )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi(1,ibnd_start), npwx2, &
|
||||
psi(1,jbnd_start), npwx2, 0._DP, sr(ibnd_start,jbnd_start), ibnd_size )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DGER( ibnd_size, jbnd_size, -1._DP, psi(1,ibnd_start), npwx2, &
|
||||
psi(1,jbnd_start), npwx2, sr(ibnd_start,jbnd_start), ibnd_size )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sr, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start,jbnd_start), ibnd_size, 1._DP, phi(1,jbnd_start), npwx2 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) phi(1,jbnd_start:jbnd_end) = &
|
||||
CMPLX( DBLE( phi(1,jbnd_start:jbnd_end) ), 0._DP, kind=DP )
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start,jbnd_start), ibnd_size, 1._DP, hphi(1,jbnd_start), npwx2 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) hphi(1,jbnd_start:jbnd_end) = &
|
||||
CMPLX( DBLE( hphi(1,jbnd_start:jbnd_end) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi(1,ibnd_start), npwx2, &
|
||||
sr(ibnd_start,jbnd_start), ibnd_size, 1._DP, sphi(1,jbnd_start), npwx2 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) sphi(1,jbnd_start:jbnd_end) = &
|
||||
CMPLX( DBLE( sphi(1,jbnd_start:jbnd_end) ), 0._DP, kind=DP )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
DEALLOCATE( sr )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE project_offdiag
|
||||
!
|
||||
!
|
||||
SUBROUTINE energyeigen( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd, ibnd_start, ibnd_end
|
||||
!
|
||||
REAL(DP), EXTERNAL :: DDOT
|
||||
!
|
||||
! ... <psi_i| H |psi_i>
|
||||
!
|
||||
e(:) = 0._DP
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
e(ibnd) = 2._DP * DDOT( npw2, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) e(ibnd) = e(ibnd) - DBLE( psi(1,ibnd) ) * DBLE ( hpsi(1,ibnd) )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
|
||||
CALL mp_sum( e, inter_bgrp_comm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE energyeigen
|
||||
!
|
||||
!
|
||||
SUBROUTINE sort_vectors( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
INTEGER :: nswap
|
||||
REAL(DP) :: e0
|
||||
!
|
||||
10 nswap = 0
|
||||
!
|
||||
DO ibnd = 2, nbnd
|
||||
!
|
||||
IF ( e(ibnd) < e(ibnd-1) ) THEN
|
||||
!
|
||||
nswap = nswap + 1
|
||||
!
|
||||
e0 = e(ibnd)
|
||||
e(ibnd) = e(ibnd-1)
|
||||
e(ibnd-1) = e0
|
||||
!
|
||||
CALL DSWAP( npw2, psi(1,ibnd), 1, psi(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL DSWAP( npw2, hpsi(1,ibnd), 1, hpsi(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL DSWAP( npw2, spsi(1,ibnd), 1, spsi(1,ibnd-1), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
IF ( nswap > 0 ) GOTO 10
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sort_vectors
|
||||
!
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_gamma
|
|
@ -0,0 +1,584 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
#define ZERO ( 0._DP, 0._DP )
|
||||
!
|
||||
!--------------------------------------------------------------------------
|
||||
SUBROUTINE gram_schmidt_gamma_gpu( npwx, npw, nbnd, psi_d, hpsi_d, spsi_d, e, &
|
||||
uspp, eigen, reorder, nbsize )
|
||||
!--------------------------------------------------------------------------
|
||||
!
|
||||
! ... Gram-Schmidt orthogonalization, for Gamma-only calculations.
|
||||
! ... blocking algorithm is used.
|
||||
!
|
||||
USE util_param, ONLY : DP, eps16
|
||||
USE mp, ONLY : mp_sum, mp_max, mp_bcast
|
||||
USE mp_bands_util, ONLY : gstart, inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
|
||||
USE device_memcpy_m, ONLY : dev_memcpy, dev_memset
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d (npwx,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi_d(npwx,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: spsi_d(npwx,nbnd)
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
LOGICAL, INTENT(IN) :: uspp
|
||||
LOGICAL, INTENT(IN) :: eigen
|
||||
LOGICAL, INTENT(IN) :: reorder
|
||||
INTEGER, INTENT(IN) :: nbsize
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
LOGICAL :: eigen_
|
||||
INTEGER :: npw2, npwx2
|
||||
INTEGER :: iblock, nblock
|
||||
INTEGER :: iblock_start, iblock_end
|
||||
INTEGER :: jblock_start, jblock_end
|
||||
INTEGER :: ibnd_start, ibnd_end
|
||||
INTEGER :: jbnd_start, jbnd_end
|
||||
COMPLEX(DP), ALLOCATABLE :: phi(:,:), hphi(:,:), sphi(:,:)
|
||||
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
|
||||
!
|
||||
! ... device variables
|
||||
!
|
||||
INTEGER :: ii, buf_start, buf_end, buf_size, info
|
||||
COMPLEX(DP),ALLOCATABLE :: phi_d (:,:)
|
||||
COMPLEX(DP),ALLOCATABLE :: hphi_d(:,:)
|
||||
COMPLEX(DP),ALLOCATABLE :: sphi_d(:,:)
|
||||
!
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: psi_d, hpsi_d, spsi_d
|
||||
attributes(device) :: phi_d, hphi_d, sphi_d
|
||||
#endif
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: sr_d(:), sr2_d(:,:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: sr_d, sr2_d
|
||||
#endif
|
||||
!
|
||||
!
|
||||
CALL start_clock( 'gsorth' )
|
||||
!
|
||||
eigen_ = eigen
|
||||
!
|
||||
IF ( reorder ) THEN
|
||||
!
|
||||
eigen_ = .TRUE.
|
||||
!
|
||||
END IF
|
||||
!
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
!
|
||||
nblock = nbnd / nbsize
|
||||
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
|
||||
!
|
||||
IF ( my_bgrp_id >= nblock ) THEN
|
||||
!
|
||||
iblock_start = nblock + 1
|
||||
iblock_end = nblock
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( phi_d ( npwx, nbnd ) )
|
||||
IF ( eigen_ ) ALLOCATE( hphi_d( npwx, nbnd ) )
|
||||
IF ( uspp ) ALLOCATE( sphi_d( npwx, nbnd ) )
|
||||
!
|
||||
phi_d = ZERO
|
||||
!
|
||||
IF ( eigen_ ) hphi_d = ZERO
|
||||
!
|
||||
IF ( uspp ) sphi_d = ZERO
|
||||
!
|
||||
!
|
||||
! ... Set owers of blocks
|
||||
!
|
||||
owner_bgrp_id = 0
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
|
||||
owner_bgrp_id(iblock) = my_bgrp_id
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
|
||||
!
|
||||
! ... Set Im[ psi(G=0) ] - needed for numerical stability
|
||||
!
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii =1,nbnd
|
||||
psi_d(1,ii) = CMPLX( DBLE( psi_d(1,ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
! ... Set initial : |phi_j> = |psi_j>
|
||||
!
|
||||
CALL DCOPY_gpu( npwx2 * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii =1,nbnd
|
||||
phi_d(1,ii) = CMPLX( DBLE( phi_d(1,ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DCOPY_gpu( npwx2 * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii =1,nbnd
|
||||
hphi_d(1,ii) = CMPLX( DBLE( hphi_d(1,ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DCOPY_gpu( npwx2 * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii =1,nbnd
|
||||
sphi_d(1,ii) = CMPLX( DBLE( sphi_d(1,ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Buffer allocation
|
||||
!
|
||||
buf_size = nbsize
|
||||
!
|
||||
ALLOCATE( sr_d(buf_size))
|
||||
ALLOCATE( sr2_d(buf_size, buf_size))
|
||||
!
|
||||
! ... Blocking loop
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
! ... Orthogonalize diagonal block by standard Gram-Schmidt
|
||||
!
|
||||
ibnd_start = ( iblock - 1 ) * nbsize + 1
|
||||
ibnd_end = MIN( iblock * nbsize, nbnd )
|
||||
!
|
||||
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
|
||||
CALL gram_schmidt_diag_gpu( ibnd_start, ibnd_end )
|
||||
!
|
||||
! ... Bcast diagonal block
|
||||
!
|
||||
CALL mp_bcast( phi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL mp_bcast( hphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL mp_bcast( sphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
! ... Project off-diagonal block outside of diagonal block
|
||||
!
|
||||
jblock_start = MAX( iblock_start, iblock + 1 )
|
||||
jblock_end = iblock_end
|
||||
!
|
||||
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
|
||||
jbnd_end = MIN( jblock_end * nbsize, nbnd )
|
||||
!
|
||||
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
|
||||
CALL project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
! ... Buffer Realese
|
||||
!
|
||||
DEALLOCATE (sr_d, sr2_d)
|
||||
!
|
||||
!
|
||||
! ... Copy psi <- phi
|
||||
!
|
||||
!CALL DCOPY( npwx2 * nbnd, phi(1,1), 1, psi(1,1), 1 )
|
||||
CALL DCOPY_gpu( npwx2 * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL DCOPY_gpu( npwx2 * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
|
||||
!CALL DCOPY( npwx2 * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL DCOPY_gpu( npwx2 * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
|
||||
!CALL DCOPY( npwx2 * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
|
||||
!
|
||||
! ... Calculate energy eigenvalues
|
||||
!
|
||||
IF ( eigen_ ) CALL energyeigen_gpu( )
|
||||
!
|
||||
! ... Sort wave functions
|
||||
!
|
||||
IF ( reorder ) CALL sort_vectors_gpu( )
|
||||
!
|
||||
DEALLOCATE( owner_bgrp_id )
|
||||
!
|
||||
DEALLOCATE( phi_d )
|
||||
IF ( eigen_ ) DEALLOCATE( hphi_d )
|
||||
IF ( uspp ) DEALLOCATE( sphi_d )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
!
|
||||
CALL stop_clock( 'gsorth' )
|
||||
!
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!
|
||||
SUBROUTINE gram_schmidt_diag_gpu( ibnd_start, ibnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
REAL(DP) :: norm
|
||||
REAL(DP) :: psi_ibnd
|
||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
IF ( ibnd > ibnd_start ) THEN
|
||||
!
|
||||
! ... <phi_j| S |psi_i>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||
spsi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
psi_ibnd = -spsi_d(1,ibnd)
|
||||
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd , phi_d(1,ibnd_start), npwx2, &
|
||||
sr_d(1), 1 )
|
||||
END IF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL DGEMV_gpu( 'T', npw2, ibnd - ibnd_start, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||
psi_d(1,ibnd), 1, 0._DP, sr_d(1), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
|
||||
psi_ibnd = -psi_d(1,ibnd)
|
||||
CALL DAXPY_gpu( ibnd - ibnd_start, psi_ibnd, phi_d(1,ibnd_start), npwx2, &
|
||||
sr_d(1), 1 )
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sr_d, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
||||
!
|
||||
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, phi_d(1,ibnd_start), npwx2, &
|
||||
sr_d(1), 1, 1._DP, phi_d(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
phi_d(1,ibnd) = CMPLX( DBLE( phi_d(1,ibnd) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, hphi_d(1,ibnd_start), npwx2, &
|
||||
sr_d(1), 1, 1._DP, hphi_d(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
hphi_d(1,ibnd) = CMPLX( DBLE( hphi_d(1,ibnd) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DGEMV_gpu( 'N', npw2, ibnd - ibnd_start, -1._DP, sphi_d(1,ibnd_start), npwx2, &
|
||||
sr_d(1), 1, 1._DP, sphi_d(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
sphi_d(1,ibnd) = CMPLX( DBLE( sphi_d(1,ibnd) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
norm = norm - DBLE( phi_d(1,ibnd) ) * DBLE ( sphi_d(1,ibnd) )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
norm = 2._DP * gpu_DDOT( npw2, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
norm = norm - DBLE( phi_d(1,ibnd) ) * DBLE ( phi_d(1,ibnd) )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( norm, intra_bgrp_comm )
|
||||
!
|
||||
norm = SQRT( MAX( norm, 0._DP ) )
|
||||
!
|
||||
IF ( norm < eps16 ) &
|
||||
CALL errore( ' gram_schmidt_gamma ', ' vectors are linear dependent ', 1 )
|
||||
!
|
||||
CALL DSCAL_gpu( npw2, 1._DP / norm, phi_d(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
phi_d(1,ibnd) = CMPLX( DBLE( phi_d(1,ibnd) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL DSCAL_gpu( npw2, 1._DP / norm, hphi_d(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
hphi_d(1,ibnd) = CMPLX( DBLE( hphi_d(1,ibnd) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL DSCAL_gpu( npw2, 1._DP / norm, sphi_d(1,ibnd), 1 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
sphi_d(1,ibnd) = CMPLX( DBLE( sphi_d(1,ibnd) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_diag_gpu
|
||||
!
|
||||
!
|
||||
SUBROUTINE project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
|
||||
!
|
||||
INTEGER :: ibnd_size
|
||||
INTEGER :: jbnd_size
|
||||
!
|
||||
!
|
||||
ibnd_size = ibnd_end - ibnd_start + 1
|
||||
jbnd_size = jbnd_end - jbnd_start + 1
|
||||
!
|
||||
! ... <phi_i| S |psi_j>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||
spsi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), ibnd_size )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
|
||||
spsi_d(1,jbnd_start), npwx2, sr2_d(1,1), ibnd_size )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL gpu_DGEMM( 'T', 'N', ibnd_size, jbnd_size, npw2, 2._DP, phi_d(1,ibnd_start), npwx2, &
|
||||
psi_d(1,jbnd_start), npwx2, 0._DP, sr2_d(1,1), ibnd_size )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL gpu_DGER( ibnd_size, jbnd_size, -1._DP, psi_d(1,ibnd_start), npwx2, &
|
||||
psi_d(1,jbnd_start), npwx2, sr2_d(1,1), ibnd_size )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sr2_d, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
||||
!
|
||||
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, phi_d(1,ibnd_start), npwx2, &
|
||||
sr2_d(1,1), ibnd_size, 1._DP, phi_d(1,jbnd_start), npwx2 )
|
||||
!
|
||||
! NOTE: set Im[ phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=jbnd_start, jbnd_end
|
||||
phi_d(1, ii) = &
|
||||
CMPLX( DBLE( phi_d(1, ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!
|
||||
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, hphi_d(1,ibnd_start), npwx2, &
|
||||
sr2_d(1,1), ibnd_size, 1._DP, hphi_d(1,jbnd_start), npwx2 )
|
||||
!
|
||||
! NOTE: set Im[ H*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii= jbnd_start, jbnd_end
|
||||
hphi_d(1, ii) = &
|
||||
CMPLX( DBLE( hphi_d(1, ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL gpu_DGEMM( 'N', 'N', npw2, jbnd_size, ibnd_size, -1._DP, sphi_d(1,ibnd_start), npwx2, &
|
||||
sr2_d(1,1), ibnd_size, 1._DP, sphi_d(1,jbnd_start), npwx2 )
|
||||
!
|
||||
! NOTE: set Im[ S*phi(G=0) ] - needed for numerical stability
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=jbnd_start, jbnd_end
|
||||
sphi_d(1, ii ) = &
|
||||
CMPLX( DBLE( sphi_d(1, ii) ), 0._DP, kind=DP )
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE project_offdiag_gpu
|
||||
!
|
||||
!
|
||||
SUBROUTINE energyeigen_gpu( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd, ibnd_start, ibnd_end
|
||||
REAL(DP) :: e_ibnd
|
||||
!
|
||||
REAL(DP), EXTERNAL :: gpu_DDOT
|
||||
!
|
||||
! ... <psi_i| H |psi_i>
|
||||
!
|
||||
e(:) = 0._DP
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
e(ibnd) = 2._DP * gpu_DDOT( npw2, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 )
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO ii=1,1
|
||||
e_ibnd = DBLE( psi_d(1,ibnd) ) * DBLE ( hpsi_d(1,ibnd) )
|
||||
END DO
|
||||
e(ibnd) = e(ibnd) - e_ibnd
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
|
||||
CALL mp_sum( e, inter_bgrp_comm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE energyeigen_gpu
|
||||
!
|
||||
!
|
||||
SUBROUTINE sort_vectors_gpu( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
INTEGER :: nswap
|
||||
REAL(DP) :: e0
|
||||
!
|
||||
10 nswap = 0
|
||||
!
|
||||
DO ibnd = 2, nbnd
|
||||
!
|
||||
IF ( e(ibnd) < e(ibnd-1) ) THEN
|
||||
!
|
||||
nswap = nswap + 1
|
||||
!
|
||||
e0 = e(ibnd)
|
||||
e(ibnd) = e(ibnd-1)
|
||||
e(ibnd-1) = e0
|
||||
!
|
||||
CALL DSWAP_gpu( npw2, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL DSWAP_gpu( npw2, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL DSWAP_gpu( npw2, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
IF ( nswap > 0 ) GOTO 10
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sort_vectors_gpu
|
||||
!
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_gamma_gpu
|
|
@ -0,0 +1,384 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
#define ZERO ( 0._DP, 0._DP )
|
||||
#define ONE ( 1._DP, 0._DP )
|
||||
#define MONE (-1._DP, 0._DP )
|
||||
!
|
||||
!--------------------------------------------------------------------------
|
||||
SUBROUTINE gram_schmidt_k( npwx, npw, nbnd, npol, psi, hpsi, spsi, e, &
|
||||
uspp, eigen, reorder, nbsize )
|
||||
!--------------------------------------------------------------------------
|
||||
!
|
||||
! ... Gram-Schmidt orthogonalization, for k-point calculations.
|
||||
! ... blocking algorithm is used.
|
||||
!
|
||||
USE util_param, ONLY : DP, eps16
|
||||
USE mp, ONLY : mp_sum, mp_max, mp_bcast
|
||||
USE mp_bands_util, ONLY : inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi (npwx*npol,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi(npwx*npol,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: spsi(npwx*npol,nbnd)
|
||||
REAL(DP), INTENT(INOUT) :: e(nbnd)
|
||||
LOGICAL, INTENT(IN) :: uspp
|
||||
LOGICAL, INTENT(IN) :: eigen
|
||||
LOGICAL, INTENT(IN) :: reorder
|
||||
INTEGER, INTENT(IN) :: nbsize
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
LOGICAL :: eigen_
|
||||
INTEGER :: kdim, kdmx
|
||||
INTEGER :: iblock, nblock
|
||||
INTEGER :: iblock_start, iblock_end
|
||||
INTEGER :: jblock_start, jblock_end
|
||||
INTEGER :: ibnd_start, ibnd_end
|
||||
INTEGER :: jbnd_start, jbnd_end
|
||||
COMPLEX(DP), ALLOCATABLE :: phi(:,:), hphi(:,:), sphi(:,:)
|
||||
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
|
||||
!
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx*npol
|
||||
kdmx = npwx*npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
eigen_ = eigen
|
||||
!
|
||||
IF ( reorder ) THEN
|
||||
!
|
||||
eigen_ = .TRUE.
|
||||
!
|
||||
END IF
|
||||
!
|
||||
nblock = nbnd / nbsize
|
||||
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
|
||||
!
|
||||
IF ( my_bgrp_id >= nblock ) THEN
|
||||
!
|
||||
iblock_start = nblock + 1
|
||||
iblock_end = nblock
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( phi ( kdmx, nbnd ) )
|
||||
IF ( eigen_ ) ALLOCATE( hphi( kdmx, nbnd ) )
|
||||
IF ( uspp ) ALLOCATE( sphi( kdmx, nbnd ) )
|
||||
ALLOCATE( owner_bgrp_id( nblock ) )
|
||||
!
|
||||
phi = ZERO
|
||||
!
|
||||
IF ( eigen_ ) hphi = ZERO
|
||||
!
|
||||
IF ( uspp ) sphi = ZERO
|
||||
!
|
||||
! ... Set owers of blocks
|
||||
!
|
||||
owner_bgrp_id = 0
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
|
||||
owner_bgrp_id(iblock) = my_bgrp_id
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
|
||||
!
|
||||
! ... Set initial : |phi_j> = |psi_j>
|
||||
!
|
||||
CALL ZCOPY( kdmx * nbnd, psi(1,1), 1, phi(1,1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZCOPY( kdmx * nbnd, hpsi(1,1), 1, hphi(1,1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZCOPY( kdmx * nbnd, spsi(1,1), 1, sphi(1,1), 1 )
|
||||
!
|
||||
! ... Blocking loop
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
! ... Orthogonalize diagonal block by standard Gram-Schmidt
|
||||
!
|
||||
ibnd_start = ( iblock - 1 ) * nbsize + 1
|
||||
ibnd_end = MIN( iblock * nbsize, nbnd )
|
||||
!
|
||||
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
|
||||
CALL gram_schmidt_diag( ibnd_start, ibnd_end )
|
||||
!
|
||||
! ... Bcast diagonal block
|
||||
!
|
||||
CALL mp_bcast( phi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL mp_bcast( hphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL mp_bcast( sphi(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
! ... Project off-diagonal block outside of diagonal block
|
||||
!
|
||||
jblock_start = MAX( iblock_start, iblock + 1 )
|
||||
jblock_end = iblock_end
|
||||
!
|
||||
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
|
||||
jbnd_end = MIN( jblock_end * nbsize, nbnd )
|
||||
!
|
||||
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
|
||||
CALL project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
! ... Copy psi <- phi
|
||||
!
|
||||
CALL ZCOPY( kdmx * nbnd, phi(1,1), 1, psi(1,1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZCOPY( kdmx * nbnd, hphi(1,1), 1, hpsi(1,1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZCOPY( kdmx * nbnd, sphi(1,1), 1, spsi(1,1), 1 )
|
||||
!
|
||||
! ... Calculate energy eigenvalues
|
||||
!
|
||||
IF ( eigen_ ) CALL energyeigen( )
|
||||
!
|
||||
! ... Sort wave functions
|
||||
!
|
||||
IF ( reorder ) CALL sort_vectors( )
|
||||
!
|
||||
DEALLOCATE( phi )
|
||||
IF ( eigen_ ) DEALLOCATE( hphi )
|
||||
IF ( uspp ) DEALLOCATE( sphi )
|
||||
DEALLOCATE( owner_bgrp_id )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!
|
||||
SUBROUTINE gram_schmidt_diag( ibnd_start, ibnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
COMPLEX(DP), ALLOCATABLE :: sc(:)
|
||||
REAL(DP) :: norm
|
||||
REAL(DP), EXTERNAL :: DDOT
|
||||
!
|
||||
ALLOCATE( sc( ibnd_start:ibnd_end ) )
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
IF ( ibnd > ibnd_start ) THEN
|
||||
!
|
||||
! ... <phi_j| S |psi_i>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL ZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi(1,ibnd_start), kdmx, &
|
||||
spsi(1,ibnd), 1, ZERO, sc(ibnd_start), 1 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL ZGEMV( 'C', kdim, ibnd - ibnd_start, ONE, phi(1,ibnd_start), kdmx, &
|
||||
psi(1,ibnd), 1, ZERO, sc(ibnd_start), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sc, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
||||
!
|
||||
CALL ZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, phi(1,ibnd_start), kdmx, &
|
||||
sc(ibnd_start), 1, ONE, phi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, hphi(1,ibnd_start), kdmx, &
|
||||
sc(ibnd_start), 1, ONE, hphi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZGEMV( 'N', kdim, ibnd - ibnd_start, MONE, sphi(1,ibnd_start), kdmx, &
|
||||
sc(ibnd_start), 1, ONE, sphi(1,ibnd), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
norm = DDOT( 2*kdim, phi(1,ibnd), 1, sphi(1,ibnd), 1 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
norm = DDOT ( 2*kdim, phi(1,ibnd), 1, phi(1,ibnd), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( norm, intra_bgrp_comm )
|
||||
!
|
||||
norm = SQRT( MAX( norm, 0._DP ) )
|
||||
!
|
||||
IF ( norm < eps16 ) &
|
||||
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
|
||||
!
|
||||
CALL ZDSCAL( kdim, 1._DP / norm, phi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZDSCAL( kdim, 1._DP / norm, hphi(1,ibnd), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZDSCAL( kdim, 1._DP / norm, sphi(1,ibnd), 1 )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE( sc )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_diag
|
||||
!
|
||||
!
|
||||
SUBROUTINE project_offdiag( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
|
||||
!
|
||||
INTEGER :: ibnd_size
|
||||
INTEGER :: jbnd_size
|
||||
COMPLEX(DP), ALLOCATABLE :: sc(:,:)
|
||||
!
|
||||
ibnd_size = ibnd_end - ibnd_start + 1
|
||||
jbnd_size = jbnd_end - jbnd_start + 1
|
||||
!
|
||||
ALLOCATE( sc( ibnd_start:ibnd_end, jbnd_start:jbnd_end ) )
|
||||
!
|
||||
! ... <phi_i| S |psi_j>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi(1,ibnd_start), kdmx, &
|
||||
spsi(1,jbnd_start), kdmx, ZERO, sc(ibnd_start,jbnd_start), ibnd_size )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi(1,ibnd_start), kdmx, &
|
||||
psi(1,jbnd_start), kdmx, ZERO, sc(ibnd_start,jbnd_start), ibnd_size )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sc, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi(1,ibnd_start), kdmx, &
|
||||
sc(ibnd_start,jbnd_start), ibnd_size, ONE, phi(1,jbnd_start), kdmx )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi(1,ibnd_start), kdmx, &
|
||||
sc(ibnd_start,jbnd_start), ibnd_size, ONE, hphi(1,jbnd_start), kdmx )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi(1,ibnd_start), kdmx, &
|
||||
sc(ibnd_start,jbnd_start), ibnd_size, ONE, sphi(1,jbnd_start), kdmx )
|
||||
!
|
||||
DEALLOCATE( sc )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE project_offdiag
|
||||
!
|
||||
!
|
||||
SUBROUTINE energyeigen( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd, ibnd_start, ibnd_end
|
||||
!
|
||||
REAL(DP), EXTERNAL :: DDOT
|
||||
!
|
||||
! ... <psi_i| H |psi_i>
|
||||
!
|
||||
e(:) = 0._DP
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
e(ibnd) = DDOT( 2*kdim, psi(1,ibnd), 1, hpsi(1,ibnd), 1 )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
|
||||
CALL mp_sum( e, inter_bgrp_comm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE energyeigen
|
||||
!
|
||||
!
|
||||
SUBROUTINE sort_vectors( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
INTEGER :: nswap
|
||||
REAL(DP) :: e0
|
||||
!
|
||||
10 nswap = 0
|
||||
!
|
||||
DO ibnd = 2, nbnd
|
||||
!
|
||||
IF ( e(ibnd) < e(ibnd-1) ) THEN
|
||||
!
|
||||
nswap = nswap + 1
|
||||
!
|
||||
e0 = e(ibnd)
|
||||
e(ibnd) = e(ibnd-1)
|
||||
e(ibnd-1) = e0
|
||||
!
|
||||
CALL ZSWAP( kdim, psi(1,ibnd), 1, psi(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZSWAP( kdim, hpsi(1,ibnd), 1, hpsi(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZSWAP( kdim, spsi(1,ibnd), 1, spsi(1,ibnd-1), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
IF ( nswap > 0 ) GOTO 10
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sort_vectors
|
||||
!
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_k
|
|
@ -0,0 +1,433 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
#define ZERO ( 0._DP, 0._DP )
|
||||
#define ONE ( 1._DP, 0._DP )
|
||||
#define MONE (-1._DP, 0._DP )
|
||||
!
|
||||
!--------------------------------------------------------------------------
|
||||
SUBROUTINE gram_schmidt_k_gpu( npwx, npw, nbnd, npol, psi_d, hpsi_d, spsi_d, e, &
|
||||
uspp, eigen, reorder, nbsize )
|
||||
!--------------------------------------------------------------------------
|
||||
!
|
||||
! ... Gram-Schmidt orthogonalization, for k-point calculations.
|
||||
! ... blocking algorithm is used.
|
||||
!
|
||||
USE util_param, ONLY : DP, eps16
|
||||
USE mp, ONLY : mp_sum, mp_max, mp_bcast
|
||||
USE mp_bands_util, ONLY : inter_bgrp_comm, intra_bgrp_comm, my_bgrp_id
|
||||
USE device_fbuff_m, ONLY : buffer => dev_buf
|
||||
USE device_memcpy_m, ONLY : dev_memcpy, dev_memset
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nbnd, npol
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d (npwx*npol,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: hpsi_d(npwx*npol,nbnd)
|
||||
COMPLEX(DP), INTENT(INOUT) :: spsi_d(npwx*npol,nbnd)
|
||||
REAL(DP), INTENT(INOUT) :: e(nbnd)
|
||||
LOGICAL, INTENT(IN) :: uspp
|
||||
LOGICAL, INTENT(IN) :: eigen
|
||||
LOGICAL, INTENT(IN) :: reorder
|
||||
INTEGER, INTENT(IN) :: nbsize
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
LOGICAL :: eigen_
|
||||
INTEGER :: kdim, kdmx
|
||||
INTEGER :: iblock, nblock
|
||||
INTEGER :: iblock_start, iblock_end
|
||||
INTEGER :: jblock_start, jblock_end
|
||||
INTEGER :: ibnd_start, ibnd_end
|
||||
INTEGER :: jbnd_start, jbnd_end
|
||||
INTEGER, ALLOCATABLE :: owner_bgrp_id(:)
|
||||
INTEGER :: buf_start, buf_end, buf_size
|
||||
!
|
||||
! ... device variables
|
||||
!
|
||||
INTEGER :: ii, jj, kk, info
|
||||
COMPLEX(DP), ALLOCATABLE :: phi_d(:,:), hphi_d(:,:), sphi_d(:,:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: psi_d, hpsi_d, spsi_d
|
||||
attributes(device) :: phi_d, hphi_d, sphi_d
|
||||
#endif
|
||||
!
|
||||
COMPLEX(DP), ALLOCATABLE :: sc_d(:), sc2_d(:,:)
|
||||
#if defined (__CUDA)
|
||||
attributes(device) :: sc_d, sc2_d
|
||||
#endif
|
||||
!
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx*npol
|
||||
kdmx = npwx*npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
eigen_ = eigen
|
||||
!
|
||||
IF ( reorder ) THEN
|
||||
!
|
||||
eigen_ = .TRUE.
|
||||
!
|
||||
END IF
|
||||
!
|
||||
nblock = nbnd / nbsize
|
||||
IF ( MOD( nbnd, nbsize ) /= 0 ) nblock = nblock + 1
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nblock, iblock_start, iblock_end )
|
||||
!
|
||||
IF ( my_bgrp_id >= nblock ) THEN
|
||||
!
|
||||
iblock_start = nblock + 1
|
||||
iblock_end = nblock
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( phi_d ( kdmx, nbnd ) )
|
||||
IF ( eigen_ ) ALLOCATE( hphi_d( kdmx, nbnd ) )
|
||||
IF ( uspp ) ALLOCATE( sphi_d( kdmx, nbnd ) )
|
||||
!
|
||||
ALLOCATE( owner_bgrp_id( nblock ) )
|
||||
!
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, kdmx
|
||||
DO jj = 1, nbnd
|
||||
phi_d(ii, jj) = ZERO
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
IF ( eigen_ ) THEN
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, kdmx
|
||||
DO jj = 1, nbnd
|
||||
hphi_d(ii, jj) = ZERO
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!$cuf kernel do(2)
|
||||
DO ii = 1, kdmx
|
||||
DO jj = 1, nbnd
|
||||
sphi_d(ii, jj) = ZERO
|
||||
END DO
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
! ... Set owers of blocks
|
||||
!
|
||||
owner_bgrp_id = 0
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
IF ( iblock_start <= iblock .AND. iblock <= iblock_end ) &
|
||||
owner_bgrp_id(iblock) = my_bgrp_id
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_max( owner_bgrp_id, inter_bgrp_comm )
|
||||
!
|
||||
! ... Set initial : |phi_j> = |psi_j>
|
||||
!
|
||||
CALL ZCOPY_gpu( kdmx * nbnd, psi_d(1,1), 1, phi_d(1,1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZCOPY_gpu( kdmx * nbnd, hpsi_d(1,1), 1, hphi_d(1,1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZCOPY_gpu( kdmx * nbnd, spsi_d(1,1), 1, sphi_d(1,1), 1 )
|
||||
!
|
||||
!
|
||||
! ... Allocate buffers
|
||||
!
|
||||
buf_size = nbsize
|
||||
ALLOCATE (sc_d(buf_size), sc2_d(buf_size, buf_size))
|
||||
!
|
||||
!
|
||||
! ... Blocking loop
|
||||
!
|
||||
DO iblock = 1, nblock
|
||||
!
|
||||
! ... Orthogonalize diagonal block by standard Gram-Schmidt
|
||||
!
|
||||
ibnd_start = ( iblock - 1 ) * nbsize + 1
|
||||
ibnd_end = MIN( iblock * nbsize, nbnd )
|
||||
!
|
||||
IF ( owner_bgrp_id(iblock) == my_bgrp_id ) &
|
||||
!
|
||||
!
|
||||
CALL gram_schmidt_diag( ibnd_start, ibnd_end )
|
||||
!
|
||||
!
|
||||
! ... Bcast diagonal block
|
||||
!
|
||||
CALL mp_bcast( phi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL mp_bcast( hphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL mp_bcast( sphi_d(:,ibnd_start:ibnd_end), owner_bgrp_id(iblock), inter_bgrp_comm )
|
||||
!
|
||||
! ... Project off-diagonal block outside of diagonal block
|
||||
!
|
||||
jblock_start = MAX( iblock_start, iblock + 1 )
|
||||
jblock_end = iblock_end
|
||||
!
|
||||
jbnd_start = ( jblock_start - 1 ) * nbsize + 1
|
||||
jbnd_end = MIN( jblock_end * nbsize, nbnd )
|
||||
!
|
||||
IF ( jblock_start <= jblock_end .AND. jbnd_start <= jbnd_end ) &
|
||||
!
|
||||
!
|
||||
CALL project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
!
|
||||
END DO
|
||||
!
|
||||
! ... Buffer Realese
|
||||
!
|
||||
DEALLOCATE (sc_d, sc2_d)
|
||||
!
|
||||
!
|
||||
! ... Copy psi <- phi
|
||||
!
|
||||
CALL ZCOPY_gpu( kdmx * nbnd, phi_d(1,1), 1, psi_d(1,1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZCOPY_gpu( kdmx * nbnd, hphi_d(1,1), 1, hpsi_d(1,1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZCOPY_gpu( kdmx * nbnd, sphi_d(1,1), 1, spsi_d(1,1), 1 )
|
||||
!
|
||||
! ... Calculate energy eigenvalues
|
||||
!
|
||||
IF ( eigen_ ) CALL energyeigen_gpu( )
|
||||
!
|
||||
! ... Sort wave functions
|
||||
!
|
||||
IF ( reorder ) CALL sort_vectors_gpu( )
|
||||
!
|
||||
DEALLOCATE( phi_d )
|
||||
IF ( eigen_ ) DEALLOCATE( hphi_d )
|
||||
IF ( uspp ) DEALLOCATE( sphi_d )
|
||||
!
|
||||
DEALLOCATE( owner_bgrp_id )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
!
|
||||
SUBROUTINE gram_schmidt_diag( ibnd_start, ibnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
REAL(DP) :: norm
|
||||
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
|
||||
!
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
IF ( ibnd > ibnd_start ) THEN
|
||||
!
|
||||
! ... <phi_j| S |psi_i>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||
spsi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL ZGEMV_gpu( 'C', kdim, ibnd - ibnd_start, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||
psi_d(1,ibnd), 1, ZERO, sc_d(1), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
!
|
||||
CALL mp_sum( sc_d, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_i = phi_i - phi_j * <phi_j| S |psi_i>
|
||||
!
|
||||
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, phi_d(1,ibnd_start), kdmx, &
|
||||
sc_d(1), 1, ONE, phi_d(1,ibnd), 1 )
|
||||
!
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, hphi_d(1,ibnd_start), kdmx, &
|
||||
sc_d(1), 1, ONE, hphi_d(1,ibnd), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZGEMV_gpu( 'N', kdim, ibnd - ibnd_start, MONE, sphi_d(1,ibnd_start), kdmx, &
|
||||
sc_d(1), 1, ONE, sphi_d(1,ibnd), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
! ... Normalize : phi_i = phi_i / SQRT(<phi_i| S |phi_i>)
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, sphi_d(1,ibnd), 1 ) )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
norm = DBLE( ZDOTC_gpu( kdim, phi_d(1,ibnd), 1, phi_d(1,ibnd), 1 ) )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( norm, intra_bgrp_comm )
|
||||
!
|
||||
norm = SQRT( MAX( norm, 0._DP ) )
|
||||
!
|
||||
IF ( norm < eps16 ) &
|
||||
CALL errore( ' gram_schmidt_k ', ' vectors are linear dependent ', 1 )
|
||||
!
|
||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, phi_d(1,ibnd), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, hphi_d(1,ibnd), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZDSCAL_gpu( kdim, 1._DP / norm, sphi_d(1,ibnd), 1 )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_diag
|
||||
!
|
||||
!
|
||||
SUBROUTINE project_offdiag_gpu( ibnd_start, ibnd_end, jbnd_start, jbnd_end )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER, INTENT(IN) :: ibnd_start, ibnd_end
|
||||
INTEGER, INTENT(IN) :: jbnd_start, jbnd_end
|
||||
!
|
||||
INTEGER :: ibnd_size
|
||||
INTEGER :: jbnd_size
|
||||
!
|
||||
ibnd_size = ibnd_end - ibnd_start + 1
|
||||
jbnd_size = jbnd_end - jbnd_start + 1
|
||||
!
|
||||
! ... <phi_i| S |psi_j>
|
||||
!
|
||||
IF ( uspp ) THEN
|
||||
!
|
||||
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||
spsi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), ibnd_size )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL gpu_ZGEMM( 'C', 'N', ibnd_size, jbnd_size, kdim, ONE, phi_d(1,ibnd_start), kdmx, &
|
||||
psi_d(1,jbnd_start), kdmx, ZERO, sc2_d(1,1), ibnd_size )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sc2_d, intra_bgrp_comm )
|
||||
!
|
||||
! ... phi_j = phi_j - phi_i * <phi_i| S |psi_j>
|
||||
!
|
||||
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, phi_d(1,ibnd_start), kdmx, &
|
||||
sc2_d(1,1), ibnd_size, ONE, phi_d(1,jbnd_start), kdmx )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, hphi_d(1,ibnd_start), kdmx, &
|
||||
sc2_d(1,1), ibnd_size, ONE, hphi_d(1,jbnd_start), kdmx )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL gpu_ZGEMM( 'N', 'N', kdim, jbnd_size, ibnd_size, MONE, sphi_d(1,ibnd_start), kdmx, &
|
||||
sc2_d(1,1), ibnd_size, ONE, sphi_d(1,jbnd_start), kdmx )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE project_offdiag_gpu
|
||||
!
|
||||
!
|
||||
SUBROUTINE energyeigen_gpu( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd, ibnd_start, ibnd_end
|
||||
!
|
||||
COMPLEX(DP), EXTERNAL :: ZDOTC_gpu
|
||||
!
|
||||
! ... <psi_i| H |psi_i>
|
||||
!
|
||||
e(:) = 0._DP
|
||||
!
|
||||
CALL divide( inter_bgrp_comm, nbnd, ibnd_start, ibnd_end )
|
||||
!
|
||||
DO ibnd = ibnd_start, ibnd_end
|
||||
!
|
||||
e(ibnd) = DBLE( ZDOTC_gpu( kdim, psi_d(1,ibnd), 1, hpsi_d(1,ibnd), 1 ) )
|
||||
!
|
||||
END DO
|
||||
!
|
||||
CALL mp_sum( e(ibnd_start:ibnd_end), intra_bgrp_comm )
|
||||
CALL mp_sum( e, inter_bgrp_comm )
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE energyeigen_gpu
|
||||
!
|
||||
!
|
||||
SUBROUTINE sort_vectors_gpu( )
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
INTEGER :: ibnd
|
||||
INTEGER :: nswap
|
||||
REAL(DP) :: e0
|
||||
!
|
||||
10 nswap = 0
|
||||
!
|
||||
DO ibnd = 2, nbnd
|
||||
!
|
||||
IF ( e(ibnd) < e(ibnd-1) ) THEN
|
||||
!
|
||||
nswap = nswap + 1
|
||||
!
|
||||
e0 = e(ibnd)
|
||||
e(ibnd) = e(ibnd-1)
|
||||
e(ibnd-1) = e0
|
||||
!
|
||||
CALL ZSWAP_gpu( kdim, psi_d(1,ibnd), 1, psi_d(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( eigen_ ) &
|
||||
CALL ZSWAP_gpu( kdim, hpsi_d(1,ibnd), 1, hpsi_d(1,ibnd-1), 1 )
|
||||
!
|
||||
IF ( uspp ) &
|
||||
CALL ZSWAP_gpu( kdim, spsi_d(1,ibnd), 1, spsi_d(1,ibnd-1), 1 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
IF ( nswap > 0 ) GOTO 10
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE sort_vectors_gpu
|
||||
!
|
||||
!
|
||||
END SUBROUTINE gram_schmidt_k_gpu
|
|
@ -1,3 +1,15 @@
|
|||
gram_schmidt_gamma.o : ../../UtilXlib/mp.o
|
||||
gram_schmidt_gamma.o : ../../UtilXlib/mp_bands_util.o
|
||||
gram_schmidt_gamma.o : ../../UtilXlib/util_param.o
|
||||
gram_schmidt_gamma_gpu.o : ../../UtilXlib/mp.o
|
||||
gram_schmidt_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
gram_schmidt_gamma_gpu.o : ../../UtilXlib/util_param.o
|
||||
gram_schmidt_k.o : ../../UtilXlib/mp.o
|
||||
gram_schmidt_k.o : ../../UtilXlib/mp_bands_util.o
|
||||
gram_schmidt_k.o : ../../UtilXlib/util_param.o
|
||||
gram_schmidt_k_gpu.o : ../../UtilXlib/mp.o
|
||||
gram_schmidt_k_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
gram_schmidt_k_gpu.o : ../../UtilXlib/util_param.o
|
||||
rotate_HSpsi_gamma.o : ../../UtilXlib/mp.o
|
||||
rotate_HSpsi_gamma.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_HSpsi_gamma.o : ../../UtilXlib/util_param.o
|
||||
|
@ -10,6 +22,8 @@ rotate_HSpsi_k.o : ../../UtilXlib/util_param.o
|
|||
rotate_HSpsi_k_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_HSpsi_k_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_HSpsi_k_gpu.o : ../../UtilXlib/util_param.o
|
||||
rotate_driver.o : ../../UtilXlib/util_param.o
|
||||
rotate_driver_cuf.o : ../../UtilXlib/util_param.o
|
||||
rotate_wfc_gamma.o : ../../UtilXlib/mp.o
|
||||
rotate_wfc_gamma.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_wfc_gamma.o : ../../UtilXlib/util_param.o
|
||||
|
@ -22,3 +36,15 @@ rotate_wfc_k.o : ../../UtilXlib/util_param.o
|
|||
rotate_wfc_k_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_wfc_k_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_wfc_k_gpu.o : ../../UtilXlib/util_param.o
|
||||
rotate_xpsi_gamma.o : ../../UtilXlib/mp.o
|
||||
rotate_xpsi_gamma.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_xpsi_gamma.o : ../../UtilXlib/util_param.o
|
||||
rotate_xpsi_gamma_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_xpsi_gamma_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_xpsi_gamma_gpu.o : ../../UtilXlib/util_param.o
|
||||
rotate_xpsi_k.o : ../../UtilXlib/mp.o
|
||||
rotate_xpsi_k.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_xpsi_k.o : ../../UtilXlib/util_param.o
|
||||
rotate_xpsi_k_gpu.o : ../../UtilXlib/mp.o
|
||||
rotate_xpsi_k_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rotate_xpsi_k_gpu.o : ../../UtilXlib/util_param.o
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_xpsi_driver &
|
||||
( npwx, npw, nstart, nbnd, psi, npol, overlap, evc, hevc, sevc, e, use_para_diag, gamma_only )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Driver routine (maybe it should be an interface) for
|
||||
! ... Hamiltonian diagonalization in the subspace spanned
|
||||
! ... by nstart states psi ( atomic or random wavefunctions ).
|
||||
! ... Produces on output nbnd eigenvectors ( nbnd <= nstart ) in evc.
|
||||
! ... Calls h_psi, s_psi to calculate H|psi> and S|psi>,
|
||||
! ... which are saved in hevc and sevc.
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
!! dimension of the matrix to be diagonalized
|
||||
!! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
!! input number of states
|
||||
!! output number of states
|
||||
!! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
!! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
|
||||
!! vectors spanning the subspace
|
||||
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
|
||||
!! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
|
||||
!! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
!! eigenvalues
|
||||
LOGICAL,INTENT(IN) :: use_para_diag
|
||||
!! if true parallel diagonalization will be used
|
||||
LOGICAL,INTENT(IN) :: gamma_only
|
||||
!! set to true when H is real
|
||||
|
||||
!
|
||||
EXTERNAL :: h_psi, s_psi
|
||||
! h_psi(npwx,npw,nbnd,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nbnd,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
|
||||
!
|
||||
CALL start_clock( 'wfcrot' )
|
||||
!
|
||||
IF( use_para_diag ) THEN
|
||||
!
|
||||
! use data distributed subroutine
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL protate_xpsi_gamma ( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL protate_xpsi_k ( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! use serial subroutines
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!
|
||||
CALL rotate_xpsi_gamma ( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL rotate_xpsi_k ( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock( 'wfcrot' )
|
||||
!
|
||||
END SUBROUTINE rotate_xpsi_driver
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_xpsi_driver_cuf &
|
||||
( npwx, npw, nstart, nbnd, psi_d, npol, overlap, evc_d, hevc_d, sevc_d, e_d, use_para_diag, gamma_only )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
!! Driver routine for Hamiltonian diagonalization in the subspace
|
||||
!! spanned by nstart states psi ( atomic or random wavefunctions ).
|
||||
!! Produces on output nbnd eigenvectors ( nbnd <= nstart ) in evc.
|
||||
!! Calls h_psi, s_psi to calculate H|psi> and S|psi>,
|
||||
!! which are saved in hevc and sevc.
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
!! dimension of the matrix to be diagonalized
|
||||
!! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
!! input number of states
|
||||
!! output number of states
|
||||
!! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
!! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart)
|
||||
!! vectors spannign the subspace
|
||||
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
|
||||
!! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx*npol,nbnd), sevc_d(npwx*npol,nbnd)
|
||||
!! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e_d(nbnd)
|
||||
!! eigenvalues
|
||||
LOGICAL, INTENT(IN) :: use_para_diag
|
||||
!! if true, use parallel diagonalization
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
!! set to true if H matrix is real
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
|
||||
#endif
|
||||
|
||||
COMPLEX(DP), ALLOCATABLE :: psi_h(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE, TARGET :: evc_h(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: hevc_h(:,:)
|
||||
COMPLEX(DP), POINTER :: sevc_h(:,:)
|
||||
REAL(DP), ALLOCATABLE :: e_h(:)
|
||||
!
|
||||
EXTERNAL :: h_psi, s_psi, h_psi_gpu, s_psi_gpu
|
||||
! h_psi(npwx,npw,nbnd,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nbnd,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
|
||||
!
|
||||
CALL start_clock_gpu( 'wfcrot' ); !write (*,*) 'start wfcrot' ; FLUSH(6)
|
||||
!write (*,*) 'gamma_only' , gamma_only; FLUSH(6)
|
||||
!
|
||||
IF( use_para_diag ) THEN
|
||||
!
|
||||
!Allocate arrays to workaround parallel case
|
||||
!
|
||||
ALLOCATE(psi_h(npwx*npol,nstart), evc_h(npwx*npol,nbnd), hevc_h(npwx*npol,nbnd), &
|
||||
e_h(nbnd))
|
||||
IF(overlap) THEN
|
||||
ALLOCATE(sevc_h(npwx*npol,nbnd))
|
||||
ELSE
|
||||
sevc_h => evc_h
|
||||
END IF
|
||||
!
|
||||
psi_h(1:npwx*npol,1:nstart) = psi_d(1:npwx*npol,1:nstart)
|
||||
evc_h(1:npwx*npol,1:nbnd) = evc_d(1:npwx*npol,1:nbnd)
|
||||
!
|
||||
! use data distributed subroutine
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!write (*,*) 'inside para gamma'; FLUSH(6)
|
||||
!
|
||||
CALL protate_xpsi_gamma ( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi_h, evc_h, hevc_h, sevc_h, e_h )
|
||||
!
|
||||
ELSE
|
||||
!write (*,*) 'inside para k'; FLUSH(6)
|
||||
!
|
||||
CALL protate_xpsi_k ( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi_h, evc_h, hevc_h, sevc_h, e_h )
|
||||
!
|
||||
END IF
|
||||
psi_d(1:npwx*npol,1:nstart) = psi_h(1:npwx*npol,1:nstart)
|
||||
evc_d(1:npwx*npol,1:nbnd) = evc_h(1:npwx*npol,1:nbnd)
|
||||
hevc_d(1:npwx*npol,1:nbnd) = hevc_h(1:npwx*npol,1:nbnd)
|
||||
e_d(1:nbnd) = e_h(1:nbnd)
|
||||
!
|
||||
DEALLOCATE(psi_h, evc_h, hevc_h, e_h)
|
||||
IF(overlap) THEN
|
||||
sevc_d(1:npwx*npol,1:nbnd) = sevc_h(1:npwx*npol,1:nbnd)
|
||||
DEALLOCATE(sevc_h)
|
||||
ELSE
|
||||
NULLIFY(sevc_h)
|
||||
END IF
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! use serial subroutines
|
||||
!
|
||||
IF ( gamma_only ) THEN
|
||||
!write (*,*) 'inside serial gamma'; FLUSH(6)
|
||||
!
|
||||
CALL rotate_xpsi_gamma_gpu ( h_psi_gpu, s_psi_gpu, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi_d, evc_d, hevc_d, sevc_d, e_d )
|
||||
!
|
||||
ELSE
|
||||
!write (*,*) 'inside serial k'; FLUSH(6)
|
||||
!
|
||||
CALL rotate_xpsi_k_gpu ( h_psi_gpu, s_psi_gpu, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi_d, evc_d, hevc_d, sevc_d, e_d )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock_gpu( 'wfcrot' )
|
||||
!
|
||||
END SUBROUTINE rotate_xpsi_driver_cuf
|
|
@ -19,7 +19,7 @@ SUBROUTINE rotate_wfc_gamma( h_psi, s_psi, overlap, &
|
|||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
|
||||
USE mp, ONLY : mp_sum
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
|
|
|
@ -0,0 +1,529 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_xpsi_gamma( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_xpsi for Gamma-only calculations
|
||||
! ... This version assumes real wavefunctions (k=0) with only
|
||||
! ... half plane waves stored: psi(-G)=psi*(G), except G=0
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! first G with nonzero norm
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx,nstart)
|
||||
COMPLEX(DP), INTENT(OUT) :: evc(npwx,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc(npwx,nbnd), sevc(npwx,nbnd)
|
||||
! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
! eigenvalues
|
||||
!
|
||||
! ... local variables:
|
||||
!
|
||||
INTEGER :: npw2, npwx2
|
||||
INTEGER :: n_start, n_end, my_n
|
||||
REAL(DP), ALLOCATABLE :: hr(:,:), sr(:,:), vr(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en(:)
|
||||
!
|
||||
EXTERNAL :: h_psi, s_psi
|
||||
! h_psi(npwx,npw,nbnd,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nbnd,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
|
||||
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
!
|
||||
IF ( gstart == -1 ) CALL errore( 'rotxpsig', 'gstart variable not initialized', 1 )
|
||||
!
|
||||
CALL start_clock('rotxpsig')
|
||||
!
|
||||
ALLOCATE( tpsi( npwx, nstart ) )
|
||||
ALLOCATE( hpsi( npwx, nstart ) )
|
||||
IF ( overlap ) &
|
||||
ALLOCATE( spsi( npwx, nstart ) )
|
||||
ALLOCATE( hr( nstart, nstart ) )
|
||||
ALLOCATE( sr( nstart, nstart ) )
|
||||
ALLOCATE( vr( nstart, nstart ) )
|
||||
ALLOCATE( en( nstart ) )
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
! ... set Im[ psi(G=0) ] - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
psi(1,1:nstart) = CMPLX( DBLE( psi(1,1:nstart) ), 0.D0 ,kind=DP)
|
||||
!
|
||||
CALL start_clock('rotxpsig:hpsi')
|
||||
!
|
||||
CALL h_psi( npwx, npw, nstart, psi, hpsi )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:hpsi')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL start_clock('rotxpsig:spsi')
|
||||
!
|
||||
CALL s_psi( npwx, npw, nstart, psi, spsi )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:spsi')
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
|
||||
my_n = n_end - n_start + 1
|
||||
!
|
||||
CALL start_clock('rotxpsig:hc')
|
||||
!
|
||||
hr = 0.D0
|
||||
!
|
||||
IF ( n_start <= n_end ) &
|
||||
CALL DGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, &
|
||||
psi, npwx2, hpsi(1,n_start), npwx2, 0.D0, hr(1,n_start), nstart )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DGER( nstart, my_n, -1.D0, psi, npwx2, hpsi(1,n_start), npwx2, hr(1,n_start), nstart )
|
||||
!
|
||||
CALL mp_sum( hr , inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hr , intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:hc')
|
||||
!
|
||||
CALL start_clock('rotxpsig:sc')
|
||||
!
|
||||
sr = 0.D0
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
IF ( n_start <= n_end ) &
|
||||
CALL DGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, &
|
||||
psi, npwx2, spsi(1,n_start), npwx2, 0.D0, sr(1,n_start), nstart )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DGER( nstart, my_n, -1.D0, psi, npwx2, spsi(1,n_start), npwx2, sr(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF ( n_start <= n_end ) &
|
||||
CALL DGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, &
|
||||
psi, npwx2, psi(1,n_start), npwx2, 0.D0, sr(1,n_start), nstart )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DGER( nstart, my_n, -1.D0, psi, npwx2, psi(1,n_start), npwx2, sr(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sr , inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sr , intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:sc')
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
CALL start_clock('rotxpsig:diag')
|
||||
!
|
||||
CALL diaghg( nstart, nbnd, hr, sr, nstart, en, vr, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
!
|
||||
e(:) = en(1:nbnd)
|
||||
!
|
||||
CALL stop_clock('rotxpsig:diag')
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
CALL start_clock('rotxpsig:evc')
|
||||
!
|
||||
tpsi = psi
|
||||
!
|
||||
evc = (0.D0, 0.D0)
|
||||
hevc = (0.D0, 0.D0)
|
||||
IF ( overlap ) &
|
||||
sevc = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( n_start <= n_end ) THEN
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
|
||||
tpsi(1,n_start), npwx2, vr(n_start,1), nstart, 0.D0, evc, npwx2 )
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
|
||||
hpsi(1,n_start), npwx2, vr(n_start,1), nstart, 0.D0, hevc, npwx2 )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL DGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
|
||||
spsi(1,n_start), npwx2, vr(n_start,1), nstart, 0.D0, sevc, npwx2 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( evc, inter_bgrp_comm )
|
||||
CALL mp_sum( hevc, inter_bgrp_comm )
|
||||
IF ( overlap ) &
|
||||
CALL mp_sum( sevc, inter_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:evc')
|
||||
!
|
||||
DEALLOCATE( en )
|
||||
DEALLOCATE( vr )
|
||||
DEALLOCATE( sr )
|
||||
DEALLOCATE( hr )
|
||||
IF ( overlap ) &
|
||||
DEALLOCATE( spsi )
|
||||
DEALLOCATE( hpsi )
|
||||
DEALLOCATE( tpsi )
|
||||
!
|
||||
CALL stop_clock('rotxpsig')
|
||||
!
|
||||
!CALL print_clock('rotxpsig')
|
||||
!CALL print_clock('rotxpsig:hpsi')
|
||||
!CALL print_clock('rotxpsig:spsi')
|
||||
!CALL print_clock('rotxpsig:hc')
|
||||
!CALL print_clock('rotxpsig:sc')
|
||||
!CALL print_clock('rotxpsig:diag')
|
||||
!CALL print_clock('rotxpsig:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_xpsi_gamma
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE protate_xpsi_gamma( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi, evc, hevc, sevc, e )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Parallel version of rotate_xpsi for Gamma-only calculations
|
||||
! ... Subroutine with distributed matrices, written by Carlo Cavazzoni
|
||||
! ... This version assumes real wavefunctions (k=0) with only
|
||||
! ... half plane waves stored: psi(-G)=psi*(G), except G=0
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, &
|
||||
nbgrp, root_bgrp_id, my_bgrp_id
|
||||
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
|
||||
USE mp, ONLY : mp_bcast, mp_root_sum, mp_sum, mp_barrier
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! first G with nonzero norm
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx,nstart)
|
||||
COMPLEX(DP), INTENT(OUT) :: evc(npwx,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc(npwx,nbnd), sevc(npwx,nbnd)
|
||||
! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
! eigenvalues
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: npw2, npwx2
|
||||
REAL(DP), ALLOCATABLE :: hr(:,:), sr(:,:), vr(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en(:)
|
||||
!
|
||||
INTEGER :: idesc(LAX_DESC_SIZE)
|
||||
! matrix distribution descriptors
|
||||
INTEGER :: nx
|
||||
! maximum local block dimension
|
||||
LOGICAL :: la_proc
|
||||
! flag to distinguish procs involved in linear algebra
|
||||
LOGICAL :: do_distr_diag_inside_bgrp
|
||||
INTEGER :: ortho_parent_comm
|
||||
INTEGER, ALLOCATABLE :: idesc_ip( :, :, : )
|
||||
INTEGER, ALLOCATABLE :: rank_ip( :, : )
|
||||
!
|
||||
EXTERNAL :: h_psi, s_psi
|
||||
! h_psi(npwx,npw,nvec,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nvec,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
|
||||
|
||||
CALL start_clock('protxpsig')
|
||||
!
|
||||
CALL laxlib_getval( do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp, &
|
||||
ortho_parent_comm = ortho_parent_comm )
|
||||
CALL desc_init( nstart, nx, la_proc, idesc, rank_ip, idesc_ip )
|
||||
!
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
!
|
||||
IF ( gstart == -1 ) CALL errore( 'protxpsig', 'gstart variable not initialized', 1 )
|
||||
!
|
||||
ALLOCATE( tpsi( npwx, nstart ) )
|
||||
ALLOCATE( hpsi( npwx, nstart ) )
|
||||
IF ( overlap ) &
|
||||
ALLOCATE( spsi( npwx, nstart ) )
|
||||
ALLOCATE( hr( nx, nx ) )
|
||||
ALLOCATE( sr( nx, nx ) )
|
||||
ALLOCATE( vr( nx, nx ) )
|
||||
ALLOCATE( en( nstart ) )
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
! ... set Im[ psi(G=0) ] - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
psi(1,1:nstart) = CMPLX( DBLE( psi(1,1:nstart) ), 0.D0, kind=DP)
|
||||
!
|
||||
CALL start_clock('protxpsig:hpsi')
|
||||
!
|
||||
CALL h_psi( npwx, npw, nstart, psi, hpsi )
|
||||
!
|
||||
CALL stop_clock('protxpsig:hpsi')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL start_clock('protxpsig:spsi')
|
||||
!
|
||||
CALL s_psi( npwx, npw, nstart, psi, spsi )
|
||||
!
|
||||
CALL stop_clock('protxpsig:spsi')
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL start_clock('protxpsig:hc')
|
||||
!
|
||||
CALL compute_distmat( hr, psi, hpsi )
|
||||
!
|
||||
CALL stop_clock('protxpsig:hc')
|
||||
!
|
||||
CALL start_clock('protxpsig:sc')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL compute_distmat( sr, psi, spsi )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL compute_distmat( sr, psi, psi )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock('protxpsig:sc')
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
CALL start_clock('protxpsig:diag')
|
||||
!
|
||||
IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg en and vr are the same across ortho_parent_comm
|
||||
! only the first bgrp performs the diagonalization
|
||||
IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nstart, hr, sr, nx, en, vr, idesc )
|
||||
IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
|
||||
CALL mp_bcast( vr, root_bgrp_id, inter_bgrp_comm )
|
||||
CALL mp_bcast( en, root_bgrp_id, inter_bgrp_comm )
|
||||
ENDIF
|
||||
ELSE
|
||||
CALL pdiaghg( nstart, hr, sr, nx, en, vr, idesc )
|
||||
END IF
|
||||
!
|
||||
e(:) = en(1:nbnd)
|
||||
!
|
||||
CALL stop_clock('protxpsig:diag')
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
CALL start_clock('protxpsig:evc')
|
||||
!
|
||||
tpsi = psi
|
||||
!
|
||||
CALL refresh_evc()
|
||||
!
|
||||
CALL stop_clock('protxpsig:evc')
|
||||
!
|
||||
DEALLOCATE( idesc_ip )
|
||||
DEALLOCATE( rank_ip )
|
||||
DEALLOCATE( en )
|
||||
DEALLOCATE( vr )
|
||||
DEALLOCATE( sr )
|
||||
DEALLOCATE( hr )
|
||||
IF ( overlap ) &
|
||||
DEALLOCATE( spsi )
|
||||
DEALLOCATE( hpsi )
|
||||
DEALLOCATE( tpsi )
|
||||
!
|
||||
CALL stop_clock('protxpsig')
|
||||
!
|
||||
!CALL print_clock('protxpsig')
|
||||
!CALL print_clock('protxpsig:hpsi')
|
||||
!CALL print_clock('protxpsig:spsi')
|
||||
!CALL print_clock('protxpsig:hc')
|
||||
!CALL print_clock('protxpsig:sc')
|
||||
!CALL print_clock('protxpsig:diag')
|
||||
!CALL print_clock('protxpsig:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE compute_distmat( dm, v, w )
|
||||
!
|
||||
! This subroutine compute <vi|wj> and store the
|
||||
! result in distributed matrix dm
|
||||
!
|
||||
INTEGER :: ipc, ipr
|
||||
INTEGER :: nr, nc, ir, ic, root
|
||||
REAL(DP), INTENT(OUT) :: dm( :, : )
|
||||
COMPLEX(DP) :: v(:,:), w(:,:)
|
||||
REAL(DP), ALLOCATABLE :: work( :, : )
|
||||
!
|
||||
ALLOCATE( work( nx, nx ) )
|
||||
!
|
||||
work = 0.0d0
|
||||
!
|
||||
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
|
||||
!
|
||||
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
|
||||
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
|
||||
!
|
||||
DO ipr = 1, ipc ! use symmetry for the loop on row procs
|
||||
!
|
||||
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
|
||||
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
|
||||
!
|
||||
! rank of the processor for which this block (ipr,ipc) is destinated
|
||||
!
|
||||
root = rank_ip( ipr, ipc )
|
||||
|
||||
! use blas subs. on the matrix block
|
||||
|
||||
CALL DGEMM( 'T', 'N', nr, nc, npw2, 2.D0, v(1,ir), npwx2, w(1,ic), npwx2, 0.D0, work, nx )
|
||||
|
||||
IF ( gstart == 2 ) &
|
||||
CALL DGER( nr, nc, -1.D0, v(1,ir), npwx2, w(1,ic), npwx2, work, nx )
|
||||
|
||||
! accumulate result on dm of root proc.
|
||||
|
||||
CALL mp_root_sum( work, dm, root, ortho_parent_comm )
|
||||
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
|
||||
if (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
|
||||
!
|
||||
CALL laxlib_dsqmsym( nstart, dm, nx, idesc )
|
||||
!
|
||||
DEALLOCATE( work )
|
||||
!
|
||||
RETURN
|
||||
END SUBROUTINE compute_distmat
|
||||
!
|
||||
!
|
||||
SUBROUTINE refresh_evc( )
|
||||
!
|
||||
INTEGER :: ipc, ipr
|
||||
INTEGER :: nr, nc, ir, ic, root
|
||||
REAL(DP), ALLOCATABLE :: vtmp( :, : )
|
||||
REAL(DP) :: beta
|
||||
|
||||
ALLOCATE( vtmp( nx, nx ) )
|
||||
!
|
||||
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
|
||||
!
|
||||
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
|
||||
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
|
||||
!
|
||||
IF( ic <= nbnd ) THEN
|
||||
!
|
||||
nc = min( nc, nbnd - ic + 1 )
|
||||
!
|
||||
beta = 0.0d0
|
||||
|
||||
DO ipr = 1, idesc(LAX_DESC_NPR)
|
||||
!
|
||||
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
|
||||
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
|
||||
!
|
||||
root = rank_ip( ipr, ipc )
|
||||
|
||||
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
|
||||
!
|
||||
! this proc sends his block
|
||||
!
|
||||
CALL mp_bcast( vr(:,1:nc), root, ortho_parent_comm )
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
|
||||
tpsi(1,ir), npwx2, vr, nx, beta, evc(1,ic), npwx2 )
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
|
||||
hpsi(1,ir), npwx2, vr, nx, beta, hevc(1,ic), npwx2 )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
|
||||
spsi(1,ir), npwx2, vr, nx, beta, sevc(1,ic), npwx2 )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! all other procs receive
|
||||
!
|
||||
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
|
||||
tpsi(1,ir), npwx2, vtmp, nx, beta, evc(1,ic), npwx2 )
|
||||
!
|
||||
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
|
||||
hpsi(1,ir), npwx2, vtmp, nx, beta, hevc(1,ic), npwx2 )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL DGEMM( 'N', 'N', npw2, nc, nr, 1.D0, &
|
||||
spsi(1,ir), npwx2, vtmp, nx, beta, sevc(1,ic), npwx2 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
||||
beta = 1.0d0
|
||||
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE( vtmp )
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE refresh_evc
|
||||
!
|
||||
END SUBROUTINE protate_xpsi_gamma
|
|
@ -0,0 +1,233 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_xpsi_gamma_gpu( h_psi_gpu, s_psi_gpu, overlap, &
|
||||
npwx, npw, nstart, nbnd, psi_d, evc_d, hevc_d, sevc_d, e_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_xpsi for Gamma-only calculations
|
||||
! ... This version assumes real wavefunctions (k=0) with only
|
||||
! ... half plane waves stored: psi_d(-G)=psi_d*(G), except G=0
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#else
|
||||
#define cublasDGEMM dgemm
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, root_bgrp_id, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp_bands_util, ONLY : gstart ! index of the first nonzero G
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! first G with nonzero norm
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx,nstart)
|
||||
COMPLEX(DP), INTENT(OUT) :: evc_d(npwx,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx,nbnd), sevc_d(npwx,nbnd)
|
||||
! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e_d(nbnd)
|
||||
! eigenvalues
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables:
|
||||
!
|
||||
INTEGER :: npw2, npwx2
|
||||
INTEGER :: n_start, n_end, my_n, i
|
||||
REAL(DP), ALLOCATABLE :: hr_d(:,:), sr_d(:,:), vr_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tpsi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: hr_d, sr_d, vr_d, tpsi_d, hpsi_d, spsi_d, en_d
|
||||
#endif
|
||||
!
|
||||
EXTERNAL :: h_psi_gpu, s_psi_gpu
|
||||
! h_psi(npwx,npw,nbnd,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nbnd,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nbnd)
|
||||
|
||||
|
||||
npw2 = 2 * npw
|
||||
npwx2 = 2 * npwx
|
||||
!
|
||||
IF ( gstart == -1 ) CALL errore( 'rotxpsig', 'gstart variable not initialized', 1 )
|
||||
!
|
||||
ALLOCATE( tpsi_d( npwx, nstart ) )
|
||||
ALLOCATE( hpsi_d( npwx, nstart ) )
|
||||
IF ( overlap ) &
|
||||
ALLOCATE( spsi_d( npwx, nstart ) )
|
||||
ALLOCATE( hr_d( nstart, nstart ) )
|
||||
ALLOCATE( sr_d( nstart, nstart ) )
|
||||
ALLOCATE( vr_d( nstart, nstart ) )
|
||||
ALLOCATE( en_d( nstart ) )
|
||||
!
|
||||
CALL start_clock('rotxpsig')
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
! ... set Im[ psi_d(G=0) ] - needed for numerical stability
|
||||
!
|
||||
IF ( gstart == 2 ) THEN
|
||||
!$cuf kernel do(1)
|
||||
DO i=1,nstart
|
||||
psi_d(1,i) = CMPLX( DBLE( psi_d(1,i) ), 0.D0, kind=DP)
|
||||
END DO
|
||||
END IF
|
||||
!
|
||||
CALL start_clock('rotxpsig:hpsi')
|
||||
!
|
||||
CALL h_psi_gpu( npwx, npw, nstart, psi_d, hpsi_d)
|
||||
!
|
||||
CALL stop_clock('rotxpsig:hpsi')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL start_clock('rotxpsig:spsi')
|
||||
!
|
||||
CALL s_psi_gpu( npwx, npw, nstart, psi_d, spsi_d )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:spsi')
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL start_clock('rotxpsig:hc')
|
||||
hr_d = 0.D0
|
||||
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
|
||||
my_n = n_end - n_start + 1
|
||||
!
|
||||
IF ( n_start .le. n_end ) &
|
||||
CALL cublasDGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, psi_d, &
|
||||
npwx2, hpsi_d(1,n_start), npwx2, 0.D0, hr_d(1,n_start), nstart )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, npwx2, hpsi_d(1,n_start), npwx2, hr_d(1,n_start), nstart )
|
||||
!
|
||||
CALL mp_sum( hr_d , inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hr_d , intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:hc')
|
||||
!
|
||||
CALL start_clock('rotxpsig:sc')
|
||||
!
|
||||
sr_d = 0.D0
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
IF ( n_start .le. n_end ) &
|
||||
CALL cublasDGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, psi_d, &
|
||||
npwx2, spsi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, npwx2, spsi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF ( n_start .le. n_end ) &
|
||||
CALL cublasDGEMM( 'T', 'N', nstart, my_n, npw2, 2.D0, psi_d, &
|
||||
npwx2, psi_d(1,n_start), npwx2, 0.D0, sr_d(1,n_start), nstart )
|
||||
!
|
||||
IF ( gstart == 2 ) &
|
||||
CALL CGcudaDGER( nstart, my_n, -1.D0, psi_d, npwx2, psi_d(1,n_start), npwx2, sr_d(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sr_d , inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sr_d , intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:sc')
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
CALL start_clock('rotxpsig:diag')
|
||||
!
|
||||
CALL diaghg( nstart, nbnd, hr_d, sr_d, nstart, en_d, vr_d, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:diag')
|
||||
!
|
||||
CALL start_clock('rotxpsig:evc')
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i=1, nbnd
|
||||
e_d(i) = en_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
tpsi_d = psi_d
|
||||
!
|
||||
evc_d = (0.D0, 0.D0)
|
||||
hevc_d = (0.D0, 0.D0)
|
||||
IF ( overlap ) &
|
||||
sevc_d = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( n_start .le. n_end ) THEN
|
||||
!
|
||||
CALL cublasDGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
|
||||
tpsi_d(1,n_start), npwx2, vr_d(n_start,1), nstart, 0.D0, evc_d, npwx2 )
|
||||
!
|
||||
CALL cublasDGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
|
||||
hpsi_d(1,n_start), npwx2, vr_d(n_start,1), nstart, 0.D0, hevc_d, npwx2 )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL cublasDGEMM( 'N', 'N', npw2, nbnd, my_n, 1.D0, &
|
||||
spsi_d(1,n_start), npwx2, vr_d(n_start,1), nstart, 0.D0, sevc_d, npwx2 )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( evc_d, inter_bgrp_comm )
|
||||
CALL mp_sum( hevc_d, inter_bgrp_comm )
|
||||
IF ( overlap ) &
|
||||
CALL mp_sum( sevc_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsig:evc')
|
||||
!
|
||||
DEALLOCATE( en_d )
|
||||
DEALLOCATE( vr_d )
|
||||
DEALLOCATE( sr_d )
|
||||
DEALLOCATE( hr_d )
|
||||
IF ( overlap ) &
|
||||
DEALLOCATE( spsi_d )
|
||||
DEALLOCATE( hpsi_d )
|
||||
DEALLOCATE( tpsi_d )
|
||||
!
|
||||
CALL stop_clock('rotxpsig')
|
||||
!
|
||||
!CALL print_clock('rotxpsig')
|
||||
!CALL print_clock('rotxpsig:hpsi')
|
||||
!CALL print_clock('rotxpsig:spsi')
|
||||
!CALL print_clock('rotxpsig:hc')
|
||||
!CALL print_clock('rotxpsig:sc')
|
||||
!CALL print_clock('rotxpsig:diag')
|
||||
!CALL print_clock('rotxpsig:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_xpsi_gamma_gpu
|
|
@ -0,0 +1,515 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_xpsi_k( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_xpsi for colinear, k-point calculations
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
|
||||
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
|
||||
! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
! eigenvalues
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: kdim, kdmx
|
||||
INTEGER :: n_start, n_end, my_n
|
||||
COMPLEX(DP), ALLOCATABLE :: hc(:,:), sc(:,:), vc(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en(:)
|
||||
!
|
||||
EXTERNAL :: h_psi, s_psi
|
||||
! h_psi(npwx,npw,nvec,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nvec,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
|
||||
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx*npol
|
||||
kdmx = npwx*npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL start_clock('rotxpsik')
|
||||
!
|
||||
ALLOCATE( tpsi( kdmx, nstart ) )
|
||||
ALLOCATE( hpsi( kdmx, nstart ) )
|
||||
IF ( overlap ) &
|
||||
ALLOCATE( spsi(kdmx, nstart ) )
|
||||
ALLOCATE( hc( nstart, nstart) )
|
||||
ALLOCATE( sc( nstart, nstart) )
|
||||
ALLOCATE( vc( nstart, nstart) )
|
||||
ALLOCATE( en( nstart ) )
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
CALL start_clock('rotxpsik:hpsi')
|
||||
!
|
||||
CALL h_psi( npwx, npw, nstart, psi, hpsi )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:hpsi')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL start_clock('rotxpsik:spsi')
|
||||
!
|
||||
CALL s_psi( npwx, npw, nstart, psi, spsi )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:spsi')
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
|
||||
my_n = n_end - n_start + 1
|
||||
!
|
||||
CALL start_clock('rotxpsik:hc')
|
||||
!
|
||||
hc = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( n_start <= n_end ) &
|
||||
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), &
|
||||
psi, kdmx, hpsi(1,n_start), kdmx, (0.D0, 0.D0), hc(1,n_start), nstart )
|
||||
!
|
||||
CALL mp_sum( hc, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hc, intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:hc')
|
||||
!
|
||||
CALL start_clock('rotxpsik:sc')
|
||||
!
|
||||
sc = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
IF ( n_start <= n_end ) &
|
||||
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), &
|
||||
psi, kdmx, spsi(1,n_start), kdmx, (0.D0, 0.D0), sc(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF ( n_start <= n_end ) &
|
||||
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), &
|
||||
psi, kdmx, psi(1,n_start), kdmx, (0.D0, 0.D0), sc(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sc, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sc, intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:sc')
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
CALL start_clock('rotxpsik:diag')
|
||||
!
|
||||
CALL diaghg( nstart, nbnd, hc, sc, nstart, en, vc, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
!
|
||||
e(:) = en(1:nbnd)
|
||||
!
|
||||
CALL stop_clock('rotxpsik:diag')
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
CALL start_clock('rotxpsik:evc')
|
||||
!
|
||||
tpsi = psi
|
||||
!
|
||||
evc = (0.D0, 0.D0)
|
||||
hevc = (0.D0, 0.D0)
|
||||
IF ( overlap ) &
|
||||
sevc = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( n_start <= n_end ) THEN
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
|
||||
tpsi(1,n_start), kdmx, vc(n_start,1), nstart, (0.D0, 0.D0), evc, kdmx )
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
|
||||
hpsi(1,n_start), kdmx, vc(n_start,1), nstart, (0.D0, 0.D0), hevc, kdmx )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
|
||||
spsi(1,n_start), kdmx, vc(n_start,1), nstart, (0.D0, 0.D0), sevc, kdmx )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( evc, inter_bgrp_comm )
|
||||
CALL mp_sum( hevc, inter_bgrp_comm )
|
||||
IF ( overlap ) &
|
||||
CALL mp_sum( sevc, inter_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:evc')
|
||||
!
|
||||
DEALLOCATE( en )
|
||||
DEALLOCATE( vc )
|
||||
DEALLOCATE( sc )
|
||||
DEALLOCATE( hc )
|
||||
IF ( overlap ) &
|
||||
DEALLOCATE( spsi )
|
||||
DEALLOCATE( hpsi )
|
||||
DEALLOCATE( tpsi )
|
||||
!
|
||||
CALL stop_clock('rotxpsik')
|
||||
!
|
||||
!CALL print_clock('rotxpsik')
|
||||
!CALL print_clock('rotxpsik:hpsi')
|
||||
!CALL print_clock('rotxpsik:spsi')
|
||||
!CALL print_clock('rotxpsik:hc')
|
||||
!CALL print_clock('rotxpsik:sc')
|
||||
!CALL print_clock('rotxpsik:diag')
|
||||
!CALL print_clock('rotxpsik:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_xpsi_k
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE protate_xpsi_k( h_psi, s_psi, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi, evc, hevc, sevc, e )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Parallel version of rotate_xpsi for colinear, k-point calculations
|
||||
! ... Subroutine with distributed matrices, written by Carlo Cavazzoni
|
||||
!
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, &
|
||||
nbgrp, root_bgrp_id, my_bgrp_id
|
||||
USE mp, ONLY : mp_bcast, mp_root_sum, mp_sum, mp_barrier
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
|
||||
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
|
||||
! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
|
||||
! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
! eigenvalues
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: kdim, kdmx
|
||||
COMPLEX(DP), ALLOCATABLE :: hc(:,:), sc(:,:), vc(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tpsi(:,:), hpsi(:,:), spsi(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en(:)
|
||||
!
|
||||
INTEGER :: idesc(LAX_DESC_SIZE)
|
||||
! matrix distribution descriptors
|
||||
INTEGER :: nx
|
||||
! maximum local block dimension
|
||||
LOGICAL :: la_proc
|
||||
! flag to distinguish procs involved in linear algebra
|
||||
LOGICAL :: do_distr_diag_inside_bgrp
|
||||
INTEGER :: ortho_parent_comm
|
||||
INTEGER, ALLOCATABLE :: idesc_ip( :, :, : )
|
||||
INTEGER, ALLOCATABLE :: rank_ip( :, : )
|
||||
!
|
||||
EXTERNAL :: h_psi, s_psi
|
||||
! h_psi(npwx,npw,nvec,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nvec,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
|
||||
|
||||
CALL start_clock('protxpsik')
|
||||
!
|
||||
CALL laxlib_getval( do_distr_diag_inside_bgrp = do_distr_diag_inside_bgrp, &
|
||||
ortho_parent_comm = ortho_parent_comm )
|
||||
CALL desc_init( nstart, nx, la_proc, idesc, rank_ip, idesc_ip )
|
||||
!
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx*npol
|
||||
kdmx = npwx*npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
ALLOCATE( tpsi( kdmx, nstart ) )
|
||||
ALLOCATE( hpsi( kdmx, nstart ) )
|
||||
IF ( overlap ) &
|
||||
ALLOCATE( spsi( kdmx, nstart ) )
|
||||
ALLOCATE( hc( nx, nx) )
|
||||
ALLOCATE( sc( nx, nx) )
|
||||
ALLOCATE( vc( nx, nx) )
|
||||
ALLOCATE( en( nstart ) )
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
CALL start_clock('protxpsik:hpsi')
|
||||
!
|
||||
CALL h_psi( npwx, npw, nstart, psi, hpsi )
|
||||
!
|
||||
CALL stop_clock('protxpsik:hpsi')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL start_clock('protxpsik:spsi')
|
||||
!
|
||||
CALL s_psi( npwx, npw, nstart, psi, spsi )
|
||||
!
|
||||
CALL stop_clock('protxpsik:spsi')
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL start_clock('protxpsik:hc')
|
||||
!
|
||||
CALL compute_distmat( hc, psi, hpsi )
|
||||
!
|
||||
CALL stop_clock('protxpsik:hc')
|
||||
!
|
||||
CALL start_clock('protxpsik:sc')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL compute_distmat( sc, psi, spsi )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
CALL compute_distmat( sc, psi, psi )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL stop_clock('protxpsik:sc')
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
CALL start_clock('protxpsik:diag')
|
||||
!
|
||||
IF ( do_distr_diag_inside_bgrp ) THEN ! NB on output of pdiaghg en and vc are the same across ortho_parent_comm
|
||||
! only the first bgrp performs the diagonalization
|
||||
IF( my_bgrp_id == root_bgrp_id ) CALL pdiaghg( nstart, hc, sc, nx, en, vc, idesc )
|
||||
IF( nbgrp > 1 ) THEN ! results must be brodcast to the other band groups
|
||||
CALL mp_bcast( vc, root_bgrp_id, inter_bgrp_comm )
|
||||
CALL mp_bcast( en, root_bgrp_id, inter_bgrp_comm )
|
||||
ENDIF
|
||||
ELSE
|
||||
CALL pdiaghg( nstart, hc, sc, nx, en, vc, idesc )
|
||||
END IF
|
||||
!
|
||||
e(:) = en(1:nbnd)
|
||||
!
|
||||
CALL stop_clock('protxpsik:diag')
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
CALL start_clock('protxpsik:evc')
|
||||
!
|
||||
tpsi = psi
|
||||
!
|
||||
CALL refresh_evc()
|
||||
!
|
||||
CALL stop_clock('protxpsik:evc')
|
||||
!
|
||||
DEALLOCATE( en )
|
||||
DEALLOCATE( vc )
|
||||
DEALLOCATE( sc )
|
||||
DEALLOCATE( hc )
|
||||
IF ( overlap ) &
|
||||
DEALLOCATE( spsi )
|
||||
DEALLOCATE( hpsi )
|
||||
DEALLOCATE( tpsi )
|
||||
!
|
||||
DEALLOCATE( idesc_ip )
|
||||
DEALLOCATE( rank_ip )
|
||||
!
|
||||
CALL stop_clock('protxpsik')
|
||||
!
|
||||
!CALL print_clock('protxpsik')
|
||||
!CALL print_clock('protxpsik:hpsi')
|
||||
!CALL print_clock('protxpsik:spsi')
|
||||
!CALL print_clock('protxpsik:hc')
|
||||
!CALL print_clock('protxpsik:sc')
|
||||
!CALL print_clock('protxpsik:diag')
|
||||
!CALL print_clock('protxpsik:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
!
|
||||
CONTAINS
|
||||
!
|
||||
SUBROUTINE compute_distmat( dm, v, w )
|
||||
!
|
||||
! This subroutine compute <vi|wj> and store the
|
||||
! result in distributed matrix dm
|
||||
!
|
||||
INTEGER :: ipc, ipr
|
||||
INTEGER :: nr, nc, ir, ic, root
|
||||
COMPLEX(DP), INTENT(OUT) :: dm( :, : )
|
||||
COMPLEX(DP) :: v(:,:), w(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: work( :, : )
|
||||
!
|
||||
ALLOCATE( work( nx, nx ) )
|
||||
!
|
||||
work = ( 0.0_DP, 0.0_DP )
|
||||
!
|
||||
DO ipc = 1, idesc(LAX_DESC_NPC) ! loop on column procs
|
||||
!
|
||||
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
|
||||
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
|
||||
!
|
||||
DO ipr = 1, ipc ! desc%npr ! ipc ! use symmetry for the loop on row procs
|
||||
!
|
||||
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
|
||||
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
|
||||
!
|
||||
! rank of the processor for which this block (ipr,ipc) is destinated
|
||||
!
|
||||
root = rank_ip( ipr, ipc )
|
||||
|
||||
! use blas subs. on the matrix block
|
||||
|
||||
CALL ZGEMM( 'C', 'N', nr, nc, kdim, ( 1.D0, 0.D0 ), v(1,ir), kdmx, w(1,ic), kdmx, ( 0.D0, 0.D0 ), work, nx )
|
||||
|
||||
! accumulate result on dm of root proc.
|
||||
CALL mp_root_sum( work, dm, root, ortho_parent_comm )
|
||||
|
||||
END DO
|
||||
!
|
||||
END DO
|
||||
if (ortho_parent_comm.ne.intra_bgrp_comm .and. nbgrp > 1) dm = dm/nbgrp
|
||||
!
|
||||
CALL laxlib_zsqmher( nstart, dm, nx, idesc )
|
||||
!
|
||||
DEALLOCATE( work )
|
||||
!
|
||||
RETURN
|
||||
END SUBROUTINE compute_distmat
|
||||
|
||||
|
||||
SUBROUTINE refresh_evc( )
|
||||
!
|
||||
INTEGER :: ipc, ipr
|
||||
INTEGER :: nr, nc, ir, ic, root
|
||||
COMPLEX(DP), ALLOCATABLE :: vtmp( :, : )
|
||||
COMPLEX(DP) :: beta
|
||||
|
||||
ALLOCATE( vtmp( nx, nx ) )
|
||||
!
|
||||
DO ipc = 1, idesc(LAX_DESC_NPC)
|
||||
!
|
||||
nc = idesc_ip( LAX_DESC_NC, 1, ipc )
|
||||
ic = idesc_ip( LAX_DESC_IC, 1, ipc )
|
||||
!
|
||||
IF( ic <= nbnd ) THEN
|
||||
!
|
||||
nc = min( nc, nbnd - ic + 1 )
|
||||
!
|
||||
beta = ( 0.D0, 0.D0 )
|
||||
|
||||
DO ipr = 1, idesc(LAX_DESC_NPR)
|
||||
!
|
||||
nr = idesc_ip( LAX_DESC_NR, ipr, ipc )
|
||||
ir = idesc_ip( LAX_DESC_IR, ipr, ipc )
|
||||
!
|
||||
root = rank_ip( ipr, ipc )
|
||||
|
||||
IF( ipr-1 == idesc(LAX_DESC_MYR) .AND. ipc-1 == idesc(LAX_DESC_MYC) .AND. la_proc ) THEN
|
||||
!
|
||||
! this proc sends his block
|
||||
!
|
||||
CALL mp_bcast( vc(:,1:nc), root, ortho_parent_comm )
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
|
||||
tpsi(1,ir), kdmx, vc, nx, beta, evc(1,ic), kdmx )
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
|
||||
hpsi(1,ir), kdmx, vc, nx, beta, hevc(1,ic), kdmx )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
|
||||
spsi(1,ir), kdmx, vc, nx, beta, sevc(1,ic), kdmx )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
! all other procs receive
|
||||
!
|
||||
CALL mp_bcast( vtmp(:,1:nc), root, ortho_parent_comm )
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
|
||||
tpsi(1,ir), kdmx, vtmp, nx, beta, evc(1,ic), kdmx )
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
|
||||
hpsi(1,ir), kdmx, vtmp, nx, beta, hevc(1,ic), kdmx )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL ZGEMM( 'N', 'N', kdim, nc, nr, ( 1.D0, 0.D0 ), &
|
||||
spsi(1,ir), kdmx, vtmp, nx, beta, sevc(1,ic), kdmx )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
|
||||
beta = ( 1.D0, 0.D0 )
|
||||
|
||||
END DO
|
||||
!
|
||||
END IF
|
||||
!
|
||||
END DO
|
||||
!
|
||||
DEALLOCATE( vtmp )
|
||||
|
||||
RETURN
|
||||
END SUBROUTINE refresh_evc
|
||||
!
|
||||
END SUBROUTINE protate_xpsi_k
|
|
@ -0,0 +1,225 @@
|
|||
!
|
||||
! Copyright (C) 2019 National Institute of Advanced Industrial Science and Technology (AIST)
|
||||
!
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
||||
SUBROUTINE rotate_xpsi_k_gpu( h_psi_gpu, s_psi_gpu, overlap, &
|
||||
npwx, npw, nstart, nbnd, npol, psi_d, evc_d, hevc_d, sevc_d, e_d )
|
||||
!----------------------------------------------------------------------------
|
||||
!
|
||||
! ... Serial version of rotate_xpsi for colinear, k-point calculations
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
USE cudafor
|
||||
USE cublas
|
||||
#endif
|
||||
USE util_param, ONLY : DP
|
||||
USE mp_bands_util, ONLY : intra_bgrp_comm, inter_bgrp_comm, nbgrp, my_bgrp_id, &
|
||||
me_bgrp, root_bgrp
|
||||
USE mp, ONLY : mp_sum
|
||||
!
|
||||
IMPLICIT NONE
|
||||
!
|
||||
include 'laxlib.fh'
|
||||
!
|
||||
! ... I/O variables
|
||||
!
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
! dimension of the matrix to be diagonalized
|
||||
! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
! input number of states
|
||||
! output number of states
|
||||
! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart)
|
||||
COMPLEX(DP), INTENT(OUT) :: evc_d(npwx*npol,nbnd) !(intent inout?)
|
||||
! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx*npol,nbnd), sevc_d(npwx*npol,nbnd)
|
||||
! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e_d(nbnd)
|
||||
! eigenvalues
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
|
||||
#endif
|
||||
!
|
||||
! ... local variables
|
||||
!
|
||||
INTEGER :: kdim, kdmx
|
||||
INTEGER :: n_start, n_end, my_n, i, j
|
||||
COMPLEX(DP), ALLOCATABLE :: hc_d(:,:), sc_d(:,:), vc_d(:,:)
|
||||
COMPLEX(DP), ALLOCATABLE :: tpsi_d(:,:), hpsi_d(:,:), spsi_d(:,:)
|
||||
REAL(DP), ALLOCATABLE :: en_d(:)
|
||||
#if defined(__CUDA)
|
||||
attributes(DEVICE) :: hc_d, sc_d, vc_d, tpsi_d, hpsi_d, spsi_d, en_d
|
||||
#endif
|
||||
!
|
||||
EXTERNAL :: h_psi_gpu, s_psi_gpu
|
||||
! h_psi(npwx,npw,nvec,psi,hpsi)
|
||||
! calculates H|psi>
|
||||
! s_psi(npwx,npw,nvec,spsi)
|
||||
! calculates S|psi> (if needed)
|
||||
! Vectors psi,hpsi,spsi are dimensioned (npwx,npol,nvec)
|
||||
|
||||
IF ( npol == 1 ) THEN
|
||||
!
|
||||
kdim = npw
|
||||
kdmx = npwx
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
kdim = npwx*npol
|
||||
kdmx = npwx*npol
|
||||
!
|
||||
END IF
|
||||
!
|
||||
!
|
||||
ALLOCATE( tpsi_d( kdmx, nstart ) )
|
||||
ALLOCATE( hpsi_d( kdmx, nstart ) )
|
||||
IF ( overlap ) &
|
||||
ALLOCATE( spsi_d(kdmx, nstart ) )
|
||||
ALLOCATE( hc_d( nstart, nstart) )
|
||||
ALLOCATE( sc_d( nstart, nstart) )
|
||||
ALLOCATE( vc_d( nstart, nstart) )
|
||||
ALLOCATE( en_d( nstart ) )
|
||||
!
|
||||
CALL start_clock('rotxpsik')
|
||||
!
|
||||
! ... Set up the Hamiltonian and Overlap matrix on the subspace :
|
||||
!
|
||||
! ... H_ij = <psi_i| H |psi_j> S_ij = <psi_i| S |psi_j>
|
||||
!
|
||||
CALL start_clock('rotxpsik:hpsi')
|
||||
!
|
||||
CALL h_psi_gpu( npwx, npw, nstart, psi_d, hpsi_d )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:hpsi')
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
CALL start_clock('rotxpsik:spsi')
|
||||
!
|
||||
CALL s_psi_gpu( npwx, npw, nstart, psi_d, spsi_d )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:spsi')
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL divide(inter_bgrp_comm, nstart, n_start, n_end)
|
||||
my_n = n_end - n_start + 1
|
||||
!
|
||||
CALL start_clock('rotxpsik:hc')
|
||||
!
|
||||
hc_d = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( n_start .le. n_end ) &
|
||||
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), psi_d, &
|
||||
kdmx, hpsi_d(1,n_start), kdmx, (0.D0, 0.D0), hc_d(1,n_start), nstart )
|
||||
!
|
||||
CALL mp_sum( hc_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( hc_d, intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:hc')
|
||||
!
|
||||
CALL start_clock('rotxpsik:sc')
|
||||
!
|
||||
sc_d = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( overlap ) THEN
|
||||
!
|
||||
IF ( n_start .le. n_end ) &
|
||||
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), psi_d, &
|
||||
kdmx, spsi_d(1,n_start), kdmx, (0.D0, 0.D0), sc_d(1,n_start), nstart )
|
||||
!
|
||||
ELSE
|
||||
!
|
||||
IF ( n_start .le. n_end ) &
|
||||
CALL ZGEMM( 'C', 'N', nstart, my_n, kdim, (1.D0, 0.D0), psi_d, &
|
||||
kdmx, psi_d(1,n_start), kdmx, (0.D0, 0.D0), sc_d(1,n_start), nstart )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( sc_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL mp_sum( sc_d, intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:sc')
|
||||
!
|
||||
! ... Diagonalize
|
||||
!
|
||||
CALL start_clock('rotxpsik:diag')
|
||||
!
|
||||
CALL diaghg( nstart, nbnd, hc_d, sc_d, nstart, en_d, vc_d, me_bgrp, root_bgrp, intra_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:diag')
|
||||
CALL start_clock('rotxpsik:evc')
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i=1,nbnd
|
||||
e_d(i) = en_d(i)
|
||||
END DO
|
||||
!
|
||||
! ... update the basis set
|
||||
!
|
||||
!$cuf kernel do(1) <<<*,*>>>
|
||||
DO i=1, nbnd
|
||||
DO j=1, kdmx
|
||||
tpsi_d(j,i) = psi_d(j,i)
|
||||
END DO
|
||||
END DO
|
||||
!
|
||||
evc_d = (0.D0, 0.D0)
|
||||
hevc_d = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( overlap ) sevc_d = (0.D0, 0.D0)
|
||||
!
|
||||
IF ( n_start .le. n_end ) THEN
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
|
||||
tpsi_d(1,n_start), kdmx, vc_d(n_start,1), nstart, (0.D0, 0.D0), evc_d, kdmx )
|
||||
!
|
||||
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
|
||||
hpsi_d(1,n_start), kdmx, vc_d(n_start,1), nstart, (0.D0, 0.D0), hevc_d, kdmx )
|
||||
!
|
||||
IF ( overlap ) &
|
||||
CALL ZGEMM( 'N', 'N', kdim, nbnd, my_n, (1.D0, 0.D0), &
|
||||
spsi_d(1,n_start), kdmx, vc_d(n_start,1), nstart, (0.D0, 0.D0), sevc_d, kdmx )
|
||||
!
|
||||
END IF
|
||||
!
|
||||
CALL mp_sum( evc_d, inter_bgrp_comm )
|
||||
CALL mp_sum( hevc_d, inter_bgrp_comm )
|
||||
IF ( overlap ) &
|
||||
CALL mp_sum( sevc_d, inter_bgrp_comm )
|
||||
!
|
||||
CALL stop_clock('rotxpsik:evc')
|
||||
!
|
||||
DEALLOCATE( en_d )
|
||||
DEALLOCATE( vc_d )
|
||||
DEALLOCATE( sc_d )
|
||||
DEALLOCATE( hc_d )
|
||||
IF ( overlap ) DEALLOCATE( spsi_d )
|
||||
DEALLOCATE( hpsi_d )
|
||||
DEALLOCATE( tpsi_d )
|
||||
!
|
||||
CALL stop_clock('rotxpsik')
|
||||
!
|
||||
!CALL print_clock('rotxpsik')
|
||||
!CALL print_clock('rotxpsik:hpsi')
|
||||
!CALL print_clock('rotxpsik:spsi')
|
||||
!CALL print_clock('rotxpsik:hc')
|
||||
!CALL print_clock('rotxpsik:sc')
|
||||
!CALL print_clock('rotxpsik:diag')
|
||||
!CALL print_clock('rotxpsik:evc')
|
||||
!
|
||||
RETURN
|
||||
!
|
||||
END SUBROUTINE rotate_xpsi_k_gpu
|
||||
!
|
||||
!
|
||||
!----------------------------------------------------------------------------
|
|
@ -1,5 +1,5 @@
|
|||
# Makefile for KS_Solvers
|
||||
sinclude ../make.inc
|
||||
include ../make.inc
|
||||
|
||||
# the following directive prevents execution of this makefile in parallel
|
||||
# Problem is, libks_solvers.a depends upon all libdavid libcg etc, but if
|
||||
|
@ -10,7 +10,7 @@ sinclude ../make.inc
|
|||
|
||||
default: all
|
||||
|
||||
all: libdavid_rci libdavid libcg libppcg libparo libdense libnewsolver libks_solvers.a
|
||||
all: libdavid_rci libdavid libcg libppcg libparo librmm libdense libnewsolver libks_solvers.a ks_solver_interfaces.fh
|
||||
|
||||
ALLOBJS = \
|
||||
Davidson_RCI/david_rci.o \
|
||||
|
@ -18,6 +18,7 @@ Davidson/cegterg.o \
|
|||
Davidson/regterg.o \
|
||||
CG/ccgdiagg.o \
|
||||
CG/rcgdiagg.o \
|
||||
PPCG/generic_cublas.o \
|
||||
PPCG/ppcg_gamma.o \
|
||||
PPCG/ppcg_k.o \
|
||||
ParO/bpcg_gamma.o \
|
||||
|
@ -31,7 +32,21 @@ ParO/paro_k_new.o \
|
|||
DENSE/rotate_HSpsi_gamma.o \
|
||||
DENSE/rotate_HSpsi_k.o \
|
||||
DENSE/rotate_wfc_gamma.o \
|
||||
DENSE/rotate_wfc_k.o
|
||||
DENSE/rotate_wfc_k.o \
|
||||
DENSE/rotate_driver.o \
|
||||
DENSE/gram_schmidt_gamma.o \
|
||||
DENSE/gram_schmidt_k.o \
|
||||
DENSE/gram_schmidt_k_gpu.o \
|
||||
DENSE/gram_schmidt_gamma_gpu.o \
|
||||
DENSE/rotate_xpsi_gamma.o \
|
||||
DENSE/rotate_xpsi_gamma_gpu.o \
|
||||
DENSE/rotate_xpsi_k_gpu.o \
|
||||
DENSE/rotate_xpsi_k.o \
|
||||
RMM/crmmdiagg.o \
|
||||
RMM/crmmdiagg_gpu.o \
|
||||
RMM/rrmmdiagg.o \
|
||||
RMM/rrmmdiagg_gpu.o
|
||||
|
||||
|
||||
# GPU-related objects
|
||||
ALLOBJS += \
|
||||
|
@ -39,6 +54,7 @@ Davidson/cegterg_gpu.o \
|
|||
Davidson/regterg_gpu.o \
|
||||
DENSE/rotate_wfc_k_gpu.o \
|
||||
DENSE/rotate_wfc_gamma_gpu.o \
|
||||
DENSE/rotate_driver_cuf.o \
|
||||
CG/rcgdiagg_gpu.o \
|
||||
CG/ccgdiagg_gpu.o \
|
||||
PPCG/generic_cublas.o \
|
||||
|
@ -53,10 +69,12 @@ DENSE/rotate_HSpsi_k_gpu.o
|
|||
|
||||
# add here other objects, e.g. ParO/*.o NewSolver/*.o
|
||||
|
||||
libks_solvers.a: $(ALLOBJS)
|
||||
libks_solvers.a: $(ALLOBJS)
|
||||
$(AR) $(ARFLAGS) $@ $?
|
||||
$(RANLIB) $@
|
||||
|
||||
|
||||
|
||||
libdavid_rci :
|
||||
if test -d Davidson_RCI ; then \
|
||||
( cd Davidson_RCI ; $(MAKE) all || exit 1 ) ; fi
|
||||
|
@ -81,6 +99,10 @@ libdense :
|
|||
if test -d DENSE ; then \
|
||||
( cd DENSE ; $(MAKE) all || exit 1 ) ; fi
|
||||
|
||||
librmm :
|
||||
if test -d RMM ; then \
|
||||
( cd RMM ; $(MAKE) all || exit 1 ) ; fi
|
||||
|
||||
libnewsolver :
|
||||
if test -d NewSolver ; then \
|
||||
( cd NewSolver ; $(MAKE) all || exit 1 ) ; fi
|
||||
|
@ -92,6 +114,9 @@ clean :
|
|||
if test -d PPCG ; then ( cd PPCG ; $(MAKE) clean ) ; fi
|
||||
if test -d ParO ; then ( cd ParO ; $(MAKE) clean ) ; fi
|
||||
if test -d DENSE ; then ( cd DENSE ; $(MAKE) clean ) ; fi
|
||||
if test -d RMM ; then ( cd RMM ; $(MAKE) clean ) ; fi
|
||||
if test -d NewSolver ; then ( cd NewSolver ; $(MAKE) clean ) ; fi
|
||||
rm -f *.fh
|
||||
rm -f libks_solvers.a
|
||||
|
||||
|
|
@ -82,6 +82,172 @@ implicit none
|
|||
return
|
||||
end subroutine gpu_DTRSM
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
DOUBLE COMPLEX function ZDOTC_gpu(n, zx, incx, zy, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
integer :: n, incx, incy
|
||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: zx, zy
|
||||
ZDOTC_gpu = cublasZDOTC(n, zx, incx, zy, incy)
|
||||
#endif
|
||||
return
|
||||
end function ZDOTC_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE ZSWAP_gpu(n, zx, incx, zy, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
integer :: n, incx, incy
|
||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: zx, zy
|
||||
CALL cublasZSWAP(n, zx, incx, zy, incy)
|
||||
#endif
|
||||
return
|
||||
END SUBROUTINE ZSWAP_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE ZCOPY_gpu(n, zx, incx, zy, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
INTEGER :: n, incx, incy
|
||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: zx, zy
|
||||
CALL cublasZCOPY(n, zx, incx, zy, incy)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE ZCOPY_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE ZAXPY_gpu(n, za, zx, incx, zy, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
INTEGER :: n, incx, incy
|
||||
DOUBLE COMPLEX :: za
|
||||
DOUBLE COMPLEX, dimension(*) :: zx, zy
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: zx, zy
|
||||
CALL cublasZAXPY(n, za, zx, incx, zy, incy)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE ZAXPY_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE ZGEMV_gpu(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
CHARACTER :: trans
|
||||
INTEGER :: lda, m, n, incx, incy
|
||||
DOUBLE COMPLEX :: alpha, beta
|
||||
DOUBLE COMPLEX, dimension(lda, *) :: a
|
||||
DOUBLE COMPLEX, dimension(*) :: x, y
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: a, x, y
|
||||
CALL cublasZGEMV(trans, m, n, alpha, a, lda, x, incx, beta, y, incy)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE ZGEMV_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE ZDSCAL_gpu(n, da, zx, incx)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
INTEGER :: n, incx
|
||||
DOUBLE PRECISION :: da
|
||||
DOUBLE COMPLEX, dimension(*) :: zx
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: zx
|
||||
CALL cublasZDSCAL(n, da, zx, incx)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE ZDSCAL_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE ZSCAL_gpu(n, za, zx, incx)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
INTEGER :: n, incx
|
||||
DOUBLE COMPLEX :: za
|
||||
DOUBLE COMPLEX, dimension(*) :: zx
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: zx
|
||||
CALL cublasZSCAL(n, za, zx, incx)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE ZSCAL_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE DGEMV_gpu(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
#if defined(__CUDA)
|
||||
use cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
DOUBLE PRECISION :: ALPHA,BETA
|
||||
INTEGER :: INCX,INCY,LDA,M,N
|
||||
CHARACTER :: TRANS
|
||||
DOUBLE PRECISION :: A(LDA,*),X(*),Y(*)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: A, X, Y
|
||||
call cublasDGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE DGEMV_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE DCOPY_gpu(n, x, incx, y, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
INTEGER :: n, incx, incy
|
||||
DOUBLE PRECISION, INTENT(IN) :: x(*)
|
||||
DOUBLE PRECISION, INTENT(OUT) :: y(*)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: x, y
|
||||
call cublasDCOPY(n, x, incx, y, incy)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE DCOPY_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE DAXPY_gpu(n, a, x, incx, y, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
INTEGER :: n, incx, incy
|
||||
DOUBLE PRECISION, INTENT(IN) :: a
|
||||
DOUBLE PRECISION, INTENT(IN) :: x(*)
|
||||
DOUBLE PRECISION, INTENT(OUT) :: y(*)
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: x, y
|
||||
call cublasDAXPY( n, a, x, incx, y, incy)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE DAXPY_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE DSCAL_gpu(n, a, x, incx)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
IMPLICIT NONE
|
||||
integer :: n, incx
|
||||
DOUBLE PRECISION :: a
|
||||
DOUBLE PRECISION, dimension(*) :: x
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: x
|
||||
call cublasDSCAL(n, a, x, incx)
|
||||
#endif
|
||||
RETURN
|
||||
END SUBROUTINE DSCAL_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE gpu_threaded_memset(array, val, length)
|
||||
!
|
||||
#if defined(__CUDA)
|
||||
|
@ -219,3 +385,17 @@ SUBROUTINE gpu_threaded_backassign(array_out, idx, array_in, kdimx, nact, use_a2
|
|||
!
|
||||
END SUBROUTINE gpu_threaded_backassign
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
SUBROUTINE DSWAP_gpu(n, dx, incx, dy, incy)
|
||||
#if defined(__CUDA)
|
||||
USE cublas
|
||||
#endif
|
||||
implicit none
|
||||
integer :: n, incx, incy
|
||||
REAL(8), dimension(*) :: dx, dy
|
||||
#if defined(__CUDA)
|
||||
attributes(device) :: dx, dy
|
||||
CALL cublasDSWAP(n, dx, incx, dy, incy)
|
||||
#endif
|
||||
return
|
||||
END SUBROUTINE DSWAP_gpu
|
||||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
|
|
@ -87,7 +87,6 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
|
|||
INTEGER :: kdim, kdmx, cg_iter, ibnd
|
||||
!
|
||||
REAL(DP), EXTERNAL :: DDOT
|
||||
COMPLEX(DP), EXTERNAL :: ZDOTC
|
||||
|
||||
EXTERNAL hs_1psi, g_1psi
|
||||
! hs_1psi( npwx, npw, psi, hpsi, spsi )
|
||||
|
@ -115,7 +114,7 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
|
|||
CALL stop_clock( 'pcg:ortho' )
|
||||
!-
|
||||
|
||||
g0 = ZDOTC( kdim, z ,1 ,r ,1)
|
||||
g0 = DDOT( 2*kdim, z ,1 ,r ,1)
|
||||
CALL mp_sum( g0, intra_bgrp_comm ) ! g0 = < initial z | initial r >
|
||||
|
||||
ff = 0.d0 ; ff0 = ff
|
||||
|
@ -143,13 +142,13 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
|
|||
CALL stop_clock( 'pcg:hs_1psi' )
|
||||
w = w - e* sp
|
||||
|
||||
gamma = ZDOTC( kdim, p ,1 ,w ,1)
|
||||
gamma = DDOT( 2*kdim, p ,1 ,w ,1)
|
||||
CALL mp_sum( gamma, intra_bgrp_comm )
|
||||
alpha = g0/gamma
|
||||
|
||||
psi(:) = psi(:) + alpha * p(:) ! updated solution
|
||||
r(:) = r(:) - alpha * w(:) ! updated gradient
|
||||
g2 = ZDOTC( kdim, z ,1 ,r ,1)
|
||||
g2 = DDOT ( 2*kdim, z ,1 ,r ,1)
|
||||
CALL mp_sum( g2, intra_bgrp_comm ) ! g2 = < old z | new r >
|
||||
z(:) = r(:) ; call g_1psi(npwx,npw,z,e) ! updated preconditioned gradient
|
||||
!- project on conduction bands
|
||||
|
@ -159,10 +158,10 @@ SUBROUTINE pcg_k( hs_1psi, g_1psi, psi0, spsi0, npw, npwx, nbnd, npol, psi, ethr
|
|||
CALL ZGEMV( 'N', kdim, nbnd, (-1.D0,0.D0), psi0, kdmx, spsi0vec, 1, ONE, z, 1 )
|
||||
CALL stop_clock( 'pcg:ortho' )
|
||||
!-
|
||||
g1 = ZDOTC( kdim, z, 1, r ,1)
|
||||
g1 = DDOT ( 2*kdim, z, 1, r ,1)
|
||||
CALL mp_sum( g1, intra_bgrp_comm ) ! g1 = < new z | new r >
|
||||
! evaluate the function
|
||||
ff = - 0.5_DP * (ZDOTC( kdim, psi, 1, r ,1) + ZDOTC( kdim, psi, 1, b ,1) )
|
||||
ff = - 0.5_DP * (DDOT(2*kdim, psi, 1, r ,1) + DDOT(2*kdim, psi, 1, b ,1) )
|
||||
CALL mp_sum( ff, intra_bgrp_comm )
|
||||
!write (6,*) cg_iter, g1, ff, gamma
|
||||
if ( ff > ff0 .AND. ff0 < 0.d0 ) psi(:) = psi(:) - alpha * p(:) ! fallback solution if last iteration failed to improve the function... exit and hope next time it'll be better
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# Makefile for RMM
|
||||
|
||||
include ../../make.inc
|
||||
|
||||
# location of needed modules and included files (if any)
|
||||
MODFLAGS= $(MOD_FLAG) ../../ELPA/src $(MOD_FLAG) ../../LAXlib $(MOD_FLAG) ../PPCG $(MOD_FLAG) ../../UtilXlib $(MOD_FLAG).
|
||||
|
||||
RMM = \
|
||||
crmmdiagg.o \
|
||||
crmmdiagg_gpu.o \
|
||||
rrmmdiagg.o \
|
||||
rrmmdiagg_gpu.o
|
||||
|
||||
|
||||
all : librmm.a
|
||||
|
||||
|
||||
librmm.a: $(RMM)
|
||||
$(AR) $(ARFLAGS) $@ $?
|
||||
$(RANLIB) $@
|
||||
|
||||
clean :
|
||||
- /bin/rm -f *.o *.a *.d *.i *~ *_tmp.f90 *.mod *.L *.x
|
||||
|
||||
include make.depend
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,12 @@
|
|||
crmmdiagg.o : ../../UtilXlib/mp.o
|
||||
crmmdiagg.o : ../../UtilXlib/mp_bands_util.o
|
||||
crmmdiagg.o : ../../UtilXlib/util_param.o
|
||||
crmmdiagg_gpu.o : ../../UtilXlib/mp.o
|
||||
crmmdiagg_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
crmmdiagg_gpu.o : ../../UtilXlib/util_param.o
|
||||
rrmmdiagg.o : ../../UtilXlib/mp.o
|
||||
rrmmdiagg.o : ../../UtilXlib/mp_bands_util.o
|
||||
rrmmdiagg.o : ../../UtilXlib/util_param.o
|
||||
rrmmdiagg_gpu.o : ../../UtilXlib/mp.o
|
||||
rrmmdiagg_gpu.o : ../../UtilXlib/mp_bands_util.o
|
||||
rrmmdiagg_gpu.o : ../../UtilXlib/util_param.o
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,72 @@
|
|||
! Copyright (C) 2013-2021 Quantum ESPRESSO group
|
||||
! This file is distributed under the terms of the
|
||||
! GNU General Public License. See the file `License'
|
||||
! in the root directory of the present distribution,
|
||||
! or http://www.gnu.org/copyleft/gpl.txt .
|
||||
!
|
||||
|
||||
INTERFACE rotate_xpsi
|
||||
SUBROUTINE rotate_xpsi_driver &
|
||||
( npwx, npw, nstart, nbnd, psi, npol, overlap, evc, hevc, sevc, e, use_para_diag, gamma_only )
|
||||
!
|
||||
IMPORT :: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
!! dimension of the matrix to be diagonalized
|
||||
!! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
!! input number of states
|
||||
!! output number of states
|
||||
!! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
!! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi(npwx*npol,nstart)
|
||||
!! vectors spanning the subspace
|
||||
COMPLEX(DP), INTENT(OUT) :: evc(npwx*npol,nbnd)
|
||||
!! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc(npwx*npol,nbnd), sevc(npwx*npol,nbnd)
|
||||
!! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e(nbnd)
|
||||
!! eigenvalues
|
||||
LOGICAL,INTENT(IN) :: use_para_diag
|
||||
!! if true parallel diagonalization will be used
|
||||
LOGICAL,INTENT(IN) :: gamma_only
|
||||
!! set to true when H is real
|
||||
END SUBROUTINE rotate_xpsi_driver
|
||||
#if defined (__CUDA)
|
||||
SUBROUTINE rotate_xpsi_driver_cuf &
|
||||
( npwx, npw, nstart, nbnd, psi_d, npol, overlap, evc_d, hevc_d, sevc_d, e_d, use_para_diag, gamma_only )
|
||||
!! Driver routine for Hamiltonian diagonalization in the subspace
|
||||
!! spanned by nstart states psi ( atomic or random wavefunctions ).
|
||||
!! Interface for the CUDA-Fortran case.
|
||||
!! Produces on output nbnd eigenvectors ( nbnd <= nstart ) in evc.
|
||||
!! Calls h_psi, s_psi to calculate H|psi> and S|psi>,
|
||||
!! which are saved in hevc and sevc.
|
||||
IMPORT :: DP
|
||||
IMPLICIT NONE
|
||||
INTEGER, INTENT(IN) :: npw, npwx, nstart, nbnd, npol
|
||||
!! dimension of the matrix to be diagonalized
|
||||
!! leading dimension of matrix psi, as declared in the calling pgm unit
|
||||
!! input number of states
|
||||
!! output number of states
|
||||
!! number of spin polarizations
|
||||
LOGICAL, INTENT(IN) :: overlap
|
||||
!! if .FALSE. : S|psi> not needed
|
||||
COMPLEX(DP), INTENT(INOUT) :: psi_d(npwx*npol,nstart)
|
||||
!! vectors spannign the subspace
|
||||
COMPLEX(DP), INTENT(INOUT) :: evc_d(npwx*npol,nbnd)
|
||||
!! input and output eigenvectors (may overlap)
|
||||
COMPLEX(DP), INTENT(OUT) :: hevc_d(npwx*npol,nbnd), sevc_d(npwx*npol,nbnd)
|
||||
!! H|psi> and S|psi>
|
||||
REAL(DP), INTENT(OUT) :: e_d(nbnd)
|
||||
!! eigenvalues
|
||||
LOGICAL, INTENT(IN) :: use_para_diag
|
||||
!! if true, use parallel diagonalization
|
||||
LOGICAL, INTENT(IN) :: gamma_only
|
||||
!! set to true if H matrix is real
|
||||
attributes(DEVICE) :: psi_d, evc_d, hevc_d, sevc_d, e_d
|
||||
END SUBROUTINE rotate_xpsi_driver_cuf
|
||||
#endif
|
||||
END INTERFACE
|
||||
|
||||
|
||||
|
|
@ -107,4 +107,6 @@ if(QE_ENABLE_TEST)
|
|||
add_unit_test(test_qe_lax-r1-t3 1 3 $<TARGET_FILE:qe_lax_test>)
|
||||
add_unit_test(test_qe_lax-r4-t1 4 1 $<TARGET_FILE:qe_lax_test>)
|
||||
add_unit_test(test_qe_lax-r9-t2 9 2 $<TARGET_FILE:qe_lax_test>)
|
||||
|
||||
add_subdirectory(tests)
|
||||
endif(QE_ENABLE_TEST)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue