2004-03-15 08:08:26 +08:00
|
|
|
!
|
2013-04-08 00:29:01 +08:00
|
|
|
! Copyright (C) 2002-2013 Quantum ESPRESSO group
|
2004-03-15 08:08:26 +08:00
|
|
|
! This file is distributed under the terms of the
|
|
|
|
! GNU General Public License. See the file `License'
|
|
|
|
! in the root directory of the present distribution,
|
|
|
|
! or http://www.gnu.org/copyleft/gpl.txt .
|
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2013-04-08 00:29:01 +08:00
|
|
|
! ... This module contains functions nd variables used to check if the code
|
|
|
|
! ... should be smoothly stopped. In order to use this module, function
|
|
|
|
! ... check_stop_init must be called (only once) at the beginning of the calc.
|
|
|
|
! ... Function check_stop_now returns .TRUE. if either the user has created
|
|
|
|
! ... an "exit" file, or if the elapsed wall time is larger than max_seconds,
|
|
|
|
! ... or if these conditions have been met in a provious call of check_stop_now.
|
|
|
|
! ... Moreover, function check_stop_now removes the exit file and sets variable
|
|
|
|
! ... stopped_by_user to .true..
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2004-03-15 08:08:26 +08:00
|
|
|
!------------------------------------------------------------------------------!
|
2004-03-24 17:36:50 +08:00
|
|
|
MODULE check_stop
|
2004-03-15 08:08:26 +08:00
|
|
|
!------------------------------------------------------------------------------!
|
2010-02-09 23:16:28 +08:00
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
USE kinds
|
|
|
|
!
|
|
|
|
IMPLICIT NONE
|
|
|
|
!
|
|
|
|
SAVE
|
|
|
|
!
|
2007-06-12 01:13:15 +08:00
|
|
|
REAL(DP) :: max_seconds = 1.E+7_DP
|
2006-09-20 01:00:18 +08:00
|
|
|
REAL(DP) :: init_second
|
2013-04-08 00:29:01 +08:00
|
|
|
LOGICAL :: stopped_by_user = .FALSE.
|
2013-04-04 16:43:00 +08:00
|
|
|
LOGICAL, PRIVATE :: tinit = .FALSE.
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2004-03-15 08:08:26 +08:00
|
|
|
CONTAINS
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
|
|
|
! ... internal procedures
|
|
|
|
!
|
|
|
|
!-----------------------------------------------------------------------
|
2005-12-18 01:40:37 +08:00
|
|
|
SUBROUTINE check_stop_init()
|
2004-03-24 17:36:50 +08:00
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
2005-12-18 01:40:37 +08:00
|
|
|
USE input_parameters, ONLY : max_seconds_ => max_seconds
|
2010-02-09 03:25:34 +08:00
|
|
|
USE io_global, ONLY : stdout
|
2010-06-11 22:43:50 +08:00
|
|
|
USE io_files, ONLY : prefix, exit_file
|
Added the possibility to intercept several signal and trigger a proper stop of the code, as if the file prefix.EXIT was found.
Currently intercepted signals are SIGINT (caused by CTRL-C), SIGTERM (by default sent by PBS one minute before wall time expires), SIGUSR1, SIGUSR2
In order to compile this code add -D__TERMINATE_GRACEFULLY to MANUAL_DFLAGS in make.sys
This code partially conflicts with __TRAP_SIGUSR1, although they can both be compiled ony the last signal handler set will work. However they do the same thing, so __TRAP_SIGUSR1 could eventually be removed as it is less general.
You can change the delay between the alert and the end of allocated wall time depending on the queue system. Here are some examples:
* with PBS:
send the signal 120 seconds before the end:
#PBS -l signal=@120
send signal SIGUSR1 10 minutes before the end:
#PBS -l signal=SIGUSR1@600
you cand also send a signal manually with qsig, or send a signal and then stop:
qdel -W 120 jobid
will send SIGTERM, wait 2 minutes than force stop.
* with LodLeveler:
According to documentation, a SIGUSR1 or a SIGTERM is sent, depending if the job will be restarted or not; SIGKILL follows after 2 minutes. There is no mention if this delay can be changed.
git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@11024 c92efa57-630b-4861-b058-cf58834340f0
2014-06-04 22:12:13 +08:00
|
|
|
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
2010-02-09 03:25:34 +08:00
|
|
|
USE set_signal, ONLY : signal_trap_init
|
2010-02-09 23:16:28 +08:00
|
|
|
#endif
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
|
|
|
IMPLICIT NONE
|
|
|
|
!
|
2006-09-20 01:00:18 +08:00
|
|
|
REAL(DP), EXTERNAL :: cclock
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2005-11-08 02:56:23 +08:00
|
|
|
IF ( tinit ) &
|
|
|
|
WRITE( UNIT = stdout, &
|
|
|
|
FMT = '(/,5X,"WARNING: check_stop already initialized")' )
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2005-12-18 01:40:37 +08:00
|
|
|
! ... the exit_file name is set here
|
|
|
|
!
|
|
|
|
exit_file = TRIM( prefix ) // '.EXIT'
|
|
|
|
!
|
2007-06-12 01:13:15 +08:00
|
|
|
IF ( max_seconds_ > 0.0_DP ) max_seconds = max_seconds_
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2006-09-20 01:00:18 +08:00
|
|
|
init_second = cclock()
|
2004-03-24 17:36:50 +08:00
|
|
|
tinit = .TRUE.
|
|
|
|
!
|
Added the possibility to intercept several signal and trigger a proper stop of the code, as if the file prefix.EXIT was found.
Currently intercepted signals are SIGINT (caused by CTRL-C), SIGTERM (by default sent by PBS one minute before wall time expires), SIGUSR1, SIGUSR2
In order to compile this code add -D__TERMINATE_GRACEFULLY to MANUAL_DFLAGS in make.sys
This code partially conflicts with __TRAP_SIGUSR1, although they can both be compiled ony the last signal handler set will work. However they do the same thing, so __TRAP_SIGUSR1 could eventually be removed as it is less general.
You can change the delay between the alert and the end of allocated wall time depending on the queue system. Here are some examples:
* with PBS:
send the signal 120 seconds before the end:
#PBS -l signal=@120
send signal SIGUSR1 10 minutes before the end:
#PBS -l signal=SIGUSR1@600
you cand also send a signal manually with qsig, or send a signal and then stop:
qdel -W 120 jobid
will send SIGTERM, wait 2 minutes than force stop.
* with LodLeveler:
According to documentation, a SIGUSR1 or a SIGTERM is sent, depending if the job will be restarted or not; SIGKILL follows after 2 minutes. There is no mention if this delay can be changed.
git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@11024 c92efa57-630b-4861-b058-cf58834340f0
2014-06-04 22:12:13 +08:00
|
|
|
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
2010-02-09 03:25:34 +08:00
|
|
|
CALL signal_trap_init ( )
|
2010-02-09 23:16:28 +08:00
|
|
|
#endif
|
2010-02-09 03:25:34 +08:00
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
RETURN
|
|
|
|
!
|
2005-05-18 17:38:45 +08:00
|
|
|
END SUBROUTINE check_stop_init
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
|
|
|
!-----------------------------------------------------------------------
|
2004-03-25 20:09:40 +08:00
|
|
|
FUNCTION check_stop_now( inunit )
|
2004-03-24 17:36:50 +08:00
|
|
|
!-----------------------------------------------------------------------
|
|
|
|
!
|
2010-02-09 03:25:34 +08:00
|
|
|
USE mp, ONLY : mp_bcast
|
2013-10-13 21:36:46 +08:00
|
|
|
USE mp_images, ONLY : intra_image_comm
|
2010-02-09 03:25:34 +08:00
|
|
|
USE io_global, ONLY : ionode, ionode_id, meta_ionode, stdout
|
2012-10-26 17:51:54 +08:00
|
|
|
USE io_files, ONLY : tmp_dir, exit_file, iunexit
|
Added the possibility to intercept several signal and trigger a proper stop of the code, as if the file prefix.EXIT was found.
Currently intercepted signals are SIGINT (caused by CTRL-C), SIGTERM (by default sent by PBS one minute before wall time expires), SIGUSR1, SIGUSR2
In order to compile this code add -D__TERMINATE_GRACEFULLY to MANUAL_DFLAGS in make.sys
This code partially conflicts with __TRAP_SIGUSR1, although they can both be compiled ony the last signal handler set will work. However they do the same thing, so __TRAP_SIGUSR1 could eventually be removed as it is less general.
You can change the delay between the alert and the end of allocated wall time depending on the queue system. Here are some examples:
* with PBS:
send the signal 120 seconds before the end:
#PBS -l signal=@120
send signal SIGUSR1 10 minutes before the end:
#PBS -l signal=SIGUSR1@600
you cand also send a signal manually with qsig, or send a signal and then stop:
qdel -W 120 jobid
will send SIGTERM, wait 2 minutes than force stop.
* with LodLeveler:
According to documentation, a SIGUSR1 or a SIGTERM is sent, depending if the job will be restarted or not; SIGKILL follows after 2 minutes. There is no mention if this delay can be changed.
git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@11024 c92efa57-630b-4861-b058-cf58834340f0
2014-06-04 22:12:13 +08:00
|
|
|
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
2010-02-09 03:25:34 +08:00
|
|
|
USE set_signal, ONLY : signal_detected
|
2010-02-09 23:16:28 +08:00
|
|
|
#endif
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
|
|
|
IMPLICIT NONE
|
|
|
|
!
|
2004-03-25 20:09:40 +08:00
|
|
|
INTEGER, OPTIONAL, INTENT(IN) :: inunit
|
2006-07-06 05:31:35 +08:00
|
|
|
!
|
|
|
|
INTEGER :: unit
|
2015-02-05 16:47:43 +08:00
|
|
|
LOGICAL :: check_stop_now, tex=.false.
|
2010-02-09 03:25:34 +08:00
|
|
|
LOGICAL :: signaled
|
2006-07-06 05:31:35 +08:00
|
|
|
REAL(DP) :: seconds
|
2006-09-20 01:00:18 +08:00
|
|
|
REAL(DP), EXTERNAL :: cclock
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2013-04-08 00:29:01 +08:00
|
|
|
IF ( stopped_by_user ) THEN
|
2013-04-04 16:43:00 +08:00
|
|
|
check_stop_now = .TRUE.
|
|
|
|
RETURN
|
|
|
|
END IF
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2010-02-09 23:16:28 +08:00
|
|
|
! ... cclock is a C function returning the elapsed solar
|
2006-09-20 01:00:18 +08:00
|
|
|
! ... time in seconds since the Epoch ( 00:00:00 1/1/1970 )
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2005-11-08 02:56:23 +08:00
|
|
|
IF ( .NOT. tinit ) &
|
2004-03-24 17:36:50 +08:00
|
|
|
CALL errore( 'check_stop_now', 'check_stop not initialized', 1 )
|
|
|
|
!
|
2004-03-25 20:09:40 +08:00
|
|
|
unit = stdout
|
|
|
|
IF ( PRESENT( inunit ) ) unit = inunit
|
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
check_stop_now = .FALSE.
|
2010-02-09 03:25:34 +08:00
|
|
|
!
|
|
|
|
signaled = .FALSE.
|
2010-02-09 23:16:28 +08:00
|
|
|
!
|
2006-07-06 05:31:35 +08:00
|
|
|
IF ( ionode ) THEN
|
2012-10-26 17:51:54 +08:00
|
|
|
!
|
|
|
|
! ... Check first if exit file exists in current directory
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
|
|
|
INQUIRE( FILE = TRIM( exit_file ), EXIST = tex )
|
|
|
|
!
|
|
|
|
IF ( tex ) THEN
|
2010-02-09 23:16:28 +08:00
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
check_stop_now = .TRUE.
|
|
|
|
OPEN( UNIT = iunexit, FILE = TRIM( exit_file ) )
|
|
|
|
CLOSE( UNIT = iunexit, STATUS = 'DELETE' )
|
|
|
|
!
|
2006-07-11 04:15:57 +08:00
|
|
|
ELSE
|
|
|
|
!
|
2012-10-26 17:51:54 +08:00
|
|
|
! ... Check if exit file exists in scratch directory
|
|
|
|
!
|
2013-05-17 18:27:13 +08:00
|
|
|
INQUIRE( FILE = TRIM(tmp_dir) // TRIM( exit_file ), EXIST = tex )
|
2006-07-11 04:15:57 +08:00
|
|
|
!
|
2013-05-17 18:27:13 +08:00
|
|
|
IF ( tex ) THEN
|
2012-10-26 17:51:54 +08:00
|
|
|
!
|
|
|
|
check_stop_now = .TRUE.
|
2012-10-26 20:02:56 +08:00
|
|
|
OPEN( UNIT = iunexit, FILE = TRIM(tmp_dir) // TRIM(exit_file) )
|
2012-10-26 17:51:54 +08:00
|
|
|
CLOSE( UNIT = iunexit, STATUS = 'DELETE' )
|
|
|
|
!
|
|
|
|
ELSE
|
|
|
|
seconds = cclock() - init_second
|
|
|
|
check_stop_now = ( seconds > max_seconds )
|
|
|
|
END IF
|
2006-07-11 04:15:57 +08:00
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
END IF
|
|
|
|
!
|
2006-07-06 05:31:35 +08:00
|
|
|
END IF
|
|
|
|
!
|
Added the possibility to intercept several signal and trigger a proper stop of the code, as if the file prefix.EXIT was found.
Currently intercepted signals are SIGINT (caused by CTRL-C), SIGTERM (by default sent by PBS one minute before wall time expires), SIGUSR1, SIGUSR2
In order to compile this code add -D__TERMINATE_GRACEFULLY to MANUAL_DFLAGS in make.sys
This code partially conflicts with __TRAP_SIGUSR1, although they can both be compiled ony the last signal handler set will work. However they do the same thing, so __TRAP_SIGUSR1 could eventually be removed as it is less general.
You can change the delay between the alert and the end of allocated wall time depending on the queue system. Here are some examples:
* with PBS:
send the signal 120 seconds before the end:
#PBS -l signal=@120
send signal SIGUSR1 10 minutes before the end:
#PBS -l signal=SIGUSR1@600
you cand also send a signal manually with qsig, or send a signal and then stop:
qdel -W 120 jobid
will send SIGTERM, wait 2 minutes than force stop.
* with LodLeveler:
According to documentation, a SIGUSR1 or a SIGTERM is sent, depending if the job will be restarted or not; SIGKILL follows after 2 minutes. There is no mention if this delay can be changed.
git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@11024 c92efa57-630b-4861-b058-cf58834340f0
2014-06-04 22:12:13 +08:00
|
|
|
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
2010-02-09 03:25:34 +08:00
|
|
|
signaled = signal_detected()
|
|
|
|
check_stop_now = check_stop_now .OR. signaled
|
|
|
|
tex = tex .OR. signaled
|
2010-02-09 23:16:28 +08:00
|
|
|
#endif
|
2010-02-09 03:25:34 +08:00
|
|
|
!
|
2006-07-06 05:31:35 +08:00
|
|
|
CALL mp_bcast( check_stop_now, ionode_id, intra_image_comm )
|
|
|
|
!
|
|
|
|
IF ( check_stop_now .AND. meta_ionode ) THEN
|
|
|
|
!
|
|
|
|
IF ( tex ) THEN
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2006-07-06 05:31:35 +08:00
|
|
|
WRITE( UNIT = unit, &
|
|
|
|
FMT = '(/,5X,"Program stopped by user request")' )
|
|
|
|
!
|
|
|
|
ELSE
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
2004-03-25 20:09:40 +08:00
|
|
|
WRITE( UNIT = unit, &
|
|
|
|
FMT = '(/,5X,"Maximum CPU time exceeded")' )
|
|
|
|
WRITE( UNIT = unit, &
|
|
|
|
FMT = '(/,5X,"max_seconds = ",F10.2)' ) max_seconds
|
|
|
|
WRITE( UNIT = unit, &
|
|
|
|
FMT = '(5X,"elapsed seconds = ",F10.2)' ) seconds
|
2004-03-24 17:36:50 +08:00
|
|
|
!
|
|
|
|
END IF
|
|
|
|
!
|
|
|
|
END IF
|
|
|
|
!
|
2013-04-08 00:29:01 +08:00
|
|
|
stopped_by_user = check_stop_now
|
2013-04-04 16:43:00 +08:00
|
|
|
!
|
2004-03-24 17:36:50 +08:00
|
|
|
RETURN
|
|
|
|
!
|
|
|
|
END FUNCTION check_stop_now
|
|
|
|
!
|
|
|
|
END MODULE check_stop
|