mirror of https://gitlab.com/QEF/q-e.git
Added the possibility to intercept several signal and trigger a proper stop of the code, as if the file prefix.EXIT was found.
Currently intercepted signals are SIGINT (caused by CTRL-C), SIGTERM (by default sent by PBS one minute before wall time expires), SIGUSR1, SIGUSR2 In order to compile this code add -D__TERMINATE_GRACEFULLY to MANUAL_DFLAGS in make.sys This code partially conflicts with __TRAP_SIGUSR1, although they can both be compiled ony the last signal handler set will work. However they do the same thing, so __TRAP_SIGUSR1 could eventually be removed as it is less general. You can change the delay between the alert and the end of allocated wall time depending on the queue system. Here are some examples: * with PBS: send the signal 120 seconds before the end: #PBS -l signal=@120 send signal SIGUSR1 10 minutes before the end: #PBS -l signal=SIGUSR1@600 you cand also send a signal manually with qsig, or send a signal and then stop: qdel -W 120 jobid will send SIGTERM, wait 2 minutes than force stop. * with LodLeveler: According to documentation, a SIGUSR1 or a SIGTERM is sent, depending if the job will be restarted or not; SIGKILL follows after 2 minutes. There is no mention if this delay can be changed. git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@11024 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
parent
302faf97c2
commit
63e85afa7f
|
@ -41,7 +41,7 @@ MODULE check_stop
|
|||
USE input_parameters, ONLY : max_seconds_ => max_seconds
|
||||
USE io_global, ONLY : stdout
|
||||
USE io_files, ONLY : prefix, exit_file
|
||||
#if defined __TRAP_SIGUSR1
|
||||
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
||||
USE set_signal, ONLY : signal_trap_init
|
||||
#endif
|
||||
!
|
||||
|
@ -62,7 +62,7 @@ MODULE check_stop
|
|||
init_second = cclock()
|
||||
tinit = .TRUE.
|
||||
!
|
||||
#if defined __TRAP_SIGUSR1
|
||||
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
||||
CALL signal_trap_init ( )
|
||||
#endif
|
||||
!
|
||||
|
@ -78,7 +78,7 @@ MODULE check_stop
|
|||
USE mp_images, ONLY : intra_image_comm
|
||||
USE io_global, ONLY : ionode, ionode_id, meta_ionode, stdout
|
||||
USE io_files, ONLY : tmp_dir, exit_file, iunexit
|
||||
#if defined __TRAP_SIGUSR1
|
||||
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
||||
USE set_signal, ONLY : signal_detected
|
||||
#endif
|
||||
!
|
||||
|
@ -143,7 +143,7 @@ MODULE check_stop
|
|||
!
|
||||
END IF
|
||||
!
|
||||
#if defined __TRAP_SIGUSR1
|
||||
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
||||
signaled = signal_detected()
|
||||
check_stop_now = check_stop_now .OR. signaled
|
||||
tex = tex .OR. signaled
|
||||
|
|
|
@ -4,7 +4,7 @@ MODULE set_signal
|
|||
|
||||
! This module is compiled only if the following preprocessing option
|
||||
! is enabled
|
||||
#if defined __TRAP_SIGUSR1
|
||||
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
||||
|
||||
USE iso_c_binding
|
||||
USE io_global, ONLY : stdout
|
||||
|
@ -14,6 +14,7 @@ USE mp, ONLY : mp_bcast
|
|||
IMPLICIT NONE
|
||||
|
||||
LOGICAL,VOLATILE::signal_trapped
|
||||
INTEGER(kind=c_int),PARAMETER :: SIGINT = 2_c_int
|
||||
|
||||
INTERFACE
|
||||
FUNCTION init_signal_USR1(new_handler) BIND(c, name = "init_signal_USR1")
|
||||
|
@ -22,6 +23,12 @@ INTERFACE
|
|||
INTEGER(C_INT)::init_signal_USR1
|
||||
END FUNCTION init_signal_USR1
|
||||
|
||||
FUNCTION init_TERMINATE_GRACEFULLY(new_handler) BIND(c, name = "init_TERMINATE_GRACEFULLY")
|
||||
USE iso_c_binding
|
||||
TYPE(C_FUNPTR),VALUE,INTENT(IN):: new_handler
|
||||
INTEGER(C_INT)::init_TERMINATE_GRACEFULLY
|
||||
END FUNCTION init_TERMINATE_GRACEFULLY
|
||||
|
||||
FUNCTION init_signal(signum, new_handler) BIND(c, name = "init_signal")
|
||||
USE iso_c_binding
|
||||
INTEGER(C_INT),VALUE :: signum
|
||||
|
@ -33,6 +40,7 @@ END INTERFACE
|
|||
|
||||
CONTAINS
|
||||
|
||||
#ifdef __TRAP_SIGUSR1
|
||||
SUBROUTINE set_signal_USR1(routine)
|
||||
USE iso_c_binding
|
||||
TYPE(C_FUNPTR),TARGET::ptr
|
||||
|
@ -51,6 +59,28 @@ SUBROUTINE set_signal_USR1(routine)
|
|||
ENDIF
|
||||
|
||||
END SUBROUTINE set_signal_USR1
|
||||
#endif
|
||||
|
||||
#ifdef __TERMINATE_GRACEFULLY
|
||||
SUBROUTINE set_TERMINATE_GRACEFULLY(routine)
|
||||
USE iso_c_binding
|
||||
TYPE(C_FUNPTR),TARGET::ptr
|
||||
INTERFACE
|
||||
SUBROUTINE routine(signal) bind(C)
|
||||
USE iso_c_binding
|
||||
INTEGER(C_INT),VALUE, INTENT(IN)::signal
|
||||
END SUBROUTINE routine
|
||||
|
||||
END INTERFACE
|
||||
|
||||
ptr = C_FUNLOC(routine)
|
||||
|
||||
IF (init_TERMINATE_GRACEFULLY(ptr) .NE. 0) THEN
|
||||
CALL errore("set_TERMINATE_GRACEFULLY", "The association of signals INT or TERM failed!", 1)
|
||||
ENDIF
|
||||
|
||||
END SUBROUTINE set_TERMINATE_GRACEFULLY
|
||||
#endif
|
||||
|
||||
! Unused. Here for possible future developments
|
||||
SUBROUTINE set_signal_action(signal, routine)
|
||||
|
@ -76,23 +106,48 @@ END SUBROUTINE set_signal_action
|
|||
! Only the master will use the signal, though
|
||||
SUBROUTINE custom_handler(signum) BIND(c)
|
||||
USE iso_c_binding
|
||||
#ifdef __MPI
|
||||
USE mp_world, ONLY : world_comm
|
||||
USE mp, ONLY : mp_abort
|
||||
#endif
|
||||
INTEGER(C_INT),VALUE,INTENT(IN):: signum
|
||||
WRITE(UNIT = stdout, FMT = *) " **** Trapped signal", signum
|
||||
signal_trapped = .TRUE.
|
||||
! Double CTRL-C will stop immediately;
|
||||
! This cannot be done with any signal because some implementation of MPI
|
||||
! send SIGTERM to every process when SIGINT (aka CTRL-C) is received
|
||||
IF(signal_trapped.and.signum==SIGINT) THEN
|
||||
WRITE(stdout, '(/,5x,a)') "**** SIGNAL ALREADY TRAPPED: terminating immediately!!", signum
|
||||
#ifdef __MPI
|
||||
CALL mp_abort(signum, world_comm)
|
||||
#else
|
||||
STOP 1
|
||||
#endif
|
||||
ELSE
|
||||
WRITE(stdout, '(/,5x,a)') "**** Trapped signal: trying to terminate gracefully", signum
|
||||
IF(signum==SIGINT) &
|
||||
WRITE(stdout, '(5x,a)') "**** press CTRL-C again to terminate immediately (no restart possible!)", signum
|
||||
!
|
||||
signal_trapped = .TRUE.
|
||||
ENDIF
|
||||
!
|
||||
END SUBROUTINE custom_handler
|
||||
|
||||
|
||||
! Set the signal handler for SIGUSR1 to 'custom_handler'
|
||||
! Every processor will trap the signal, howver only 0 will actually
|
||||
! use the result (required since the default action for SIGUSR1 is
|
||||
! exit)
|
||||
! use the result (required since the default action for SIGUSR1 is exit)
|
||||
SUBROUTINE signal_trap_init
|
||||
USE iso_c_binding
|
||||
WRITE(UNIT = stdout, FMT=*) " signal trapping enabled: kill the code with -SIGUSR1 to stop cleanly the simulation "
|
||||
#ifdef __TRAP_SIGUSR1
|
||||
WRITE(stdout, FMT='(5x,a)') "signal trapping enabled: kill the code with -SIGUSR1 to stop cleanly the simulation "
|
||||
CALL set_signal_USR1(custom_handler)
|
||||
#endif
|
||||
#ifdef __TERMINATE_GRACEFULLY
|
||||
WRITE(stdout, FMT='(/,5x,a)') "Signal trapping enabled: code will terminate cleanly with SIGINT, SIGTERM, SIGUSR1, SIGUSR2"
|
||||
WRITE(stdout, FMT='(5x,a)') "Type CTRL-C twice to terminate immediately (no restart possible!)"
|
||||
CALL set_TERMINATE_GRACEFULLY(custom_handler)
|
||||
#endif
|
||||
END SUBROUTINE signal_trap_init
|
||||
|
||||
|
||||
FUNCTION signal_detected()
|
||||
LOGICAL::signal_detected
|
||||
! If the signal is trapped, set the exit status and broadcast it
|
||||
|
@ -112,8 +167,8 @@ CONTAINS
|
|||
|
||||
! Place holders to employ when the signal trapping feature is disabled
|
||||
SUBROUTINE signal_trap_init
|
||||
WRITE(UNIT = stdout, FMT=*) " signal trapping disabled: compile with "
|
||||
WRITE(UNIT = stdout, FMT=*) " -D__TRAP_SIGUSR1 to enable this feature"
|
||||
WRITE(stdout, FMT=*) "signal trapping disabled: compile with "
|
||||
WRITE(stdout, FMT=*) "-D__TRAP_SIGUSR1 to enable this feature"
|
||||
END SUBROUTINE signal_trap_init
|
||||
|
||||
FUNCTION signal_detected()
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
#ifdef __TRAP_SIGUSR1
|
||||
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <signal.h>
|
||||
|
@ -23,6 +23,21 @@ int init_signal_USR1(void (*new_handler)(int))
|
|||
{
|
||||
return init_signal(SIGUSR1, new_handler);
|
||||
}
|
||||
|
||||
int init_TERMINATE_GRACEFULLY(void (*new_handler)(int))
|
||||
{
|
||||
int fail;
|
||||
fail = init_signal(SIGTERM, new_handler);
|
||||
if(fail) return -SIGTERM;
|
||||
fail = init_signal(SIGINT, new_handler);
|
||||
if(fail) return -SIGINT;
|
||||
fail = init_signal(SIGUSR1, new_handler);
|
||||
if(fail) return -SIGUSR1;
|
||||
fail = init_signal(SIGUSR2, new_handler);
|
||||
if(fail) return -SIGUSR2;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
void dummy ( ) { }
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue