Added the possibility to intercept several signal and trigger a proper stop of the code, as if the file prefix.EXIT was found.

Currently intercepted signals are SIGINT (caused by CTRL-C), SIGTERM (by default sent by PBS one minute before wall time expires), SIGUSR1, SIGUSR2

In order to compile this code add -D__TERMINATE_GRACEFULLY to MANUAL_DFLAGS in make.sys

This code partially conflicts with __TRAP_SIGUSR1, although they can both be compiled ony the last signal handler set will work. However they do the same thing, so __TRAP_SIGUSR1 could eventually be removed as it is less general. 

You can change the delay between the alert and the end of allocated wall time depending on the queue system. Here are some examples:

* with PBS:
  send the signal 120 seconds before the end:
  #PBS -l signal=@120

  send signal SIGUSR1 10 minutes before the end:
  #PBS -l signal=SIGUSR1@600

  you cand also send a signal manually with qsig, or send a signal and then stop:
   qdel -W 120 jobid
  will send SIGTERM, wait 2 minutes than force stop.

* with LodLeveler:
  According to documentation, a SIGUSR1 or a SIGTERM is sent, depending if the job will be restarted or not; SIGKILL follows after 2 minutes. There is no mention if this delay can be changed.



git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@11024 c92efa57-630b-4861-b058-cf58834340f0
This commit is contained in:
paulatto 2014-06-04 14:12:13 +00:00
parent 302faf97c2
commit 63e85afa7f
3 changed files with 84 additions and 14 deletions

View File

@ -41,7 +41,7 @@ MODULE check_stop
USE input_parameters, ONLY : max_seconds_ => max_seconds
USE io_global, ONLY : stdout
USE io_files, ONLY : prefix, exit_file
#if defined __TRAP_SIGUSR1
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
USE set_signal, ONLY : signal_trap_init
#endif
!
@ -62,7 +62,7 @@ MODULE check_stop
init_second = cclock()
tinit = .TRUE.
!
#if defined __TRAP_SIGUSR1
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
CALL signal_trap_init ( )
#endif
!
@ -78,7 +78,7 @@ MODULE check_stop
USE mp_images, ONLY : intra_image_comm
USE io_global, ONLY : ionode, ionode_id, meta_ionode, stdout
USE io_files, ONLY : tmp_dir, exit_file, iunexit
#if defined __TRAP_SIGUSR1
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
USE set_signal, ONLY : signal_detected
#endif
!
@ -143,7 +143,7 @@ MODULE check_stop
!
END IF
!
#if defined __TRAP_SIGUSR1
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
signaled = signal_detected()
check_stop_now = check_stop_now .OR. signaled
tex = tex .OR. signaled

View File

@ -4,7 +4,7 @@ MODULE set_signal
! This module is compiled only if the following preprocessing option
! is enabled
#if defined __TRAP_SIGUSR1
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
USE iso_c_binding
USE io_global, ONLY : stdout
@ -14,6 +14,7 @@ USE mp, ONLY : mp_bcast
IMPLICIT NONE
LOGICAL,VOLATILE::signal_trapped
INTEGER(kind=c_int),PARAMETER :: SIGINT = 2_c_int
INTERFACE
FUNCTION init_signal_USR1(new_handler) BIND(c, name = "init_signal_USR1")
@ -22,6 +23,12 @@ INTERFACE
INTEGER(C_INT)::init_signal_USR1
END FUNCTION init_signal_USR1
FUNCTION init_TERMINATE_GRACEFULLY(new_handler) BIND(c, name = "init_TERMINATE_GRACEFULLY")
USE iso_c_binding
TYPE(C_FUNPTR),VALUE,INTENT(IN):: new_handler
INTEGER(C_INT)::init_TERMINATE_GRACEFULLY
END FUNCTION init_TERMINATE_GRACEFULLY
FUNCTION init_signal(signum, new_handler) BIND(c, name = "init_signal")
USE iso_c_binding
INTEGER(C_INT),VALUE :: signum
@ -33,6 +40,7 @@ END INTERFACE
CONTAINS
#ifdef __TRAP_SIGUSR1
SUBROUTINE set_signal_USR1(routine)
USE iso_c_binding
TYPE(C_FUNPTR),TARGET::ptr
@ -51,6 +59,28 @@ SUBROUTINE set_signal_USR1(routine)
ENDIF
END SUBROUTINE set_signal_USR1
#endif
#ifdef __TERMINATE_GRACEFULLY
SUBROUTINE set_TERMINATE_GRACEFULLY(routine)
USE iso_c_binding
TYPE(C_FUNPTR),TARGET::ptr
INTERFACE
SUBROUTINE routine(signal) bind(C)
USE iso_c_binding
INTEGER(C_INT),VALUE, INTENT(IN)::signal
END SUBROUTINE routine
END INTERFACE
ptr = C_FUNLOC(routine)
IF (init_TERMINATE_GRACEFULLY(ptr) .NE. 0) THEN
CALL errore("set_TERMINATE_GRACEFULLY", "The association of signals INT or TERM failed!", 1)
ENDIF
END SUBROUTINE set_TERMINATE_GRACEFULLY
#endif
! Unused. Here for possible future developments
SUBROUTINE set_signal_action(signal, routine)
@ -76,23 +106,48 @@ END SUBROUTINE set_signal_action
! Only the master will use the signal, though
SUBROUTINE custom_handler(signum) BIND(c)
USE iso_c_binding
#ifdef __MPI
USE mp_world, ONLY : world_comm
USE mp, ONLY : mp_abort
#endif
INTEGER(C_INT),VALUE,INTENT(IN):: signum
WRITE(UNIT = stdout, FMT = *) " **** Trapped signal", signum
signal_trapped = .TRUE.
! Double CTRL-C will stop immediately;
! This cannot be done with any signal because some implementation of MPI
! send SIGTERM to every process when SIGINT (aka CTRL-C) is received
IF(signal_trapped.and.signum==SIGINT) THEN
WRITE(stdout, '(/,5x,a)') "**** SIGNAL ALREADY TRAPPED: terminating immediately!!", signum
#ifdef __MPI
CALL mp_abort(signum, world_comm)
#else
STOP 1
#endif
ELSE
WRITE(stdout, '(/,5x,a)') "**** Trapped signal: trying to terminate gracefully", signum
IF(signum==SIGINT) &
WRITE(stdout, '(5x,a)') "**** press CTRL-C again to terminate immediately (no restart possible!)", signum
!
signal_trapped = .TRUE.
ENDIF
!
END SUBROUTINE custom_handler
! Set the signal handler for SIGUSR1 to 'custom_handler'
! Every processor will trap the signal, howver only 0 will actually
! use the result (required since the default action for SIGUSR1 is
! exit)
! use the result (required since the default action for SIGUSR1 is exit)
SUBROUTINE signal_trap_init
USE iso_c_binding
WRITE(UNIT = stdout, FMT=*) " signal trapping enabled: kill the code with -SIGUSR1 to stop cleanly the simulation "
#ifdef __TRAP_SIGUSR1
WRITE(stdout, FMT='(5x,a)') "signal trapping enabled: kill the code with -SIGUSR1 to stop cleanly the simulation "
CALL set_signal_USR1(custom_handler)
#endif
#ifdef __TERMINATE_GRACEFULLY
WRITE(stdout, FMT='(/,5x,a)') "Signal trapping enabled: code will terminate cleanly with SIGINT, SIGTERM, SIGUSR1, SIGUSR2"
WRITE(stdout, FMT='(5x,a)') "Type CTRL-C twice to terminate immediately (no restart possible!)"
CALL set_TERMINATE_GRACEFULLY(custom_handler)
#endif
END SUBROUTINE signal_trap_init
FUNCTION signal_detected()
LOGICAL::signal_detected
! If the signal is trapped, set the exit status and broadcast it
@ -112,8 +167,8 @@ CONTAINS
! Place holders to employ when the signal trapping feature is disabled
SUBROUTINE signal_trap_init
WRITE(UNIT = stdout, FMT=*) " signal trapping disabled: compile with "
WRITE(UNIT = stdout, FMT=*) " -D__TRAP_SIGUSR1 to enable this feature"
WRITE(stdout, FMT=*) "signal trapping disabled: compile with "
WRITE(stdout, FMT=*) "-D__TRAP_SIGUSR1 to enable this feature"
END SUBROUTINE signal_trap_init
FUNCTION signal_detected()

View File

@ -1,5 +1,5 @@
#ifdef __TRAP_SIGUSR1
#if defined(__TRAP_SIGUSR1) || defined(__TERMINATE_GRACEFULLY)
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
@ -23,6 +23,21 @@ int init_signal_USR1(void (*new_handler)(int))
{
return init_signal(SIGUSR1, new_handler);
}
int init_TERMINATE_GRACEFULLY(void (*new_handler)(int))
{
int fail;
fail = init_signal(SIGTERM, new_handler);
if(fail) return -SIGTERM;
fail = init_signal(SIGINT, new_handler);
if(fail) return -SIGINT;
fail = init_signal(SIGUSR1, new_handler);
if(fail) return -SIGUSR1;
fail = init_signal(SIGUSR2, new_handler);
if(fail) return -SIGUSR2;
return 0;
}
#else
void dummy ( ) { }
#endif