The Gaudi Framework  master (e68eea06)
Loading...
Searching...
No Matches
event_timeout_check.py
Go to the documentation of this file.
11"""
12## Use Case
13Prevent jobs to never end because of a problem in an algorithm
14
15## Description
16Sometimes, because of a bug, an algorithm enters an infinite loop or a dead lock
17occurs. In these cases the application will never terminate.
18
19## Solution
20To prevent such cases, in particular for batch jobs that could waste a lot of
21resources before the problem is detected, it is possible to use the special algorithm
22`Gaudi::EventWatchdogAlg`.
23
24`Gaudi::EventWatchdogAlg` starts a secondary thread that sleeps until a timeout is
25reached. At that point it prints a warning message and optionally a stack trace of
26the process on stderr, then it sleeps for another timeout period unless it's
27configured to abort the process when the timeout occurs.
28"""
29
30# Run this configuration with `gaudirun.py <path_to_this_file>:config`
31
32from GaudiConfig2 import Configurable
33from GaudiConfig2 import Configurables as C
34
35
36def base_config() -> list[Configurable]:
37 """
38 Example configuration of a job with no input and a algorithm that looks stuck.
39 """
40 algorithms = [C.GaudiTesting.SleepyAlg("StuckAlg", SleepTime=3600)]
41 app = C.ApplicationMgr(
42 EvtSel="NONE", TopAlg=[C.Gaudi.Sequencer("MainSequence", Members=algorithms)]
43 )
44 return [app] + list(app.TopAlg) + algorithms
45
46
48 conf: list[Configurable], timeout_seconds: int
49) -> list[Configurable]:
50 """
51 Take a configuration and adds a check on events reaching a timeout.
52 """
53 # find the ApplicationMgr as we have to tweak its configuration
54 app = next(c for c in conf if c.name == "ApplicationMgr")
55 # configure a Gaudi::EventWatchdogAlg
56 watchdog = C.Gaudi.EventWatchdogAlg(
57 EventTimeout=timeout_seconds, # sleep for this number of seconds
58 StackTrace=False, # no stack trace (may be slow, see issue #349)
59 AbortOnTimeout=True, # kill the process on timeout
60 )
61 # wrap original list of algorithms into a sequence to ensure the watchdog
62 # is executed before everything else
63 wrapping_seq = C.Gaudi.Sequencer(
64 "SequenceWithTimeout", Sequential=True, Members=[watchdog] + list(app.TopAlg)
65 )
66 # reset the main list of algorithms
67 app.TopAlg = [wrapping_seq]
68 # return the tweaked configuration
69 return conf + [watchdog, wrapping_seq]
70
71
72def config():
73 # get the normal job configuration
74 conf = base_config()
75 # make sure we stop if an event takes more than 2s
76 conf = add_event_timeout(conf, 2)
77 return conf
list[Configurable] add_event_timeout(list[Configurable] conf, int timeout_seconds)
list[Configurable] base_config()