The Gaudi Framework  master (37c0b60a)
event_timeout_check.py
Go to the documentation of this file.
1 
11 """
12 ## Use Case
13 Prevent jobs to never end because of a problem in an algorithm
14 
15 ## Description
16 Sometimes, because of a bug, an algorithm enters an infinite loop or a dead lock
17 occurs. In these cases the application will never terminate.
18 
19 ## Solution
20 To prevent such cases, in particular for batch jobs that could waste a lot of
21 resources before the problem is detected, it is possible to use the special algorithm
22 `Gaudi::EventWatchdogAlg`.
23 
24 `Gaudi::EventWatchdogAlg` starts a secondary thread that sleeps until a timeout is
25 reached. At that point it prints a warning message and optionally a stack trace of
26 the process on stderr, then it sleeps for another timeout period unless it's
27 configured to abort the process when the timeout occurs.
28 """
29 
30 # Run this configuration with `gaudirun.py <path_to_this_file>:config`
31 
32 from GaudiConfig2 import Configurable
33 from GaudiConfig2 import Configurables as C
34 
35 
36 def base_config() -> list[Configurable]:
37  """
38  Example configuration of a job with no input and a algorithm that looks stuck.
39  """
40  algorithms = [C.GaudiTesting.SleepyAlg("StuckAlg", SleepTime=3600)]
41  app = C.ApplicationMgr(
42  EvtSel="NONE", TopAlg=[C.Gaudi.Sequencer("MainSequence", Members=algorithms)]
43  )
44  return [app] + list(app.TopAlg) + algorithms
45 
46 
48  conf: list[Configurable], timeout_seconds: int
49 ) -> list[Configurable]:
50  """
51  Take a configuration and adds a check on events reaching a timeout.
52  """
53  # find the ApplicationMgr as we have to tweak its configuration
54  app = next(c for c in conf if c.name == "ApplicationMgr")
55  # configure a Gaudi::EventWatchdogAlg
56  watchdog = C.Gaudi.EventWatchdogAlg(
57  EventTimeout=timeout_seconds, # sleep for this number of seconds
58  StackTrace=True, # print a stack trace when we wake up
59  AbortOnTimeout=True, # kill the process on timeout
60  )
61  # wrap original list of algorithms into a sequence to ensure the watchdog
62  # is executed before everything else
63  wrapping_seq = C.Gaudi.Sequencer(
64  "SequenceWithTimeout", Sequential=True, Members=[watchdog] + list(app.TopAlg)
65  )
66  # reset the main list of algorithms
67  app.TopAlg = [wrapping_seq]
68  # return the tweaked configuration
69  return conf + [watchdog, wrapping_seq]
70 
71 
72 def config():
73  # get the normal job configuration
74  conf = base_config()
75  # make sure we stop if an event takes more than 2s
76  conf = add_event_timeout(conf, 2)
77  return conf
event_timeout_check.base_config
list[Configurable] base_config()
Definition: event_timeout_check.py:36
event_timeout_check.config
def config()
Definition: event_timeout_check.py:72
event_timeout_check.add_event_timeout
list[Configurable] add_event_timeout(list[Configurable] conf, int timeout_seconds)
Definition: event_timeout_check.py:47