Gaudi Framework, version v25r2

Home   Generated: Wed Jun 4 2014
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
13 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace {
21  class EventWatchdog: public WatchdogThread {
22  public:
23  EventWatchdog(const SmartIF<IMessageSvc> &msgSvc,
24  const std::string &name,
25  boost::posix_time::time_duration timeout,
26  bool stackTrace = false,
27  long maxCount = 0,
28  bool autostart = false):
29  WatchdogThread(timeout, autostart),
30  log(msgSvc, name),
31  m_counter(0),
32  m_maxCount(maxCount),
33  m_stackTrace(stackTrace){}
34  virtual ~EventWatchdog() {}
35  private:
37  MsgStream log;
39  long m_counter;
41  long m_maxCount;
43  bool m_stackTrace;
45  void action() {
46  if (!m_counter) {
47  log << MSG::WARNING << "More than " << getTimeout().total_seconds()
48  << "s since the last " << IncidentType::BeginEvent << endmsg;
49  } else {
50  log << MSG::WARNING << "Other " << getTimeout().total_seconds()
51  << "s passed" << endmsg;
52  }
53  log << MSG::INFO << "Current memory usage is"
54  " virtual size = " << System::virtualMemory() / 1024. << " MB"
55  ", resident set size = " << System::pagedMemory() / 1024.<< " MB"
56  << endmsg;
57  if (m_stackTrace && gSystem) {
58  // TSystem::StackTrace() prints on the standard error, so we
59  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
60  gSystem->StackTrace();
61  }
62  ++m_counter;
63  if (m_maxCount > 0 && m_counter >= m_maxCount) {
64  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
65  std::raise(SIGABRT);
66  }
67  }
68  void onPing() {
69  if (m_counter) {
70  if (m_counter >= 3)
71  log << MSG::INFO << "Starting a new event after ~"
72  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
73  m_counter = 0;
74  }
75  }
76  void onStop() {
77  if (m_counter >= 3)
78  log << MSG::INFO << "The last event took ~"
79  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
80  }
81  };
82 }
83 
84 // Constructor
85 StalledEventMonitor::StalledEventMonitor(const std::string& name, ISvcLocator* svcLoc):
86  base_class(name, svcLoc) {
87 
88  declareProperty("EventTimeout", m_eventTimeout = 600,
89  "Number of seconds allowed to process a single event (0 to disable the check).");
90 
91  declareProperty("MaxTimeoutCount", m_maxTimeoutCount = 0,
92  "Number timeouts before aborting the execution (0 means never abort).");
93 
94  declareProperty("StackTrace", m_stackTrace = false,
95  "Whether to print the stack-trace on timeout.");
96 }
97 
98 // Destructor
100 
101 }
102 
103 // Initialization of the service.
106  if (sc.isFailure()) return sc;
107 
108 
109  if (m_eventTimeout) {
110  // create the watchdog thread
111  m_watchdog = std::auto_ptr<WatchdogThread>(
112  new EventWatchdog(msgSvc(),
113  "EventWatchdog",
114  boost::posix_time::seconds(m_eventTimeout),
115  m_stackTrace,
117 
118  // register to the incident service
119  std::string serviceName = "IncidentSvc";
120  m_incidentSvc = serviceLocator()->service(serviceName);
121  if ( ! m_incidentSvc ) {
122  error() << "Cannot retrieve " << serviceName << endmsg;
123  return StatusCode::FAILURE;
124  }
125  debug() << "Register to the IncidentSvc" << endmsg;
126  m_incidentSvc->addListener(this, IncidentType::BeginEvent);
127  } else {
128  warning() << "StalledEventMonitor/" << name()
129  << " instantiated with 0 time-out: no monitoring performed" << endmsg;
130  }
131 
132  return StatusCode::SUCCESS;
133 }
134 
135 // Start the monitoring.
137  if (m_watchdog.get()) m_watchdog->start();
138  return StatusCode::SUCCESS;
139 }
140 
141 // Notify the watchdog that a new event has been started
142 void StalledEventMonitor::handle(const Incident& /* incident */) {
143  if (m_watchdog.get()) m_watchdog->ping();
144 }
145 
146 // Start the monitoring.
148  if (m_watchdog.get()) m_watchdog->stop();
149  return StatusCode::SUCCESS;
150 }
151 
152 // Finalization of the service.
154  // destroy the watchdog thread (if any)
155  m_watchdog.reset();
156  // unregistering from the IncidentSvc
157  m_incidentSvc->removeListener(this, IncidentType::BeginEvent);
159  return base_class::finalize();
160 }
161 
162 // Declaration of the factory

Generated at Wed Jun 4 2014 14:48:58 for Gaudi Framework, version v25r2 by Doxygen version 1.8.2 written by Dimitri van Heesch, © 1997-2004