All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
13 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace {
21  class EventWatchdog: public WatchdogThread {
22  public:
23  EventWatchdog(const SmartIF<IMessageSvc> &msgSvc,
24  const std::string &name,
25  boost::posix_time::time_duration timeout,
26  bool stackTrace = false,
27  long maxCount = 0,
28  bool autostart = false):
29  WatchdogThread(timeout, autostart),
30  log(msgSvc, name),
31  m_counter(0),
32  m_maxCount(maxCount),
33  m_stackTrace(stackTrace){}
34  virtual ~EventWatchdog() {}
35  private:
37  MsgStream log;
39  long m_counter;
41  long m_maxCount;
43  bool m_stackTrace;
45  void action() {
46  if (!m_counter) {
47  log << MSG::WARNING << "More than " << getTimeout().total_seconds()
48  << "s since the last " << IncidentType::BeginEvent << endmsg;
49  } else {
50  log << MSG::WARNING << "Other " << getTimeout().total_seconds()
51  << "s passed" << endmsg;
52  }
53  log << MSG::INFO << "Current memory usage is"
54  " virtual size = " << System::virtualMemory() / 1024. << " MB"
55  ", resident set size = " << System::pagedMemory() / 1024.<< " MB"
56  << endmsg;
57  if (m_stackTrace && gSystem) {
58  // TSystem::StackTrace() prints on the standard error, so we
59  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
60  gSystem->StackTrace();
61  }
62  ++m_counter;
63  if (m_maxCount > 0 && m_counter >= m_maxCount) {
64  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
65  std::raise(SIGABRT);
66  }
67  }
68  void onPing() {
69  if (m_counter) {
70  if (m_counter >= 3)
71  log << MSG::INFO << "Starting a new event after ~"
72  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
73  m_counter = 0;
74  }
75  }
76  void onStop() {
77  if (m_counter >= 3)
78  log << MSG::INFO << "The last event took ~"
79  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
80  }
81  };
82 }
83 
84 // Constructor
85 StalledEventMonitor::StalledEventMonitor(const std::string& name, ISvcLocator* svcLoc):
86  base_class(name, svcLoc) {
87 
88  declareProperty("EventTimeout", m_eventTimeout = 600,
89  "Number of seconds allowed to process a single event (0 to disable the check).");
90 
91  declareProperty("MaxTimeoutCount", m_maxTimeoutCount = 0,
92  "Number timeouts before aborting the execution (0 means never abort).");
93 
94  declareProperty("StackTrace", m_stackTrace = false,
95  "Whether to print the stack-trace on timeout.");
96 }
97 
98 // Destructor
100 
101 }
102 
103 // Initialization of the service.
106  if (sc.isFailure()) return sc;
107 
108 
109  if (m_eventTimeout) {
110  // create the watchdog thread
111  m_watchdog = std::auto_ptr<WatchdogThread>(
112  new EventWatchdog(msgSvc(),
113  "EventWatchdog",
114  boost::posix_time::seconds(m_eventTimeout),
115  m_stackTrace,
117 
118  // register to the incident service
119  std::string serviceName = "IncidentSvc";
120  m_incidentSvc = serviceLocator()->service(serviceName);
121  if ( ! m_incidentSvc ) {
122  error() << "Cannot retrieve " << serviceName << endmsg;
123  return StatusCode::FAILURE;
124  }
125  debug() << "Register to the IncidentSvc" << endmsg;
126  m_incidentSvc->addListener(this, IncidentType::BeginEvent);
127  } else {
128  warning() << "StalledEventMonitor/" << name()
129  << " instantiated with 0 time-out: no monitoring performed" << endmsg;
130  }
131 
132  return StatusCode::SUCCESS;
133 }
134 
135 // Start the monitoring.
137  if (m_watchdog.get()) m_watchdog->start();
138  return StatusCode::SUCCESS;
139 }
140 
141 // Notify the watchdog that a new event has been started
142 void StalledEventMonitor::handle(const Incident& /* incident */) {
143  if (m_watchdog.get()) m_watchdog->ping();
144 }
145 
146 // Start the monitoring.
148  if (m_watchdog.get()) m_watchdog->stop();
149  return StatusCode::SUCCESS;
150 }
151 
152 // Finalization of the service.
154  // destroy the watchdog thread (if any)
155  m_watchdog.reset();
156  // unregistering from the IncidentSvc
157  m_incidentSvc->removeListener(this, IncidentType::BeginEvent);
159  return base_class::finalize();
160 }
161 
162 // Declaration of the factory
const std::string BeginEvent
Processing of a new event has started.
Definition: Incident.h:60
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition: ISvcLocator.h:26
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
virtual void onStop()
User implemented function that will be called when stopping.
virtual StatusCode start()
Start the watchdog thread (before entering the event loop).
SmartIF< IMessageSvc > & msgSvc() const
The standard message service.
virtual StatusCode initialize()
Initialization of the service.
MsgStream & warning() const
shortcut for the method msgStream(MSG::WARNING)
bool m_stackTrace
Whether to print a stack-trace on timeout.
bool isFailure() const
Test for a status code of FAILURE.
Definition: StatusCode.h:85
#define DECLARE_COMPONENT(type)
Definition: PluginService.h:36
Service that monitor the time taken by processing of single events using a separate thread...
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
MsgStream & debug() const
shortcut for the method msgStream(MSG::DEBUG)
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:30
int m_maxTimeoutCount
Number timeouts before aborting the execution (0 means never abort).
unsigned int m_eventTimeout
Number of seconds allowed to process a single event.
StalledEventMonitor(const std::string &name, ISvcLocator *svcLoc)
Constructor.
virtual const std::string & name() const
Retrieve name of the service.
Definition: Service.cpp:331
virtual void handle(const Incident &)
Notify the watchdog thread for a new event.
virtual void onPing()
User implemented function that will be called when ping is called.
Simple class for asynchronous check of time-out.
Base class for all Incidents (computing events).
Definition: Incident.h:16
Templated class to add the standard messaging functionalities.
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:213
virtual ~StalledEventMonitor()
Destructor.
virtual StatusCode finalize()
Finalization of the service.
Property * declareProperty(const std::string &name, T &property, const std::string &doc="none") const
Declare the named property.
Definition: Service.h:211
virtual void action()
User implemented function that will be called if the time-out is reached.
std::auto_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
virtual StatusCode stop()
Stop the watchdog thread (after the event loop).
void reset(TYPE *ptr=0)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:74
MsgStream & error() const
shortcut for the method msgStream(MSG::ERROR)
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'...
Definition: Memory.cpp:122
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:244
SmartIF< ISvcLocator > & serviceLocator() const
Retrieve pointer to service locator.
Definition: Service.cpp:336