All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
11 #include "GaudiKernel/WatchdogThread.h"
12 #include "GaudiKernel/IIncidentSvc.h"
13 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace {
21  class EventWatchdog: public WatchdogThread {
22  public:
23  EventWatchdog(const SmartIF<IMessageSvc> &msgSvc,
24  const std::string &name,
25  boost::posix_time::time_duration timeout,
26  bool stackTrace = false,
27  long maxCount = 0,
28  bool autostart = false):
29  WatchdogThread(timeout, autostart),
30  log(msgSvc, name),
31  m_maxCount(maxCount),
32  m_stackTrace(stackTrace){}
33  ~EventWatchdog() override = default;
34  private:
36  MsgStream log;
38  long m_counter = 0;
40  long m_maxCount = 0;
42  bool m_stackTrace = false;
44  void action() override {
45  if (!m_counter) {
46  log << MSG::WARNING << "More than " << getTimeout().total_seconds()
47  << "s since the last " << IncidentType::BeginEvent << endmsg;
48  } else {
49  log << MSG::WARNING << "Other " << getTimeout().total_seconds()
50  << "s passed" << endmsg;
51  }
52  log << MSG::INFO << "Current memory usage is"
53  " virtual size = " << System::virtualMemory() / 1024. << " MB"
54  ", resident set size = " << System::pagedMemory() / 1024.<< " MB"
55  << endmsg;
56  if (m_stackTrace && gSystem) {
57  // TSystem::StackTrace() prints on the standard error, so we
58  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
59  gSystem->StackTrace();
60  }
61  ++m_counter;
62  if (m_maxCount > 0 && m_counter >= m_maxCount) {
63  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
64  std::raise(SIGABRT);
65  }
66  }
67  void onPing() override {
68  if (m_counter) {
69  if (m_counter >= 3)
70  log << MSG::INFO << "Starting a new event after ~"
71  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
72  m_counter = 0;
73  }
74  }
75  void onStop() override {
76  if (m_counter >= 3)
77  log << MSG::INFO << "The last event took ~"
78  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
79  }
80  };
81 }
82 
83 // Constructor
85  base_class(name, svcLoc) {
86 
87  declareProperty("EventTimeout", m_eventTimeout = 600,
88  "Number of seconds allowed to process a single event (0 to disable the check).");
89 
90  declareProperty("MaxTimeoutCount", m_maxTimeoutCount = 0,
91  "Number timeouts before aborting the execution (0 means never abort).");
92 
93  declareProperty("StackTrace", m_stackTrace = false,
94  "Whether to print the stack-trace on timeout.");
95 }
96 
97 // Initialization of the service.
100  if (sc.isFailure()) return sc;
101 
102 
103  if (m_eventTimeout) {
104  // create the watchdog thread
105  m_watchdog.reset( new EventWatchdog(msgSvc(),
106  "EventWatchdog",
107  boost::posix_time::seconds(m_eventTimeout),
108  m_stackTrace,
110 
111  // register to the incident service
112  static const std::string serviceName = "IncidentSvc";
113  m_incidentSvc = serviceLocator()->service(serviceName);
114  if ( ! m_incidentSvc ) {
115  error() << "Cannot retrieve " << serviceName << endmsg;
116  return StatusCode::FAILURE;
117  }
118  debug() << "Register to the IncidentSvc" << endmsg;
119  m_incidentSvc->addListener(this, IncidentType::BeginEvent);
120  } else {
121  warning() << "StalledEventMonitor/" << name()
122  << " instantiated with 0 time-out: no monitoring performed" << endmsg;
123  }
124 
125  return StatusCode::SUCCESS;
126 }
127 
128 // Start the monitoring.
130  if (m_watchdog) m_watchdog->start();
131  return StatusCode::SUCCESS;
132 }
133 
134 // Notify the watchdog that a new event has been started
135 void StalledEventMonitor::handle(const Incident& /* incident */) {
136  if (m_watchdog) m_watchdog->ping();
137 }
138 
139 // Start the monitoring.
141  if (m_watchdog) m_watchdog->stop();
142  return StatusCode::SUCCESS;
143 }
144 
145 // Finalization of the service.
147  // destroy the watchdog thread (if any)
148  m_watchdog.reset();
149  // unregistering from the IncidentSvc
150  m_incidentSvc->removeListener(this, IncidentType::BeginEvent);
152  return base_class::finalize();
153 }
154 
155 // Declaration of the factory
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
def initialize()
Definition: AnalysisTest.py:12
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition: ISvcLocator.h:25
StatusCode stop() override
Stop the watchdog thread (after the event loop).
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:244
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
virtual void onStop()
User implemented function that will be called when stopping.
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
StatusCode start() override
Start the watchdog thread (before entering the event loop).
bool isFailure() const
Test for a status code of FAILURE.
Definition: StatusCode.h:86
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:212
Service that monitor the time taken by processing of single events using a separate thread...
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:26
#define DECLARE_COMPONENT(type)
Definition: PluginService.h:36
int m_maxTimeoutCount
Number timeouts before aborting the execution (0 means never abort).
unsigned int m_eventTimeout
Number of seconds allowed to process a single event.
StalledEventMonitor(const std::string &name, ISvcLocator *svcLoc)
Constructor.
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
Base class used to extend a class implementing other interfaces.
Definition: extends.h:10
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'...
Definition: Memory.cpp:121
Base class for all Incidents (computing events).
Definition: Incident.h:16
virtual void addListener(IIncidentListener *lis, const std::string &type="", long priority=0, bool rethrow=false, bool singleShot=false)=0
Add listener.
StatusCode finalize() override
Finalization of the service.
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:88
virtual void removeListener(IIncidentListener *lis, const std::string &type="")=0
Remove listener.
virtual void action()
User implemented function that will be called if the time-out is reached.