StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
13 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace {
21  class EventWatchdog: public WatchdogThread {
22  public:
23  EventWatchdog(const SmartIF<IMessageSvc> &msgSvc,
24  const std::string &name,
25  boost::posix_time::time_duration timeout,
26  bool stackTrace = false,
27  long maxCount = 0,
28  bool autostart = false):
29  WatchdogThread(timeout, autostart),
30  log(msgSvc, name),
31  m_maxCount(maxCount),
32  m_stackTrace(stackTrace){}
33  ~EventWatchdog() override = default;
34  private:
36  MsgStream log;
38  long m_counter = 0;
40  long m_maxCount = 0;
42  bool m_stackTrace = false;
44  void action() override {
45  if (!m_counter) {
46  log << MSG::WARNING << "More than " << getTimeout().total_seconds()
47  << "s since the last " << IncidentType::BeginEvent << endmsg;
48  } else {
49  log << MSG::WARNING << "Other " << getTimeout().total_seconds()
50  << "s passed" << endmsg;
51  }
52  log << MSG::INFO << "Current memory usage is"
53  " virtual size = " << System::virtualMemory() / 1024. << " MB"
54  ", resident set size = " << System::pagedMemory() / 1024.<< " MB"
55  << endmsg;
56  if (m_stackTrace && gSystem) {
57  // TSystem::StackTrace() prints on the standard error, so we
58  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
59  gSystem->StackTrace();
60  }
61  ++m_counter;
62  if (m_maxCount > 0 && m_counter >= m_maxCount) {
63  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
64  std::raise(SIGABRT);
65  }
66  }
67  void onPing() override {
68  if (m_counter) {
69  if (m_counter >= 3)
70  log << MSG::INFO << "Starting a new event after ~"
71  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
72  m_counter = 0;
73  }
74  }
75  void onStop() override {
76  if (m_counter >= 3)
77  log << MSG::INFO << "The last event took ~"
78  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
79  }
80  };
81 }
82 
83 // Constructor
85  base_class(name, svcLoc) {
86 
87  declareProperty("EventTimeout", m_eventTimeout = 600,
88  "Number of seconds allowed to process a single event (0 to disable the check).");
89 
90  declareProperty("MaxTimeoutCount", m_maxTimeoutCount = 0,
91  "Number timeouts before aborting the execution (0 means never abort).");
92 
93  declareProperty("StackTrace", m_stackTrace = false,
94  "Whether to print the stack-trace on timeout.");
95 }
96 
97 // Initialization of the service.
100  if (sc.isFailure()) return sc;
101 
102 
103  if (m_eventTimeout) {
104  // create the watchdog thread
105  m_watchdog.reset( new EventWatchdog(msgSvc(),
106  "EventWatchdog",
107  boost::posix_time::seconds(m_eventTimeout),
108  m_stackTrace,
110 
111  // register to the incident service
112  static const std::string serviceName = "IncidentSvc";
113  m_incidentSvc = serviceLocator()->service(serviceName);
114  if ( ! m_incidentSvc ) {
115  error() << "Cannot retrieve " << serviceName << endmsg;
116  return StatusCode::FAILURE;
117  }
118  debug() << "Register to the IncidentSvc" << endmsg;
119  m_incidentSvc->addListener(this, IncidentType::BeginEvent);
120  } else {
121  warning() << "StalledEventMonitor/" << name()
122  << " instantiated with 0 time-out: no monitoring performed" << endmsg;
123  }
124 
125  return StatusCode::SUCCESS;
126 }
127 
128 // Start the monitoring.
130  if (m_watchdog) m_watchdog->start();
131  return StatusCode::SUCCESS;
132 }
133 
134 // Notify the watchdog that a new event has been started
135 void StalledEventMonitor::handle(const Incident& /* incident */) {
136  if (m_watchdog) m_watchdog->ping();
137 }
138 
139 // Start the monitoring.
141  if (m_watchdog) m_watchdog->stop();
142  return StatusCode::SUCCESS;
143 }
144 
145 // Finalization of the service.
147  // destroy the watchdog thread (if any)
148  m_watchdog.reset();
149  // unregistering from the IncidentSvc
150  m_incidentSvc->removeListener(this, IncidentType::BeginEvent);
152  return base_class::finalize();
153 }
154 
155 // Declaration of the factory
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
SmartIF< ISvcLocator > & serviceLocator() const override
Retrieve pointer to service locator.
Definition: Service.cpp:324
def initialize()
Definition: AnalysisTest.py:12
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition: ISvcLocator.h:25
StatusCode stop() override
Stop the watchdog thread (after the event loop).
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
void start()
Start the watchdog thread.
virtual void onStop()
User implemented function that will be called when stopping.
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
T endl(T...args)
StatusCode start() override
Start the watchdog thread (before entering the event loop).
void ping()
Function to call to notify the watchdog thread that we are still alive.
bool isFailure() const
Test for a status code of FAILURE.
Definition: StatusCode.h:86
#define DECLARE_COMPONENT(type)
Definition: PluginService.h:36
STL class.
StatusCode service(const Gaudi::Utils::TypeNameString &name, T *&svc, bool createIf=true)
Templated method to access a service by name.
Definition: ISvcLocator.h:78
const std::string & name() const override
Retrieve name of the service.
Definition: Service.cpp:319
MsgStream & error() const
shortcut for the method msgStream(MSG::ERROR)
Service that monitor the time taken by processing of single events using a separate thread...
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
MsgStream & warning() const
shortcut for the method msgStream(MSG::WARNING)
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:26
int m_maxTimeoutCount
Number timeouts before aborting the execution (0 means never abort).
unsigned int m_eventTimeout
Number of seconds allowed to process a single event.
T reset(T...args)
T raise(T...args)
StalledEventMonitor(const std::string &name, ISvcLocator *svcLoc)
Constructor.
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
MsgStream & debug() const
shortcut for the method msgStream(MSG::DEBUG)
void stop()
Signal the watchdog thread to stop and wait for it.
Base class for all Incidents (computing events).
Definition: Incident.h:17
virtual void addListener(IIncidentListener *lis, const std::string &type="", long priority=0, bool rethrow=false, bool singleShot=false)=0
Add listener.
SmartIF< IMessageSvc > & msgSvc() const
The standard message service.
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:212
StatusCode finalize() override
Finalization of the service.
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:88
virtual void removeListener(IIncidentListener *lis, const std::string &type="")=0
Remove listener.
Property * declareProperty(const std::string &name, T &property, const std::string &doc="none") const
Declare the named property.
Definition: Service.h:215
virtual void action()
User implemented function that will be called if the time-out is reached.
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'...
Definition: Memory.cpp:121
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:244