All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
13 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace {
21  class EventWatchdog: public WatchdogThread {
22  public:
23  EventWatchdog(const SmartIF<IMessageSvc> &msgSvc,
24  const std::string &name,
25  boost::posix_time::time_duration timeout,
26  bool stackTrace = false,
27  long maxCount = 0,
28  bool autostart = false):
29  WatchdogThread(timeout, autostart),
30  log(msgSvc, name),
31  m_maxCount(maxCount),
32  m_stackTrace(stackTrace){}
33  ~EventWatchdog() override = default;
34  private:
36  MsgStream log;
38  long m_counter = 0;
40  long m_maxCount = 0;
42  bool m_stackTrace = false;
44  void action() override {
45  if (!m_counter) {
46  log << MSG::WARNING << "More than " << getTimeout().total_seconds()
47  << "s since the last " << IncidentType::BeginEvent << endmsg;
48  } else {
49  log << MSG::WARNING << "Other " << getTimeout().total_seconds()
50  << "s passed" << endmsg;
51  }
52  log << MSG::INFO << "Current memory usage is"
53  " virtual size = " << System::virtualMemory() / 1024. << " MB"
54  ", resident set size = " << System::pagedMemory() / 1024.<< " MB"
55  << endmsg;
56  if (m_stackTrace && gSystem) {
57  // TSystem::StackTrace() prints on the standard error, so we
58  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
59  gSystem->StackTrace();
60  }
61  ++m_counter;
62  if (m_maxCount > 0 && m_counter >= m_maxCount) {
63  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
64  std::raise(SIGABRT);
65  }
66  }
67  void onPing() override {
68  if (m_counter) {
69  if (m_counter >= 3)
70  log << MSG::INFO << "Starting a new event after ~"
71  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
72  m_counter = 0;
73  }
74  }
75  void onStop() override {
76  if (m_counter >= 3)
77  log << MSG::INFO << "The last event took ~"
78  << m_counter * getTimeout().total_seconds() << "s" << endmsg;
79  }
80  };
81 }
82 
83 // Constructor
85  base_class(name, svcLoc) {
86 
87 }
88 
89 // Initialization of the service.
91  StatusCode sc = base_class::initialize();
92  if (sc.isFailure()) return sc;
93 
94 
95  if (m_eventTimeout) {
96  // create the watchdog thread
97  m_watchdog.reset( new EventWatchdog(msgSvc(),
98  "EventWatchdog",
99  boost::posix_time::seconds(m_eventTimeout),
100  m_stackTrace,
102 
103  // register to the incident service
104  static const std::string serviceName = "IncidentSvc";
105  m_incidentSvc = serviceLocator()->service(serviceName);
106  if ( ! m_incidentSvc ) {
107  error() << "Cannot retrieve " << serviceName << endmsg;
108  return StatusCode::FAILURE;
109  }
110  debug() << "Register to the IncidentSvc" << endmsg;
111  m_incidentSvc->addListener(this, IncidentType::BeginEvent);
112  } else {
113  warning() << "StalledEventMonitor/" << name()
114  << " instantiated with 0 time-out: no monitoring performed" << endmsg;
115  }
116 
117  return StatusCode::SUCCESS;
118 }
119 
120 // Start the monitoring.
122  if (m_watchdog) m_watchdog->start();
123  return StatusCode::SUCCESS;
124 }
125 
126 // Notify the watchdog that a new event has been started
127 void StalledEventMonitor::handle(const Incident& /* incident */) {
128  if (m_watchdog) m_watchdog->ping();
129 }
130 
131 // Start the monitoring.
133  if (m_watchdog) m_watchdog->stop();
134  return StatusCode::SUCCESS;
135 }
136 
137 // Finalization of the service.
139  // destroy the watchdog thread (if any)
140  m_watchdog.reset();
141  // unregistering from the IncidentSvc
142  m_incidentSvc->removeListener(this, IncidentType::BeginEvent);
144  return base_class::finalize();
145 }
146 
147 // Declaration of the factory
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition: ISvcLocator.h:25
StatusCode stop() override
Stop the watchdog thread (after the event loop).
const std::string & name() const override
Retrieve name of the service.
Definition: Service.cpp:289
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
void start()
Start the watchdog thread.
virtual void onStop()
User implemented function that will be called when stopping.
Gaudi::Property< unsigned int > m_eventTimeout
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
T endl(T...args)
StatusCode start() override
Start the watchdog thread (before entering the event loop).
void ping()
Function to call to notify the watchdog thread that we are still alive.
bool isFailure() const
Test for a status code of FAILURE.
Definition: StatusCode.h:84
#define DECLARE_COMPONENT(type)
Definition: PluginService.h:36
STL class.
StatusCode service(const Gaudi::Utils::TypeNameString &name, T *&svc, bool createIf=true)
Templated method to access a service by name.
Definition: ISvcLocator.h:78
MsgStream & error() const
shortcut for the method msgStream(MSG::ERROR)
Service that monitor the time taken by processing of single events using a separate thread...
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
MsgStream & warning() const
shortcut for the method msgStream(MSG::WARNING)
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:26
T reset(T...args)
T raise(T...args)
StalledEventMonitor(const std::string &name, ISvcLocator *svcLoc)
Constructor.
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
MsgStream & debug() const
shortcut for the method msgStream(MSG::DEBUG)
void stop()
Signal the watchdog thread to stop and wait for it.
Base class for all Incidents (computing events).
Definition: Incident.h:17
virtual void addListener(IIncidentListener *lis, const std::string &type="", long priority=0, bool rethrow=false, bool singleShot=false)=0
Add listener.
SmartIF< IMessageSvc > & msgSvc() const
The standard message service.
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:212
StatusCode finalize() override
Finalization of the service.
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:88
virtual void removeListener(IIncidentListener *lis, const std::string &type="")=0
Remove listener.
virtual void action()
User implemented function that will be called if the time-out is reached.
SmartIF< ISvcLocator > & serviceLocator() const override
Retrieve pointer to service locator.
Definition: Service.cpp:292
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process &#39;pid&#39;...
Definition: Memory.cpp:121
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:244
Gaudi::Property< int > m_maxTimeoutCount