The Gaudi Framework  v30r3 (a5ef0a68)
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
12 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace
20 {
22  class EventWatchdog : public WatchdogThread
23  {
24  public:
25  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name,
26  boost::posix_time::time_duration timeout, bool stackTrace = false, long maxCount = 0,
27  bool autostart = false )
28  : WatchdogThread( timeout, autostart ), log( msgSvc, name ), m_maxCount( maxCount ), m_stackTrace( stackTrace )
29  {
30  }
31 
32  private:
34  MsgStream log;
36  long m_counter = 0;
38  long m_maxCount = 0;
40  bool m_stackTrace = false;
42  void action() override
43  {
44  if ( !m_counter ) {
45  log << MSG::WARNING << "More than " << getTimeout().total_seconds() << "s since the last "
46  << IncidentType::BeginEvent << endmsg;
47  } else {
48  log << MSG::WARNING << "Other " << getTimeout().total_seconds() << "s passed" << endmsg;
49  }
50  log << MSG::INFO << "Current memory usage is"
51  " virtual size = "
52  << System::virtualMemory() / 1024. << " MB"
53  ", resident set size = "
54  << System::pagedMemory() / 1024. << " MB" << endmsg;
55  if ( m_stackTrace && gSystem ) {
56  // TSystem::StackTrace() prints on the standard error, so we
57  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
58  gSystem->StackTrace();
59  }
60  ++m_counter;
61  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
62  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
63  std::raise( SIGABRT );
64  }
65  }
66  void onPing() override
67  {
68  if ( m_counter ) {
69  if ( m_counter >= 3 )
70  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().total_seconds() << "s"
71  << endmsg;
72  m_counter = 0;
73  }
74  }
75  void onStop() override
76  {
77  if ( m_counter >= 3 )
78  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().total_seconds() << "s" << endmsg;
79  }
80  };
81 }
82 
83 // Initialization of the service.
85 {
86  StatusCode sc = base_class::initialize();
87  if ( sc.isFailure() ) return sc;
88 
89  if ( m_eventTimeout ) {
90  // create the watchdog thread
91  m_watchdog = std::make_unique<EventWatchdog>(
92  msgSvc(), "EventWatchdog", boost::posix_time::seconds( m_eventTimeout ), m_stackTrace, m_maxTimeoutCount );
93 
94  // register to the incident service
95  static const std::string serviceName = "IncidentSvc";
96  m_incidentSvc = serviceLocator()->service( serviceName );
97  if ( !m_incidentSvc ) {
98  error() << "Cannot retrieve " << serviceName << endmsg;
99  return StatusCode::FAILURE;
100  }
101  debug() << "Register to the IncidentSvc" << endmsg;
102  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
103  } else {
104  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
105  }
106 
107  return StatusCode::SUCCESS;
108 }
109 
110 // Start the monitoring.
112 {
113  if ( m_watchdog ) m_watchdog->start();
114  return StatusCode::SUCCESS;
115 }
116 
117 // Notify the watchdog that a new event has been started
118 void StalledEventMonitor::handle( const Incident& /* incident */ )
119 {
120  if ( m_watchdog ) m_watchdog->ping();
121 }
122 
123 // Start the monitoring.
125 {
126  if ( m_watchdog ) m_watchdog->stop();
127  return StatusCode::SUCCESS;
128 }
129 
130 // Finalization of the service.
132 {
133  // destroy the watchdog thread (if any)
134  m_watchdog.reset();
135  // unregistering from the IncidentSvc
136  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
137  m_incidentSvc.reset();
138  return base_class::finalize();
139 }
140 
141 // Declaration of the factory
constexpr static const auto FAILURE
Definition: StatusCode.h:88
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
StatusCode stop() override
Stop the watchdog thread (after the event loop).
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
virtual void onStop()
User implemented function that will be called when stopping.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
T endl(T...args)
StatusCode start() override
Start the watchdog thread (before entering the event loop).
bool isFailure() const
Definition: StatusCode.h:139
STL class.
#define DECLARE_COMPONENT(type)
Service that monitor the time taken by processing of single events using a separate thread...
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:51
T raise(T...args)
constexpr static const auto SUCCESS
Definition: StatusCode.h:87
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
Base class for all Incidents (computing events).
Definition: Incident.h:17
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:239
StatusCode finalize() override
Finalization of the service.
virtual void action()
User implemented function that will be called if the time-out is reached.
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process &#39;pid&#39;...
Definition: Memory.cpp:141
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:209