1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
8 // Include files
9 #include "StalledEventMonitor.h"
12 #include "GaudiKernel/Memory.h"
15 #include "TSystem.h"
17 #include <csignal>
19 namespace
20 {
22  class EventWatchdog : public WatchdogThread
23  {
24  public:
25  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name,
26  boost::posix_time::time_duration timeout, bool stackTrace = false, long maxCount = 0,
27  bool autostart = false )
28  : WatchdogThread( timeout, autostart ), log( msgSvc, name ), m_maxCount( maxCount ), m_stackTrace( stackTrace )
29  {
30  }
31  ~EventWatchdog() override = default;
33  private:
35  MsgStream log;
37  long m_counter = 0;
39  long m_maxCount = 0;
41  bool m_stackTrace = false;
43  void action() override
44  {
45  if ( !m_counter ) {
46  log << MSG::WARNING << "More than " << getTimeout().total_seconds() << "s since the last "
47  << IncidentType::BeginEvent << endmsg;
48  } else {
49  log << MSG::WARNING << "Other " << getTimeout().total_seconds() << "s passed" << endmsg;
50  }
51  log << MSG::INFO << "Current memory usage is"
52  " virtual size = "
53  << System::virtualMemory() / 1024. << " MB"
54  ", resident set size = "
55  << System::pagedMemory() / 1024. << " MB" << endmsg;
56  if ( m_stackTrace && gSystem ) {
57  // TSystem::StackTrace() prints on the standard error, so we
58  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
59  gSystem->StackTrace();
60  }
61  ++m_counter;
62  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
63  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
64  std::raise( SIGABRT );
65  }
66  }
67  void onPing() override
68  {
69  if ( m_counter ) {
70  if ( m_counter >= 3 )
71  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().total_seconds() << "s"
72  << endmsg;
73  m_counter = 0;
74  }
75  }
76  void onStop() override
77  {
78  if ( m_counter >= 3 )
79  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().total_seconds() << "s" << endmsg;
80  }
81  };
82 }
84 // Constructor
87 // Initialization of the service.
89 {
90  StatusCode sc = base_class::initialize();
91  if ( sc.isFailure() ) return sc;
93  if ( m_eventTimeout ) {
94  // create the watchdog thread
95  m_watchdog.reset( new EventWatchdog( msgSvc(), "EventWatchdog", boost::posix_time::seconds( m_eventTimeout ),
96  m_stackTrace, m_maxTimeoutCount ) );
98  // register to the incident service
99  static const std::string serviceName = "IncidentSvc";
100  m_incidentSvc = serviceLocator()->service( serviceName );
101  if ( !m_incidentSvc ) {
102  error() << "Cannot retrieve " << serviceName << endmsg;
103  return StatusCode::FAILURE;
104  }
105  debug() << "Register to the IncidentSvc" << endmsg;
106  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
107  } else {
108  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
109  }
111  return StatusCode::SUCCESS;
112 }
114 // Start the monitoring.
116 {
117  if ( m_watchdog ) m_watchdog->start();
118  return StatusCode::SUCCESS;
119 }
121 // Notify the watchdog that a new event has been started
122 void StalledEventMonitor::handle( const Incident& /* incident */ )
123 {
124  if ( m_watchdog ) m_watchdog->ping();
125 }
127 // Start the monitoring.
129 {
130  if ( m_watchdog ) m_watchdog->stop();
131  return StatusCode::SUCCESS;
132 }
134 // Finalization of the service.
136 {
137  // destroy the watchdog thread (if any)
138  m_watchdog.reset();
139  // unregistering from the IncidentSvc
140  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
142  return base_class::finalize();
143 }
145 // Declaration of the factory
