Loading [MathJax]/extensions/tex2jax.js
The Gaudi Framework  v31r0 (aeb156f0)
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
12 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace {
21  class EventWatchdog : public WatchdogThread {
22  public:
23  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name,
24  boost::posix_time::time_duration timeout, bool stackTrace = false, long maxCount = 0,
25  bool autostart = false )
26  : WatchdogThread( timeout, autostart )
27  , log( msgSvc, name )
28  , m_maxCount( maxCount )
29  , m_stackTrace( stackTrace ) {}
30 
31  private:
33  MsgStream log;
35  long m_counter = 0;
37  long m_maxCount = 0;
39  bool m_stackTrace = false;
41  void action() override {
42  if ( !m_counter ) {
43  log << MSG::WARNING << "More than " << getTimeout().total_seconds() << "s since the last "
44  << IncidentType::BeginEvent << endmsg;
45  } else {
46  log << MSG::WARNING << "Other " << getTimeout().total_seconds() << "s passed" << endmsg;
47  }
48  log << MSG::INFO
49  << "Current memory usage is"
50  " virtual size = "
51  << System::virtualMemory() / 1024.
52  << " MB"
53  ", resident set size = "
54  << System::pagedMemory() / 1024. << " MB" << endmsg;
55  if ( m_stackTrace && gSystem ) {
56  // TSystem::StackTrace() prints on the standard error, so we
57  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
58  gSystem->StackTrace();
59  }
60  ++m_counter;
61  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
62  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
63  std::raise( SIGABRT );
64  }
65  }
66  void onPing() override {
67  if ( m_counter ) {
68  if ( m_counter >= 3 )
69  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().total_seconds() << "s"
70  << endmsg;
71  m_counter = 0;
72  }
73  }
74  void onStop() override {
75  if ( m_counter >= 3 )
76  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().total_seconds() << "s" << endmsg;
77  }
78  };
79 } // namespace
80 
81 // Initialization of the service.
83  StatusCode sc = base_class::initialize();
84  if ( sc.isFailure() ) return sc;
85 
86  if ( m_eventTimeout ) {
87  // create the watchdog thread
88  m_watchdog = std::make_unique<EventWatchdog>( msgSvc(), "EventWatchdog",
89  boost::posix_time::seconds( m_eventTimeout.value() ), m_stackTrace,
90  m_maxTimeoutCount );
91 
92  // register to the incident service
93  static const std::string serviceName = "IncidentSvc";
94  m_incidentSvc = serviceLocator()->service( serviceName );
95  if ( !m_incidentSvc ) {
96  error() << "Cannot retrieve " << serviceName << endmsg;
97  return StatusCode::FAILURE;
98  }
99  debug() << "Register to the IncidentSvc" << endmsg;
100  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
101  } else {
102  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
103  }
104 
105  return StatusCode::SUCCESS;
106 }
107 
108 // Start the monitoring.
110  if ( m_watchdog ) m_watchdog->start();
111  return StatusCode::SUCCESS;
112 }
113 
114 // Notify the watchdog that a new event has been started
115 void StalledEventMonitor::handle( const Incident& /* incident */ ) {
116  if ( m_watchdog ) m_watchdog->ping();
117 }
118 
119 // Start the monitoring.
121  if ( m_watchdog ) m_watchdog->stop();
122  return StatusCode::SUCCESS;
123 }
124 
125 // Finalization of the service.
127  // destroy the watchdog thread (if any)
128  m_watchdog.reset();
129  // unregistering from the IncidentSvc
130  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
131  m_incidentSvc.reset();
132  return base_class::finalize();
133 }
134 
135 // Declaration of the factory
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
StatusCode stop() override
Stop the watchdog thread (after the event loop).
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
virtual void onStop()
User implemented function that will be called when stopping.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
T endl(T...args)
constexpr static const auto SUCCESS
Definition: StatusCode.h:85
StatusCode start() override
Start the watchdog thread (before entering the event loop).
bool isFailure() const
Definition: StatusCode.h:130
STL class.
#define DECLARE_COMPONENT(type)
Service that monitor the time taken by processing of single events using a separate thread...
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:50
T raise(T...args)
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
Base class for all Incidents (computing events).
Definition: Incident.h:17
constexpr static const auto FAILURE
Definition: StatusCode.h:86
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:218
StatusCode finalize() override
Finalization of the service.
virtual void action()
User implemented function that will be called if the time-out is reached.
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process &#39;pid&#39;...
Definition: Memory.cpp:131
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:192