The Gaudi Framework  v29r0 (ff2e7097)
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /*
2  * StalledEventMonitor.cpp
3  *
4  * Created on: Apr 19, 2010
5  * Author: Marco Clemencic
6  */
7 
8 // Include files
9 #include "StalledEventMonitor.h"
10 
12 #include "GaudiKernel/Memory.h"
14 
15 #include "TSystem.h"
16 
17 #include <csignal>
18 
19 namespace
20 {
22  class EventWatchdog : public WatchdogThread
23  {
24  public:
25  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name,
26  boost::posix_time::time_duration timeout, bool stackTrace = false, long maxCount = 0,
27  bool autostart = false )
28  : WatchdogThread( timeout, autostart ), log( msgSvc, name ), m_maxCount( maxCount ), m_stackTrace( stackTrace )
29  {
30  }
31  ~EventWatchdog() override = default;
32 
33  private:
35  MsgStream log;
37  long m_counter = 0;
39  long m_maxCount = 0;
41  bool m_stackTrace = false;
43  void action() override
44  {
45  if ( !m_counter ) {
46  log << MSG::WARNING << "More than " << getTimeout().total_seconds() << "s since the last "
47  << IncidentType::BeginEvent << endmsg;
48  } else {
49  log << MSG::WARNING << "Other " << getTimeout().total_seconds() << "s passed" << endmsg;
50  }
51  log << MSG::INFO << "Current memory usage is"
52  " virtual size = "
53  << System::virtualMemory() / 1024. << " MB"
54  ", resident set size = "
55  << System::pagedMemory() / 1024. << " MB" << endmsg;
56  if ( m_stackTrace && gSystem ) {
57  // TSystem::StackTrace() prints on the standard error, so we
58  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
59  gSystem->StackTrace();
60  }
61  ++m_counter;
62  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
63  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
64  std::raise( SIGABRT );
65  }
66  }
67  void onPing() override
68  {
69  if ( m_counter ) {
70  if ( m_counter >= 3 )
71  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().total_seconds() << "s"
72  << endmsg;
73  m_counter = 0;
74  }
75  }
76  void onStop() override
77  {
78  if ( m_counter >= 3 )
79  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().total_seconds() << "s" << endmsg;
80  }
81  };
82 }
83 
84 // Constructor
86 
87 // Initialization of the service.
89 {
90  StatusCode sc = base_class::initialize();
91  if ( sc.isFailure() ) return sc;
92 
93  if ( m_eventTimeout ) {
94  // create the watchdog thread
95  m_watchdog.reset( new EventWatchdog( msgSvc(), "EventWatchdog", boost::posix_time::seconds( m_eventTimeout ),
96  m_stackTrace, m_maxTimeoutCount ) );
97 
98  // register to the incident service
99  static const std::string serviceName = "IncidentSvc";
100  m_incidentSvc = serviceLocator()->service( serviceName );
101  if ( !m_incidentSvc ) {
102  error() << "Cannot retrieve " << serviceName << endmsg;
103  return StatusCode::FAILURE;
104  }
105  debug() << "Register to the IncidentSvc" << endmsg;
106  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
107  } else {
108  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
109  }
110 
111  return StatusCode::SUCCESS;
112 }
113 
114 // Start the monitoring.
116 {
117  if ( m_watchdog ) m_watchdog->start();
118  return StatusCode::SUCCESS;
119 }
120 
121 // Notify the watchdog that a new event has been started
122 void StalledEventMonitor::handle( const Incident& /* incident */ )
123 {
124  if ( m_watchdog ) m_watchdog->ping();
125 }
126 
127 // Start the monitoring.
129 {
130  if ( m_watchdog ) m_watchdog->stop();
131  return StatusCode::SUCCESS;
132 }
133 
134 // Finalization of the service.
136 {
137  // destroy the watchdog thread (if any)
138  m_watchdog.reset();
139  // unregistering from the IncidentSvc
140  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
142  return base_class::finalize();
143 }
144 
145 // Declaration of the factory
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:24
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition: ISvcLocator.h:25
StatusCode stop() override
Stop the watchdog thread (after the event loop).
const std::string & name() const override
Retrieve name of the service.
Definition: Service.cpp:289
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
void start()
Start the watchdog thread.
virtual void onStop()
User implemented function that will be called when stopping.
Gaudi::Property< unsigned int > m_eventTimeout
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
T endl(T...args)
StatusCode start() override
Start the watchdog thread (before entering the event loop).
void ping()
Function to call to notify the watchdog thread that we are still alive.
bool isFailure() const
Test for a status code of FAILURE.
Definition: StatusCode.h:86
#define DECLARE_COMPONENT(type)
Definition: PluginService.h:33
STL class.
StatusCode service(const Gaudi::Utils::TypeNameString &name, T *&svc, bool createIf=true)
Templated method to access a service by name.
Definition: ISvcLocator.h:79
MsgStream & error() const
shortcut for the method msgStream(MSG::ERROR)
Service that monitor the time taken by processing of single events using a separate thread...
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
MsgStream & warning() const
shortcut for the method msgStream(MSG::WARNING)
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:28
T reset(T...args)
T raise(T...args)
StalledEventMonitor(const std::string &name, ISvcLocator *svcLoc)
Constructor.
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
MsgStream & debug() const
shortcut for the method msgStream(MSG::DEBUG)
void stop()
Signal the watchdog thread to stop and wait for it.
Base class for all Incidents (computing events).
Definition: Incident.h:17
virtual void addListener(IIncidentListener *lis, const std::string &type="", long priority=0, bool rethrow=false, bool singleShot=false)=0
Add listener.
SmartIF< IMessageSvc > & msgSvc() const
The standard message service.
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:239
StatusCode finalize() override
Finalization of the service.
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:92
virtual void removeListener(IIncidentListener *lis, const std::string &type="")=0
Remove listener.
virtual void action()
User implemented function that will be called if the time-out is reached.
SmartIF< ISvcLocator > & serviceLocator() const override
Retrieve pointer to service locator.
Definition: Service.cpp:292
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process &#39;pid&#39;...
Definition: Memory.cpp:141
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:209
Gaudi::Property< int > m_maxTimeoutCount