The Gaudi Framework  v33r1 (b1225454)
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /***********************************************************************************\
2 * (c) Copyright 1998-2019 CERN for the benefit of the LHCb and ATLAS collaborations *
3 * *
4 * This software is distributed under the terms of the Apache version 2 licence, *
5 * copied verbatim in the file "LICENSE". *
6 * *
7 * In applying this licence, CERN does not waive the privileges and immunities *
8 * granted to it by virtue of its status as an Intergovernmental Organization *
9 * or submit itself to any jurisdiction. *
10 \***********************************************************************************/
11 /*
12  * StalledEventMonitor.cpp
13  *
14  * Created on: Apr 19, 2010
15  * Author: Marco Clemencic
16  */
17 
18 // Include files
19 #include "StalledEventMonitor.h"
20 
22 #include "GaudiKernel/Memory.h"
24 
25 #include "TSystem.h"
26 
27 #include <csignal>
28 
29 namespace {
31  class EventWatchdog : public WatchdogThread {
32  public:
33  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name,
34  boost::posix_time::time_duration timeout, bool stackTrace = false, long maxCount = 0,
35  bool autostart = false )
36  : WatchdogThread( timeout, autostart )
37  , log( msgSvc, name )
38  , m_maxCount( maxCount )
39  , m_stackTrace( stackTrace ) {}
40 
41  private:
43  MsgStream log;
45  long m_counter = 0;
47  long m_maxCount = 0;
49  bool m_stackTrace = false;
51  void action() override {
52  if ( !m_counter ) {
53  log << MSG::WARNING << "More than " << getTimeout().total_seconds() << "s since the last "
54  << IncidentType::BeginEvent << endmsg;
55  } else {
56  log << MSG::WARNING << "Other " << getTimeout().total_seconds() << "s passed" << endmsg;
57  }
58  log << MSG::INFO
59  << "Current memory usage is"
60  " virtual size = "
61  << System::virtualMemory() / 1024.
62  << " MB"
63  ", resident set size = "
64  << System::pagedMemory() / 1024. << " MB" << endmsg;
65  if ( m_stackTrace && gSystem ) {
66  // TSystem::StackTrace() prints on the standard error, so we
67  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
68  gSystem->StackTrace();
69  }
70  ++m_counter;
71  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
72  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
73  std::raise( SIGABRT );
74  }
75  }
76  void onPing() override {
77  if ( m_counter ) {
78  if ( m_counter >= 3 )
79  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().total_seconds() << "s"
80  << endmsg;
81  m_counter = 0;
82  }
83  }
84  void onStop() override {
85  if ( m_counter >= 3 )
86  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().total_seconds() << "s" << endmsg;
87  }
88  };
89 } // namespace
90 
91 // Initialization of the service.
93  StatusCode sc = base_class::initialize();
94  if ( sc.isFailure() ) return sc;
95 
96  if ( m_eventTimeout ) {
97  // create the watchdog thread
98  m_watchdog = std::make_unique<EventWatchdog>( msgSvc(), "EventWatchdog",
99  boost::posix_time::seconds( m_eventTimeout.value() ), m_stackTrace,
101 
102  // register to the incident service
103  static const std::string serviceName = "IncidentSvc";
104  m_incidentSvc = serviceLocator()->service( serviceName );
105  if ( !m_incidentSvc ) {
106  error() << "Cannot retrieve " << serviceName << endmsg;
107  return StatusCode::FAILURE;
108  }
109  debug() << "Register to the IncidentSvc" << endmsg;
110  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
111  } else {
112  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
113  }
114 
115  return StatusCode::SUCCESS;
116 }
117 
118 // Start the monitoring.
120  if ( m_watchdog ) m_watchdog->start();
121  return StatusCode::SUCCESS;
122 }
123 
124 // Notify the watchdog that a new event has been started
125 void StalledEventMonitor::handle( const Incident& /* incident */ ) {
126  if ( m_watchdog ) m_watchdog->ping();
127 }
128 
129 // Start the monitoring.
131  if ( m_watchdog ) m_watchdog->stop();
132  return StatusCode::SUCCESS;
133 }
134 
135 // Finalization of the service.
137  // destroy the watchdog thread (if any)
138  m_watchdog.reset();
139  // unregistering from the IncidentSvc
140  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
142  return base_class::finalize();
143 }
144 
145 // Declaration of the factory
Definition of the MsgStream class used to transmit messages.
Definition: MsgStream.h:34
SmartIF< ISvcLocator > & serviceLocator() const override
Retrieve pointer to service locator.
Definition: Service.cpp:287
StatusCode stop() override
Stop the watchdog thread (after the event loop).
const SmartIF< IMessageSvc > & msgSvc() const
The standard message service.
void start()
Start the watchdog thread.
virtual void onStop()
User implemented function that will be called when stopping.
MsgStream & warning() const
shortcut for the method msgStream(MSG::WARNING)
Gaudi::Property< unsigned int > m_eventTimeout
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
void handle(const Incident &) override
Notify the watchdog thread for a new event.
Gaudi::Property< bool > m_stackTrace
T endl(T... args)
constexpr static const auto SUCCESS
Definition: StatusCode.h:100
StatusCode start() override
Start the watchdog thread (before entering the event loop).
void ping()
Function to call to notify the watchdog thread that we are still alive.
STL class.
#define DECLARE_COMPONENT(type)
StatusCode service(const Gaudi::Utils::TypeNameString &name, T *&svc, bool createIf=true)
Templated method to access a service by name.
Definition: ISvcLocator.h:86
const std::string & name() const override
Retrieve name of the service.
Definition: Service.cpp:284
Service that monitor the time taken by processing of single events using a separate thread.
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
MsgStream & error() const
shortcut for the method msgStream(MSG::ERROR)
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:61
MsgStream & debug() const
shortcut for the method msgStream(MSG::DEBUG)
T reset(T... args)
T raise(T... args)
virtual void onPing()
User implemented function that will be called when ping is called.
StatusCode initialize() override
Initialization of the service.
Simple class for asynchronous check of time-out.
void stop()
Signal the watchdog thread to stop and wait for it.
Base class for all Incidents (computing events).
Definition: Incident.h:27
virtual void addListener(IIncidentListener *lis, const std::string &type="", long priority=0, bool rethrow=false, bool singleShot=false)=0
Add listener.
constexpr static const auto FAILURE
Definition: StatusCode.h:101
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:228
StatusCode finalize() override
Finalization of the service.
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:96
virtual void removeListener(IIncidentListener *lis, const std::string &type="")=0
Remove listener.
bool isFailure() const
Definition: StatusCode.h:145
virtual void action()
User implemented function that will be called if the time-out is reached.
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition: Memory.cpp:141
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:202
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
Gaudi::Property< int > m_maxTimeoutCount