The Gaudi Framework  v38r0 (2143aa4c)
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /***********************************************************************************\
2 * (c) Copyright 1998-2019 CERN for the benefit of the LHCb and ATLAS collaborations *
3 * *
4 * This software is distributed under the terms of the Apache version 2 licence, *
5 * copied verbatim in the file "LICENSE". *
6 * *
7 * In applying this licence, CERN does not waive the privileges and immunities *
8 * granted to it by virtue of its status as an Intergovernmental Organization *
9 * or submit itself to any jurisdiction. *
10 \***********************************************************************************/
11 /*
12  * StalledEventMonitor.cpp
13  *
14  * Created on: Apr 19, 2010
15  * Author: Marco Clemencic
16  */
17 
18 // Include files
19 #include "StalledEventMonitor.h"
20 
22 #include "GaudiKernel/Memory.h"
24 
25 #include "TSystem.h"
26 
27 #include <csignal>
28 
29 namespace {
31  class EventWatchdog : public WatchdogThread {
32  public:
33  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name,
34  boost::posix_time::time_duration timeout, bool stackTrace = false, long maxCount = 0,
35  bool autostart = false )
36  : WatchdogThread( timeout, autostart )
37  , log( msgSvc, name )
38  , m_maxCount( maxCount )
39  , m_stackTrace( stackTrace ) {}
40 
41  private:
43  MsgStream log;
45  long m_counter = 0;
47  long m_maxCount = 0;
49  bool m_stackTrace = false;
51  void action() override {
52  if ( !m_counter ) {
53  log << MSG::WARNING << "More than " << getTimeout().total_seconds() << "s since the last "
54  << IncidentType::BeginEvent << endmsg;
55  } else {
56  log << MSG::WARNING << "Other " << getTimeout().total_seconds() << "s passed" << endmsg;
57  }
58  log << MSG::INFO
59  << "Current memory usage is"
60  " virtual size = "
61  << System::virtualMemory() / 1024.
62  << " MB"
63  ", resident set size = "
64  << System::pagedMemory() / 1024. << " MB" << endmsg;
65  if ( m_stackTrace && gSystem ) {
66  // TSystem::StackTrace() prints on the standard error, so we
67  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
68  gSystem->StackTrace();
69  }
70  ++m_counter;
71  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
72  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
73  std::raise( SIGQUIT );
74  }
75  }
76  void onPing() override {
77  if ( m_counter ) {
78  if ( m_counter >= 3 )
79  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().total_seconds() << "s"
80  << endmsg;
81  m_counter = 0;
82  }
83  }
84  void onStop() override {
85  if ( m_counter >= 3 )
86  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().total_seconds() << "s" << endmsg;
87  }
88  };
89 } // namespace
90 
91 // Initialization of the service.
93  StatusCode sc = base_class::initialize();
94  if ( sc.isFailure() ) return sc;
95 
96  if ( m_eventTimeout ) {
97  // create the watchdog thread
98  m_watchdog = std::make_unique<EventWatchdog>( msgSvc(), "EventWatchdog",
99  boost::posix_time::seconds( m_eventTimeout.value() ), m_stackTrace,
101 
102  // register to the incident service
103  static const std::string serviceName = "IncidentSvc";
104  m_incidentSvc = serviceLocator()->service( serviceName );
105  if ( !m_incidentSvc ) {
106  error() << "Cannot retrieve " << serviceName << endmsg;
107  return StatusCode::FAILURE;
108  }
109  debug() << "Register to the IncidentSvc" << endmsg;
110  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
111  } else {
112  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
113  }
114 
115  return StatusCode::SUCCESS;
116 }
117 
118 // Start the monitoring.
120  if ( m_watchdog ) m_watchdog->start();
121  return StatusCode::SUCCESS;
122 }
123 
124 // Notify the watchdog that a new event has been started
125 void StalledEventMonitor::handle( const Incident& /* incident */ ) {
126  if ( m_watchdog ) m_watchdog->ping();
127 }
128 
129 // Start the monitoring.
131  if ( m_watchdog ) m_watchdog->stop();
132  return StatusCode::SUCCESS;
133 }
134 
135 // Finalization of the service.
137  // destroy the watchdog thread (if any)
138  m_watchdog.reset();
139  // unregistering from the IncidentSvc
140  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
142  return base_class::finalize();
143 }
144 
145 // Declaration of the factory
WatchdogThread::stop
void stop()
Signal the watchdog thread to stop and wait for it.
Definition: WatchdogThread.cpp:45
WatchdogThread::ping
void ping()
Function to call to notify the watchdog thread that we are still alive.
Definition: WatchdogThread.h:56
std::string
STL class.
Gaudi.Configuration.log
log
Definition: Configuration.py:29
bug_34121.name
name
Definition: bug_34121.py:20
MSG::INFO
@ INFO
Definition: IMessageSvc.h:25
SmartIF::reset
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:96
Memory.h
std::raise
T raise(T... args)
MSG::WARNING
@ WARNING
Definition: IMessageSvc.h:25
StalledEventMonitor
Service that monitor the time taken by processing of single events using a separate thread.
Definition: StalledEventMonitor.h:39
WatchdogThread::onPing
virtual void onPing()
User implemented function that will be called when ping is called.
Definition: WatchdogThread.cpp:91
WatchdogThread::getTimeout
boost::posix_time::time_duration getTimeout() const
Get the current time-out value.
Definition: WatchdogThread.h:66
StalledEventMonitor::initialize
StatusCode initialize() override
Initialization of the service.
Definition: StalledEventMonitor.cpp:92
std::unique_ptr::reset
T reset(T... args)
WatchdogThread::start
void start()
Start the watchdog thread.
Definition: WatchdogThread.cpp:33
IIncidentSvc.h
WatchdogThread
Definition: WatchdogThread.h:37
WatchdogThread.h
Service::name
const std::string & name() const override
Retrieve name of the service
Definition: Service.cpp:332
StatusCode
Definition: StatusCode.h:65
std::cerr
StalledEventMonitor::finalize
StatusCode finalize() override
Finalization of the service.
Definition: StalledEventMonitor.cpp:136
StalledEventMonitor::m_eventTimeout
Gaudi::Property< unsigned int > m_eventTimeout
Definition: StalledEventMonitor.h:61
WatchdogThread::action
virtual void action()
User implemented function that will be called if the time-out is reached.
Definition: WatchdogThread.cpp:88
StalledEventMonitor.h
Gaudi::Property::value
const ValueType & value() const
Definition: Property.h:239
SmartIF< IMessageSvc >
endmsg
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:203
MsgStream
Definition: MsgStream.h:34
MSG::FATAL
@ FATAL
Definition: IMessageSvc.h:25
StalledEventMonitor::m_maxTimeoutCount
Gaudi::Property< int > m_maxTimeoutCount
Definition: StalledEventMonitor.h:63
StalledEventMonitor::handle
void handle(const Incident &) override
Notify the watchdog thread for a new event.
Definition: StalledEventMonitor.cpp:125
StatusCode::isFailure
bool isFailure() const
Definition: StatusCode.h:129
StatusCode::SUCCESS
constexpr static const auto SUCCESS
Definition: StatusCode.h:100
std::endl
T endl(T... args)
StalledEventMonitor::m_watchdog
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
Definition: StalledEventMonitor.h:68
StalledEventMonitor::stop
StatusCode stop() override
Stop the watchdog thread (after the event loop).
Definition: StalledEventMonitor.cpp:130
DECLARE_COMPONENT
#define DECLARE_COMPONENT(type)
Definition: PluginServiceV1.h:46
StalledEventMonitor::m_incidentSvc
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
Definition: StalledEventMonitor.h:71
StalledEventMonitor::m_stackTrace
Gaudi::Property< bool > m_stackTrace
Definition: StalledEventMonitor.h:65
WatchdogThread::onStop
virtual void onStop()
User implemented function that will be called when stopping.
Definition: WatchdogThread.cpp:97
AsyncIncidents.msgSvc
msgSvc
Definition: AsyncIncidents.py:34
StatusCode::FAILURE
constexpr static const auto FAILURE
Definition: StatusCode.h:101
System::pagedMemory
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition: Memory.cpp:141
Incident
Definition: Incident.h:27
System::virtualMemory
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:228
StalledEventMonitor::start
StatusCode start() override
Start the watchdog thread (before entering the event loop).
Definition: StalledEventMonitor.cpp:119
Service::serviceLocator
SmartIF< ISvcLocator > & serviceLocator() const override
Retrieve pointer to service locator
Definition: Service.cpp:335