The Gaudi Framework  v38r3 (c3fc9673)
StalledEventMonitor.cpp
Go to the documentation of this file.
1 /***********************************************************************************\
2 * (c) Copyright 1998-2024 CERN for the benefit of the LHCb and ATLAS collaborations *
3 * *
4 * This software is distributed under the terms of the Apache version 2 licence, *
5 * copied verbatim in the file "LICENSE". *
6 * *
7 * In applying this licence, CERN does not waive the privileges and immunities *
8 * granted to it by virtue of its status as an Intergovernmental Organization *
9 * or submit itself to any jurisdiction. *
10 \***********************************************************************************/
11 /*
12  * StalledEventMonitor.cpp
13  *
14  * Created on: Apr 19, 2010
15  * Author: Marco Clemencic
16  */
17 
18 // Include files
19 #include "StalledEventMonitor.h"
21 #include <GaudiKernel/Memory.h>
23 #include <TSystem.h>
24 #include <chrono>
25 #include <csignal>
26 
27 namespace {
29  class EventWatchdog : public WatchdogThread {
30  public:
31  EventWatchdog( const SmartIF<IMessageSvc>& msgSvc, const std::string& name, std::chrono::seconds timeout,
32  bool stackTrace = false, long maxCount = 0, bool autostart = false )
33  : WatchdogThread( timeout, autostart )
34  , log( msgSvc, name )
35  , m_maxCount( maxCount )
36  , m_stackTrace( stackTrace ) {}
37 
38  private:
40  MsgStream log;
42  long m_counter = 0;
44  long m_maxCount = 0;
46  bool m_stackTrace = false;
48  void action() override {
49  if ( !m_counter ) {
50  log << MSG::WARNING << "More than " << getTimeout().count() << "s since the last " << IncidentType::BeginEvent
51  << endmsg;
52  } else {
53  log << MSG::WARNING << "Other " << getTimeout().count() << "s passed" << endmsg;
54  }
55  log << MSG::INFO
56  << "Current memory usage is"
57  " virtual size = "
58  << System::virtualMemory() / 1024.
59  << " MB"
60  ", resident set size = "
61  << System::pagedMemory() / 1024. << " MB" << endmsg;
62  if ( m_stackTrace && gSystem ) {
63  // TSystem::StackTrace() prints on the standard error, so we
64  std::cerr << "=== Stalled event: current stack trace ===" << std::endl;
65  gSystem->StackTrace();
66  }
67  ++m_counter;
68  if ( m_maxCount > 0 && m_counter >= m_maxCount ) {
69  log << MSG::FATAL << "too much time on a single event: aborting process" << endmsg;
70  std::raise( SIGQUIT );
71  }
72  }
73  void onPing() override {
74  if ( m_counter ) {
75  if ( m_counter >= 3 )
76  log << MSG::INFO << "Starting a new event after ~" << m_counter * getTimeout().count() << "s" << endmsg;
77  m_counter = 0;
78  }
79  }
80  void onStop() override {
81  if ( m_counter >= 3 )
82  log << MSG::INFO << "The last event took ~" << m_counter * getTimeout().count() << "s" << endmsg;
83  }
84  };
85 } // namespace
86 
87 // Initialization of the service.
89  StatusCode sc = base_class::initialize();
90  if ( sc.isFailure() ) return sc;
91 
92  warning() << "the service StalledEventMonitor is deprecated, please use Gaudi::EventWatchdogAlg" << endmsg;
93 
94  if ( m_eventTimeout ) {
95  // create the watchdog thread
96  m_watchdog = std::make_unique<EventWatchdog>(
98 
99  // register to the incident service
100  static const std::string serviceName = "IncidentSvc";
101  m_incidentSvc = serviceLocator()->service( serviceName );
102  if ( !m_incidentSvc ) {
103  error() << "Cannot retrieve " << serviceName << endmsg;
104  return StatusCode::FAILURE;
105  }
106  debug() << "Register to the IncidentSvc" << endmsg;
107  m_incidentSvc->addListener( this, IncidentType::BeginEvent );
108  } else {
109  warning() << "StalledEventMonitor/" << name() << " instantiated with 0 time-out: no monitoring performed" << endmsg;
110  }
111 
112  return StatusCode::SUCCESS;
113 }
114 
115 // Start the monitoring.
117  if ( m_watchdog ) m_watchdog->start();
118  return StatusCode::SUCCESS;
119 }
120 
121 // Notify the watchdog that a new event has been started
122 void StalledEventMonitor::handle( const Incident& /* incident */ ) {
123  if ( m_watchdog ) m_watchdog->ping();
124 }
125 
126 // Start the monitoring.
128  if ( m_watchdog ) m_watchdog->stop();
129  return StatusCode::SUCCESS;
130 }
131 
132 // Finalization of the service.
134  // destroy the watchdog thread (if any)
135  m_watchdog.reset();
136  // unregistering from the IncidentSvc
137  m_incidentSvc->removeListener( this, IncidentType::BeginEvent );
139  return base_class::finalize();
140 }
141 
142 // Declaration of the factory
WatchdogThread::stop
void stop()
Signal the watchdog thread to stop and wait for it.
Definition: WatchdogThread.cpp:60
WatchdogThread::ping
void ping()
Function to call to notify the watchdog thread that we are still alive.
Definition: WatchdogThread.h:49
std::string
STL class.
Gaudi.Configuration.log
log
Definition: Configuration.py:28
MSG::INFO
@ INFO
Definition: IMessageSvc.h:25
SmartIF::reset
void reset(TYPE *ptr=nullptr)
Set the internal pointer to the passed one disposing of the old one.
Definition: SmartIF.h:96
std::chrono::seconds
Memory.h
std::raise
T raise(T... args)
MSG::WARNING
@ WARNING
Definition: IMessageSvc.h:25
StalledEventMonitor
Service that monitor the time taken by processing of single events using a separate thread.
Definition: StalledEventMonitor.h:39
WatchdogThread::onPing
virtual void onPing()
User implemented function that will be called when ping is called.
Definition: WatchdogThread.cpp:73
StalledEventMonitor::initialize
StatusCode initialize() override
Initialization of the service.
Definition: StalledEventMonitor.cpp:88
std::unique_ptr::reset
T reset(T... args)
AvalancheSchedulerErrorTest.msgSvc
msgSvc
Definition: AvalancheSchedulerErrorTest.py:80
WatchdogThread::start
void start()
Start the watchdog thread.
Definition: WatchdogThread.cpp:28
IIncidentSvc.h
WatchdogThread
Definition: WatchdogThread.h:27
WatchdogThread.h
Service::name
const std::string & name() const override
Retrieve name of the service
Definition: Service.cpp:332
StatusCode
Definition: StatusCode.h:65
std::cerr
StalledEventMonitor::finalize
StatusCode finalize() override
Finalization of the service.
Definition: StalledEventMonitor.cpp:133
StalledEventMonitor::m_eventTimeout
Gaudi::Property< unsigned int > m_eventTimeout
Definition: StalledEventMonitor.h:61
WatchdogThread::action
virtual void action()
User implemented function that will be called if the time-out is reached.
Definition: WatchdogThread.cpp:70
StalledEventMonitor.h
Gaudi::Property::value
const ValueType & value() const
Definition: Property.h:239
SmartIF< IMessageSvc >
endmsg
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:203
MsgStream
Definition: MsgStream.h:34
MSG::FATAL
@ FATAL
Definition: IMessageSvc.h:25
StalledEventMonitor::m_maxTimeoutCount
Gaudi::Property< int > m_maxTimeoutCount
Definition: StalledEventMonitor.h:64
StalledEventMonitor::handle
void handle(const Incident &) override
Notify the watchdog thread for a new event.
Definition: StalledEventMonitor.cpp:122
StatusCode::isFailure
bool isFailure() const
Definition: StatusCode.h:129
StatusCode::SUCCESS
constexpr static const auto SUCCESS
Definition: StatusCode.h:100
ConditionsStallTest.name
name
Definition: ConditionsStallTest.py:77
std::endl
T endl(T... args)
StalledEventMonitor::m_watchdog
std::unique_ptr< WatchdogThread > m_watchdog
Pointer to the watchdog thread that checks for the event timeout.
Definition: StalledEventMonitor.h:72
StalledEventMonitor::stop
StatusCode stop() override
Stop the watchdog thread (after the event loop).
Definition: StalledEventMonitor.cpp:127
DECLARE_COMPONENT
#define DECLARE_COMPONENT(type)
Definition: PluginServiceV1.h:46
StalledEventMonitor::m_incidentSvc
SmartIF< IIncidentSvc > m_incidentSvc
Pointer to the incident service.
Definition: StalledEventMonitor.h:75
std::chrono::seconds::count
T count(T... args)
WatchdogThread::getTimeout
std::chrono::seconds getTimeout() const
Get the current time-out value.
Definition: WatchdogThread.h:58
StalledEventMonitor::m_stackTrace
Gaudi::Property< bool > m_stackTrace
Definition: StalledEventMonitor.h:67
WatchdogThread::onStop
virtual void onStop()
User implemented function that will be called when stopping.
Definition: WatchdogThread.cpp:79
StatusCode::FAILURE
constexpr static const auto FAILURE
Definition: StatusCode.h:101
System::pagedMemory
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition: Memory.cpp:141
Incident
Definition: Incident.h:27
System::virtualMemory
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:228
StalledEventMonitor::start
StatusCode start() override
Start the watchdog thread (before entering the event loop).
Definition: StalledEventMonitor.cpp:116
Service::serviceLocator
SmartIF< ISvcLocator > & serviceLocator() const override
Retrieve pointer to service locator
Definition: Service.cpp:335