The Gaudi Framework  v38r3 (c3fc9673)
EventWatchdogAlg.cpp
Go to the documentation of this file.
1 /***********************************************************************************\
2 * (c) Copyright 2024 CERN for the benefit of the LHCb and ATLAS collaborations *
3 * *
4 * This software is distributed under the terms of the Apache version 2 licence, *
5 * copied verbatim in the file "LICENSE". *
6 * *
7 * In applying this licence, CERN does not waive the privileges and immunities *
8 * granted to it by virtue of its status as an Intergovernmental Organization *
9 * or submit itself to any jurisdiction. *
10 \***********************************************************************************/
14 #include <GaudiKernel/IScheduler.h>
15 #include <GaudiKernel/Memory.h>
16 #include <TSystem.h>
17 #include <algorithm>
18 #include <chrono>
19 #include <csignal>
20 #include <fmt/format.h>
21 #include <fmt/ostream.h>
22 #include <mutex>
23 #include <range/v3/range/conversion.hpp>
24 #include <range/v3/view/remove.hpp>
25 #include <range/v3/view/transform.hpp>
26 #include <string>
27 #include <string_view>
28 #include <utility>
29 #include <vector>
30 
31 namespace {
33  std::string sanitize( std::string_view input ) {
34  std::string output{ input };
35  std::replace( begin( output ), end( output ), ':', '_' );
36  return output;
37  }
38 
39  // mutex to prevent reports from the watchdog to step on each other,
40  // for example requiring a stack trace while already producing one
41  std::mutex s_watchdogReportMutex;
42 } // namespace
43 
44 #if FMT_VERSION >= 90000
45 // make EventContext formattable via fmt
46 template <>
47 struct fmt::formatter<EventContext> : ostream_formatter {};
48 #endif
49 
51 
52 namespace Gaudi {
53 
55  class EventWatchdogAlg : public Gaudi::Functional::Transformer<PeriodicAction( EventContext const& )> {
56  public:
57  EventWatchdogAlg( const std::string& name, ISvcLocator* pSvcLocator )
58  : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", fmt::format( ".{}-timer", sanitize( name ) ) } ) {
59  // timeout period cannot be smaller than 1
60  m_eventTimeout.verifier().setLower( 1 );
61  }
62 
63  StatusCode initialize() override {
64  return Transformer::initialize().andThen( [this] {
65  if ( !service( "StalledEventMonitor", false, true ) ) {
66  // StalledEventMonitor was not instantiated, so we try to steal its options
67  auto& opts = serviceLocator()->getOptsSvc();
68  opts.bind( "StalledEventMonitor", &m_eventTimeout );
69  opts.bind( "StalledEventMonitor", &m_stackTrace );
70  if ( opts.isSet( "StalledEventMonitor.MaxTimeoutCount" ) ) {
71  Gaudi::Property<int> maxCount{ "MaxTimeoutCount", 0 };
72  opts.bind( "StalledEventMonitor", &maxCount );
73  m_abortOnTimeout = maxCount.value() >= 0;
74  }
75  }
76  } );
77  }
78 
79  PeriodicAction operator()( EventContext const& ctx ) const override {
80  using namespace std::chrono_literals;
81  // we use a functor because we cannot pass mutable states to a lambda
82  struct Action {
83  MsgStream log;
84  const bool doStackTrace;
85  const bool abortOnTimeout;
86  const unsigned int timeout;
87  const EventContext ctx;
88 
89  const std::vector<IScheduler*> schedulers;
90 
91  int counter{ 0 };
92 
93  const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
94 
95  void operator()() {
96  ++counter;
97  if ( counter == 1 ) {
98  log << MSG::WARNING << fmt::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
99  << endmsg;
100  } else {
101  log << MSG::WARNING << fmt::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
102  }
103  if ( log.level() <= MSG::INFO ) {
104  log << MSG::INFO
105  << fmt::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
106  System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
107  << endmsg;
108  }
109 
110  std::scoped_lock protectReport( s_watchdogReportMutex );
111  if ( doStackTrace ) {
112  for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
113  if ( gSystem ) {
114  // TSystem::StackTrace() prints on the standard error, so we do the same
115  fmt::print( stderr, "=== Stalled event: current stack trace ({}) ===\n", ctx );
116  gSystem->StackTrace();
117  }
118  }
119  if ( abortOnTimeout ) {
120  log << MSG::FATAL << fmt::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
121  std::raise( SIGQUIT );
122  }
123  }
124 
125  ~Action() {
126  if ( counter ) {
128  std::chrono::steady_clock::now() - eventStart;
129  log << MSG::INFO << fmt::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
130  }
131  }
132  };
133 
134  std::vector<IScheduler*> schedulers;
135  if ( m_stackTrace ) {
136  using namespace ranges;
137  // if we ask for the stack trace we are also interested in the state of the scheduler
138  schedulers = svcLoc()->getServices() |
139  views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
140  views::remove( nullptr ) | to<std::vector>();
141  }
142 
144  // use a partial copy of the context to avoid copying the optional extension
145  EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
147  }
148 
149  private:
150  Gaudi::CheckedProperty<unsigned int> m_eventTimeout{ this, "EventTimeout", 600,
151  "Number of seconds allowed to process a single event." };
152  Gaudi::Property<bool> m_abortOnTimeout{ this, "AbortOnTimeout", false,
153  "If set to true, the application is killed when we reach the timeout." };
154  Gaudi::Property<bool> m_stackTrace{ this, "StackTrace", false, "Whether to print the stack-trace on timeout." };
155  };
157 } // namespace Gaudi
Gaudi::EventWatchdogAlg::operator()
PeriodicAction operator()(EventContext const &ctx) const override
Definition: EventWatchdogAlg.cpp:79
std::string
STL class.
Gaudi.Configuration.log
log
Definition: Configuration.py:28
StatusCode::andThen
StatusCode andThen(F &&f, ARGS &&... args) const
Chain code blocks making the execution conditional a success result.
Definition: StatusCode.h:163
std::move
T move(T... args)
MSG::INFO
@ INFO
Definition: IMessageSvc.h:25
std::vector
STL class.
ISvcLocator
Definition: ISvcLocator.h:46
std::chrono::duration
Memory.h
std::raise
T raise(T... args)
ranges
Definition: details.h:30
MSG::WARNING
@ WARNING
Definition: IMessageSvc.h:25
Gaudi::EventWatchdogAlg::m_eventTimeout
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
Definition: EventWatchdogAlg.cpp:150
gaudirun.output
output
Definition: gaudirun.py:521
AvalancheSchedulerErrorTest.msgSvc
msgSvc
Definition: AvalancheSchedulerErrorTest.py:80
std::replace
T replace(T... args)
IScheduler.h
Gaudi::Utils::PeriodicAction
Helper to periodically run asynchronous tasks.
Definition: PeriodicAction.h:29
GaudiPython.Pythonizations.ctx
ctx
Definition: Pythonizations.py:578
StatusCode
Definition: StatusCode.h:65
gaudirun.opts
opts
Definition: gaudirun.py:336
Transformer.h
CLHEP::begin
double * begin(CLHEP::HepVector &v)
Definition: TupleAlg.cpp:45
Gaudi::EventWatchdogAlg::m_abortOnTimeout
Gaudi::Property< bool > m_abortOnTimeout
Definition: EventWatchdogAlg.cpp:152
Gaudi::Property::value
const ValueType & value() const
Definition: Property.h:239
AtlasMCRecoFullPrecedenceDump.scheduler
scheduler
Definition: AtlasMCRecoFullPrecedenceDump.py:47
format
GAUDI_API std::string format(const char *,...)
MsgStream format utility "a la sprintf(...)".
Definition: MsgStream.cpp:119
Gaudi::EventWatchdogAlg
Add to the transient store a tracker that detects events that are taking too long.
Definition: EventWatchdogAlg.cpp:55
endmsg
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:203
MsgStream
Definition: MsgStream.h:34
Gaudi
Header file for std:chrono::duration-based Counters.
Definition: __init__.py:1
MSG::FATAL
@ FATAL
Definition: IMessageSvc.h:25
PeriodicAction.h
ConditionsStallTest.name
name
Definition: ConditionsStallTest.py:77
EventContext.h
DECLARE_COMPONENT
#define DECLARE_COMPONENT(type)
Definition: PluginServiceV1.h:46
EventContext
Definition: EventContext.h:34
std::chrono::duration::count
T count(T... args)
Gaudi::EventWatchdogAlg::m_stackTrace
Gaudi::Property< bool > m_stackTrace
Definition: EventWatchdogAlg.cpp:154
Gaudi ::Functional::Transformer
details::Transformer< Signature, Traits_, details::isLegacy< Traits_ > > Transformer
Definition: Transformer.h:237
std::mutex
STL class.
gaudirun.action
action
Definition: gaudirun.py:153
plotSpeedupsPyRoot.counter
counter
Definition: plotSpeedupsPyRoot.py:175
IOTest.end
end
Definition: IOTest.py:125
Gaudi::EventWatchdogAlg::EventWatchdogAlg
EventWatchdogAlg(const std::string &name, ISvcLocator *pSvcLocator)
Definition: EventWatchdogAlg.cpp:57
Io::Action
Action
Definition: IFileMgr.h:278
System::pagedMemory
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition: Memory.cpp:141
Gaudi::EventWatchdogAlg::initialize
StatusCode initialize() override
Definition: EventWatchdogAlg.cpp:63
System::virtualMemory
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:228
Gaudi::Property< int >
std::chrono::steady_clock::now
T now(T... args)