The Gaudi Framework  master (37c0b60a)
EventWatchdogAlg.cpp
Go to the documentation of this file.
1 /***********************************************************************************\
2 * (c) Copyright 2024 CERN for the benefit of the LHCb and ATLAS collaborations *
3 * *
4 * This software is distributed under the terms of the Apache version 2 licence, *
5 * copied verbatim in the file "LICENSE". *
6 * *
7 * In applying this licence, CERN does not waive the privileges and immunities *
8 * granted to it by virtue of its status as an Intergovernmental Organization *
9 * or submit itself to any jurisdiction. *
10 \***********************************************************************************/
14 #include <GaudiKernel/IScheduler.h>
15 #include <GaudiKernel/Memory.h>
16 #include <TSystem.h>
17 #include <algorithm>
18 #include <chrono>
19 #include <csignal>
20 #include <fmt/format.h>
21 #include <mutex>
22 #include <range/v3/range/conversion.hpp>
23 #include <range/v3/view/remove.hpp>
24 #include <range/v3/view/transform.hpp>
25 #include <string>
26 #include <string_view>
27 #include <utility>
28 #include <vector>
29 
30 namespace {
32  std::string sanitize( std::string_view input ) {
33  std::string output{ input };
34  std::replace( begin( output ), end( output ), ':', '_' );
35  return output;
36  }
37 
38  // mutex to prevent reports from the watchdog to step on each other,
39  // for example requiring a stack trace while already producing one
40  std::mutex s_watchdogReportMutex;
41 } // namespace
42 
44 
45 namespace Gaudi {
46 
48  class EventWatchdogAlg : public Gaudi::Functional::Transformer<PeriodicAction( EventContext const& )> {
49  public:
50  EventWatchdogAlg( const std::string& name, ISvcLocator* pSvcLocator )
51  : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", fmt::format( ".{}-timer", sanitize( name ) ) } ) {
52  // timeout period cannot be smaller than 1
53  m_eventTimeout.verifier().setLower( 1 );
54  }
55 
56  PeriodicAction operator()( EventContext const& ctx ) const override {
57  using namespace std::chrono_literals;
58  // we use a functor because we cannot pass mutable states to a lambda
59  struct Action {
60  MsgStream log;
61  const bool doStackTrace;
62  const bool abortOnTimeout;
63  const unsigned int timeout;
64  const EventContext ctx;
65 
66  const std::vector<IScheduler*> schedulers;
67 
68  int counter{ 0 };
69 
70  const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
71 
72  void operator()() {
73  ++counter;
74  if ( counter == 1 ) {
75  log << MSG::WARNING << fmt::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
76  << endmsg;
77  } else {
78  log << MSG::WARNING << fmt::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
79  }
80  if ( log.level() <= MSG::INFO ) {
81  log << MSG::INFO
82  << fmt::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
83  System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
84  << endmsg;
85  }
86 
87  std::scoped_lock protectReport( s_watchdogReportMutex );
88  if ( doStackTrace ) {
89  for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
90  if ( gSystem ) {
91  // TSystem::StackTrace() prints on the standard error, so we do the same
92  fmt::print( stderr, "=== Stalled event: current stack trace ({}) ===\n", ctx );
93  gSystem->StackTrace();
94  }
95  }
96  if ( abortOnTimeout ) {
97  log << MSG::FATAL << fmt::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
98  std::raise( SIGQUIT );
99  }
100  }
101 
102  ~Action() {
103  if ( counter ) {
105  std::chrono::steady_clock::now() - eventStart;
106  log << MSG::INFO << fmt::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
107  }
108  }
109  };
110 
111  std::vector<IScheduler*> schedulers;
112  if ( m_stackTrace ) {
113  using namespace ranges;
114  // if we ask for the stack trace we are also interested in the state of the scheduler
115  schedulers = svcLoc()->getServices() |
116  views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
117  views::remove( nullptr ) | to<std::vector>();
118  }
119 
121  // use a partial copy of the context to avoid copying the optional extension
122  EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
124  }
125 
126  private:
127  Gaudi::CheckedProperty<unsigned int> m_eventTimeout{ this, "EventTimeout", 600,
128  "Number of seconds allowed to process a single event." };
129  Gaudi::Property<bool> m_abortOnTimeout{ this, "AbortOnTimeout", false,
130  "If set to true, the application is killed when we reach the timeout." };
131  Gaudi::Property<bool> m_stackTrace{ this, "StackTrace", false, "Whether to print the stack-trace on timeout." };
132  };
134 } // namespace Gaudi
Gaudi::EventWatchdogAlg::operator()
PeriodicAction operator()(EventContext const &ctx) const override
Definition: EventWatchdogAlg.cpp:56
std::string
STL class.
Gaudi.Configuration.log
log
Definition: Configuration.py:28
std::move
T move(T... args)
MSG::INFO
@ INFO
Definition: IMessageSvc.h:25
std::vector
STL class.
ISvcLocator
Definition: ISvcLocator.h:46
std::chrono::duration
Memory.h
std::raise
T raise(T... args)
ranges
Definition: details.h:29
MSG::WARNING
@ WARNING
Definition: IMessageSvc.h:25
Gaudi::EventWatchdogAlg::m_eventTimeout
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
Definition: EventWatchdogAlg.cpp:127
gaudirun.output
output
Definition: gaudirun.py:521
AvalancheSchedulerErrorTest.msgSvc
msgSvc
Definition: AvalancheSchedulerErrorTest.py:80
std::replace
T replace(T... args)
fixtures.stderr
Generator[bytes, None, None] stderr(subprocess.CompletedProcess completed_process)
Definition: fixtures.py:147
IScheduler.h
Gaudi::Utils::PeriodicAction
Helper to periodically run asynchronous tasks.
Definition: PeriodicAction.h:29
Gaudi::Utils::begin
AttribStringParser::Iterator begin(const AttribStringParser &parser)
Definition: AttribStringParser.h:136
GaudiPython.Pythonizations.ctx
ctx
Definition: Pythonizations.py:578
Gaudi::EventWatchdogAlg::m_abortOnTimeout
Gaudi::Property< bool > m_abortOnTimeout
Definition: EventWatchdogAlg.cpp:129
Transformer.h
format
GAUDI_API std::string format(const char *,...)
MsgStream format utility "a la sprintf(...)".
Definition: MsgStream.cpp:119
Gaudi::EventWatchdogAlg
Add to the transient store a tracker that detects events that are taking too long.
Definition: EventWatchdogAlg.cpp:48
endmsg
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:202
MsgStream
Definition: MsgStream.h:33
Gaudi
This file provides a Grammar for the type Gaudi::Accumulators::Axis It allows to use that type from p...
Definition: __init__.py:1
MSG::FATAL
@ FATAL
Definition: IMessageSvc.h:25
PeriodicAction.h
ConditionsStallTest.name
name
Definition: ConditionsStallTest.py:77
EventContext.h
DECLARE_COMPONENT
#define DECLARE_COMPONENT(type)
Definition: PluginServiceV1.h:46
EventContext
Definition: EventContext.h:34
std::chrono::duration::count
T count(T... args)
Gaudi::EventWatchdogAlg::m_stackTrace
Gaudi::Property< bool > m_stackTrace
Definition: EventWatchdogAlg.cpp:131
Gaudi ::Functional::Transformer
details::Transformer< Signature, Traits_, details::isLegacy< Traits_ > > Transformer
Definition: Transformer.h:237
std::mutex
STL class.
gaudirun.action
action
Definition: gaudirun.py:153
plotSpeedupsPyRoot.counter
counter
Definition: plotSpeedupsPyRoot.py:175
IOTest.end
end
Definition: IOTest.py:125
Gaudi::EventWatchdogAlg::EventWatchdogAlg
EventWatchdogAlg(const std::string &name, ISvcLocator *pSvcLocator)
Definition: EventWatchdogAlg.cpp:50
Io::Action
Action
Definition: IFileMgr.h:278
System::pagedMemory
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition: Memory.cpp:141
System::virtualMemory
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:228
GPUAvalancheSchedulerSimpleTest.scheduler
scheduler
Definition: GPUAvalancheSchedulerSimpleTest.py:84
Gaudi::Property< bool >
std::chrono::steady_clock::now
T now(T... args)