The Gaudi Framework  v38r3 (c3fc9673)
Gaudi::EventWatchdogAlg Class Reference

Add to the transient store a tracker that detects events that are taking too long. More...

Inheritance diagram for Gaudi::EventWatchdogAlg:
Collaboration diagram for Gaudi::EventWatchdogAlg:

Public Member Functions

 EventWatchdogAlg (const std::string &name, ISvcLocator *pSvcLocator)
 
StatusCode initialize () override
 
PeriodicAction operator() (EventContext const &ctx) const override
 

Private Attributes

Gaudi::CheckedProperty< unsigned int > m_eventTimeout
 
Gaudi::Property< bool > m_abortOnTimeout
 
Gaudi::Property< bool > m_stackTrace { this, "StackTrace", false, "Whether to print the stack-trace on timeout." }
 

Detailed Description

Add to the transient store a tracker that detects events that are taking too long.

Definition at line 55 of file EventWatchdogAlg.cpp.

Constructor & Destructor Documentation

◆ EventWatchdogAlg()

Gaudi::EventWatchdogAlg::EventWatchdogAlg ( const std::string name,
ISvcLocator pSvcLocator 
)
inline

Definition at line 57 of file EventWatchdogAlg.cpp.

58  : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", fmt::format( ".{}-timer", sanitize( name ) ) } ) {
59  // timeout period cannot be smaller than 1
60  m_eventTimeout.verifier().setLower( 1 );
61  }

Member Function Documentation

◆ initialize()

StatusCode Gaudi::EventWatchdogAlg::initialize ( )
inlineoverride

Definition at line 63 of file EventWatchdogAlg.cpp.

63  {
64  return Transformer::initialize().andThen( [this] {
65  if ( !service( "StalledEventMonitor", false, true ) ) {
66  // StalledEventMonitor was not instantiated, so we try to steal its options
67  auto& opts = serviceLocator()->getOptsSvc();
68  opts.bind( "StalledEventMonitor", &m_eventTimeout );
69  opts.bind( "StalledEventMonitor", &m_stackTrace );
70  if ( opts.isSet( "StalledEventMonitor.MaxTimeoutCount" ) ) {
71  Gaudi::Property<int> maxCount{ "MaxTimeoutCount", 0 };
72  opts.bind( "StalledEventMonitor", &maxCount );
73  m_abortOnTimeout = maxCount.value() >= 0;
74  }
75  }
76  } );
77  }

◆ operator()()

PeriodicAction Gaudi::EventWatchdogAlg::operator() ( EventContext const &  ctx) const
inlineoverride

Definition at line 79 of file EventWatchdogAlg.cpp.

79  {
80  using namespace std::chrono_literals;
81  // we use a functor because we cannot pass mutable states to a lambda
82  struct Action {
83  MsgStream log;
84  const bool doStackTrace;
85  const bool abortOnTimeout;
86  const unsigned int timeout;
87  const EventContext ctx;
88 
89  const std::vector<IScheduler*> schedulers;
90 
91  int counter{ 0 };
92 
93  const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
94 
95  void operator()() {
96  ++counter;
97  if ( counter == 1 ) {
98  log << MSG::WARNING << fmt::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
99  << endmsg;
100  } else {
101  log << MSG::WARNING << fmt::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
102  }
103  if ( log.level() <= MSG::INFO ) {
104  log << MSG::INFO
105  << fmt::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
106  System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
107  << endmsg;
108  }
109 
110  std::scoped_lock protectReport( s_watchdogReportMutex );
111  if ( doStackTrace ) {
112  for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
113  if ( gSystem ) {
114  // TSystem::StackTrace() prints on the standard error, so we do the same
115  fmt::print( stderr, "=== Stalled event: current stack trace ({}) ===\n", ctx );
116  gSystem->StackTrace();
117  }
118  }
119  if ( abortOnTimeout ) {
120  log << MSG::FATAL << fmt::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
121  std::raise( SIGQUIT );
122  }
123  }
124 
125  ~Action() {
126  if ( counter ) {
128  std::chrono::steady_clock::now() - eventStart;
129  log << MSG::INFO << fmt::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
130  }
131  }
132  };
133 
134  std::vector<IScheduler*> schedulers;
135  if ( m_stackTrace ) {
136  using namespace ranges;
137  // if we ask for the stack trace we are also interested in the state of the scheduler
138  schedulers = svcLoc()->getServices() |
139  views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
140  views::remove( nullptr ) | to<std::vector>();
141  }
142 
144  // use a partial copy of the context to avoid copying the optional extension
145  EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
147  }

Member Data Documentation

◆ m_abortOnTimeout

Gaudi::Property<bool> Gaudi::EventWatchdogAlg::m_abortOnTimeout
private
Initial value:
{ this, "AbortOnTimeout", false,
"If set to true, the application is killed when we reach the timeout." }

Definition at line 152 of file EventWatchdogAlg.cpp.

◆ m_eventTimeout

Gaudi::CheckedProperty<unsigned int> Gaudi::EventWatchdogAlg::m_eventTimeout
private
Initial value:
{ this, "EventTimeout", 600,
"Number of seconds allowed to process a single event." }

Definition at line 150 of file EventWatchdogAlg.cpp.

◆ m_stackTrace

Gaudi::Property<bool> Gaudi::EventWatchdogAlg::m_stackTrace { this, "StackTrace", false, "Whether to print the stack-trace on timeout." }
private

Definition at line 154 of file EventWatchdogAlg.cpp.


The documentation for this class was generated from the following file:
Gaudi::EventWatchdogAlg::operator()
PeriodicAction operator()(EventContext const &ctx) const override
Definition: EventWatchdogAlg.cpp:79
Gaudi.Configuration.log
log
Definition: Configuration.py:28
std::move
T move(T... args)
MSG::INFO
@ INFO
Definition: IMessageSvc.h:25
std::vector
STL class.
std::chrono::duration
std::raise
T raise(T... args)
ranges
Definition: details.h:30
MSG::WARNING
@ WARNING
Definition: IMessageSvc.h:25
Gaudi::EventWatchdogAlg::m_eventTimeout
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
Definition: EventWatchdogAlg.cpp:150
AvalancheSchedulerErrorTest.msgSvc
msgSvc
Definition: AvalancheSchedulerErrorTest.py:80
Gaudi::Utils::PeriodicAction
Helper to periodically run asynchronous tasks.
Definition: PeriodicAction.h:29
GaudiPython.Pythonizations.ctx
ctx
Definition: Pythonizations.py:578
gaudirun.opts
opts
Definition: gaudirun.py:336
Gaudi::EventWatchdogAlg::m_abortOnTimeout
Gaudi::Property< bool > m_abortOnTimeout
Definition: EventWatchdogAlg.cpp:152
Gaudi::Property::value
const ValueType & value() const
Definition: Property.h:239
AtlasMCRecoFullPrecedenceDump.scheduler
scheduler
Definition: AtlasMCRecoFullPrecedenceDump.py:47
format
GAUDI_API std::string format(const char *,...)
MsgStream format utility "a la sprintf(...)".
Definition: MsgStream.cpp:119
endmsg
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:203
MsgStream
Definition: MsgStream.h:34
MSG::FATAL
@ FATAL
Definition: IMessageSvc.h:25
ConditionsStallTest.name
name
Definition: ConditionsStallTest.py:77
EventContext
Definition: EventContext.h:34
std::chrono::duration::count
T count(T... args)
Gaudi::EventWatchdogAlg::m_stackTrace
Gaudi::Property< bool > m_stackTrace
Definition: EventWatchdogAlg.cpp:154
Gaudi ::Functional::Transformer
details::Transformer< Signature, Traits_, details::isLegacy< Traits_ > > Transformer
Definition: Transformer.h:237
gaudirun.action
action
Definition: gaudirun.py:153
plotSpeedupsPyRoot.counter
counter
Definition: plotSpeedupsPyRoot.py:175
Io::Action
Action
Definition: IFileMgr.h:278
System::pagedMemory
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition: Memory.cpp:141
System::virtualMemory
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition: Memory.cpp:228
Gaudi::Property< int >
std::chrono::steady_clock::now
T now(T... args)