The Gaudi Framework  master (ff829712)
Loading...
Searching...
No Matches
Gaudi::EventWatchdogAlg Class Reference

Add to the transient store a tracker that detects events that are taking too long. More...

Inheritance diagram for Gaudi::EventWatchdogAlg:
Collaboration diagram for Gaudi::EventWatchdogAlg:

Public Member Functions

 EventWatchdogAlg (const std::string &name, ISvcLocator *pSvcLocator)
 
PeriodicAction operator() (EventContext const &ctx) const override
 

Private Attributes

Gaudi::CheckedProperty< unsigned int > m_eventTimeout
 
Gaudi::Property< bool > m_abortOnTimeout
 
Gaudi::Property< bool > m_stackTrace { this, "StackTrace", false, "Whether to print the stack-trace on timeout." }
 

Detailed Description

Add to the transient store a tracker that detects events that are taking too long.

Definition at line 48 of file EventWatchdogAlg.cpp.

Constructor & Destructor Documentation

◆ EventWatchdogAlg()

Gaudi::EventWatchdogAlg::EventWatchdogAlg ( const std::string & name,
ISvcLocator * pSvcLocator )
inline

Definition at line 50 of file EventWatchdogAlg.cpp.

51 : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", fmt::format( ".{}-timer", sanitize( name ) ) } ) {
52 // timeout period cannot be smaller than 1
53 m_eventTimeout.verifier().setLower( 1 );
54 }
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
details::Transformer< Signature, Traits_, details::isLegacy< Traits_ > > Transformer

Member Function Documentation

◆ operator()()

PeriodicAction Gaudi::EventWatchdogAlg::operator() ( EventContext const & ctx) const
inlineoverride

Definition at line 56 of file EventWatchdogAlg.cpp.

56 {
57 using namespace std::chrono_literals;
58 // we use a functor because we cannot pass mutable states to a lambda
59 struct Action {
60 MsgStream log;
61 const bool doStackTrace;
62 const bool abortOnTimeout;
63 const unsigned int timeout;
64 const EventContext ctx;
65
66 const std::vector<IScheduler*> schedulers;
67
68 int counter{ 0 };
69
70 const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
71
72 void operator()() {
73 ++counter;
74 if ( counter == 1 ) {
75 log << MSG::WARNING << fmt::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
76 << endmsg;
77 } else {
78 log << MSG::WARNING << fmt::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
79 }
80 if ( log.level() <= MSG::INFO ) {
81 log << MSG::INFO
82 << fmt::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
83 System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
84 << endmsg;
85 }
86
87 std::scoped_lock protectReport( s_watchdogReportMutex );
88 if ( doStackTrace ) {
89 for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
90 if ( gSystem ) {
91 // TSystem::StackTrace() prints on the standard error, so we do the same
92 fmt::print( stderr, "=== Stalled event: current stack trace ({}) ===\n", ctx );
93 gSystem->StackTrace();
94 }
95 }
96 if ( abortOnTimeout ) {
97 log << MSG::FATAL << fmt::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
98 std::raise( SIGQUIT );
99 }
100 }
101
102 ~Action() {
103 if ( counter ) {
104 const std::chrono::duration<float, std::chrono::seconds::period> duration =
105 std::chrono::steady_clock::now() - eventStart;
106 log << MSG::INFO << fmt::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
107 }
108 }
109 };
110
111 std::vector<IScheduler*> schedulers;
112 if ( m_stackTrace ) {
113 using namespace ranges;
114 // if we ask for the stack trace we are also interested in the state of the scheduler
115 schedulers = svcLoc()->getServices() |
116 views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
117 views::remove( nullptr ) | to<std::vector>();
118 }
119
120 Action action{ MsgStream{ msgSvc(), "EventWatchdog" }, m_stackTrace, m_abortOnTimeout, m_eventTimeout,
121 // use a partial copy of the context to avoid copying the optional extension
122 EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
123 return PeriodicAction( std::move( action ), std::chrono::seconds{ m_eventTimeout } );
124 }
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition MsgStream.h:198
Gaudi::Property< bool > m_abortOnTimeout
Gaudi::Property< bool > m_stackTrace
TARGET * Cast(IInterface *i)
Definition IInterface.h:333
Action
Definition IFileMgr.h:263
@ WARNING
Definition IMessageSvc.h:22
@ FATAL
Definition IMessageSvc.h:22
@ INFO
Definition IMessageSvc.h:22
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition Memory.cpp:116
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition Memory.cpp:203

Member Data Documentation

◆ m_abortOnTimeout

Gaudi::Property<bool> Gaudi::EventWatchdogAlg::m_abortOnTimeout
private
Initial value:
{ this, "AbortOnTimeout", false,
"If set to true, the application is killed when we reach the timeout." }

Definition at line 129 of file EventWatchdogAlg.cpp.

129 { this, "AbortOnTimeout", false,
130 "If set to true, the application is killed when we reach the timeout." };

◆ m_eventTimeout

Gaudi::CheckedProperty<unsigned int> Gaudi::EventWatchdogAlg::m_eventTimeout
private
Initial value:
{ this, "EventTimeout", 600,
"Number of seconds allowed to process a single event." }

Definition at line 127 of file EventWatchdogAlg.cpp.

127 { this, "EventTimeout", 600,
128 "Number of seconds allowed to process a single event." };

◆ m_stackTrace

Gaudi::Property<bool> Gaudi::EventWatchdogAlg::m_stackTrace { this, "StackTrace", false, "Whether to print the stack-trace on timeout." }
private

Definition at line 131 of file EventWatchdogAlg.cpp.

131{ this, "StackTrace", false, "Whether to print the stack-trace on timeout." };

The documentation for this class was generated from the following file: