The Gaudi Framework  master (1304469f)
Loading...
Searching...
No Matches
Gaudi::EventWatchdogAlg Class Reference

Add to the transient store a tracker that detects events that are taking too long. More...

Inheritance diagram for Gaudi::EventWatchdogAlg:
Collaboration diagram for Gaudi::EventWatchdogAlg:

Public Member Functions

 EventWatchdogAlg (const std::string &name, ISvcLocator *pSvcLocator)
 
PeriodicAction operator() (EventContext const &ctx) const override
 

Private Attributes

Gaudi::CheckedProperty< unsigned int > m_eventTimeout
 
Gaudi::Property< bool > m_abortOnTimeout
 
Gaudi::Property< bool > m_stackTrace { this, "StackTrace", false, "Whether to print the stack-trace on timeout." }
 

Detailed Description

Add to the transient store a tracker that detects events that are taking too long.

Definition at line 50 of file EventWatchdogAlg.cpp.

Constructor & Destructor Documentation

◆ EventWatchdogAlg()

Gaudi::EventWatchdogAlg::EventWatchdogAlg ( const std::string & name,
ISvcLocator * pSvcLocator )
inline

Definition at line 52 of file EventWatchdogAlg.cpp.

53 : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", std::format( ".{}-timer", sanitize( name ) ) } ) {
54 // timeout period cannot be smaller than 1
55 m_eventTimeout.verifier().setLower( 1 );
56 }
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
details::Transformer< Signature, Traits_, details::isLegacy< Traits_ > > Transformer

Member Function Documentation

◆ operator()()

PeriodicAction Gaudi::EventWatchdogAlg::operator() ( EventContext const & ctx) const
inlineoverride

Definition at line 58 of file EventWatchdogAlg.cpp.

58 {
59 using namespace std::chrono_literals;
60 // we use a functor because we cannot pass mutable states to a lambda
61 struct Action {
62 MsgStream log;
63 const bool doStackTrace;
64 const bool abortOnTimeout;
65 const unsigned int timeout;
66 const EventContext ctx;
67
68 const std::vector<IScheduler*> schedulers;
69
70 int counter{ 0 };
71
72 const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
73
74 void operator()() {
75 ++counter;
76 if ( counter == 1 ) {
77 log << MSG::WARNING << std::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
78 << endmsg;
79 } else {
80 log << MSG::WARNING << std::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
81 }
82 if ( log.level() <= MSG::INFO ) {
83 log << MSG::INFO
84 << std::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
85 System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
86 << endmsg;
87 }
88
89 std::scoped_lock protectReport( s_watchdogReportMutex );
90 if ( doStackTrace ) {
91 for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
92 if ( gSystem ) {
93 // TSystem::StackTrace() prints on the standard error, so we do the same
94 std::cerr << std::format( "=== Stalled event: current stack trace ({}) ===\n", ctx );
95 gSystem->StackTrace();
96 }
97 }
98 if ( abortOnTimeout ) {
99 log << MSG::FATAL << std::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
100 std::raise( SIGQUIT );
101 }
102 }
103
104 ~Action() {
105 if ( counter ) {
106 const std::chrono::duration<float, std::chrono::seconds::period> duration =
107 std::chrono::steady_clock::now() - eventStart;
108 log << MSG::INFO << std::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
109 }
110 }
111 };
112
113 std::vector<IScheduler*> schedulers;
114 if ( m_stackTrace ) {
115 // if we ask for the stack trace we are also interested in the state of the scheduler
116 std::ranges::copy( svcLoc()->getServices() |
117 std::views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
118 std::views::filter( []( auto* p ) { return p != nullptr; } ),
119 std::back_inserter( schedulers ) );
120 }
121
122 Action action{ MsgStream{ msgSvc(), "EventWatchdog" }, m_stackTrace, m_abortOnTimeout, m_eventTimeout,
123 // use a partial copy of the context to avoid copying the optional extension
124 EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
125 return PeriodicAction( std::move( action ), std::chrono::seconds{ m_eventTimeout } );
126 }
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition MsgStream.h:198
Gaudi::Property< bool > m_abortOnTimeout
Gaudi::Property< bool > m_stackTrace
TARGET * Cast(IInterface *i)
Definition IInterface.h:340
Action
Definition IFileMgr.h:263
@ WARNING
Definition IMessageSvc.h:22
@ FATAL
Definition IMessageSvc.h:22
@ INFO
Definition IMessageSvc.h:22
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition Memory.cpp:116
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition Memory.cpp:203

Member Data Documentation

◆ m_abortOnTimeout

Gaudi::Property<bool> Gaudi::EventWatchdogAlg::m_abortOnTimeout
private
Initial value:
{ this, "AbortOnTimeout", false,
"If set to true, the application is killed when we reach the timeout." }

Definition at line 131 of file EventWatchdogAlg.cpp.

131 { this, "AbortOnTimeout", false,
132 "If set to true, the application is killed when we reach the timeout." };

◆ m_eventTimeout

Gaudi::CheckedProperty<unsigned int> Gaudi::EventWatchdogAlg::m_eventTimeout
private
Initial value:
{ this, "EventTimeout", 600,
"Number of seconds allowed to process a single event." }

Definition at line 129 of file EventWatchdogAlg.cpp.

129 { this, "EventTimeout", 600,
130 "Number of seconds allowed to process a single event." };

◆ m_stackTrace

Gaudi::Property<bool> Gaudi::EventWatchdogAlg::m_stackTrace { this, "StackTrace", false, "Whether to print the stack-trace on timeout." }
private

Definition at line 133 of file EventWatchdogAlg.cpp.

133{ this, "StackTrace", false, "Whether to print the stack-trace on timeout." };

The documentation for this class was generated from the following file: