The Gaudi Framework  master (181af51f)
Loading...
Searching...
No Matches
EventWatchdogAlg.cpp
Go to the documentation of this file.
1/***********************************************************************************\
2* (c) Copyright 2024-2025 CERN for the benefit of the LHCb and ATLAS collaborations *
3* *
4* This software is distributed under the terms of the Apache version 2 licence, *
5* copied verbatim in the file "LICENSE". *
6* *
7* In applying this licence, CERN does not waive the privileges and immunities *
8* granted to it by virtue of its status as an Intergovernmental Organization *
9* or submit itself to any jurisdiction. *
10\***********************************************************************************/
15#include <GaudiKernel/Memory.h>
16#include <TSystem.h>
17#include <algorithm>
18#include <chrono>
19#include <csignal>
20#include <fmt/format.h>
21#include <mutex>
22#include <ranges>
23#include <string>
24#include <string_view>
25#include <utility>
26#include <vector>
27
28namespace {
30 std::string sanitize( std::string_view input ) {
31 std::string output{ input };
32 std::replace( begin( output ), end( output ), ':', '_' );
33 return output;
34 }
35
36 // mutex to prevent reports from the watchdog to step on each other,
37 // for example requiring a stack trace while already producing one
38 std::mutex s_watchdogReportMutex;
39} // namespace
40
42
43namespace Gaudi {
44
46 class EventWatchdogAlg : public Gaudi::Functional::Transformer<PeriodicAction( EventContext const& )> {
47 public:
48 EventWatchdogAlg( const std::string& name, ISvcLocator* pSvcLocator )
49 : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", fmt::format( ".{}-timer", sanitize( name ) ) } ) {
50 // timeout period cannot be smaller than 1
51 m_eventTimeout.verifier().setLower( 1 );
52 }
53
54 PeriodicAction operator()( EventContext const& ctx ) const override {
55 using namespace std::chrono_literals;
56 // we use a functor because we cannot pass mutable states to a lambda
57 struct Action {
58 MsgStream log;
59 const bool doStackTrace;
60 const bool abortOnTimeout;
61 const unsigned int timeout;
62 const EventContext ctx;
63
64 const std::vector<IScheduler*> schedulers;
65
66 int counter{ 0 };
67
68 const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
69
70 void operator()() {
71 ++counter;
72 if ( counter == 1 ) {
73 log << MSG::WARNING << fmt::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
74 << endmsg;
75 } else {
76 log << MSG::WARNING << fmt::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
77 }
78 if ( log.level() <= MSG::INFO ) {
79 log << MSG::INFO
80 << fmt::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
81 System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
82 << endmsg;
83 }
84
85 std::scoped_lock protectReport( s_watchdogReportMutex );
86 if ( doStackTrace ) {
87 for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
88 if ( gSystem ) {
89 // TSystem::StackTrace() prints on the standard error, so we do the same
90 fmt::print( stderr, "=== Stalled event: current stack trace ({}) ===\n", ctx );
91 gSystem->StackTrace();
92 }
93 }
94 if ( abortOnTimeout ) {
95 log << MSG::FATAL << fmt::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
96 std::raise( SIGQUIT );
97 }
98 }
99
100 ~Action() {
101 if ( counter ) {
102 const std::chrono::duration<float, std::chrono::seconds::period> duration =
103 std::chrono::steady_clock::now() - eventStart;
104 log << MSG::INFO << fmt::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
105 }
106 }
107 };
108
109 std::vector<IScheduler*> schedulers;
110 if ( m_stackTrace ) {
111 // if we ask for the stack trace we are also interested in the state of the scheduler
112 std::ranges::copy( svcLoc()->getServices() |
113 std::views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
114 std::views::filter( []( auto* p ) { return p != nullptr; } ),
115 std::back_inserter( schedulers ) );
116 }
117
118 Action action{ MsgStream{ msgSvc(), "EventWatchdog" }, m_stackTrace, m_abortOnTimeout, m_eventTimeout,
119 // use a partial copy of the context to avoid copying the optional extension
120 EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
121 return PeriodicAction( std::move( action ), std::chrono::seconds{ m_eventTimeout } );
122 }
123
124 private:
125 Gaudi::CheckedProperty<unsigned int> m_eventTimeout{ this, "EventTimeout", 600,
126 "Number of seconds allowed to process a single event." };
127 Gaudi::Property<bool> m_abortOnTimeout{ this, "AbortOnTimeout", false,
128 "If set to true, the application is killed when we reach the timeout." };
129 Gaudi::Property<bool> m_stackTrace{ this, "StackTrace", false, "Whether to print the stack-trace on timeout." };
130 };
131 DECLARE_COMPONENT( EventWatchdogAlg )
132} // namespace Gaudi
GAUDI_API std::string format(const char *,...)
MsgStream format utility "a la sprintf(...)".
Definition MsgStream.cpp:93
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition MsgStream.h:198
#define DECLARE_COMPONENT(type)
This class represents an entry point to all the event specific data.
EventWatchdogAlg(const std::string &name, ISvcLocator *pSvcLocator)
PeriodicAction operator()(EventContext const &ctx) const override
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
Gaudi::Property< bool > m_abortOnTimeout
Gaudi::Property< bool > m_stackTrace
Implementation of property with value of concrete type.
Definition PropertyFwd.h:27
Helper to periodically run asynchronous tasks.
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition ISvcLocator.h:42
Definition of the MsgStream class used to transmit messages.
Definition MsgStream.h:29
AttribStringParser::Iterator begin(const AttribStringParser &parser)
This file provides a Grammar for the type Gaudi::Accumulators::Axis It allows to use that type from p...
Definition __init__.py:1
TARGET * Cast(IInterface *i)
Definition IInterface.h:333
@ WARNING
Definition IMessageSvc.h:22
@ FATAL
Definition IMessageSvc.h:22
@ INFO
Definition IMessageSvc.h:22
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition Memory.cpp:116
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition Memory.cpp:203