The Gaudi Framework  master (b9786168)
Loading...
Searching...
No Matches
EventWatchdogAlg.cpp
Go to the documentation of this file.
1/***********************************************************************************\
2<<<<<<< HEAD
3* (c) Copyright 2024-2025 CERN for the benefit of the LHCb and ATLAS collaborations *
4=======
5* (c) Copyright 2024-2025, 2024-2025 CERN for the benefit of the LHCb and ATLAS collaborations *
6>>>>>>> 2d2761275 (Replace fmt library with std::format where possible)
7* *
8* This software is distributed under the terms of the Apache version 2 licence, *
9* copied verbatim in the file "LICENSE". *
10* *
11* In applying this licence, CERN does not waive the privileges and immunities *
12* granted to it by virtue of its status as an Intergovernmental Organization *
13* or submit itself to any jurisdiction. *
14\***********************************************************************************/
19#include <GaudiKernel/Memory.h>
20#include <TSystem.h>
21#include <algorithm>
22#include <chrono>
23#include <csignal>
24#include <format>
25#include <mutex>
26#include <ranges>
27#include <string>
28#include <string_view>
29#include <utility>
30#include <vector>
31
32namespace {
34 std::string sanitize( std::string_view input ) {
35 std::string output{ input };
36 std::replace( begin( output ), end( output ), ':', '_' );
37 return output;
38 }
39
40 // mutex to prevent reports from the watchdog to step on each other,
41 // for example requiring a stack trace while already producing one
42 std::mutex s_watchdogReportMutex;
43} // namespace
44
46
47namespace Gaudi {
48
50 class EventWatchdogAlg : public Gaudi::Functional::Transformer<PeriodicAction( EventContext const& )> {
51 public:
52 EventWatchdogAlg( const std::string& name, ISvcLocator* pSvcLocator )
53 : Transformer( name, pSvcLocator, KeyValue{ "TimerLocation", std::format( ".{}-timer", sanitize( name ) ) } ) {
54 // timeout period cannot be smaller than 1
55 m_eventTimeout.verifier().setLower( 1 );
56 }
57
58 PeriodicAction operator()( EventContext const& ctx ) const override {
59 using namespace std::chrono_literals;
60 // we use a functor because we cannot pass mutable states to a lambda
61 struct Action {
62 MsgStream log;
63 const bool doStackTrace;
64 const bool abortOnTimeout;
65 const unsigned int timeout;
66 const EventContext ctx;
67
68 const std::vector<IScheduler*> schedulers;
69
70 int counter{ 0 };
71
72 const std::chrono::steady_clock::time_point eventStart{ std::chrono::steady_clock::now() };
73
74 void operator()() {
75 ++counter;
76 if ( counter == 1 ) {
77 log << MSG::WARNING << std::format( "More than {}s since the beginning of the event ({})", timeout, ctx )
78 << endmsg;
79 } else {
80 log << MSG::WARNING << std::format( "Another {}s passed since last timeout ({})", timeout, ctx ) << endmsg;
81 }
82 if ( log.level() <= MSG::INFO ) {
83 log << MSG::INFO
84 << std::format( "Current memory usage is virtual size = {} MB, resident set size = {} MB",
85 System::virtualMemory() / 1024., System::pagedMemory() / 1024. )
86 << endmsg;
87 }
88
89 std::scoped_lock protectReport( s_watchdogReportMutex );
90 if ( doStackTrace ) {
91 for ( auto scheduler : schedulers ) { scheduler->dumpState(); }
92 if ( gSystem ) {
93 // TSystem::StackTrace() prints on the standard error, so we do the same
94 std::cerr << std::format( "=== Stalled event: current stack trace ({}) ===\n", ctx );
95 gSystem->StackTrace();
96 }
97 }
98 if ( abortOnTimeout ) {
99 log << MSG::FATAL << std::format( "too much time on a single event ({}): aborting process", ctx ) << endmsg;
100 std::raise( SIGQUIT );
101 }
102 }
103
104 ~Action() {
105 if ( counter ) {
106 const std::chrono::duration<float, std::chrono::seconds::period> duration =
107 std::chrono::steady_clock::now() - eventStart;
108 log << MSG::INFO << std::format( "An event ({}) took {:.3f}s", ctx, duration.count() ) << endmsg;
109 }
110 }
111 };
112
113 std::vector<IScheduler*> schedulers;
114 if ( m_stackTrace ) {
115 // if we ask for the stack trace we are also interested in the state of the scheduler
116 std::ranges::copy( svcLoc()->getServices() |
117 std::views::transform( []( auto svc ) { return Gaudi::Cast<IScheduler>( svc ); } ) |
118 std::views::filter( []( auto* p ) { return p != nullptr; } ),
119 std::back_inserter( schedulers ) );
120 }
121
122 Action action{ MsgStream{ msgSvc(), "EventWatchdog" }, m_stackTrace, m_abortOnTimeout, m_eventTimeout,
123 // use a partial copy of the context to avoid copying the optional extension
124 EventContext{ ctx.evt(), ctx.slot(), ctx.subSlot() }, std::move( schedulers ) };
125 return PeriodicAction( std::move( action ), std::chrono::seconds{ m_eventTimeout } );
126 }
127
128 private:
129 Gaudi::CheckedProperty<unsigned int> m_eventTimeout{ this, "EventTimeout", 600,
130 "Number of seconds allowed to process a single event." };
131 Gaudi::Property<bool> m_abortOnTimeout{ this, "AbortOnTimeout", false,
132 "If set to true, the application is killed when we reach the timeout." };
133 Gaudi::Property<bool> m_stackTrace{ this, "StackTrace", false, "Whether to print the stack-trace on timeout." };
134 };
135 DECLARE_COMPONENT( EventWatchdogAlg )
136} // namespace Gaudi
GAUDI_API std::string format(const char *,...)
MsgStream format utility "a la sprintf(...)".
Definition MsgStream.cpp:93
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition MsgStream.h:198
#define DECLARE_COMPONENT(type)
This class represents an entry point to all the event specific data.
EventWatchdogAlg(const std::string &name, ISvcLocator *pSvcLocator)
PeriodicAction operator()(EventContext const &ctx) const override
Gaudi::CheckedProperty< unsigned int > m_eventTimeout
Gaudi::Property< bool > m_abortOnTimeout
Gaudi::Property< bool > m_stackTrace
Implementation of property with value of concrete type.
Definition PropertyFwd.h:27
Helper to periodically run asynchronous tasks.
The ISvcLocator is the interface implemented by the Service Factory in the Application Manager to loc...
Definition ISvcLocator.h:42
Definition of the MsgStream class used to transmit messages.
Definition MsgStream.h:29
AttribStringParser::Iterator begin(const AttribStringParser &parser)
This file provides a Grammar for the type Gaudi::Accumulators::Axis It allows to use that type from p...
Definition __init__.py:1
TARGET * Cast(IInterface *i)
Definition IInterface.h:340
@ WARNING
Definition IMessageSvc.h:22
@ FATAL
Definition IMessageSvc.h:22
@ INFO
Definition IMessageSvc.h:22
GAUDI_API long pagedMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: Amount of paged memory currently occupied by the process 'pid'.
Definition Memory.cpp:116
GAUDI_API long virtualMemory(MemoryUnit unit=kByte, InfoType fetch=Memory, long pid=-1)
Basic Process Information: priority boost.
Definition Memory.cpp:203
STL namespace.