The Gaudi Framework  v30r0 (c919700c)
AvalancheSchedulerSvc.cpp
Go to the documentation of this file.
2 #include "AlgoExecutionTask.h"
3 #include "IOBoundAlgTask.h"
4 
5 // Framework includes
7 #include "GaudiKernel/Algorithm.h" // will be IAlgorithm if context getter promoted to interface
10 #include "GaudiKernel/IAlgorithm.h"
12 #include "GaudiKernel/SvcFactory.h"
14 
15 // C++
16 #include <algorithm>
17 #include <map>
18 #include <queue>
19 #include <sstream>
20 #include <unordered_set>
21 
22 // External libs
23 #include "boost/algorithm/string.hpp"
24 #include "boost/thread.hpp"
25 #include "boost/tokenizer.hpp"
26 // DP waiting for the TBB service
27 #include "tbb/task_scheduler_init.h"
28 
31 
32 // Instantiation of a static factory class used by clients to create instances of this service
34 
35 namespace
36 {
37  struct DataObjIDSorter {
38  bool operator()( const DataObjID* a, const DataObjID* b ) { return a->fullKey() < b->fullKey(); }
39  };
40 
41  // Sort a DataObjIDColl in a well-defined, reproducible manner.
42  // Used for making debugging dumps.
43  std::vector<const DataObjID*> sortedDataObjIDColl( const DataObjIDColl& coll )
44  {
46  v.reserve( coll.size() );
47  for ( const DataObjID& id : coll ) v.push_back( &id );
48  std::sort( v.begin(), v.end(), DataObjIDSorter() );
49  return v;
50  }
51 }
52 
53 //===========================================================================
54 // Infrastructure methods
55 
62 {
63 
64  // Initialise mother class (read properties, ...)
66  if ( !sc.isSuccess() ) warning() << "Base class could not be initialized" << endmsg;
67 
68  // Get hold of the TBBSvc. This should initialize the thread pool
69  m_threadPoolSvc = serviceLocator()->service( "ThreadPoolSvc" );
70  if ( !m_threadPoolSvc.isValid() ) {
71  fatal() << "Error retrieving ThreadPoolSvc" << endmsg;
72  return StatusCode::FAILURE;
73  }
74 
75  // Activate the scheduler in another thread.
76  info() << "Activating scheduler in a separate thread" << endmsg;
77  m_thread = std::thread( [this]() { this->activate(); } );
78 
79  while ( m_isActive != ACTIVE ) {
80  if ( m_isActive == FAILURE ) {
81  fatal() << "Terminating initialization" << endmsg;
82  return StatusCode::FAILURE;
83  } else {
84  info() << "Waiting for AvalancheSchedulerSvc to activate" << endmsg;
85  sleep( 1 );
86  }
87  }
88 
89  if ( m_enableCondSvc ) {
90  // Get hold of the CondSvc
91  m_condSvc = serviceLocator()->service( "CondSvc" );
92  if ( !m_condSvc.isValid() ) {
93  warning() << "No CondSvc found, or not enabled. "
94  << "Will not manage CondAlgorithms" << endmsg;
95  m_enableCondSvc = false;
96  }
97  }
98 
99  // Get the algo resource pool
100  m_algResourcePool = serviceLocator()->service( "AlgResourcePool" );
101  if ( !m_algResourcePool.isValid() ) {
102  fatal() << "Error retrieving AlgoResourcePool" << endmsg;
103  return StatusCode::FAILURE;
104  }
105 
106  m_algExecStateSvc = serviceLocator()->service( "AlgExecStateSvc" );
107  if ( !m_algExecStateSvc.isValid() ) {
108  fatal() << "Error retrieving AlgExecStateSvc" << endmsg;
109  return StatusCode::FAILURE;
110  }
111 
112  // Get Whiteboard
113  m_whiteboard = serviceLocator()->service( m_whiteboardSvcName );
114  if ( !m_whiteboard.isValid() ) {
115  fatal() << "Error retrieving EventDataSvc interface IHiveWhiteBoard." << endmsg;
116  return StatusCode::FAILURE;
117  }
118 
119  // Get dedicated scheduler for I/O-bound algorithms
120  if ( m_useIOBoundAlgScheduler ) {
121  m_IOBoundAlgScheduler = serviceLocator()->service( m_IOBoundAlgSchedulerSvcName );
122  if ( !m_IOBoundAlgScheduler.isValid() )
123  fatal() << "Error retrieving IOBoundSchedulerAlgSvc interface IAccelerator." << endmsg;
124  }
125 
126  // Set the MaxEventsInFlight parameters from the number of WB stores
127  m_maxEventsInFlight = m_whiteboard->getNumberOfStores();
128 
129  // Set the number of free slots
130  m_freeSlots = m_maxEventsInFlight;
131 
132  // set global concurrency flags
134 
135  // Get the list of algorithms
136  const std::list<IAlgorithm*>& algos = m_algResourcePool->getFlatAlgList();
137  const unsigned int algsNumber = algos.size();
138  info() << "Found " << algsNumber << " algorithms" << endmsg;
139 
140  /* Dependencies
141  1) Look for handles in algo, if none
142  2) Assume none are required
143  */
144 
145  DataObjIDColl globalInp, globalOutp;
146 
147  // figure out all outputs
148  for ( IAlgorithm* ialgoPtr : algos ) {
149  Algorithm* algoPtr = dynamic_cast<Algorithm*>( ialgoPtr );
150  if ( !algoPtr ) {
151  fatal() << "Could not convert IAlgorithm into Algorithm: this will result in a crash." << endmsg;
152  }
153  for ( auto id : algoPtr->outputDataObjs() ) {
154  auto r = globalOutp.insert( id );
155  if ( !r.second ) {
156  warning() << "multiple algorithms declare " << id << " as output! could be a single instance in multiple paths "
157  "though, or control flow may guarantee only one runs...!"
158  << endmsg;
159  }
160  }
161  }
162 
163  std::ostringstream ostdd;
164  ostdd << "Data Dependencies for Algorithms:";
165 
167  for ( IAlgorithm* ialgoPtr : algos ) {
168  Algorithm* algoPtr = dynamic_cast<Algorithm*>( ialgoPtr );
169  if ( nullptr == algoPtr ) {
170  fatal() << "Could not convert IAlgorithm into Algorithm for " << ialgoPtr->name()
171  << ": this will result in a crash." << endmsg;
172  return StatusCode::FAILURE;
173  }
174 
175  ostdd << "\n " << algoPtr->name();
176 
177  DataObjIDColl algoDependencies;
178  if ( !algoPtr->inputDataObjs().empty() || !algoPtr->outputDataObjs().empty() ) {
179  for ( const DataObjID* idp : sortedDataObjIDColl( algoPtr->inputDataObjs() ) ) {
180  DataObjID id = *idp;
181  ostdd << "\n o INPUT " << id;
182  if ( id.key().find( ":" ) != std::string::npos ) {
183  ostdd << " contains alternatives which require resolution...\n";
184  auto tokens = boost::tokenizer<boost::char_separator<char>>{id.key(), boost::char_separator<char>{":"}};
185  auto itok = std::find_if( tokens.begin(), tokens.end(), [&]( const std::string& t ) {
186  return globalOutp.find( DataObjID{t} ) != globalOutp.end();
187  } );
188  if ( itok != tokens.end() ) {
189  ostdd << "found matching output for " << *itok << " -- updating scheduler info\n";
190  id.updateKey( *itok );
191  } else {
192  error() << "failed to find alternate in global output list"
193  << " for id: " << id << " in Alg " << algoPtr->name() << endmsg;
194  m_showDataDeps = true;
195  }
196  }
197  algoDependencies.insert( id );
198  globalInp.insert( id );
199  }
200  for ( const DataObjID* id : sortedDataObjIDColl( algoPtr->outputDataObjs() ) ) {
201  ostdd << "\n o OUTPUT " << *id;
202  if ( id->key().find( ":" ) != std::string::npos ) {
203  error() << " in Alg " << algoPtr->name() << " alternatives are NOT allowed for outputs! id: " << *id
204  << endmsg;
205  m_showDataDeps = true;
206  }
207  }
208  } else {
209  ostdd << "\n none";
210  }
211  algosDependenciesMap[algoPtr->name()] = algoDependencies;
212  }
213 
214  if ( m_showDataDeps ) {
215  info() << ostdd.str() << endmsg;
216  }
217 
218  // Check if we have unmet global input dependencies, and, optionally, heal them
219  // WARNING: this step must be done BEFORE the Precedence Service is initialized
220  if ( m_checkDeps ) {
221  DataObjIDColl unmetDep;
222  for ( auto o : globalInp )
223  if ( globalOutp.find( o ) == globalOutp.end() ) unmetDep.insert( o );
224 
225  if ( unmetDep.size() > 0 ) {
226 
227  std::ostringstream ost;
228  for ( const DataObjID* o : sortedDataObjIDColl( unmetDep ) ) {
229  ost << "\n o " << *o << " required by Algorithm: ";
230 
231  for ( const auto& p : algosDependenciesMap )
232  if ( p.second.find( *o ) != p.second.end() ) ost << "\n * " << p.first;
233  }
234 
235  if ( !m_useDataLoader.empty() ) {
236 
237  // Find the DataLoader Alg
238  IAlgorithm* dataLoaderAlg( nullptr );
239  for ( IAlgorithm* algo : algos )
240  if ( algo->name() == m_useDataLoader ) {
241  dataLoaderAlg = algo;
242  break;
243  }
244 
245  if ( dataLoaderAlg == nullptr ) {
246  fatal() << "No DataLoader Algorithm \"" << m_useDataLoader.value()
247  << "\" found, and unmet INPUT dependencies "
248  << "detected:\n"
249  << ost.str() << endmsg;
250  return StatusCode::FAILURE;
251  }
252 
253  info() << "Will attribute the following unmet INPUT dependencies to \"" << dataLoaderAlg->type() << "/"
254  << dataLoaderAlg->name() << "\" Algorithm" << ost.str() << endmsg;
255 
256  // Set the property Load of DataLoader Alg
257  Algorithm* dataAlg = dynamic_cast<Algorithm*>( dataLoaderAlg );
258  if ( !dataAlg ) {
259  fatal() << "Unable to dcast DataLoader \"" << m_useDataLoader.value() << "\" IAlg to Algorithm" << endmsg;
260  return StatusCode::FAILURE;
261  }
262 
263  for ( auto& id : unmetDep ) {
264  debug() << "adding OUTPUT dep \"" << id << "\" to " << dataLoaderAlg->type() << "/" << dataLoaderAlg->name()
265  << endmsg;
267  }
268 
269  } else {
270  fatal() << "Auto DataLoading not requested, "
271  << "and the following unmet INPUT dependencies were found:" << ost.str() << endmsg;
272  return StatusCode::FAILURE;
273  }
274 
275  } else {
276  info() << "No unmet INPUT data dependencies were found" << endmsg;
277  }
278  }
279 
280  // Get the precedence service
281  m_precSvc = serviceLocator()->service( "PrecedenceSvc" );
282  if ( !m_precSvc.isValid() ) {
283  fatal() << "Error retrieving PrecedenceSvc" << endmsg;
284  return StatusCode::FAILURE;
285  }
286  const PrecedenceSvc* precSvc = dynamic_cast<const PrecedenceSvc*>( m_precSvc.get() );
287  if ( !precSvc ) {
288  fatal() << "Unable to dcast PrecedenceSvc" << endmsg;
289  return StatusCode::FAILURE;
290  }
291 
292  // Fill the containers to convert algo names to index
293  m_algname_vect.resize( algsNumber );
294  for ( IAlgorithm* algo : algos ) {
295  const std::string& name = algo->name();
296  auto index = precSvc->getRules()->getAlgorithmNode( name )->getAlgoIndex();
297  m_algname_index_map[name] = index;
298  m_algname_vect.at( index ) = name;
299  }
300 
301  // Shortcut for the message service
302  SmartIF<IMessageSvc> messageSvc( serviceLocator() );
303  if ( !messageSvc.isValid() ) error() << "Error retrieving MessageSvc interface IMessageSvc." << endmsg;
304 
305  m_eventSlots.assign( m_maxEventsInFlight,
306  EventSlot( algsNumber, precSvc->getRules()->getControlFlowNodeCounter(), messageSvc ) );
307  std::for_each( m_eventSlots.begin(), m_eventSlots.end(), []( EventSlot& slot ) { slot.complete = true; } );
308 
309  if ( m_threadPoolSize > 1 ) {
310  m_maxAlgosInFlight = (size_t)m_threadPoolSize;
311  }
312 
313  // Clearly inform about the level of concurrency
314  info() << "Concurrency level information:" << endmsg;
315  info() << " o Number of events in flight: " << m_maxEventsInFlight << endmsg;
316  info() << " o TBB thread pool size: " << m_threadPoolSize << endmsg;
317 
318  if ( m_showControlFlow ) m_precSvc->dumpControlFlow();
319 
320  if ( m_showDataFlow ) m_precSvc->dumpDataFlow();
321 
322  // Simulate execution flow
323  if ( m_simulateExecution ) m_precSvc->simulate( m_eventSlots[0] );
324 
325  return sc;
326 }
327 //---------------------------------------------------------------------------
328 
333 {
334 
336  if ( !sc.isSuccess() ) warning() << "Base class could not be finalized" << endmsg;
337 
338  sc = deactivate();
339  if ( !sc.isSuccess() ) warning() << "Scheduler could not be deactivated" << endmsg;
340 
341  info() << "Joining Scheduler thread" << endmsg;
342  m_thread.join();
343 
344  // Final error check after thread pool termination
345  if ( m_isActive == FAILURE ) {
346  error() << "problems in scheduler thread" << endmsg;
347  return StatusCode::FAILURE;
348  }
349 
350  return sc;
351 }
352 //---------------------------------------------------------------------------
364 {
365 
366  if ( msgLevel( MSG::DEBUG ) ) debug() << "AvalancheSchedulerSvc::activate()" << endmsg;
367 
368  if ( m_threadPoolSvc->initPool( m_threadPoolSize ).isFailure() ) {
369  error() << "problems initializing ThreadPoolSvc" << endmsg;
370  m_isActive = FAILURE;
371  return;
372  }
373 
374  // Wait for actions pushed into the queue by finishing tasks.
375  action thisAction;
377 
378  m_isActive = ACTIVE;
379 
380  // Continue to wait if the scheduler is running or there is something to do
381  info() << "Start checking the actionsQueue" << endmsg;
382  while ( m_isActive == ACTIVE or m_actionsQueue.size() != 0 ) {
383  m_actionsQueue.pop( thisAction );
384  sc = thisAction();
385  if ( sc != StatusCode::SUCCESS )
386  verbose() << "Action did not succeed (which is not bad per se)." << endmsg;
387  else
388  verbose() << "Action succeeded." << endmsg;
389  }
390 
391  info() << "Terminating thread-pool resources" << endmsg;
392  if ( m_threadPoolSvc->terminatePool().isFailure() ) {
393  error() << "Problems terminating thread pool" << endmsg;
394  m_isActive = FAILURE;
395  }
396 }
397 
398 //---------------------------------------------------------------------------
399 
407 {
408 
409  if ( m_isActive == ACTIVE ) {
410  // Drain the scheduler
411  m_actionsQueue.push( [this]() { return this->m_drain(); } );
412  // This would be the last action
413  m_actionsQueue.push( [this]() -> StatusCode {
414  m_isActive = INACTIVE;
415  return StatusCode::SUCCESS;
416  } );
417  }
418 
419  return StatusCode::SUCCESS;
420 }
421 
422 //===========================================================================
423 
424 //===========================================================================
425 // Utils and shortcuts
426 
427 inline const std::string& AvalancheSchedulerSvc::index2algname( unsigned int index ) { return m_algname_vect[index]; }
428 
429 //---------------------------------------------------------------------------
430 
431 inline unsigned int AvalancheSchedulerSvc::algname2index( const std::string& algoname )
432 {
433  unsigned int index = m_algname_index_map[algoname];
434  return index;
435 }
436 
437 //===========================================================================
438 // EventSlot management
446 {
447 
448  if ( m_first ) {
449  m_first = false;
450  }
451 
452  if ( !eventContext ) {
453  fatal() << "Event context is nullptr" << endmsg;
454  return StatusCode::FAILURE;
455  }
456 
457  if ( m_freeSlots.load() == 0 ) {
458  if ( msgLevel( MSG::DEBUG ) ) debug() << "A free processing slot could not be found." << endmsg;
459  return StatusCode::FAILURE;
460  }
461 
462  // no problem as push new event is only called from one thread (event loop manager)
463  m_freeSlots--;
464 
465  auto action = [this, eventContext]() -> StatusCode {
466  // Event processing slot forced to be the same as the wb slot
467  const unsigned int thisSlotNum = eventContext->slot();
468  EventSlot& thisSlot = m_eventSlots[thisSlotNum];
469  if ( !thisSlot.complete ) {
470  fatal() << "The slot " << thisSlotNum << " is supposed to be a finished event but it's not" << endmsg;
471  return StatusCode::FAILURE;
472  }
473 
474  debug() << "Executing event " << eventContext->evt() << " on slot " << thisSlotNum << endmsg;
475  thisSlot.reset( eventContext );
476 
477  // Result status code:
479 
480  // promote to CR and DR the initial set of algorithms
481  Cause cs = {Cause::source::Root, "RootDecisionHub"};
482  if ( m_precSvc->iterate( thisSlot, cs ).isFailure() ) {
483  error() << "Failed to call IPrecedenceSvc::iterate for slot " << thisSlotNum << endmsg;
484  result = StatusCode::FAILURE;
485  }
486 
487  if ( this->updateStates( thisSlotNum ).isFailure() ) {
488  error() << "Failed to call AvalancheSchedulerSvc::updateStates for slot " << thisSlotNum << endmsg;
489  result = StatusCode::FAILURE;
490  }
491 
492  return result;
493  }; // end of lambda
494 
495  // Kick off the scheduling!
496  if ( msgLevel( MSG::VERBOSE ) ) {
497  verbose() << "Pushing the action to update the scheduler for slot " << eventContext->slot() << endmsg;
498  verbose() << "Free slots available " << m_freeSlots.load() << endmsg;
499  }
500  m_actionsQueue.push( action );
501 
502  return StatusCode::SUCCESS;
503 }
504 
505 //---------------------------------------------------------------------------
507 {
508  StatusCode sc;
509  for ( auto context : eventContexts ) {
510  sc = pushNewEvent( context );
511  if ( sc != StatusCode::SUCCESS ) return sc;
512  }
513  return sc;
514 }
515 
516 //---------------------------------------------------------------------------
517 unsigned int AvalancheSchedulerSvc::freeSlots() { return std::max( m_freeSlots.load(), 0 ); }
518 
519 //---------------------------------------------------------------------------
524 {
525 
526  unsigned int slotNum = 0;
527  for ( auto& thisSlot : m_eventSlots ) {
528  if ( not thisSlot.algsStates.allAlgsExecuted() and not thisSlot.complete ) {
529  updateStates( slotNum );
530  }
531  slotNum++;
532  }
533  return StatusCode::SUCCESS;
534 }
535 
536 //---------------------------------------------------------------------------
541 {
542  // debug() << "popFinishedEvent: queue size: " << m_finishedEvents.size() << endmsg;
543  if ( m_freeSlots.load() == (int)m_maxEventsInFlight or m_isActive == INACTIVE ) {
544  // debug() << "freeslots: " << m_freeSlots << "/" << m_maxEventsInFlight
545  // << " active: " << m_isActive << endmsg;
546  return StatusCode::FAILURE;
547  } else {
548  // debug() << "freeslots: " << m_freeSlots << "/" << m_maxEventsInFlight
549  // << " active: " << m_isActive << endmsg;
550  m_finishedEvents.pop( eventContext );
551  m_freeSlots++;
552  if ( msgLevel( MSG::DEBUG ) )
553  debug() << "Popped slot " << eventContext->slot() << "(event " << eventContext->evt() << ")" << endmsg;
554  return StatusCode::SUCCESS;
555  }
556 }
557 
558 //---------------------------------------------------------------------------
563 {
564  if ( m_finishedEvents.try_pop( eventContext ) ) {
565  if ( msgLevel( MSG::DEBUG ) )
566  debug() << "Try Pop successful slot " << eventContext->slot() << "(event " << eventContext->evt() << ")"
567  << endmsg;
568  m_freeSlots++;
569  return StatusCode::SUCCESS;
570  }
571  return StatusCode::FAILURE;
572 }
573 
574 //---------------------------------------------------------------------------
581 {
582 
583  // Set the number of slots available to an error code
584  m_freeSlots.store( 0 );
585 
586  fatal() << "*** Event " << eventContext->evt() << " on slot " << eventContext->slot() << " failed! ***" << endmsg;
587 
588  std::ostringstream ost;
589  m_algExecStateSvc->dump( ost, *eventContext );
590 
591  info() << "Dumping Alg Exec State for slot " << eventContext->slot() << ":\n" << ost.str() << endmsg;
592 
593  dumpSchedulerState( -1 );
594  // dump temporal and topological precedence analysis (if enabled in the PrecedenceSvc)
595  m_precSvc->dumpPrecedenceRules( m_eventSlots[eventContext->slot()] );
596 
597  // Empty queue and deactivate the service
598  action thisAction;
599  while ( m_actionsQueue.try_pop( thisAction ) ) {
600  };
601  deactivate();
602 
603  // Push into the finished events queue the failed context
604  EventContext* thisEvtContext;
605  while ( m_finishedEvents.try_pop( thisEvtContext ) ) {
606  m_finishedEvents.push( thisEvtContext );
607  };
608  m_finishedEvents.push( eventContext );
609 
610  return StatusCode::FAILURE;
611 }
612 
613 //===========================================================================
614 
615 //===========================================================================
616 // States Management
617 
627 StatusCode AvalancheSchedulerSvc::updateStates( int si, const int algo_index )
628 {
629 
630  StatusCode global_sc( StatusCode::SUCCESS );
631 
632  // Sort from the oldest to the newest event
633  // Prepare a vector of pointers to the slots to avoid copies
634  std::vector<EventSlot*> eventSlotsPtrs;
635 
636  // Consider all slots if si <0 or just one otherwise
637  if ( si < 0 ) {
638  const int eventsSlotsSize( m_eventSlots.size() );
639  eventSlotsPtrs.reserve( eventsSlotsSize );
640  for ( auto slotIt = m_eventSlots.begin(); slotIt != m_eventSlots.end(); slotIt++ ) {
641  if ( !slotIt->complete ) eventSlotsPtrs.push_back( &( *slotIt ) );
642  }
643  std::sort( eventSlotsPtrs.begin(), eventSlotsPtrs.end(),
644  []( EventSlot* a, EventSlot* b ) { return a->eventContext->evt() < b->eventContext->evt(); } );
645  } else {
646  eventSlotsPtrs.push_back( &m_eventSlots[si] );
647  }
648 
649  for ( EventSlot* thisSlotPtr : eventSlotsPtrs ) {
650  int iSlot = thisSlotPtr->eventContext->slot();
651 
652  // Cache the states of the algos to improve readability and performance
653  auto& thisSlot = m_eventSlots[iSlot];
654  AlgsExecutionStates& thisAlgsStates = thisSlot.algsStates;
655 
656  // Perform the I->CR->DR transitions
657  if ( algo_index >= 0 ) {
658  Cause cs = {Cause::source::Task, index2algname( algo_index )};
659  if ( m_precSvc->iterate( thisSlot, cs ).isFailure() ) {
660  error() << "Failed to call IPrecedenceSvc::iterate for slot " << iSlot << endmsg;
661  global_sc = StatusCode::FAILURE;
662  }
663  }
664 
665  StatusCode partial_sc( StatusCode::FAILURE, true );
666 
667  // Perform DR->SCHEDULED
668  if ( !m_optimizationMode.empty() ) {
669  auto comp_nodes = [this]( const uint& i, const uint& j ) {
670  return ( m_precSvc->getPriority( index2algname( i ) ) < m_precSvc->getPriority( index2algname( j ) ) );
671  };
673  comp_nodes, std::vector<uint>() );
674  for ( auto it = thisAlgsStates.begin( AlgsExecutionStates::State::DATAREADY );
675  it != thisAlgsStates.end( AlgsExecutionStates::State::DATAREADY ); ++it )
676  buffer.push( *it );
677  while ( !buffer.empty() ) {
678  bool IOBound = false;
679  if ( m_useIOBoundAlgScheduler ) IOBound = m_precSvc->isBlocking( index2algname( buffer.top() ) );
680 
681  if ( !IOBound )
682  partial_sc = promoteToScheduled( buffer.top(), iSlot );
683  else
684  partial_sc = promoteToAsyncScheduled( buffer.top(), iSlot );
685 
686  if ( msgLevel( MSG::VERBOSE ) )
687  if ( partial_sc.isFailure() )
688  verbose() << "Could not apply transition from "
689  << AlgsExecutionStates::stateNames[AlgsExecutionStates::State::DATAREADY] << " for algorithm "
690  << index2algname( buffer.top() ) << " on processing slot " << iSlot << endmsg;
691 
692  buffer.pop();
693  }
694 
695  } else {
696  for ( auto it = thisAlgsStates.begin( AlgsExecutionStates::State::DATAREADY );
697  it != thisAlgsStates.end( AlgsExecutionStates::State::DATAREADY ); ++it ) {
698  uint algIndex = *it;
699 
700  bool IOBound = false;
701  if ( m_useIOBoundAlgScheduler ) IOBound = m_precSvc->isBlocking( index2algname( algIndex ) );
702 
703  if ( !IOBound )
704  partial_sc = promoteToScheduled( algIndex, iSlot );
705  else
706  partial_sc = promoteToAsyncScheduled( algIndex, iSlot );
707 
708  if ( msgLevel( MSG::VERBOSE ) )
709  if ( partial_sc.isFailure() )
710  verbose() << "Could not apply transition from "
711  << AlgsExecutionStates::stateNames[AlgsExecutionStates::State::DATAREADY] << " for algorithm "
712  << index2algname( algIndex ) << " on processing slot " << iSlot << endmsg;
713  }
714  }
715 
716  if ( m_dumpIntraEventDynamics ) {
718  s << index2algname( algo_index ) << ", " << thisAlgsStates.sizeOfSubset( State::CONTROLREADY ) << ", "
719  << thisAlgsStates.sizeOfSubset( State::DATAREADY ) << ", " << thisAlgsStates.sizeOfSubset( State::SCHEDULED )
720  << ", " << std::chrono::high_resolution_clock::now().time_since_epoch().count() << "\n";
721  auto threads = ( m_threadPoolSize != -1 ) ? std::to_string( m_threadPoolSize )
722  : std::to_string( tbb::task_scheduler_init::default_num_threads() );
723  std::ofstream myfile;
724  myfile.open( "IntraEventConcurrencyDynamics_" + threads + "T.csv", std::ios::app );
725  myfile << s.str();
726  myfile.close();
727  }
728 
729  // Not complete because this would mean that the slot is already free!
730  if ( !thisSlot.complete && m_precSvc->CFRulesResolved( thisSlot ) &&
731  !thisSlot.algsStates.algsPresent( AlgsExecutionStates::CONTROLREADY ) &&
732  !thisSlot.algsStates.algsPresent( AlgsExecutionStates::DATAREADY ) &&
733  !thisSlot.algsStates.algsPresent( AlgsExecutionStates::SCHEDULED ) ) {
734 
735  thisSlot.complete = true;
736  // if the event did not fail, add it to the finished events
737  // otherwise it is taken care of in the error handling already
738  if ( m_algExecStateSvc->eventStatus( *thisSlot.eventContext ) == EventStatus::Success ) {
739  m_finishedEvents.push( thisSlot.eventContext );
740  if ( msgLevel( MSG::DEBUG ) )
741  debug() << "Event " << thisSlot.eventContext->evt() << " finished (slot " << thisSlot.eventContext->slot()
742  << ")." << endmsg;
743  }
744 
745  // now let's return the fully evaluated result of the control flow
746  if ( msgLevel( MSG::DEBUG ) ) debug() << m_precSvc->printState( thisSlot ) << endmsg;
747 
748  thisSlot.eventContext = nullptr;
749  } else {
750  StatusCode eventStalledSC = isStalled( iSlot );
751  if ( !eventStalledSC.isSuccess() ) {
752  m_algExecStateSvc->setEventStatus( EventStatus::AlgStall, *thisSlot.eventContext );
753  eventFailed( thisSlot.eventContext ).ignore();
754  }
755  }
756  } // end loop on slots
757 
758  verbose() << "States Updated." << endmsg;
759 
760  return global_sc;
761 }
762 
763 //---------------------------------------------------------------------------
764 
772 {
773  // Get the slot
774  EventSlot& thisSlot = m_eventSlots[iSlot];
775 
776  if ( m_actionsQueue.empty() && m_algosInFlight == 0 && m_IOBoundAlgosInFlight == 0 &&
778 
779  info() << "About to declare a stall" << endmsg;
780  fatal() << "*** Stall detected! ***\n" << endmsg;
781  dumpSchedulerState( iSlot );
782  // throw GaudiException ("Stall detected",name(),StatusCode::FAILURE);
783 
784  return StatusCode::FAILURE;
785  }
786  return StatusCode::SUCCESS;
787 }
788 
789 //---------------------------------------------------------------------------
790 
797 {
798 
799  // To have just one big message
800  std::ostringstream outputMessageStream;
801 
802  outputMessageStream << "============================== Execution Task State ============================="
803  << std::endl;
804  dumpState( outputMessageStream );
805 
806  outputMessageStream << std::endl
807  << "============================== Scheduler State ================================="
808  << std::endl;
809 
810  int slotCount = -1;
811  for ( auto& thisSlot : m_eventSlots ) {
812  slotCount++;
813  if ( thisSlot.complete ) continue;
814 
815  outputMessageStream << "----------- slot: " << thisSlot.eventContext->slot()
816  << " event: " << thisSlot.eventContext->evt() << " -----------" << std::endl;
817 
818  if ( 0 > iSlot or iSlot == slotCount ) {
819 
820  // Snapshot of the Control Flow and FSM states
821  outputMessageStream << "\nControl Flow and FSM states:" << std::endl;
822  outputMessageStream << m_precSvc->printState( thisSlot ) << std::endl;
823  }
824  }
825 
826  outputMessageStream << "=================================== END ======================================" << std::endl;
827 
828  info() << "Dumping Scheduler State " << std::endl << outputMessageStream.str() << endmsg;
829 }
830 
831 //---------------------------------------------------------------------------
832 
834 {
835 
836  if ( m_algosInFlight == m_maxAlgosInFlight ) return StatusCode::FAILURE;
837 
838  const std::string& algName( index2algname( iAlgo ) );
839  IAlgorithm* ialgoPtr = nullptr;
840  StatusCode sc( m_algResourcePool->acquireAlgorithm( algName, ialgoPtr ) );
841 
842  if ( sc.isSuccess() ) { // if we managed to get an algorithm instance try to schedule it
843  EventContext* eventContext( m_eventSlots[si].eventContext );
844  if ( !eventContext ) {
845  fatal() << "Event context for algorithm " << algName << " is a nullptr (slot " << si << ")" << endmsg;
846  return StatusCode::FAILURE;
847  }
848 
849  ++m_algosInFlight;
850  auto promote2ExecutedClosure = [this, iAlgo, ialgoPtr, eventContext]() {
851  this->m_actionsQueue.push( [this, iAlgo, ialgoPtr, eventContext]() {
852  return this->AvalancheSchedulerSvc::promoteToExecuted( iAlgo, eventContext->slot(), ialgoPtr, eventContext );
853  } );
854  return StatusCode::SUCCESS;
855  };
856 
857  // Avoid to use tbb if the pool size is 1 and run in this thread
858  if ( -100 != m_threadPoolSize ) {
859  // the child task that executes an Algorithm
860  tbb::task* algoTask = new ( tbb::task::allocate_root() )
861  AlgoExecutionTask( ialgoPtr, eventContext, serviceLocator(), m_algExecStateSvc, promote2ExecutedClosure );
862  // schedule the algoTask
863  tbb::task::enqueue( *algoTask );
864 
865  } else {
866  AlgoExecutionTask theTask( ialgoPtr, eventContext, serviceLocator(), m_algExecStateSvc, promote2ExecutedClosure );
867  theTask.execute();
868  }
869 
870  if ( msgLevel( MSG::DEBUG ) )
871  debug() << "Algorithm " << algName << " was submitted on event " << eventContext->evt() << " in slot " << si
872  << ". Algorithms scheduled are " << m_algosInFlight << endmsg;
873 
874  StatusCode updateSc( m_eventSlots[si].algsStates.updateState( iAlgo, AlgsExecutionStates::SCHEDULED ) );
875 
876  if ( msgLevel( MSG::VERBOSE ) ) dumpSchedulerState( -1 );
877 
878  if ( updateSc.isSuccess() )
879  if ( msgLevel( MSG::VERBOSE ) ) verbose() << "Promoting " << algName << " to SCHEDULED on slot " << si << endmsg;
880  return updateSc;
881  } else {
882  if ( msgLevel( MSG::DEBUG ) )
883  debug() << "Could not acquire instance for algorithm " << index2algname( iAlgo ) << " on slot " << si << endmsg;
884  return sc;
885  }
886 }
887 
888 //---------------------------------------------------------------------------
889 
891 {
892 
893  if ( m_IOBoundAlgosInFlight == m_maxIOBoundAlgosInFlight ) return StatusCode::FAILURE;
894 
895  // bool IOBound = m_precSvc->isBlocking(algName);
896 
897  const std::string& algName( index2algname( iAlgo ) );
898  IAlgorithm* ialgoPtr = nullptr;
899  StatusCode sc( m_algResourcePool->acquireAlgorithm( algName, ialgoPtr ) );
900 
901  if ( sc.isSuccess() ) { // if we managed to get an algorithm instance try to schedule it
902  EventContext* eventContext( m_eventSlots[si].eventContext );
903  if ( !eventContext ) {
904  fatal() << "[Asynchronous] Event context for algorithm " << algName << " is a nullptr (slot " << si << ")"
905  << endmsg;
906  return StatusCode::FAILURE;
907  }
908 
909  ++m_IOBoundAlgosInFlight;
910  // Can we use tbb-based overloaded new-operator for a "custom" task (an algorithm wrapper, not derived from
911  // tbb::task)? it seems it works..
912  IOBoundAlgTask* theTask = new ( tbb::task::allocate_root() )
913  IOBoundAlgTask( ialgoPtr, eventContext, serviceLocator(), m_algExecStateSvc );
914  m_IOBoundAlgScheduler->push( *theTask );
915 
916  if ( msgLevel( MSG::DEBUG ) )
917  debug() << "[Asynchronous] Algorithm " << algName << " was submitted on event " << eventContext->evt()
918  << " in slot " << si << ". algorithms scheduled are " << m_IOBoundAlgosInFlight << endmsg;
919 
920  StatusCode updateSc( m_eventSlots[si].algsStates.updateState( iAlgo, AlgsExecutionStates::SCHEDULED ) );
921 
922  if ( updateSc.isSuccess() )
923  if ( msgLevel( MSG::VERBOSE ) )
924  verbose() << "[Asynchronous] Promoting " << algName << " to SCHEDULED on slot " << si << endmsg;
925  return updateSc;
926  } else {
927  if ( msgLevel( MSG::DEBUG ) )
928  debug() << "[Asynchronous] Could not acquire instance for algorithm " << index2algname( iAlgo ) << " on slot "
929  << si << endmsg;
930  return sc;
931  }
932 }
933 
934 //---------------------------------------------------------------------------
939  EventContext* eventContext )
940 {
941  // Check if the execution failed
942  if ( m_algExecStateSvc->eventStatus( *eventContext ) != EventStatus::Success ) eventFailed( eventContext ).ignore();
943 
944  Gaudi::Hive::setCurrentContext( eventContext );
945  StatusCode sc = m_algResourcePool->releaseAlgorithm( algo->name(), algo );
946 
947  if ( !sc.isSuccess() ) {
948  error() << "[Event " << eventContext->evt() << ", Slot " << eventContext->slot() << "] "
949  << "Instance of algorithm " << algo->name() << " could not be properly put back." << endmsg;
950  return StatusCode::FAILURE;
951  }
952 
953  m_algosInFlight--;
954 
955  EventSlot& thisSlot = m_eventSlots[si];
956 
957  if ( msgLevel( MSG::DEBUG ) )
958  debug() << "Trying to handle execution result of " << algo->name() << " on slot " << si << endmsg;
959  State state;
960  if ( algo->filterPassed() ) {
961  state = State::EVTACCEPTED;
962  } else {
963  state = State::EVTREJECTED;
964  }
965 
966  sc = thisSlot.algsStates.updateState( iAlgo, state );
967 
968  if ( sc.isSuccess() )
969  if ( msgLevel( MSG::VERBOSE ) )
970  verbose() << "Promoting " << algo->name() << " on slot " << si << " to " << AlgsExecutionStates::stateNames[state]
971  << endmsg;
972 
973  if ( msgLevel( MSG::DEBUG ) )
974  debug() << "Algorithm " << algo->name() << " executed in slot " << si << ". Algorithms scheduled are "
975  << m_algosInFlight << endmsg;
976 
977  // Schedule an update of the status of the algorithms
978  m_actionsQueue.push( [this, iAlgo]() { return this->updateStates( -1, iAlgo ); } );
979 
980  return sc;
981 }
982 
983 //---------------------------------------------------------------------------
988  EventContext* eventContext )
989 {
990  // Check if the execution failed
991  if ( m_algExecStateSvc->eventStatus( *eventContext ) != EventStatus::Success ) eventFailed( eventContext ).ignore();
992 
993  StatusCode sc = m_algResourcePool->releaseAlgorithm( algo->name(), algo );
994 
995  if ( !sc.isSuccess() ) {
996  error() << "[Asynchronous] [Event " << eventContext->evt() << ", Slot " << eventContext->slot() << "] "
997  << "Instance of algorithm " << algo->name() << " could not be properly put back." << endmsg;
998  return StatusCode::FAILURE;
999  }
1000 
1001  m_IOBoundAlgosInFlight--;
1002 
1003  EventSlot& thisSlot = m_eventSlots[si];
1004 
1005  if ( msgLevel( MSG::DEBUG ) )
1006  debug() << "[Asynchronous] Trying to handle execution result of " << algo->name() << " on slot " << si << endmsg;
1007  State state;
1008  if ( algo->filterPassed() ) {
1009  state = State::EVTACCEPTED;
1010  } else {
1011  state = State::EVTREJECTED;
1012  }
1013 
1014  sc = thisSlot.algsStates.updateState( iAlgo, state );
1015 
1016  if ( sc.isSuccess() )
1017  if ( msgLevel( MSG::VERBOSE ) )
1018  verbose() << "[Asynchronous] Promoting " << algo->name() << " on slot " << si << " to "
1020 
1021  if ( msgLevel( MSG::DEBUG ) )
1022  debug() << "[Asynchronous] Algorithm " << algo->name() << " executed in slot " << si
1023  << ". Algorithms scheduled are " << m_IOBoundAlgosInFlight << endmsg;
1024 
1025  // Schedule an update of the status of the algorithms
1026  m_actionsQueue.push( [this, iAlgo]() { return this->updateStates( -1, iAlgo ); } );
1027 
1028  return sc;
1029 }
1030 
1031 //===========================================================================
1033 {
1034 
1035  std::lock_guard<std::mutex> lock( m_ssMut );
1036  m_sState.push_back( SchedulerState( a, e, t ) );
1037 }
1038 
1039 //===========================================================================
1041 {
1042 
1043  std::lock_guard<std::mutex> lock( m_ssMut );
1044 
1045  for ( std::list<SchedulerState>::iterator itr = m_sState.begin(); itr != m_sState.end(); ++itr ) {
1046  if ( *itr == a ) {
1047  m_sState.erase( itr );
1048  return true;
1049  }
1050  }
1051 
1052  error() << "could not find Alg " << a->name() << " in Scheduler!" << endmsg;
1053  return false;
1054 }
1055 
1056 //===========================================================================
1058 {
1059 
1060  std::lock_guard<std::mutex> lock( m_ssMut );
1061 
1062  for ( auto it : m_sState ) {
1063  ost << " " << it << std::endl;
1064  }
1065 }
1066 
1067 //===========================================================================
1069 {
1070 
1071  std::lock_guard<std::mutex> lock( m_ssMut );
1072 
1073  std::ostringstream ost;
1074  ost << "dumping Executing Threads: [" << m_sState.size() << "]" << std::endl;
1075  dumpState( ost );
1076 
1077  info() << ost.str() << endmsg;
1078 }
bool algsPresent(State state) const
const concurrency::PrecedenceRulesGraph * getRules() const
Precedence rules accessor.
Definition: PrecedenceSvc.h:68
Wrapper around I/O-bound Gaudi-algorithms.
StatusCode tryPopFinishedEvent(EventContext *&eventContext) override
Try to fetch an event from the scheduler.
StatusCode initialize() override
Definition: Service.cpp:64
const unsigned int & getAlgoIndex() const
Get algorithm index.
T empty(T...args)
T open(T...args)
const std::string & name() const override
The identifying name of the algorithm object.
Definition: Algorithm.cpp:737
StatusCode finalize() override
Definition: Service.cpp:174
ContextID_t slot() const
Definition: EventContext.h:40
StatusCode initialize() override
Initialise.
void dumpSchedulerState(int iSlot)
Dump the state of the scheduler.
StatusCode promoteToScheduled(unsigned int iAlgo, int si)
Algorithm promotion.
AlgsExecutionStates algsStates
Vector of algorithms states.
Definition: EventSlot.h:33
const DataObjIDColl & outputDataObjs() const override
bool isSuccess() const
Test for a status code of SUCCESS.
Definition: StatusCode.h:50
EventContext * eventContext
Cache for the eventContext.
Definition: EventSlot.h:28
StatusCode isStalled(int si)
Check if the scheduling is in a stall.
A service to resolve the task execution precedence.
Definition: PrecedenceSvc.h:21
Header file for class GaudiAlgorithm.
T to_string(T...args)
T endl(T...args)
virtual bool filterPassed() const =0
Did this algorithm pass or fail its filter criterion for the last event?
void activate()
Activate scheduler.
T end(T...args)
AlgorithmNode * getAlgorithmNode(const std::string &algoName) const
Get the AlgorithmNode from by algorithm name using graph index.
size_t sizeOfSubset(State state) const
StatusCode promoteToAsyncScheduled(unsigned int iAlgo, int si)
This class represents an entry point to all the event specific data.
Definition: EventContext.h:24
bool isFailure() const
Test for a status code of FAILURE.
Definition: StatusCode.h:61
unsigned int algname2index(const std::string &algoname)
Convert a name to an integer.
void addAlg(Algorithm *, EventContext *, pthread_t)
virtual const std::string & type() const =0
The type of the algorithm.
tbb::task * execute() override
ContextEvt_t evt() const
Definition: EventContext.h:39
STL class.
StatusCode pushNewEvents(std::vector< EventContext * > &eventContexts) override
T push_back(T...args)
static std::list< SchedulerState > m_sState
STL class.
The AlgsExecutionStates encodes the state machine for the execution of algorithms within a single eve...
StatusCode popFinishedEvent(EventContext *&eventContext) override
Blocks until an event is availble.
This class is used for returning status codes from appropriate routines.
Definition: StatusCode.h:26
const DataObjIDColl & inputDataObjs() const override
T close(T...args)
StatusCode finalize() override
Finalise.
#define DECLARE_SERVICE_FACTORY(x)
Definition: Service.h:211
bool complete
Flags completion of the event.
Definition: EventSlot.h:35
T max(T...args)
The IAlgorithm is the interface implemented by the Algorithm base class.
Definition: IAlgorithm.h:28
GAUDI_API void setCurrentContext(const EventContext *ctx)
T insert(T...args)
void addDependency(const DataObjID &id, const Gaudi::DataHandle::Mode &mode) override
Base class from which all concrete algorithm classes should be derived.
Definition: Algorithm.h:79
T find_if(T...args)
T size(T...args)
StatusCode pushNewEvent(EventContext *eventContext) override
Make an event available to the scheduler.
STL class.
void reset(EventContext *theeventContext)
Reset all resources in order to reuse the slot.
Definition: EventSlot.h:22
virtual Out operator()(const vector_of_const_< In > &inputs) const =0
bool isValid() const
Allow for check if smart pointer is valid.
Definition: SmartIF.h:68
StatusCode eventFailed(EventContext *eventContext)
Method to check if an event failed and take appropriate actions.
T begin(T...args)
Iterator begin(State kind)
const std::string & index2algname(unsigned int index)
Convert an integer to a name.
Class representing the event slot.
Definition: EventSlot.h:10
string s
Definition: gaudirun.py:253
StatusCode promoteToExecuted(unsigned int iAlgo, int si, IAlgorithm *algo, EventContext *)
The call to this method is triggered only from within the AlgoExecutionTask.
unsigned int freeSlots() override
Get free slots number.
T sort(T...args)
StatusCode promoteToAsyncExecuted(unsigned int iAlgo, int si, IAlgorithm *algo, EventContext *)
The call to this method is triggered only from within the IOBoundAlgTask.
StatusCode deactivate()
Deactivate scheduler.
StatusCode updateStates(int si=-1, int algo_index=-1)
Loop on algorithm in the slots and promote them to successive states (-1 for algo_index means skippin...
void ignore() const
Definition: StatusCode.h:84
State
Execution states of the algorithms.
T for_each(T...args)
std::string fullKey() const
Definition: DataObjID.cpp:54
STL class.
MsgStream & endmsg(MsgStream &s)
MsgStream Modifier: endmsg. Calls the output method of the MsgStream.
Definition: MsgStream.h:209
static GAUDI_API void setNumConcEvents(const std::size_t &nE)
unsigned int getControlFlowNodeCounter() const
Get total number of control flow graph nodes.
T reserve(T...args)
static std::map< State, std::string > stateNames
StatusCode m_drain()
Drain the actions present in the queue.
Iterator end(State kind)
StatusCode updateState(unsigned int iAlgo, State newState)