The Gaudi Framework  master (37c0b60a)
CUDADeviceArray.cpp
Go to the documentation of this file.
1 /***********************************************************************************\
2 * (c) Copyright 2024 CERN for the benefit of the LHCb and ATLAS collaborations *
3 * *
4 * This software is distributed under the terms of the Apache version 2 licence, *
5 * copied verbatim in the file "LICENSE". *
6 * *
7 * In applying this licence, CERN does not waive the privileges and immunities *
8 * granted to it by virtue of its status as an Intergovernmental Organization *
9 * or submit itself to any jurisdiction. *
10 \***********************************************************************************/
11 #include "CUDADeviceArray.h"
12 
13 // Gaudi
15 #include <Gaudi/CUDA/CUDAStream.h>
16 
17 // CUDA
18 #ifndef __CUDACC__
19 # include <cuda_runtime.h>
20 #endif
21 
22 // Fibers
23 #include <boost/fiber/condition_variable.hpp>
24 #include <boost/fiber/mutex.hpp>
25 
26 // standard library
27 #include <chrono>
28 #include <format>
29 #include <string>
30 #include <thread>
31 
33  using namespace std::chrono_literals;
34  namespace {
35  const std::string DEVARREXC = "CUDADeviceArrayException";
36  std::string err_fmt( cudaError_t err, std::string file, int line ) {
37  const char* errname = cudaGetErrorName( err );
38  const char* errstr = cudaGetErrorString( err );
39  std::string errmsg =
40  std::format( "Encountered CUDA error {} [{}]: {} on {}:{}", errname, int( err ), errstr, file, line );
41  return errmsg;
42  }
43 
44  boost::fibers::mutex gpu_mem_mtx;
45  boost::fibers::condition_variable gpu_mem_cv;
46  } // namespace
47 
49  void* devPtr = nullptr;
50  cudaError_t err = cudaSuccess;
51  auto start_time = std::chrono::steady_clock::now();
52  do {
53  err = cudaMallocAsync( &devPtr, size, stream );
54  if ( err == cudaSuccess ) { break; }
55  if ( err == cudaErrorMemoryAllocation ) {
56  cudaGetLastError();
57  std::unique_lock lck( gpu_mem_mtx );
58  gpu_mem_cv.wait( lck );
59  } else {
60  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
61  }
62  } while ( err == cudaErrorMemoryAllocation );
63  // In case we suspended
64  stream.parent()->restoreAfterSuspend().orThrow( "Error restoring", DEVARREXC );
65  return devPtr;
66  }
67 
69  void* devPtr = nullptr;
70  cudaError_t err = cudaSuccess;
71  auto start_time = std::chrono::steady_clock::now();
72  do {
73  err = cudaMalloc( &devPtr, size );
74  if ( err == cudaSuccess ) { break; }
75  if ( err == cudaErrorMemoryAllocation ) {
76  cudaGetLastError();
77  // If called from an AsynchronousAlgorithm, wait as in the with stream variant
78  // Otherwise, the thread should sleep
79  if ( parent != nullptr ) {
80  std::unique_lock lck( gpu_mem_mtx );
81  gpu_mem_cv.wait( lck );
82  parent->restoreAfterSuspend().orThrow( "Error restoring", DEVARREXC );
83  } else {
85  }
86  } else {
87  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
88  }
89  } while ( err == cudaErrorMemoryAllocation );
90  return devPtr;
91  }
92 
93  void freeWithStream( void* ptr, Stream& stream ) {
94  cudaError_t err = cudaFreeAsync( ptr, stream );
95  if ( err != cudaSuccess ) {
96  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
97  }
98  gpu_mem_cv.notify_all();
99  }
100 
101  void freeNoStream( void* ptr ) {
102  cudaError_t err = cudaFree( ptr );
103  if ( err != cudaSuccess ) {
104  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
105  }
106  gpu_mem_cv.notify_all();
107  }
108 
109  void copyHostToDeviceWithStream( void* devPtr, const void* hstPtr, std::size_t size, Stream& stream ) {
110  cudaError_t err = cudaMemcpyAsync( devPtr, hstPtr, size, cudaMemcpyHostToDevice, stream );
111  if ( err != cudaSuccess ) {
112  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
113  }
114  // await stream to avoid deleting host memory before copy is done
115  stream.await().orThrow( "Await error", DEVARREXC );
116  }
117 
118  void copyHostToDeviceNoStream( void* devPtr, const void* hstPtr, std::size_t size ) {
119  cudaError_t err = cudaMemcpy( devPtr, hstPtr, size, cudaMemcpyHostToDevice );
120  if ( err != cudaSuccess ) {
121  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
122  }
123  }
124 
125  void copyDeviceToHostWithStream( void* hstPtr, const void* devPtr, std::size_t size, Stream& stream ) {
126  cudaError_t err = cudaMemcpyAsync( hstPtr, devPtr, size, cudaMemcpyDeviceToHost, stream );
127  if ( err != cudaSuccess ) {
128  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
129  }
130  // await stream to avoid deleting host memory before copy is done
131  stream.await().orThrow( "Await error", DEVARREXC );
132  }
133 
134  void copyDeviceToHostNoStream( void* hstPtr, const void* devPtr, std::size_t size ) {
135  cudaError_t err = cudaMemcpy( hstPtr, devPtr, size, cudaMemcpyDeviceToHost );
136  if ( err != cudaSuccess ) {
137  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
138  }
139  }
140 
141  void copyDeviceToDeviceWithStream( void* destDevPtr, const void* srcDevPtr, std::size_t size, Stream& stream ) {
142  cudaError_t err = cudaMemcpyAsync( destDevPtr, srcDevPtr, size, cudaMemcpyDeviceToDevice, stream );
143  if ( err != cudaSuccess ) {
144  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
145  }
146  }
147 
148  void copyDeviceToDeviceNoStream( void* destDevPtr, const void* srcDevPtr, std::size_t size ) {
149  cudaError_t err = cudaMemcpy( destDevPtr, srcDevPtr, size, cudaMemcpyDeviceToDevice );
150  if ( err != cudaSuccess ) {
151  throw GaudiException( err_fmt( err, __FILE__, __LINE__ ), DEVARREXC, StatusCode::FAILURE );
152  }
153  }
154 } // namespace Gaudi::CUDA::Detail
CUDADeviceArray.h
std::this_thread::sleep_for
T sleep_for(T... args)
Write.stream
stream
Definition: Write.py:32
std::string
STL class.
details::size
constexpr auto size(const T &, Args &&...) noexcept
Definition: AnyDataWrapper.h:23
StatusCode::orThrow
const StatusCode & orThrow(std::string_view message, std::string_view tag) const
Throw a GaudiException in case of failures.
Definition: StatusCode.h:206
Gaudi::CUDA::Detail::copyDeviceToDeviceNoStream
void copyDeviceToDeviceNoStream(void *destDevPtr, const void *srcDevPtr, std::size_t size)
Definition: CUDADeviceArray.cpp:148
GaudiException
Definition: GaudiException.h:31
Gaudi::CUDA::Detail::copyDeviceToDeviceWithStream
void copyDeviceToDeviceWithStream(void *destDevPtr, const void *srcDevPtr, std::size_t size, Stream &stream)
Definition: CUDADeviceArray.cpp:141
CUDAStream.h
Gaudi::CUDA::Detail::copyHostToDeviceNoStream
void copyHostToDeviceNoStream(void *devPtr, const void *hstPtr, std::size_t size)
Definition: CUDADeviceArray.cpp:118
Gaudi::Units::ms
constexpr double ms
Definition: SystemOfUnits.h:154
AsynchronousAlgorithm.h
Gaudi::CUDA::Detail::freeWithStream
void freeWithStream(void *ptr, Stream &stream)
Definition: CUDADeviceArray.cpp:93
Gaudi::CUDA::Detail::allocateNoStream
void * allocateNoStream(std::size_t size, Gaudi::AsynchronousAlgorithm *parent)
Definition: CUDADeviceArray.cpp:68
Gaudi::CUDA::Detail::allocateWithStream
void * allocateWithStream(std::size_t size, Stream &stream)
Definition: CUDADeviceArray.cpp:48
std::unique_lock
STL class.
Gaudi::CUDA::Stream
Definition: CUDAStream.h:21
Gaudi::AsynchronousAlgorithm::restoreAfterSuspend
virtual StatusCode restoreAfterSuspend() const
Restore after suspend.
Definition: AsynchronousAlgorithm.cpp:38
Gaudi::AsynchronousAlgorithm
Base class for asynchronous algorithms.
Definition: AsynchronousAlgorithm.h:34
format
GAUDI_API std::string format(const char *,...)
MsgStream format utility "a la sprintf(...)".
Definition: MsgStream.cpp:119
Gaudi::CUDA::Detail::copyDeviceToHostWithStream
void copyDeviceToHostWithStream(void *hstPtr, const void *devPtr, std::size_t size, Stream &stream)
Definition: CUDADeviceArray.cpp:125
Gaudi::CUDA::Detail
Definition: CUDADeviceArray.cpp:32
Gaudi::CUDA::Detail::freeNoStream
void freeNoStream(void *ptr)
Definition: CUDADeviceArray.cpp:101
Gaudi::CUDA::Detail::copyHostToDeviceWithStream
void copyHostToDeviceWithStream(void *devPtr, const void *hstPtr, std::size_t size, Stream &stream)
Definition: CUDADeviceArray.cpp:109
plotSpeedupsPyRoot.line
line
Definition: plotSpeedupsPyRoot.py:198
std::size_t
StatusCode::FAILURE
constexpr static const auto FAILURE
Definition: StatusCode.h:101
Gaudi::CUDA::Detail::copyDeviceToHostNoStream
void copyDeviceToHostNoStream(void *hstPtr, const void *devPtr, std::size_t size)
Definition: CUDADeviceArray.cpp:134
std::chrono::steady_clock::now
T now(T... args)