cu-device.h
Go to the documentation of this file.
1 // cudamatrix/cu-device.h
2 
3 // Copyright 2009-2012 Karel Vesely
4 // 2012-2015 Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 
23 #ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
24 #define KALDI_CUDAMATRIX_CU_DEVICE_H_
25 
26 #if HAVE_CUDA == 1
27 #include <cublas_v2.h>
28 #include <cusparse.h>
29 #include <curand.h>
30 #include <map>
31 #include <string>
32 #include <iostream>
33 #include <cuda.h>
34 #include <cuda_runtime_api.h>
35 #include "base/kaldi-common.h"
36 #include "base/timer.h"
38 #include "cudamatrix/cu-common.h"
39 
40 #if CUDA_VERSION >= 9010
41 #include <cusolverDn.h>
42 #else
43 // cusolver not supported.
44 // Setting a few types to minimize compiler guards.
45 // If a user tries to use cusovler it will throw an error.
46 typedef void* cusolverDnHandle_t;
47 typedef int cusolverStatus_t;
48 #endif
49 
50 namespace kaldi {
51 
52 class CuTimer;
53 
79 class CuDevice {
80  public:
81 
82  // You obtain the CuDevice for the current thread by calling
83  // CuDevice::Instantiate()
84  // At the beginning of the program, if you want to use a GPU, you
85  // should call CuDevice::Instantiate().SelectGpuId(..).
86  static inline CuDevice& Instantiate() {
87  CuDevice &ans = this_thread_device_;
88  if (!ans.initialized_)
89  ans.Initialize();
90  return ans;
91  }
92 
93  inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
94  inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
95  inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
96  inline cusolverDnHandle_t GetCusolverDnHandle() {
97 #if CUDA_VERSION < 9010
98  KALDI_ERR << "CUDA VERSION '" << CUDA_VERSION << "' not new enough to support "
99  << "cusolver. Upgrade to at least 9.1";
100 #endif
101  return cusolverdn_handle_;
102  }
103 
104  inline void SeedGpu() {
105  if (CuDevice::Instantiate().Enabled()) {
106  // To get same random sequence, call srand() before the method is invoked,
107  CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
108  curand_handle_, RandInt(128, RAND_MAX)));
109  CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
110  }
111  }
112  // We provide functions Malloc(), MallocPitch() and Free() which replace
113  // cudaMalloc(), cudaMallocPitch() and cudaFree(). Their function is to cache
114  // the results of previous allocations to avoid the very large overhead that
115  // CUDA's allocation seems to give for some setups.
116  inline void* Malloc(size_t size) {
117  return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
118  g_cuda_allocator.Malloc(size);
119  }
120 
121  inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
122  if (multi_threaded_) {
123  return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
124  } else if (debug_stride_mode_) {
125  // The pitch bucket size is hardware dependent.
126  // It is 512 on K40c with CUDA 7.5
127  // "% 8" ensures that any 8 adjacent allocations have different pitches
128  // if their original pitches are same in the normal mode.
129  return g_cuda_allocator.MallocPitch(
130  row_bytes + 512 * RandInt(0, 4), num_rows,
131  pitch);
132  } else {
133  return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
134  }
135  }
136 
137  inline void Free(void *ptr) {
138  if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
139  else g_cuda_allocator.Free(ptr);
140  }
141 
155  void SelectGpuId(std::string use_gpu);
156 
157  // Select a specific GPU for computation. Will reuse the existing Cuda Context
158  // for that device. Initialize the necessary handles for GPU use (e.g. cublas
159  // handle)
160  bool SelectAndInitializeGpuIdWithExistingCudaContext(int dev_id);
161 
163  bool Enabled() const {
164  return (device_id_ > -1);
165  }
166 
169  bool DoublePrecisionSupported();
170 
174  void AccuProfile(const char *function_name, const CuTimer &timer);
175 
177  void PrintProfile();
178 
180  void PrintMemoryUsage() const;
181 
186  inline void AllowMultithreading() { multi_threaded_ = true; }
187 
189  void DeviceGetName(char* name, int32 len, int32 dev);
190 
193  void CheckGpuHealth();
194 
198  int32 GetMatrixAlignment() const;
199 
207  bool SetDebugStrideMode(bool mode) {
208  bool old_mode = debug_stride_mode_;
209  debug_stride_mode_ = mode;
210  return old_mode;
211  }
212 
218  bool IsComputeExclusive();
219 
220  // Register command line options for CUDA device.
221  // This must be done before calling CuDevice::Initialize()
222  // Example:
223  // CuDevice::RegisterDeviceOptions(&po);
224  // po.Read(argc, argv);
225  // CuDevice::Initialize();
226  static void RegisterDeviceOptions(OptionsItf *po) {
227  CuDevice::device_options_.Register(po);
228  }
229  ~CuDevice();
230  private:
231 
232  struct CuDeviceOptions {
233  bool use_tensor_cores; // Enable tensor cores
234  CuDeviceOptions () : use_tensor_cores(false) {};
235  void Register(OptionsItf *po) {
236  po->Register("cuda-use-tensor-cores", &use_tensor_cores,
237  "Enable FP16 tensor math. "
238  "This is higher performance but less accuracy. "
239  "This is only recommended for inference.");
240  }
241  };
242 
243  static CuDeviceOptions device_options_;
244 
245  // Default constructor used to initialize this_thread_device_
246  CuDevice();
247  CuDevice(CuDevice&); // Disallow.
248  CuDevice &operator=(CuDevice&); // Disallow.
249 
250 
256  void Initialize();
257 
261  bool SelectGpuIdAuto();
262 
263  // Selects GPU given its ID. Called from SelectGpuIdAuto or
264  // SelectGpuIdWithExistingCudaContext
265  bool SelectGpuId(int dev_id);
266 
274  void FinalizeActiveGpu();
275 
277  int32 MajorDeviceVersion();
278 
280  int32 MinorDeviceVersion();
281 
282 
283  // Each thread has its own CuDevice object, which contains the cublas and
284  // cusparse handles. These are unique to the thread (which is what is
285  // recommended by NVidia).
286  static thread_local CuDevice this_thread_device_;
287 
288  // The GPU device-id that we are using. This will be initialized to -1, and will
289  // be set when the user calls
290  // CuDevice::Instantiate::SelectGpuId(...)
291  // from the main thread. Background threads will, when spawned and when
292  // CuDevice::Instantiate() is called from them the first time, will
293  // call cudaSetDevice(device_id))
294  static int32 device_id_;
295 
296  // This will automatically be set to true if the application has multiple
297  // threads that access the GPU device. It is used to know whether to
298  // use locks when accessing the allocator and the profiling-related code.
299  static bool multi_threaded_;
300 
301  // The variable profile_map_ will only be used if the verbose level is >= 1;
302  // it will accumulate some function-level timing information that is printed
303  // out at program end. This makes things a bit slower as we have to call
304  // cudaDeviceSynchronize() to make the timing information meaningful.
305  static unordered_map<std::string, double, StringHasher> profile_map_;
306  // profile_mutex_ guards profile_map_ in case multi_threaded_ is true.
307  static std::mutex profile_mutex_;
308 
309  // free_memory_at_startup_ is just used in printing the memory used according
310  // to the device.
311  static int64 free_memory_at_startup_;
312  static cudaDeviceProp properties_;
313 
314  // If set to true by SetDebugStrideMode(), code will be activated to use
315  // pseudo-random stride values when allocating data (to detect errors which
316  // otherwise would be rare).
317  static bool debug_stride_mode_;
318 
319 
320  // The following member variable is initialized to false; if the user calls
321  // Instantiate() in a thread where it is still false, Initialize() will be
322  // called, in order to -- if a GPU is being used-- call cudaSetDevice() and
323  // set up the cublas and cusparse handles.
324  bool initialized_;
325 
326  // This variable is just a copy of the static variable device_id_. It's used
327  // to detect when this code is called in the wrong way.
328  int32 device_id_copy_;
329 
330  cublasHandle_t cublas_handle_;
331  cusparseHandle_t cusparse_handle_;
332  curandGenerator_t curand_handle_;
333  cusolverDnHandle_t cusolverdn_handle_;
334 }; // class CuDevice
335 
336 
337 // Class CuTimer is a convenience wrapper for class Timer which only
338 // sets the time if the verbose level is >= 1. This helps avoid
339 // an unnecessary system call if the verbose level is 0 and you
340 // won't be accumulating the timing stats.
341 class CuTimer: public Timer {
342  public:
343  CuTimer(): Timer(GetVerboseLevel() >= 1) { }
344 };
345 
346 // This function is declared as a more convenient way to get the CUDA device handle for use
347 // in the CUBLAS v2 API, since we so frequently need to access it.
348 inline cublasHandle_t GetCublasHandle() {
349  return CuDevice::Instantiate().GetCublasHandle();
350 }
351 
352 inline cusolverDnHandle_t GetCusolverDnHandle() {
353  return CuDevice::Instantiate().GetCusolverDnHandle();
354 }
355 
356 // A more convenient way to get the handle to use cuSPARSE APIs.
357 inline cusparseHandle_t GetCusparseHandle() {
358  return CuDevice::Instantiate().GetCusparseHandle();
359 }
360 
361 inline curandGenerator_t GetCurandHandle() {
362  return CuDevice::Instantiate().GetCurandHandle();
363 }
364 
365 
366 } // namespace kaldi
367 
368 #endif // HAVE_CUDA
369 
370 
371 namespace kaldi {
372 
391 void SynchronizeGpu();
392 
393 } // namespace kaldi
394 
395 #endif // KALDI_CUDAMATRIX_CU_DEVICE_H_
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
kaldi::int32 int32
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
Definition: cu-device.cc:638
#define KALDI_ERR
Definition: kaldi-error.h:147
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95