cu-device.cc
Go to the documentation of this file.
1 // cudamatrix/cu-device.cc
2 
3 // Copyright 2009-2012 Karel Vesely
4 // 2013 Lucas Ondel
5 // 2013-2015 Johns Hopkins University (author: Daniel Povey)
6 // 2015 Guoguo Chen
7 
8 // See ../../COPYING for clarification regarding multiple authors
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
18 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
19 // MERCHANTABLITY OR NON-INFRINGEMENT.
20 // See the Apache 2 License for the specific language governing permissions and
21 // limitations under the License.
22 
23 
24 
25 #if HAVE_CUDA == 1
26 #include <cublas_v2.h>
27 #include <cuda.h>
28 #include <cuda_runtime_api.h>
29 
30 #include <string>
31 #include <vector>
32 #include <algorithm>
33 #ifndef _MSC_VER
34 #include <dlfcn.h>
35 #endif
36 
37 #include "cudamatrix/cu-common.h"
38 #include "cudamatrix/cu-device.h"
39 #include "cudamatrix/cu-matrix.h"
40 #include "base/kaldi-error.h"
41 #include "base/kaldi-utils.h"
42 #include "util/common-utils.h"
43 #include "util/kaldi-io.h"
44 // the following is for cuda_legacy_noop().
46 
47 namespace kaldi {
48 
52 static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
53  // Our first attempt to get a device context is: we do cudaFree(0) and see if
54  // that returns no error code. If it succeeds then we have a device
55  // context. Apparently this is the canonical way to get a context.
56  if (cudaFree(0) == 0) {
57  cudaGetLastError(); // Clear any error status.
58  return true;
59  }
60 
61  // The rest of this code represents how we used to get a device context, but
62  // now its purpose is mainly a debugging one.
63  std::ostringstream debug_stream;
64  debug_stream << "num-gpus=" << num_gpus << ". ";
65  for (int32 device = 0; device < num_gpus; device++) {
66  cudaSetDevice(device);
67  cudaError_t e = cudaFree(0); // CUDA context gets created here.
68  if (e == cudaSuccess) {
69  if (debug_str)
70  *debug_str = debug_stream.str();
71  cudaGetLastError(); // Make sure the error state doesn't get returned in
72  // the next cudaGetLastError().
73  return true;
74  }
75  debug_stream << "Device " << device << ": " << cudaGetErrorString(e) << ". ";
76  }
77  if (debug_str)
78  *debug_str = debug_stream.str();
79  return false;
80 }
81 
82 
83 void CuDevice::Initialize() {
84  // This function may be called in the following two situations:
85  //
86  // (1) in the main thread, only when a GPU is not currently being used, either
87  // within a call like CuDevice()::Instantiate().SelectGpuId(..)
88  // (where the Instantiate() call will call Initialize() before SelectGpuId()
89  // is called, just because of how Instantiate() works), or in a call
90  // to 'CuDevice::Instantiate().Enabled()'. In this case it will just
91  // set initialized_ to true and notice that device_id_ == 1, and do nothing.
92  //
93  // (2) in threads created by the user, as soon as someone calls something that
94  // might potentially use the GPU, via CuDevice()::Instantiate().
95  // If device_id_ is >= 0, this will create the cuBLAS and cuSparse handles.
96  KALDI_ASSERT(!initialized_);
97  initialized_ = true;
98  if (device_id_ == -1) {
99  // There is nothing to do; we are not using a GPU.
100  return;
101  } else {
102  if (!multi_threaded_) {
103  multi_threaded_ = true;
104  KALDI_WARN << "For multi-threaded code that might use GPU, you should call "
105  "CuDevice::Instantiate().AllowMultithreading() at the start of "
106  "the program.";
107  }
108  device_id_copy_ = device_id_;
109  cudaSetDevice(device_id_);
110  // Initialize CUBLAS.
111  CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
112  CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
113 
114 #if CUDA_VERSION >= 9010
115  CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
116  CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
117  cudaStreamPerThread));
118 #endif
119 
120 #if CUDA_VERSION >= 9000
121  if (device_options_.use_tensor_cores) {
122  // Enable tensor cores in CUBLAS
123  // Note if the device does not support tensor cores this will fall back to normal math mode
124  CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_,
125  CUBLAS_TENSOR_OP_MATH));
126  }
127 #endif
128 
129  // Initialize the cuSPARSE library
130  CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
131  CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
132 
133  // Initialize the generator,
134  CURAND_SAFE_CALL(curandCreateGenerator(
135  &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
136  // To get same random sequence, call srand() before the constructor is invoked,
137  CURAND_SAFE_CALL(curandSetGeneratorOrdering(
138  curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
139  CURAND_SAFE_CALL(curandSetStream(curand_handle_, cudaStreamPerThread));
140  SeedGpu();
141  }
142 }
143 
144 void CuDevice::SelectGpuId(std::string use_gpu) {
145  if (device_id_ != -1) {
146  KALDI_ERR << "You cannot call SelectGpuId twice if, on the first time, "
147  "you requested a GPU.";
148  }
149  if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional" && use_gpu != "wait") {
150  KALDI_ERR << "Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu << "'";
151  }
152  if (use_gpu == "no") {
153  KALDI_LOG << "Manually selected to compute on CPU.";
154  return;
155  }
156  // Check that we have a gpu available
157  int32 num_gpus = 0;
158 
159  cudaError_t e = cudaGetDeviceCount(&num_gpus);
160 
161  // Make sure the global allocator object has the up-to-date options.
162  g_cuda_allocator.SetOptions(g_allocator_options);
163 
164  if (num_gpus == 0) {
165  if (use_gpu == "yes" || use_gpu == "wait") {
166  KALDI_CUDA_ERR(e, "No CUDA GPU detected!");
167  }
168  if (use_gpu == "optional") {
169  KALDI_WARN << "No CUDA GPU detected; running on CPU since --use-gpu=optional specified.";
170  return;
171  }
172  }
173 
174  // Create a CUDA context.
175  std::string debug_str;
176  bool got_context = GetCudaContext(num_gpus, &debug_str);
177 
178  if (use_gpu != "wait") {
179  if (!got_context) {
180  // So far no we don't have context, sleep a bit and retry.
181  int32 sec_sleep = (use_gpu == "yes" ? 20 : 2);
182  KALDI_WARN << "Will try again to get a GPU after " << sec_sleep
183  << " seconds.";
184  Sleep(sec_sleep);
185  if (!GetCudaContext(num_gpus, &debug_str)) {
186  if (use_gpu == "yes") {
187  {
188  Input input;
189  input.Open("nvidia-smi 1>&2 |");
190  }
191  KALDI_LOG << debug_str;
192  KALDI_ERR << "Failed to create CUDA context, no more unused GPUs? ";
193  }
194  if (use_gpu == "optional") {
195  KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
196  return;
197  }
198  }
199  }
200  } else {
201  int32 num_times = 0;
202  BaseFloat wait_time = 0.0;
203  while (!got_context) {
204  int32 sec_sleep = 5;
205  if (num_times == 0)
206  KALDI_WARN << "Will try again indefinitely every " << sec_sleep
207  << " seconds to get a GPU.";
208  num_times++;
209  wait_time += sec_sleep;
210  Sleep(sec_sleep);
211  got_context = GetCudaContext(num_gpus, NULL);
212  }
213 
214  KALDI_WARN << "Waited " << wait_time
215  << " seconds before creating CUDA context";
216  }
217 
218  // Double check that we have the context
219  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
220 
221  // Check if the machine use compute exclusive mode
222  if (IsComputeExclusive()) {
223  KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
224  FinalizeActiveGpu();
225  return;
226  } else {
227  // Suggest to use compute exclusive mode
228  KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use "
229  "'nvidia-smi -c 3' to set compute exclusive mode";
230  // We want to choose the device more carefully, so release the CUDA context.
231  e = cudaDeviceReset();
232  if (e != cudaSuccess) {
233  KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU");
234  }
235 
236  // And select the GPU according to proportion of free memory
237  if (SelectGpuIdAuto()) {
238  FinalizeActiveGpu();
239  return;
240  } else {
241  // We could not get a GPU the second time, after prevously having the CUDA
242  // context. Strange but not impossible.
243  if (use_gpu == "yes") {
244  KALDI_ERR << "Error acquiring GPU.";
245  }
246  if (use_gpu == "optional") {
247  KALDI_WARN << "Running on CPU!!! Error acquiring GPU.";
248  return;
249  }
250  }
251  }
252 }
253 
254 
255 void CuDevice::FinalizeActiveGpu() {
256  // The device at this point should have an active GPU, so we can query its
257  // name and memory stats and notify user which GPU is being used.
258 
259  // Get the device-id of the active device.
260  {
261  int device_id;
262  cudaError_t e = cudaGetDevice(&device_id);
263  if (e != cudaSuccess) {
264  KALDI_CUDA_ERR(e, "Failed to get device-id of active device.");
265  }
266  device_id_ = device_id;
267  device_id_copy_ = device_id;
268  initialized_ = true; // Prevent Initialize() from being called on this,
269  // the main thread.
270  // Initialize CUBLAS.
271  CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
272  CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
273 
274 #if CUDA_VERSION >= 9010
275  CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
276  CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
277  cudaStreamPerThread));
278 #endif
279 
280 #if CUDA_VERSION >= 9000
281  if (device_options_.use_tensor_cores) {
282  // Enable tensor cores in CUBLAS
283  // Note if the device does not support tensor cores this will fall back to normal math mode
284  CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_,
285  CUBLAS_TENSOR_OP_MATH));
286  }
287 #endif
288 
289 
290  // Initialize the cuSPARSE library
291  CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
292  CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
293 
294  // Initialize the generator,
295  CURAND_SAFE_CALL(curandCreateGenerator(
296  &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
297  // To get same random sequence, call srand() before the constructor is invoked,
298  CURAND_SAFE_CALL(curandSetGeneratorOrdering(
299  curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
300  SeedGpu();
301 
302  // Notify the user which GPU is being userd.
303  char name[128];
304  DeviceGetName(name,128, device_id);
305 
306  CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, device_id));
307 
308  KALDI_LOG << "The active GPU is [" << device_id << "]: " << name << "\t"
309  << GetFreeGpuMemory(&free_memory_at_startup_, NULL) << " version "
310  << properties_.major << "." << properties_.minor;
311  }
312  return;
313 }
314 
315 bool CuDevice::DoublePrecisionSupported() {
316  if (!Enabled()) return true;
317  return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
318  // Double precision is supported from version 1.3
319 }
320 
321 
322 bool CuDevice::IsComputeExclusive() {
323  // assume we already have an CUDA context created
324  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
325 
326  // get the device-id and its device-properties
327  int gpu_id = -1;
328  cudaError_t e = cudaGetDevice(&gpu_id);
329  if (e != cudaSuccess) {
330  KALDI_CUDA_ERR(e, "Failed to get current device");
331  }
332  struct cudaDeviceProp gpu_prop;
333  e = cudaGetDeviceProperties(&gpu_prop, gpu_id);
334  if (e != cudaSuccess) {
335  KALDI_CUDA_ERR(e, "Failed to get device properties");
336  }
337  // find out whether compute exclusive mode is used
338  switch (gpu_prop.computeMode) {
339  case cudaComputeModeExclusive :
340  return true;
341  break;
342  case cudaComputeModeExclusiveProcess :
343  return true;
344  break;
345  default :
346  // in this case we release the GPU context...
347  return false;
348  }
349 }
350 
351 bool CuDevice::SelectGpuId(int dev_id) {
352  KALDI_LOG << "Trying to select device: " << dev_id;
353  cudaError_t e = cudaSetDevice(dev_id);
354  if (e != cudaSuccess) {
355  KALDI_WARN << "Cannot select this device: return code " << e
356  << ", Error message: \"" << cudaGetErrorString(e) << "\"";
357  return false;
358  } else {
359  e = cudaDeviceSynchronize();
360  if (e != cudaSuccess) {
361  KALDI_WARN << "Cannot select this device: return code " << e
362  << ", Error message: \"" << cudaGetErrorString(e) << "\"";
363  return false;
364  }
365  }
366 
367  std::string debug_str;
368  int num_gpus = dev_id + 1; // used for debugging purposes
369  bool got_context = GetCudaContext(num_gpus, &debug_str);
370  if (!got_context) {
371  KALDI_WARN << "Cannot get Cuda Context, Error message: \"" << debug_str
372  << "\"";
373  }
374 
375  return true;
376 }
377 
378 bool CuDevice::SelectAndInitializeGpuIdWithExistingCudaContext(int dev_id) {
379  // Make sure the global allocator object has the up-to-date options.
380  g_cuda_allocator.SetOptions(g_allocator_options);
381  if (!CuDevice::SelectGpuId(dev_id)) return false;
382  FinalizeActiveGpu();
383  return true;
384 }
385 
386 template <typename TA, typename TB>
387 bool greater_pair(const std::pair<TA, TB> &left, const std::pair<TA, TB> &right) {
388  return left.second > right.second;
389 }
390 
391 bool CuDevice::SelectGpuIdAuto() {
392  // Check that we have at least one gpu
393  int32 num_gpus = 0;
394  cudaError_t e = cudaGetDeviceCount(&num_gpus);
395  if (num_gpus == 0) {
396  KALDI_WARN << "No CUDA devices found";
397  if (e != cudaSuccess) {
398  KALDI_WARN << "cudaGetDeviceCount() returned " << e
399  <<", meaning: \"" << cudaGetErrorString(e) << "\"";
400  }
401  return false;
402  }
403 
404  // The GPU is selected according to maximal free memory ratio
405  std::vector< std::pair<int, float> > free_mem_ratio(num_gpus);
406 
407  // Get ratios of memory use, if possible
408  KALDI_LOG << "Selecting from " << num_gpus << " GPUs";
409  for(int32 n = 0; n < num_gpus; n++) {
410  int32 ret = cudaSetDevice(n);
411  switch(ret) {
412  case cudaSuccess : {
413  // create the CUDA context for the thread
414  cudaDeviceSynchronize();
415  // get GPU name
416  char name[128];
417  DeviceGetName(name,128,n);
418  // get GPU memory stats
419  int64 free, total;
420  std::string mem_stats;
421  mem_stats = GetFreeGpuMemory(&free, &total);
422  // log
423  KALDI_LOG << "cudaSetDevice(" << n << "): "
424  << name << "\t" << mem_stats;
425 
426  // We have seen that in some cases GetFreeGpuMemory returns zero
427  // That will produce nan after division, which might confuse
428  // the sorting routine. Or maybe not, but let's keep it clean
429  if (total <= 0) {
430  KALDI_LOG << "Total memory reported for device " << n
431  << " is zero (or less).";
432  }
433  float mem_ratio = total > 0 ? free/(float)total : 0;
434  free_mem_ratio[n] = std::make_pair(n, mem_ratio);
435 
436  // destroy the CUDA context for the thread
437  cudaDeviceReset();
438  } break;
439  case cudaErrorDeviceAlreadyInUse :
440  KALDI_LOG << "cudaSetDevice(" << n << "): "
441  << "Device cannot be accessed, used EXCLUSIVE-THREAD mode...";
442  break;
443  case cudaErrorInvalidDevice :
444  KALDI_LOG << "cudaSetDevice(" << n << "): "
445  << "Device cannot be accessed, not a VALID CUDA device!";
446  break;
447  default :
448  KALDI_LOG << "cudaSetDevice(" << n << "): "
449  << "returned " << ret << ", "
450  << cudaGetErrorString((cudaError_t)ret);
451  }
452  }
453  // find GPU with max free memory
454  int32 max_id=0;
455  std::sort(free_mem_ratio.begin(), free_mem_ratio.end(),
456  greater_pair<int, float>);
457  // the free_mem_ratio should be bigger than zero
458  KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0);
459 
460  int dev_id;
461  float mem_ratio;
462  bool success;
463  do {
464  // try to select the GPU in the best to worst order
465  // Note we have to check the return codes manually, as the CU_SAFE_CALL
466  // contains call to KALDI_ERR (which will cause the program to abort)
467 
468  dev_id = free_mem_ratio[max_id].first;
469  mem_ratio = free_mem_ratio[max_id].second;
470 
471  KALDI_LOG << "Device: " << dev_id << ", mem_ratio: " << mem_ratio;
472  success = SelectGpuId(dev_id);
473 
474  max_id++;
475  } while (!success && (max_id < free_mem_ratio.size()));
476 
477  if (e != cudaSuccess) {
478  KALDI_WARN << "Failed to (automatically) select any device";
479  return false;
480  }
481  KALDI_LOG << "Success selecting device " << dev_id << " free mem ratio: " << mem_ratio;
482  return true;
483 }
484 
485 
486 void CuDevice::AccuProfile(const char *function_name,
487  const CuTimer &timer) {
488  if (GetVerboseLevel() >= 1) {
489  std::unique_lock<std::mutex> lock(profile_mutex_, std::defer_lock_t());
490  if (multi_threaded_)
491  lock.lock();
492  std::string key(function_name);
493  // by passing 0 as the stream to cudaStreamSynchronize, we are using the
494  // per-thread default stream. Since we compile with
495  // -DCUDA_API_PER_THREAD_DEFAULT_STREAM, this equates to a per-thread
496  // stream.
497  CU_SAFE_CALL(cudaStreamSynchronize(0));
498  double elapsed = timer.Elapsed();
499  if (profile_map_.find(key) == profile_map_.end())
500  profile_map_[key] = elapsed;
501  else
502  profile_map_[key] += elapsed;
503  }
504 }
505 
506 void CuDevice::PrintMemoryUsage() const {
507  if (Enabled())
508  g_cuda_allocator.PrintMemoryUsage();
509 }
510 
511 void CuDevice::PrintProfile() {
512  if (GetVerboseLevel() >= 1) {
513  std::ostringstream os;
514  os << "-----\n[cudevice profile]\n";
515  unordered_map<std::string, double, StringHasher>::iterator it;
516  std::vector<std::pair<double, std::string> > pairs;
517  double total_time = 0.0;
518  for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
519  std::string function_name = it->first;
520  double elapsed_time = it->second;
521  total_time += elapsed_time;
522  pairs.push_back(std::make_pair(elapsed_time, function_name));
523  }
524  // display from shortest to longest time, so tail will show the longest
525  // times at the end.
526  std::sort(pairs.begin(), pairs.end());
527  size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
528  0 : pairs.size() - max_print);
529  for (size_t i = start_pos; i < pairs.size(); i++)
530  os << pairs[i].second << "\t" << pairs[i].first << "s\n";
531  os << "Total GPU time:\t" << total_time << "s (may involve some double-counting)\n";
532  os << "-----";
533  KALDI_LOG << os.str();
534  PrintMemoryUsage();
535  }
536 }
537 
538 
539 void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
540  // prefill with something reasonable
541  strncpy(name,"Unknown GPU",len);
542 #ifdef _MSC_VER
543  cuDeviceGetName(name, len, dev);
544 #else
545  // open libcuda.so
546  void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
547  if (NULL == libcuda) {
548  KALDI_WARN << "cannot open libcuda.so";
549  } else {
550  // define the function signature type
551  typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
552  // get the symbol
553  cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
554  if (NULL == cuDeviceGetName_ptr) {
555  KALDI_WARN << "cannot load cuDeviceGetName from libcuda.so";
556  } else {
557  // call the function
558  cuDeviceGetName_ptr(name, len, dev);
559  }
560  // close the library
561  dlclose(libcuda);
562  }
563 #endif
564 }
565 
566 
567 void CuDevice::CheckGpuHealth() {
568  if (!Enabled()) return;
569  CuTimer t;
570  // prepare small matrices for a quick test
571  Matrix<BaseFloat> a(50, 100);
572  Matrix<BaseFloat> b(100 ,50);
573  a.SetRandn();
574  b.SetRandUniform();
575  // multiply 2 small matrices in CPU:
576  Matrix<BaseFloat> c(50, 50);
577  c.AddMatMat(1.0, a, kNoTrans, b, kNoTrans, 0.0);
578  // multiply same matrices in GPU:
579  CuMatrix<BaseFloat> c1(50, 50);
580  c1.AddMatMat(1.0, CuMatrix<BaseFloat>(a), kNoTrans, CuMatrix<BaseFloat>(b), kNoTrans, 0.0);
581  // check that relative differnence is <1%
582  AssertEqual(c, Matrix<BaseFloat>(c1), 0.01);
583  // measure time spent in this check
584  AccuProfile(__func__, t);
585 }
586 
587 CuDevice::CuDevice():
588  initialized_(false),
589  device_id_copy_(-1),
590  cublas_handle_(NULL),
591  cusparse_handle_(NULL),
592  cusolverdn_handle_(NULL) {
593 }
594 
595 CuDevice::~CuDevice() {
596  if (cublas_handle_)
597  CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
598  if (cusparse_handle_)
599  CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
600  if (curand_handle_) {
601  CURAND_SAFE_CALL(curandDestroyGenerator(curand_handle_));
602  }
603 #if CUDA_VERSION >= 9010
604  if (cusolverdn_handle_) {
605  CUSOLVER_SAFE_CALL(cusolverDnDestroy(cusolverdn_handle_));
606  }
607 #endif
608 }
609 
610 
611 // Each thread has its own copy of the CuDevice object.
612 // Note: this was declared "static".
613 thread_local CuDevice CuDevice::this_thread_device_;
614 
615 CuDevice::CuDeviceOptions CuDevice::device_options_;
616 
617 // define and initialize the static members of the CuDevice object.
618 int32 CuDevice::device_id_ = -1;
619 bool CuDevice::multi_threaded_ = false;
620 unordered_map<std::string, double, StringHasher> CuDevice::profile_map_;
621 std::mutex CuDevice::profile_mutex_;
622 int64 CuDevice::free_memory_at_startup_;
623 cudaDeviceProp CuDevice::properties_;
624 bool CuDevice::debug_stride_mode_ = false;
625 
626 
627 void SynchronizeGpu() {
628  cuda_legacy_noop();
629  CU_SAFE_CALL(cudaGetLastError());
630 }
631 
632 } // namespace kaldi
633 
634 #else // #if HAVE_CUDA == 1
635 
636 namespace kaldi {
637 // SynchronizeGpu() does nothing if we didn't compile for GPU.
638 void SynchronizeGpu() { }
639 }
640 
641 #endif // #if HAVE_CUDA == 1
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
CuAllocatorOptions g_allocator_options
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
void Sleep(float seconds)
Definition: kaldi-utils.cc:45
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
struct rnnlm::@11::@12 n
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
Definition: cu-device.cc:638
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
Definition: kaldi-math.h:276
#define KALDI_LOG
Definition: kaldi-error.h:153