26 #include <cublas_v2.h> 28 #include <cuda_runtime_api.h> 52 static bool GetCudaContext(
int32 num_gpus, std::string *debug_str) {
56 if (cudaFree(0) == 0) {
63 std::ostringstream debug_stream;
64 debug_stream <<
"num-gpus=" << num_gpus <<
". ";
65 for (
int32 device = 0; device < num_gpus; device++) {
66 cudaSetDevice(device);
67 cudaError_t e = cudaFree(0);
68 if (e == cudaSuccess) {
70 *debug_str = debug_stream.str();
75 debug_stream <<
"Device " << device <<
": " << cudaGetErrorString(e) <<
". ";
78 *debug_str = debug_stream.str();
83 void CuDevice::Initialize() {
98 if (device_id_ == -1) {
102 if (!multi_threaded_) {
103 multi_threaded_ =
true;
104 KALDI_WARN <<
"For multi-threaded code that might use GPU, you should call " 105 "CuDevice::Instantiate().AllowMultithreading() at the start of " 108 device_id_copy_ = device_id_;
109 cudaSetDevice(device_id_);
111 CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
112 CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
114 #if CUDA_VERSION >= 9010 115 CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
116 CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
117 cudaStreamPerThread));
120 #if CUDA_VERSION >= 9000 121 if (device_options_.use_tensor_cores) {
124 CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_,
125 CUBLAS_TENSOR_OP_MATH));
130 CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
131 CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
134 CURAND_SAFE_CALL(curandCreateGenerator(
135 &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
137 CURAND_SAFE_CALL(curandSetGeneratorOrdering(
138 curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
139 CURAND_SAFE_CALL(curandSetStream(curand_handle_, cudaStreamPerThread));
144 void CuDevice::SelectGpuId(std::string use_gpu) {
145 if (device_id_ != -1) {
146 KALDI_ERR <<
"You cannot call SelectGpuId twice if, on the first time, " 147 "you requested a GPU.";
149 if (use_gpu !=
"yes" && use_gpu !=
"no" && use_gpu !=
"optional" && use_gpu !=
"wait") {
150 KALDI_ERR <<
"Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu <<
"'";
152 if (use_gpu ==
"no") {
153 KALDI_LOG <<
"Manually selected to compute on CPU.";
159 cudaError_t e = cudaGetDeviceCount(&num_gpus);
165 if (use_gpu ==
"yes" || use_gpu ==
"wait") {
166 KALDI_CUDA_ERR(e,
"No CUDA GPU detected!");
168 if (use_gpu ==
"optional") {
169 KALDI_WARN <<
"No CUDA GPU detected; running on CPU since --use-gpu=optional specified.";
175 std::string debug_str;
176 bool got_context = GetCudaContext(num_gpus, &debug_str);
178 if (use_gpu !=
"wait") {
181 int32 sec_sleep = (use_gpu ==
"yes" ? 20 : 2);
182 KALDI_WARN <<
"Will try again to get a GPU after " << sec_sleep
185 if (!GetCudaContext(num_gpus, &debug_str)) {
186 if (use_gpu ==
"yes") {
189 input.Open(
"nvidia-smi 1>&2 |");
192 KALDI_ERR <<
"Failed to create CUDA context, no more unused GPUs? ";
194 if (use_gpu ==
"optional") {
195 KALDI_WARN <<
"Running on CPU!!! No more unused CUDA GPUs?";
203 while (!got_context) {
206 KALDI_WARN <<
"Will try again indefinitely every " << sec_sleep
207 <<
" seconds to get a GPU.";
209 wait_time += sec_sleep;
211 got_context = GetCudaContext(num_gpus, NULL);
215 <<
" seconds before creating CUDA context";
222 if (IsComputeExclusive()) {
223 KALDI_LOG <<
"CUDA setup operating under Compute Exclusive Mode.";
228 KALDI_WARN <<
"Not in compute-exclusive mode. Suggestion: use " 229 "'nvidia-smi -c 3' to set compute exclusive mode";
231 e = cudaDeviceReset();
232 if (e != cudaSuccess) {
233 KALDI_CUDA_ERR(e,
"Failed to release CUDA context on a GPU");
237 if (SelectGpuIdAuto()) {
243 if (use_gpu ==
"yes") {
246 if (use_gpu ==
"optional") {
247 KALDI_WARN <<
"Running on CPU!!! Error acquiring GPU.";
255 void CuDevice::FinalizeActiveGpu() {
262 cudaError_t e = cudaGetDevice(&device_id);
263 if (e != cudaSuccess) {
264 KALDI_CUDA_ERR(e,
"Failed to get device-id of active device.");
266 device_id_ = device_id;
267 device_id_copy_ = device_id;
271 CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
272 CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
274 #if CUDA_VERSION >= 9010 275 CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
276 CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
277 cudaStreamPerThread));
280 #if CUDA_VERSION >= 9000 281 if (device_options_.use_tensor_cores) {
284 CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_,
285 CUBLAS_TENSOR_OP_MATH));
291 CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
292 CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
295 CURAND_SAFE_CALL(curandCreateGenerator(
296 &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
298 CURAND_SAFE_CALL(curandSetGeneratorOrdering(
299 curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
304 DeviceGetName(name,128, device_id);
306 CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, device_id));
308 KALDI_LOG <<
"The active GPU is [" << device_id <<
"]: " << name <<
"\t" 309 << GetFreeGpuMemory(&free_memory_at_startup_, NULL) <<
" version " 310 << properties_.major <<
"." << properties_.minor;
315 bool CuDevice::DoublePrecisionSupported() {
316 if (!Enabled())
return true;
317 return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
322 bool CuDevice::IsComputeExclusive() {
328 cudaError_t e = cudaGetDevice(&gpu_id);
329 if (e != cudaSuccess) {
330 KALDI_CUDA_ERR(e,
"Failed to get current device");
332 struct cudaDeviceProp gpu_prop;
333 e = cudaGetDeviceProperties(&gpu_prop, gpu_id);
334 if (e != cudaSuccess) {
335 KALDI_CUDA_ERR(e,
"Failed to get device properties");
338 switch (gpu_prop.computeMode) {
339 case cudaComputeModeExclusive :
342 case cudaComputeModeExclusiveProcess :
351 bool CuDevice::SelectGpuId(
int dev_id) {
352 KALDI_LOG <<
"Trying to select device: " << dev_id;
353 cudaError_t e = cudaSetDevice(dev_id);
354 if (e != cudaSuccess) {
355 KALDI_WARN <<
"Cannot select this device: return code " << e
356 <<
", Error message: \"" << cudaGetErrorString(e) <<
"\"";
359 e = cudaDeviceSynchronize();
360 if (e != cudaSuccess) {
361 KALDI_WARN <<
"Cannot select this device: return code " << e
362 <<
", Error message: \"" << cudaGetErrorString(e) <<
"\"";
367 std::string debug_str;
368 int num_gpus = dev_id + 1;
369 bool got_context = GetCudaContext(num_gpus, &debug_str);
371 KALDI_WARN <<
"Cannot get Cuda Context, Error message: \"" << debug_str
378 bool CuDevice::SelectAndInitializeGpuIdWithExistingCudaContext(
int dev_id) {
381 if (!CuDevice::SelectGpuId(dev_id))
return false;
386 template <
typename TA,
typename TB>
387 bool greater_pair(
const std::pair<TA, TB> &left,
const std::pair<TA, TB> &right) {
388 return left.second > right.second;
391 bool CuDevice::SelectGpuIdAuto() {
394 cudaError_t e = cudaGetDeviceCount(&num_gpus);
397 if (e != cudaSuccess) {
398 KALDI_WARN <<
"cudaGetDeviceCount() returned " << e
399 <<
", meaning: \"" << cudaGetErrorString(e) <<
"\"";
405 std::vector< std::pair<int, float> > free_mem_ratio(num_gpus);
408 KALDI_LOG <<
"Selecting from " << num_gpus <<
" GPUs";
409 for(
int32 n = 0;
n < num_gpus;
n++) {
410 int32 ret = cudaSetDevice(
n);
414 cudaDeviceSynchronize();
417 DeviceGetName(name,128,
n);
420 std::string mem_stats;
421 mem_stats = GetFreeGpuMemory(&free, &total);
424 << name <<
"\t" << mem_stats;
430 KALDI_LOG <<
"Total memory reported for device " <<
n 431 <<
" is zero (or less).";
433 float mem_ratio = total > 0 ? free/(
float)total : 0;
434 free_mem_ratio[
n] = std::make_pair(
n, mem_ratio);
439 case cudaErrorDeviceAlreadyInUse :
441 <<
"Device cannot be accessed, used EXCLUSIVE-THREAD mode...";
443 case cudaErrorInvalidDevice :
444 KALDI_LOG <<
"cudaSetDevice(" << n <<
"): " 445 <<
"Device cannot be accessed, not a VALID CUDA device!";
448 KALDI_LOG <<
"cudaSetDevice(" << n <<
"): " 449 <<
"returned " << ret <<
", " 450 << cudaGetErrorString((cudaError_t)ret);
455 std::sort(free_mem_ratio.begin(), free_mem_ratio.end(),
456 greater_pair<int, float>);
468 dev_id = free_mem_ratio[max_id].first;
469 mem_ratio = free_mem_ratio[max_id].second;
471 KALDI_LOG <<
"Device: " << dev_id <<
", mem_ratio: " << mem_ratio;
472 success = SelectGpuId(dev_id);
475 }
while (!success && (max_id < free_mem_ratio.size()));
477 if (e != cudaSuccess) {
478 KALDI_WARN <<
"Failed to (automatically) select any device";
481 KALDI_LOG <<
"Success selecting device " << dev_id <<
" free mem ratio: " << mem_ratio;
486 void CuDevice::AccuProfile(
const char *function_name,
487 const CuTimer &timer) {
489 std::unique_lock<std::mutex> lock(profile_mutex_, std::defer_lock_t());
492 std::string key(function_name);
497 CU_SAFE_CALL(cudaStreamSynchronize(0));
498 double elapsed = timer.Elapsed();
499 if (profile_map_.find(key) == profile_map_.end())
500 profile_map_[key] = elapsed;
502 profile_map_[key] += elapsed;
506 void CuDevice::PrintMemoryUsage()
const {
508 g_cuda_allocator.PrintMemoryUsage();
511 void CuDevice::PrintProfile() {
513 std::ostringstream os;
514 os <<
"-----\n[cudevice profile]\n";
515 unordered_map<std::string, double, StringHasher>::iterator it;
516 std::vector<std::pair<double, std::string> > pairs;
517 double total_time = 0.0;
518 for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
519 std::string function_name = it->first;
520 double elapsed_time = it->second;
521 total_time += elapsed_time;
522 pairs.push_back(std::make_pair(elapsed_time, function_name));
526 std::sort(pairs.begin(), pairs.end());
527 size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
528 0 : pairs.size() - max_print);
529 for (
size_t i = start_pos;
i < pairs.size();
i++)
530 os << pairs[
i].second <<
"\t" << pairs[
i].first <<
"s\n";
531 os <<
"Total GPU time:\t" << total_time <<
"s (may involve some double-counting)\n";
539 void CuDevice::DeviceGetName(
char* name,
int32 len,
int32 dev) {
541 strncpy(name,
"Unknown GPU",len);
543 cuDeviceGetName(name, len, dev);
546 void* libcuda = dlopen(
"libcuda.so",RTLD_LAZY);
547 if (NULL == libcuda) {
551 typedef CUresult (*cu_fun_ptr)(
char*,int,CUdevice);
553 cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,
"cuDeviceGetName");
554 if (NULL == cuDeviceGetName_ptr) {
555 KALDI_WARN <<
"cannot load cuDeviceGetName from libcuda.so";
558 cuDeviceGetName_ptr(name, len, dev);
567 void CuDevice::CheckGpuHealth() {
568 if (!Enabled())
return;
571 Matrix<BaseFloat> a(50, 100);
572 Matrix<BaseFloat> b(100 ,50);
576 Matrix<BaseFloat> c(50, 50);
579 CuMatrix<BaseFloat> c1(50, 50);
580 c1.AddMatMat(1.0, CuMatrix<BaseFloat>(a),
kNoTrans, CuMatrix<BaseFloat>(b),
kNoTrans, 0.0);
584 AccuProfile(__func__, t);
587 CuDevice::CuDevice():
590 cublas_handle_(NULL),
591 cusparse_handle_(NULL),
592 cusolverdn_handle_(NULL) {
595 CuDevice::~CuDevice() {
597 CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
598 if (cusparse_handle_)
599 CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
600 if (curand_handle_) {
601 CURAND_SAFE_CALL(curandDestroyGenerator(curand_handle_));
603 #if CUDA_VERSION >= 9010 604 if (cusolverdn_handle_) {
605 CUSOLVER_SAFE_CALL(cusolverDnDestroy(cusolverdn_handle_));
613 thread_local CuDevice CuDevice::this_thread_device_;
615 CuDevice::CuDeviceOptions CuDevice::device_options_;
618 int32 CuDevice::device_id_ = -1;
619 bool CuDevice::multi_threaded_ =
false;
620 unordered_map<std::string, double, StringHasher> CuDevice::profile_map_;
621 std::mutex CuDevice::profile_mutex_;
622 int64 CuDevice::free_memory_at_startup_;
623 cudaDeviceProp CuDevice::properties_;
624 bool CuDevice::debug_stride_mode_ =
false;
629 CU_SAFE_CALL(cudaGetLastError());
634 #else // #if HAVE_CUDA == 1 641 #endif // #if HAVE_CUDA == 1
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
CuAllocatorOptions g_allocator_options
int32 GetVerboseLevel()
Get verbosity level, usually set via command line '–verbose=' switch.
void Sleep(float seconds)
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
#define KALDI_ASSERT(cond)
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))