26 #include <cublas_v2.h>    28 #include <cuda_runtime_api.h>    52 static bool GetCudaContext(
int32 num_gpus, std::string *debug_str) {
    56   if (cudaFree(0) == 0) {
    63   std::ostringstream debug_stream;
    64   debug_stream << 
"num-gpus=" << num_gpus << 
". ";
    65   for (
int32 device = 0; device < num_gpus; device++) {
    66     cudaSetDevice(device);
    67     cudaError_t e = cudaFree(0);  
    68     if (e == cudaSuccess) {
    70         *debug_str = debug_stream.str();
    75     debug_stream << 
"Device " << device << 
": " << cudaGetErrorString(e) << 
".  ";
    78     *debug_str = debug_stream.str();
    83 void CuDevice::Initialize() {
    98   if (device_id_ == -1) {
   102     if (!multi_threaded_) {
   103       multi_threaded_ = 
true;
   104       KALDI_WARN << 
"For multi-threaded code that might use GPU, you should call "   105           "CuDevice::Instantiate().AllowMultithreading() at the start of "   108     device_id_copy_ = device_id_;
   109     cudaSetDevice(device_id_);
   111     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
   112     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
   114 #if CUDA_VERSION >= 9010   115     CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
   116     CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_, 
   117             cudaStreamPerThread));
   120 #if CUDA_VERSION >= 9000    121     if (device_options_.use_tensor_cores) {
   124       CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
   125             CUBLAS_TENSOR_OP_MATH));
   130     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
   131     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
   134     CURAND_SAFE_CALL(curandCreateGenerator(
   135           &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
   137     CURAND_SAFE_CALL(curandSetGeneratorOrdering(
   138           curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
   139     CURAND_SAFE_CALL(curandSetStream(curand_handle_, cudaStreamPerThread));
   144 void CuDevice::SelectGpuId(std::string use_gpu) {
   145   if (device_id_ != -1) {
   146     KALDI_ERR << 
"You cannot call SelectGpuId twice if, on the first time, "   147         "you requested a GPU.";
   149   if (use_gpu != 
"yes" && use_gpu != 
"no" && use_gpu != 
"optional" && use_gpu != 
"wait") {
   150     KALDI_ERR << 
"Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu << 
"'";
   152   if (use_gpu == 
"no") {
   153     KALDI_LOG << 
"Manually selected to compute on CPU.";
   159   cudaError_t e = cudaGetDeviceCount(&num_gpus);
   165     if (use_gpu == 
"yes" || use_gpu == 
"wait") {
   166       KALDI_CUDA_ERR(e, 
"No CUDA GPU detected!");
   168     if (use_gpu == 
"optional") {
   169       KALDI_WARN << 
"No CUDA GPU detected; running on CPU since --use-gpu=optional specified.";
   175   std::string debug_str;
   176   bool got_context = GetCudaContext(num_gpus, &debug_str);
   178   if (use_gpu != 
"wait") {
   181       int32 sec_sleep = (use_gpu == 
"yes" ? 20 : 2);
   182       KALDI_WARN << 
"Will try again to get a GPU after " << sec_sleep
   185       if (!GetCudaContext(num_gpus, &debug_str)) {
   186         if (use_gpu == 
"yes") {
   189             input.Open(
"nvidia-smi 1>&2 |");
   192           KALDI_ERR << 
"Failed to create CUDA context, no more unused GPUs? ";
   194         if (use_gpu == 
"optional") {
   195           KALDI_WARN << 
"Running on CPU!!! No more unused CUDA GPUs?";
   203     while (!got_context) {
   206         KALDI_WARN << 
"Will try again indefinitely every " << sec_sleep
   207                    << 
" seconds to get a GPU.";
   209       wait_time += sec_sleep;
   211       got_context = GetCudaContext(num_gpus, NULL);
   215                << 
" seconds before creating CUDA context";
   222   if (IsComputeExclusive()) {
   223     KALDI_LOG << 
"CUDA setup operating under Compute Exclusive Mode.";
   228     KALDI_WARN << 
"Not in compute-exclusive mode.  Suggestion: use "   229         "'nvidia-smi -c 3' to set compute exclusive mode";
   231     e = cudaDeviceReset();
   232     if (e != cudaSuccess) {
   233       KALDI_CUDA_ERR(e, 
"Failed to release CUDA context on a GPU");
   237     if (SelectGpuIdAuto()) {
   243       if (use_gpu == 
"yes") {
   246       if (use_gpu == 
"optional") {
   247         KALDI_WARN << 
"Running on CPU!!! Error acquiring GPU.";
   255 void CuDevice::FinalizeActiveGpu() {
   262     cudaError_t e = cudaGetDevice(&device_id);
   263     if (e != cudaSuccess) {
   264       KALDI_CUDA_ERR(e, 
"Failed to get device-id of active device.");
   266     device_id_ = device_id;
   267     device_id_copy_ = device_id;
   271     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
   272     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
   274 #if CUDA_VERSION >= 9010    275     CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
   276     CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
   277             cudaStreamPerThread));
   280 #if CUDA_VERSION >= 9000    281     if (device_options_.use_tensor_cores) {
   284       CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
   285             CUBLAS_TENSOR_OP_MATH));
   291     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
   292     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
   295     CURAND_SAFE_CALL(curandCreateGenerator(
   296           &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
   298     CURAND_SAFE_CALL(curandSetGeneratorOrdering(
   299           curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
   304     DeviceGetName(name,128, device_id);
   306     CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, device_id));
   308     KALDI_LOG << 
"The active GPU is [" << device_id << 
"]: " << name << 
"\t"   309               << GetFreeGpuMemory(&free_memory_at_startup_, NULL) << 
" version "   310               << properties_.major << 
"." << properties_.minor;
   315 bool CuDevice::DoublePrecisionSupported() {
   316   if (!Enabled()) 
return true;
   317   return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
   322 bool CuDevice::IsComputeExclusive() {
   328   cudaError_t e = cudaGetDevice(&gpu_id);
   329   if (e != cudaSuccess) {
   330     KALDI_CUDA_ERR(e, 
"Failed to get current device");
   332   struct cudaDeviceProp gpu_prop;
   333   e = cudaGetDeviceProperties(&gpu_prop, gpu_id);
   334   if (e != cudaSuccess) {
   335     KALDI_CUDA_ERR(e,  
"Failed to get device properties");
   338   switch (gpu_prop.computeMode) {
   339     case cudaComputeModeExclusive :
   342     case cudaComputeModeExclusiveProcess :
   351 bool CuDevice::SelectGpuId(
int dev_id) {
   352   KALDI_LOG << 
"Trying to select device: " << dev_id;
   353   cudaError_t e = cudaSetDevice(dev_id);
   354   if (e != cudaSuccess) {
   355     KALDI_WARN << 
"Cannot select this device: return code " << e
   356                << 
", Error message: \"" << cudaGetErrorString(e) << 
"\"";
   359     e = cudaDeviceSynchronize();
   360     if (e != cudaSuccess) {
   361       KALDI_WARN << 
"Cannot select this device: return code " << e
   362                  << 
", Error message: \"" << cudaGetErrorString(e) << 
"\"";
   367   std::string debug_str;
   368   int num_gpus = dev_id + 1;  
   369   bool got_context = GetCudaContext(num_gpus, &debug_str);
   371     KALDI_WARN << 
"Cannot get Cuda Context, Error message: \"" << debug_str
   378 bool CuDevice::SelectAndInitializeGpuIdWithExistingCudaContext(
int dev_id) {
   381   if (!CuDevice::SelectGpuId(dev_id)) 
return false;
   386 template <
typename TA, 
typename TB>
   387 bool greater_pair(
const std::pair<TA, TB> &left, 
const std::pair<TA, TB> &right) {
   388   return left.second > right.second;
   391 bool CuDevice::SelectGpuIdAuto() {
   394   cudaError_t e = cudaGetDeviceCount(&num_gpus);
   397     if (e != cudaSuccess) {
   398       KALDI_WARN << 
"cudaGetDeviceCount() returned " << e
   399                  <<
", meaning: \"" << cudaGetErrorString(e)  << 
"\"";
   405   std::vector< std::pair<int, float> > free_mem_ratio(num_gpus);
   408   KALDI_LOG << 
"Selecting from " << num_gpus << 
" GPUs";
   409   for(
int32 n = 0; 
n < num_gpus; 
n++) {
   410     int32 ret = cudaSetDevice(
n);
   414         cudaDeviceSynchronize();
   417         DeviceGetName(name,128,
n);
   420         std::string mem_stats;
   421         mem_stats = GetFreeGpuMemory(&free, &total);
   424                   << name << 
"\t" << mem_stats;
   430           KALDI_LOG << 
"Total memory reported for device " << 
n   431                     << 
" is zero (or less).";
   433         float mem_ratio = total > 0 ? free/(
float)total : 0;
   434         free_mem_ratio[
n] = std::make_pair(
n, mem_ratio);
   439       case cudaErrorDeviceAlreadyInUse :
   441                   << 
"Device cannot be accessed, used EXCLUSIVE-THREAD mode...";
   443       case cudaErrorInvalidDevice :
   444         KALDI_LOG << 
"cudaSetDevice(" << n << 
"): "   445                   << 
"Device cannot be accessed, not a VALID CUDA device!";
   448         KALDI_LOG << 
"cudaSetDevice(" << n << 
"): "   449                   << 
"returned " << ret << 
", "   450                   << cudaGetErrorString((cudaError_t)ret);
   455   std::sort(free_mem_ratio.begin(), free_mem_ratio.end(),
   456             greater_pair<int, float>);
   468     dev_id = free_mem_ratio[max_id].first;
   469     mem_ratio = free_mem_ratio[max_id].second;
   471     KALDI_LOG << 
"Device: " << dev_id << 
", mem_ratio: " << mem_ratio;
   472     success = SelectGpuId(dev_id);
   475   } 
while (!success && (max_id < free_mem_ratio.size()));
   477   if (e != cudaSuccess) {
   478     KALDI_WARN << 
"Failed to (automatically) select any device";
   481   KALDI_LOG << 
"Success selecting device " << dev_id << 
" free mem ratio: " << mem_ratio;
   486 void CuDevice::AccuProfile(
const char *function_name,
   487                            const CuTimer &timer) {
   489     std::unique_lock<std::mutex> lock(profile_mutex_, std::defer_lock_t());
   492     std::string key(function_name);
   497     CU_SAFE_CALL(cudaStreamSynchronize(0));
   498     double elapsed = timer.Elapsed();
   499     if (profile_map_.find(key) == profile_map_.end())
   500       profile_map_[key] = elapsed;
   502       profile_map_[key] += elapsed;
   506 void CuDevice::PrintMemoryUsage()
 const {
   508     g_cuda_allocator.PrintMemoryUsage();
   511 void CuDevice::PrintProfile() {
   513     std::ostringstream os;
   514     os << 
"-----\n[cudevice profile]\n";
   515     unordered_map<std::string, double, StringHasher>::iterator it;
   516     std::vector<std::pair<double, std::string> > pairs;
   517     double total_time = 0.0;
   518     for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
   519       std::string function_name = it->first;
   520       double elapsed_time = it->second;
   521       total_time += elapsed_time;
   522       pairs.push_back(std::make_pair(elapsed_time, function_name));
   526     std::sort(pairs.begin(), pairs.end());
   527     size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
   528                                         0 : pairs.size() - max_print);
   529     for (
size_t i = start_pos; 
i < pairs.size(); 
i++)
   530       os << pairs[
i].second << 
"\t" << pairs[
i].first << 
"s\n";
   531     os << 
"Total GPU time:\t" << total_time << 
"s (may involve some double-counting)\n";
   539 void CuDevice::DeviceGetName(
char* name, 
int32 len, 
int32 dev) {
   541   strncpy(name,
"Unknown GPU",len);
   543   cuDeviceGetName(name, len, dev);
   546   void* libcuda = dlopen(
"libcuda.so",RTLD_LAZY);
   547   if (NULL == libcuda) {
   551     typedef CUresult (*cu_fun_ptr)(
char*,int,CUdevice);
   553     cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,
"cuDeviceGetName");
   554     if (NULL == cuDeviceGetName_ptr) {
   555       KALDI_WARN << 
"cannot load cuDeviceGetName from libcuda.so";
   558       cuDeviceGetName_ptr(name, len, dev);
   567 void CuDevice::CheckGpuHealth() {
   568   if (!Enabled()) 
return;
   571   Matrix<BaseFloat> a(50, 100);
   572   Matrix<BaseFloat> b(100 ,50);
   576   Matrix<BaseFloat> c(50, 50);
   579   CuMatrix<BaseFloat> c1(50, 50);
   580   c1.AddMatMat(1.0, CuMatrix<BaseFloat>(a), 
kNoTrans, CuMatrix<BaseFloat>(b), 
kNoTrans, 0.0);
   584   AccuProfile(__func__, t);
   587 CuDevice::CuDevice():
   590     cublas_handle_(NULL),
   591     cusparse_handle_(NULL),
   592     cusolverdn_handle_(NULL) {
   595 CuDevice::~CuDevice() {
   597     CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
   598   if (cusparse_handle_)
   599     CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
   600   if (curand_handle_) {
   601     CURAND_SAFE_CALL(curandDestroyGenerator(curand_handle_));
   603 #if CUDA_VERSION >= 9010   604   if (cusolverdn_handle_) {
   605     CUSOLVER_SAFE_CALL(cusolverDnDestroy(cusolverdn_handle_));
   613 thread_local CuDevice CuDevice::this_thread_device_;
   615 CuDevice::CuDeviceOptions CuDevice::device_options_;
   618 int32 CuDevice::device_id_ = -1;
   619 bool CuDevice::multi_threaded_ = 
false;
   620 unordered_map<std::string, double, StringHasher> CuDevice::profile_map_;
   621 std::mutex CuDevice::profile_mutex_;
   622 int64 CuDevice::free_memory_at_startup_;
   623 cudaDeviceProp CuDevice::properties_;
   624 bool CuDevice::debug_stride_mode_ = 
false;
   629   CU_SAFE_CALL(cudaGetLastError());
   634 #else  // #if HAVE_CUDA == 1   641 #endif  // #if HAVE_CUDA == 1 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
CuAllocatorOptions g_allocator_options
 
int32 GetVerboseLevel()
Get verbosity level, usually set via command line '–verbose=' switch. 
 
void Sleep(float seconds)
 
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
 
#define KALDI_ASSERT(cond)
 
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))