doc/cu-device_8h_source.html

 // cudamatrix/cu-device.h

 // Copyright 2009-2012  Karel Vesely
 //           2012-2015  Johns Hopkins University (author: Daniel Povey)

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.


 #ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_
 #define KALDI_CUDAMATRIX_CU_DEVICE_H_

 #if HAVE_CUDA == 1
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
 #include <map>
 #include <string>
 #include <iostream>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include "base/kaldi-common.h"
 #include "base/timer.h"
 #include "cudamatrix/cu-allocator.h"
 #include "cudamatrix/cu-common.h"

 #if CUDA_VERSION >= 9010
 #include <cusolverDn.h>
 #else
 // cusolver not supported.
 // Setting a few types to minimize compiler guards.
 // If a user tries to use cusovler it will throw an error.
 typedef void* cusolverDnHandle_t;
 typedef int cusolverStatus_t;
 #endif

 namespace kaldi {

 class CuTimer;

 class CuDevice {
  public:

   // You obtain the CuDevice for the current thread by calling
   //   CuDevice::Instantiate()
   // At the beginning of the program, if you want to use a GPU, you
   // should call CuDevice::Instantiate().SelectGpuId(..).
   static inline CuDevice& Instantiate() {
     CuDevice &ans = this_thread_device_;
     if (!ans.initialized_)
       ans.Initialize();
     return ans;
   }

   inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
   inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
   inline cusolverDnHandle_t GetCusolverDnHandle() {
 #if CUDA_VERSION < 9010
     KALDI_ERR << "CUDA VERSION '" << CUDA_VERSION << "' not new enough to support "
       << "cusolver. Upgrade to at least 9.1";
 #endif
     return cusolverdn_handle_;
   }

   inline void SeedGpu() {
     if (CuDevice::Instantiate().Enabled()) {
       // To get same random sequence, call srand() before the method is invoked,
       CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
             curand_handle_, RandInt(128, RAND_MAX)));
       CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
     }
   }
   // We provide functions Malloc(), MallocPitch() and Free() which replace
   // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
   // the results of previous allocations to avoid the very large overhead that
   // CUDA's allocation seems to give for some setups.
   inline void* Malloc(size_t size) {
     return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
         g_cuda_allocator.Malloc(size);
   }

   inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
     if (multi_threaded_) {
       return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
     } else if (debug_stride_mode_) {
       // The pitch bucket size is hardware dependent.
       // It is 512 on K40c with CUDA 7.5
       // "% 8" ensures that any 8 adjacent allocations have different pitches
       // if their original pitches are same in the normal mode.
       return g_cuda_allocator.MallocPitch(
           row_bytes + 512 * RandInt(0, 4), num_rows,
           pitch);
     } else {
       return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
     }
   }

   inline void Free(void *ptr) {
     if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
     else g_cuda_allocator.Free(ptr);
   }

   void SelectGpuId(std::string use_gpu);

   // Select a specific GPU for computation. Will reuse the existing Cuda Context
   // for that device. Initialize the necessary handles for GPU use (e.g. cublas
   // handle)
   bool SelectAndInitializeGpuIdWithExistingCudaContext(int dev_id);

   bool Enabled() const {
     return (device_id_ > -1);
   }

   bool DoublePrecisionSupported();

   void AccuProfile(const char *function_name, const CuTimer &timer);

   void PrintProfile();

   void PrintMemoryUsage() const;

   inline void AllowMultithreading() { multi_threaded_ = true; }

   void DeviceGetName(char* name, int32 len, int32 dev);

   void CheckGpuHealth();

   int32 GetMatrixAlignment() const;

   bool SetDebugStrideMode(bool mode) {
     bool old_mode = debug_stride_mode_;
     debug_stride_mode_ = mode;
     return old_mode;
   }

   bool IsComputeExclusive();

   // Register command line options for CUDA device.
   // This must be done before calling CuDevice::Initialize()
   // Example:
   //  CuDevice::RegisterDeviceOptions(&po);
   //  po.Read(argc, argv);
   //  CuDevice::Initialize();
   static void RegisterDeviceOptions(OptionsItf *po) {
     CuDevice::device_options_.Register(po);
   }
   ~CuDevice();
  private:

   struct CuDeviceOptions {
     bool use_tensor_cores; // Enable tensor cores
     CuDeviceOptions () : use_tensor_cores(false) {};
     void Register(OptionsItf *po) {
       po->Register("cuda-use-tensor-cores", &use_tensor_cores,
           "Enable FP16 tensor math. "
           "This is higher performance but less accuracy. "
           "This is only recommended for inference.");
     }
   };

   static CuDeviceOptions device_options_;

   // Default constructor used to initialize this_thread_device_
   CuDevice();
   CuDevice(CuDevice&); // Disallow.
   CuDevice &operator=(CuDevice&);  // Disallow.


   void Initialize();

   bool SelectGpuIdAuto();

   // Selects GPU given its ID. Called from SelectGpuIdAuto or
   // SelectGpuIdWithExistingCudaContext
   bool SelectGpuId(int dev_id);

   void FinalizeActiveGpu();

   int32 MajorDeviceVersion();

   int32 MinorDeviceVersion();


   // Each thread has its own CuDevice object, which contains the cublas and
   // cusparse handles.  These are unique to the thread (which is what is
   // recommended by NVidia).
   static thread_local CuDevice this_thread_device_;

   // The GPU device-id that we are using.  This will be initialized to -1, and will
   // be set when the user calls
   //  CuDevice::Instantiate::SelectGpuId(...)
   // from the main thread.  Background threads will, when spawned and when
   // CuDevice::Instantiate() is called from them the first time, will
   // call cudaSetDevice(device_id))
   static int32 device_id_;

   // This will automatically be set to true if the application has multiple
   // threads that access the GPU device.  It is used to know whether to
   // use locks when accessing the allocator and the profiling-related code.
   static bool multi_threaded_;

   // The variable profile_map_ will only be used if the verbose level is >= 1;
   // it will accumulate some function-level timing information that is printed
   // out at program end.  This makes things a bit slower as we have to call
   // cudaDeviceSynchronize() to make the timing information meaningful.
   static unordered_map<std::string, double, StringHasher> profile_map_;
   // profile_mutex_ guards profile_map_ in case multi_threaded_ is true.
   static std::mutex profile_mutex_;

   // free_memory_at_startup_ is just used in printing the memory used according
   // to the device.
   static int64 free_memory_at_startup_;
   static cudaDeviceProp properties_;

   // If set to true by SetDebugStrideMode(), code will be activated to use
   // pseudo-random stride values when allocating data (to detect errors which
   // otherwise would be rare).
   static bool debug_stride_mode_;


   // The following member variable is initialized to false; if the user calls
   // Instantiate() in a thread where it is still false, Initialize() will be
   // called, in order to -- if a GPU is being used-- call cudaSetDevice() and
   // set up the cublas and cusparse handles.
   bool initialized_;

   // This variable is just a copy of the static variable device_id_.  It's used
   // to detect when this code is called in the wrong way.
   int32 device_id_copy_;

   cublasHandle_t cublas_handle_;
   cusparseHandle_t cusparse_handle_;
   curandGenerator_t curand_handle_;
   cusolverDnHandle_t cusolverdn_handle_;
 }; // class CuDevice


 // Class CuTimer is a convenience wrapper for class Timer which only
 // sets the time if the verbose level is >= 1.  This helps avoid
 // an unnecessary system call if the verbose level is 0 and you
 // won't be accumulating the timing stats.
 class CuTimer: public Timer {
  public:
   CuTimer(): Timer(GetVerboseLevel() >= 1) { }
 };

 // This function is declared as a more convenient way to get the CUDA device handle for use
 // in the CUBLAS v2 API, since we so frequently need to access it.
 inline cublasHandle_t GetCublasHandle() {
   return CuDevice::Instantiate().GetCublasHandle();
 }

 inline cusolverDnHandle_t GetCusolverDnHandle() {
   return CuDevice::Instantiate().GetCusolverDnHandle();
 }

 // A more convenient way to get the handle to use cuSPARSE APIs.
 inline cusparseHandle_t GetCusparseHandle() {
   return CuDevice::Instantiate().GetCusparseHandle();
 }

 inline curandGenerator_t GetCurandHandle() {
   return CuDevice::Instantiate().GetCurandHandle();
 }


 }  // namespace kaldi

 #endif // HAVE_CUDA


 namespace kaldi {

 void SynchronizeGpu();

 }   // namespace kaldi

 #endif // KALDI_CUDAMATRIX_CU_DEVICE_H_
kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::GetVerboseLevel
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

timer.h

cu-allocator.h

kaldi::SynchronizeGpu
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
Definition: cu-device.cc:638

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

cu-common.h

kaldi-common.h

kaldi::RandInt
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95