23 #ifndef KALDI_CUDAMATRIX_CU_DEVICE_H_ 24 #define KALDI_CUDAMATRIX_CU_DEVICE_H_ 27 #include <cublas_v2.h> 34 #include <cuda_runtime_api.h> 40 #if CUDA_VERSION >= 9010 41 #include <cusolverDn.h> 46 typedef void* cusolverDnHandle_t;
47 typedef int cusolverStatus_t;
86 static inline CuDevice& Instantiate() {
87 CuDevice &ans = this_thread_device_;
88 if (!ans.initialized_)
93 inline cublasHandle_t GetCublasHandle() {
return cublas_handle_; }
94 inline cusparseHandle_t GetCusparseHandle() {
return cusparse_handle_; }
95 inline curandGenerator_t GetCurandHandle() {
return curand_handle_; }
96 inline cusolverDnHandle_t GetCusolverDnHandle() {
97 #if CUDA_VERSION < 9010 98 KALDI_ERR <<
"CUDA VERSION '" << CUDA_VERSION <<
"' not new enough to support " 99 <<
"cusolver. Upgrade to at least 9.1";
101 return cusolverdn_handle_;
104 inline void SeedGpu() {
105 if (CuDevice::Instantiate().Enabled()) {
107 CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
108 curand_handle_,
RandInt(128, RAND_MAX)));
109 CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
116 inline void* Malloc(
size_t size) {
117 return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
118 g_cuda_allocator.Malloc(size);
121 inline void* MallocPitch(
size_t row_bytes,
size_t num_rows,
size_t *pitch) {
122 if (multi_threaded_) {
123 return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
124 }
else if (debug_stride_mode_) {
129 return g_cuda_allocator.MallocPitch(
130 row_bytes + 512 *
RandInt(0, 4), num_rows,
133 return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
137 inline void Free(
void *ptr) {
138 if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
139 else g_cuda_allocator.Free(ptr);
155 void SelectGpuId(std::string use_gpu);
160 bool SelectAndInitializeGpuIdWithExistingCudaContext(
int dev_id);
163 bool Enabled()
const {
164 return (device_id_ > -1);
169 bool DoublePrecisionSupported();
174 void AccuProfile(
const char *function_name,
const CuTimer &timer);
180 void PrintMemoryUsage()
const;
186 inline void AllowMultithreading() { multi_threaded_ =
true; }
189 void DeviceGetName(
char* name,
int32 len,
int32 dev);
193 void CheckGpuHealth();
198 int32 GetMatrixAlignment()
const;
207 bool SetDebugStrideMode(
bool mode) {
208 bool old_mode = debug_stride_mode_;
209 debug_stride_mode_ = mode;
218 bool IsComputeExclusive();
226 static void RegisterDeviceOptions(OptionsItf *po) {
227 CuDevice::device_options_.Register(po);
232 struct CuDeviceOptions {
233 bool use_tensor_cores;
234 CuDeviceOptions () : use_tensor_cores(false) {};
235 void Register(OptionsItf *po) {
236 po->Register(
"cuda-use-tensor-cores", &use_tensor_cores,
237 "Enable FP16 tensor math. " 238 "This is higher performance but less accuracy. " 239 "This is only recommended for inference.");
243 static CuDeviceOptions device_options_;
248 CuDevice &operator=(CuDevice&);
261 bool SelectGpuIdAuto();
265 bool SelectGpuId(
int dev_id);
274 void FinalizeActiveGpu();
277 int32 MajorDeviceVersion();
280 int32 MinorDeviceVersion();
286 static thread_local CuDevice this_thread_device_;
294 static int32 device_id_;
299 static bool multi_threaded_;
305 static unordered_map<std::string, double, StringHasher> profile_map_;
307 static std::mutex profile_mutex_;
311 static int64 free_memory_at_startup_;
312 static cudaDeviceProp properties_;
317 static bool debug_stride_mode_;
328 int32 device_id_copy_;
330 cublasHandle_t cublas_handle_;
331 cusparseHandle_t cusparse_handle_;
332 curandGenerator_t curand_handle_;
333 cusolverDnHandle_t cusolverdn_handle_;
341 class CuTimer:
public Timer {
348 inline cublasHandle_t GetCublasHandle() {
349 return CuDevice::Instantiate().GetCublasHandle();
352 inline cusolverDnHandle_t GetCusolverDnHandle() {
353 return CuDevice::Instantiate().GetCusolverDnHandle();
357 inline cusparseHandle_t GetCusparseHandle() {
358 return CuDevice::Instantiate().GetCusparseHandle();
361 inline curandGenerator_t GetCurandHandle() {
362 return CuDevice::Instantiate().GetCurandHandle();
395 #endif // KALDI_CUDAMATRIX_CU_DEVICE_H_ This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
int32 GetVerboseLevel()
Get verbosity level, usually set via command line '–verbose=' switch.
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)