22 #ifndef KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ 23 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ 26 #include <cublas_v2.h> 28 #include <cuda_runtime_api.h> 68 cache_memory(true), memory_proportion(0.5), num_subregions(20) { }
71 po->
Register(
"cuda-cache-memory", &cache_memory,
"True if you want " 72 "to use the caching allocator. Set this to false only if you " 73 "want to use cuda-memcheck or cuda-gdb; it will be slower.");
74 po->
Register(
"cuda-memory-proportion", &memory_proportion,
75 "Proportion of the GPU device memory that the allocator " 76 "should allocate at the start");
81 KALDI_ASSERT(memory_proportion >= 0.05 && memory_proportion < 0.99);
153 class CuMemoryAllocator {
157 void* Malloc(
size_t size);
160 void* MallocPitch(
size_t row_bytes,
size_t num_rows,
size_t *pitch);
163 void Free(
void *ptr);
166 inline void* MallocLocking(
size_t size) {
167 std::unique_lock<std::mutex> lock(mutex_);
171 inline void* MallocPitchLocking(
size_t row_bytes,
size_t num_rows,
size_t *pitch) {
172 std::unique_lock<std::mutex> lock(mutex_);
173 return MallocPitch(row_bytes, num_rows, pitch);
176 void FreeLocking(
void *ptr) {
177 std::unique_lock<std::mutex> lock(mutex_);
181 void PrintMemoryUsage()
const;
184 size_t GetAllocatedMemory() {
return allocated_memory_; }
187 size_t GetMaxAllocatedMemory() {
return max_allocated_memory_; }
196 ~CuMemoryAllocator();
205 SubRegion *subregion;
215 std::thread::id thread_id;
238 struct MemoryRegion {
241 SubRegion *subregion_begin;
243 MemoryBlock *block_begin;
260 size_t memory_region;
263 size_t subregion_index;
270 std::set<std::pair<size_t, MemoryBlock*> > free_blocks;
279 inline void* MallocInternal(
size_t size);
283 inline void* MallocFromSubregion(SubRegion *subregion,
size_t size);
294 inline MemoryBlock *SplitBlock(MemoryBlock *block,
size_t size);
298 void RemoveFromFreeBlocks(MemoryBlock *block);
302 void AddToFreeBlocks(MemoryBlock *block);
308 void AllocateNewRegion(
size_t size);
313 void SortSubregions();
319 std::vector<MemoryRegion> memory_regions_;
321 std::vector<SubRegion*> subregions_;
328 std::vector<size_t> largest_free_block_;
331 size_t synchronize_gpu_t_;
333 size_t num_synchronizations_;
334 double tot_time_taken_;
335 double malloc_time_taken_;
339 std::unordered_map<void*, MemoryBlock*> allocated_block_map_;
347 size_t max_allocated_memory_;
348 size_t allocated_memory_;
357 std::string GetFreeGpuMemory(int64* free, int64* total);
359 extern CuMemoryAllocator g_cuda_allocator;
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
CuAllocatorOptions g_allocator_options
void Register(OptionsItf *po)
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
BaseFloat memory_proportion
#define KALDI_ASSERT(cond)
void RegisterCuAllocatorOptions(OptionsItf *po)