26 #include <cublas_v2.h> 28 #include <cuda_runtime_api.h> 47 void* CuMemoryAllocator::Malloc(
size_t size) {
49 if (!opts_.cache_memory) {
51 CU_SAFE_CALL(cudaMalloc(&ans, size));
52 double elapsed = tim.Elapsed();
53 tot_time_taken_ += elapsed;
54 malloc_time_taken_ += elapsed;
64 size = (size + 255) & ~((
size_t)255);
65 void *ans = MallocInternal(size);
66 tot_time_taken_ += tim.Elapsed();
71 CuMemoryAllocator::MemoryBlock *CuMemoryAllocator::SplitBlock(
72 MemoryBlock *block,
size_t size) {
73 SubRegion *subregion = block->subregion;
76 MemoryBlock *new_block =
new MemoryBlock;
77 bool return_new_block;
97 if (block->next != NULL && block->prev != NULL &&
98 block->prev->t < block->next->t &&
99 block->next->subregion == subregion) {
102 return_new_block =
true;
103 new_begin = block->end - size;
106 return_new_block =
false;
107 new_begin = block->begin + size;
114 while (new_begin >= subregion->end)
115 subregion = subregion->next;
116 MemoryBlock *next_block = block->next;
117 new_block->begin = new_begin;
118 new_block->end = block->end;
119 new_block->subregion = subregion;
120 new_block->allocated =
false;
121 new_block->thread_id = block->thread_id;
122 new_block->t = block->t;
123 new_block->next = next_block;
124 new_block->prev = block;
126 next_block->prev = new_block;
127 block->next = new_block;
128 block->end = new_begin;
132 if (return_new_block) {
133 AddToFreeBlocks(block);
136 AddToFreeBlocks(new_block);
142 void CuMemoryAllocator::RemoveFromFreeBlocks(MemoryBlock *block) {
143 SubRegion *subregion = block->subregion;
144 size_t block_size = block->end - block->begin;
145 std::pair<size_t, MemoryBlock*> p(block_size, block);
146 size_t num_removed = subregion->free_blocks.erase(p);
149 size_t subregion_index = subregion->subregion_index;
150 if (block_size == largest_free_block_[subregion_index]) {
151 if (subregion->free_blocks.empty())
152 largest_free_block_[subregion_index] = 0;
154 largest_free_block_[subregion_index] =
155 subregion->free_blocks.rbegin()->first;
159 void CuMemoryAllocator::AddToFreeBlocks(MemoryBlock *block) {
160 SubRegion *subregion = block->subregion;
162 block->begin < subregion->end);
163 size_t block_size = block->end - block->begin,
164 subregion_index = subregion->subregion_index;
166 if (block_size > largest_free_block_[subregion_index]) {
167 largest_free_block_[subregion_index] = block_size;
169 subregion->free_blocks.insert(std::pair<size_t, MemoryBlock*>(block_size, block));
173 void* CuMemoryAllocator::MallocFromSubregion(SubRegion *subregion,
178 std::pair<size_t, MemoryBlock*> p(size, (MemoryBlock*)0);
180 std::set<std::pair<size_t, MemoryBlock*> >::iterator iter =
181 subregion->free_blocks.lower_bound(p);
190 auto search_iter = iter;
192 search_iter != subregion->free_blocks.end() &&
i < max_iters;
193 ++
i, ++search_iter) {
194 if (search_iter->second->thread_id == std::this_thread::get_id() ||
195 search_iter->second->t <= synchronize_gpu_t_) {
201 MemoryBlock *block = iter->second;
205 subregion->free_blocks.erase(iter);
208 size_t block_size = block->end - block->begin,
209 subregion_index = subregion->subregion_index;
210 if (block_size == largest_free_block_[subregion_index]) {
211 if (subregion->free_blocks.empty())
212 largest_free_block_[subregion_index] = 0;
214 largest_free_block_[subregion_index] =
215 subregion->free_blocks.rbegin()->first;
222 size_t allowed_extra_size = std::min<size_t>(size >> 4, 524288);
223 if (block_size > size + allowed_extra_size) {
226 block = SplitBlock(block, size);
229 if (std::this_thread::get_id() != block->thread_id &&
230 block->t > synchronize_gpu_t_) {
233 synchronize_gpu_t_ = t_;
234 num_synchronizations_++;
236 block->allocated =
true;
238 allocated_block_map_[block->begin] = block;
239 allocated_memory_ += (block->end - block->begin);
240 if (allocated_memory_ > max_allocated_memory_)
241 max_allocated_memory_ = allocated_memory_;
248 void* CuMemoryAllocator::MallocInternal(
size_t size) {
250 std::vector<size_t>::const_iterator iter = largest_free_block_.begin(),
251 end = largest_free_block_.end();
252 size_t subregion_index = 0;
253 for (; iter != end; ++iter, ++subregion_index) {
255 return MallocFromSubregion(subregions_[subregion_index], size);
260 AllocateNewRegion(size);
268 static inline size_t IntegerLog2(
size_t i) {
285 std::string GetFreeGpuMemory(int64* free, int64* total) {
287 size_t mem_free, mem_total;
288 cuMemGetInfo_v2(&mem_free, &mem_total);
291 size_t mem_free, mem_total;
295 mem_free = 1; mem_total = 1;
297 void* libcuda = dlopen(
"libcuda.so", RTLD_LAZY);
298 if (NULL == libcuda) {
303 typedef CUresult (*cu_fun_ptr)(
size_t*,
size_t*);
304 cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,
"cuMemGetInfo_v2");
305 if (NULL == dl_cuMemGetInfo) {
306 KALDI_WARN <<
"cannot load cuMemGetInfo from libcuda.so";
309 dl_cuMemGetInfo(&mem_free, &mem_total);
317 if (NULL != free) *free = mem_free;
318 if (NULL != total) *total = mem_total;
320 std::ostringstream os;
321 os <<
"free:" << mem_free/(1024*1024) <<
"M, " 322 <<
"used:" << (mem_total-mem_free)/(1024*1024) <<
"M, " 323 <<
"total:" << mem_total/(1024*1024) <<
"M, " 324 <<
"free/total:" << mem_free/(
float)mem_total;
328 void CuMemoryAllocator::PrintMemoryUsage()
const {
329 if (!opts_.cache_memory) {
330 KALDI_LOG <<
"Not caching allocations; time taken in " 331 <<
"malloc/free is " << malloc_time_taken_
332 <<
"/" << (tot_time_taken_ - malloc_time_taken_)
333 <<
", num operations is " << t_
334 <<
"; device memory info: " 335 << GetFreeGpuMemory(NULL, NULL);
339 size_t num_blocks_allocated = 0, num_blocks_free = 0,
340 memory_allocated = 0, memory_held = 0,
341 largest_free_block = 0, largest_allocated_block = 0;
343 for (
size_t i = 0; i < memory_regions_.size(); i++) {
344 MemoryBlock *m = memory_regions_[
i].block_begin;
346 for (; m != NULL; m = m->next) {
347 size_t size = m->end - m->begin;
349 num_blocks_allocated++;
350 memory_allocated += size;
351 if (size > largest_allocated_block)
352 largest_allocated_block = size;
355 if (size > largest_free_block)
356 largest_free_block = size;
362 KALDI_ASSERT(m->next->prev == m && m->end == m->next->begin);
364 KALDI_ASSERT(m->end == memory_regions_[m->subregion->memory_region].end);
368 KALDI_LOG <<
"Memory usage: " << memory_allocated <<
"/" 369 << memory_held <<
" bytes currently allocated/total-held; " 370 << num_blocks_allocated <<
"/" << num_blocks_free
371 <<
" blocks currently allocated/free; largest " 372 <<
"free/allocated block sizes are " 373 << largest_allocated_block <<
"/" << largest_free_block
374 <<
"; time taken total/cudaMalloc is " 375 << tot_time_taken_ <<
"/" << malloc_time_taken_
376 <<
", synchronized the GPU " << num_synchronizations_
377 <<
" times out of " << (t_/2) <<
" frees; " 378 <<
"device memory info: " << GetFreeGpuMemory(NULL, NULL)
379 <<
"maximum allocated: " << max_allocated_memory_
380 <<
"current allocated: " << allocated_memory_;
385 CuMemoryAllocator::CuMemoryAllocator():
386 opts_(CuAllocatorOptions()),
388 synchronize_gpu_t_(0),
389 num_synchronizations_(0),
390 tot_time_taken_(0.0),
391 malloc_time_taken_(0.0),
392 max_allocated_memory_(0),
393 allocated_memory_(0) {
399 void* CuMemoryAllocator::MallocPitch(
size_t row_bytes,
403 if (!opts_.cache_memory) {
405 CU_SAFE_CALL(cudaMallocPitch(&ans, pitch, row_bytes, num_rows));
406 double elapsed = tim.Elapsed();
407 tot_time_taken_ += elapsed;
408 malloc_time_taken_ += elapsed;
413 row_bytes = (row_bytes + 255) & ~((
size_t)255);
415 void *ans = MallocInternal(row_bytes * num_rows);
416 tot_time_taken_ += tim.Elapsed();
420 void CuMemoryAllocator::Free(
void *ptr) {
422 if (!opts_.cache_memory) {
423 CU_SAFE_CALL(cudaFree(ptr));
424 tot_time_taken_ += tim.Elapsed();
429 unordered_map<void*, MemoryBlock*>::iterator iter =
430 allocated_block_map_.find(ptr);
431 if (iter == allocated_block_map_.end()) {
432 KALDI_ERR <<
"Attempt to free CUDA memory pointer that was not allocated: " 435 MemoryBlock *block = iter->second;
436 allocated_memory_ -= (block->end - block->begin);
437 allocated_block_map_.erase(iter);
439 block->thread_id = std::this_thread::get_id();
440 block->allocated =
false;
444 MemoryBlock *prev_block = block->prev;
445 if (prev_block != NULL && !prev_block->allocated) {
446 RemoveFromFreeBlocks(prev_block);
447 prev_block->end = block->end;
448 if (prev_block->thread_id != block->thread_id) {
455 prev_block->thread_id = std::thread::id();
458 prev_block->next = block->next;
460 block->next->prev = prev_block;
467 MemoryBlock *next_block = block->next;
468 if (next_block != NULL && !next_block->allocated) {
472 RemoveFromFreeBlocks(next_block);
473 block->end = next_block->end;
474 if (next_block->thread_id != block->thread_id && next_block->t > 0) {
480 block->thread_id = std::thread::id();
484 block->next = next_block->next;
486 block->next->prev = block;
489 AddToFreeBlocks(block);
490 tot_time_taken_ += tim.Elapsed();
493 void CuMemoryAllocator::AllocateNewRegion(
size_t size) {
494 int64 free_memory, total_memory;
495 std::string mem_info = GetFreeGpuMemory(&free_memory, &total_memory);
497 size_t region_size =
static_cast<size_t>(free_memory * opts_.memory_proportion);
498 if (region_size < size)
502 region_size = (region_size + 1048575) & ~((
size_t)1048575);
504 if (!memory_regions_.empty()) {
506 KALDI_LOG <<
"About to allocate new memory region of " << region_size
507 <<
" bytes; current memory info is: " << mem_info;
513 e = cudaMalloc(&memory_region, region_size);
514 malloc_time_taken_ += tim.Elapsed();
516 if (e != cudaSuccess) {
518 if (!CuDevice::Instantiate().IsComputeExclusive()) {
519 KALDI_ERR <<
"Failed to allocate a memory region of " << region_size
520 <<
" bytes. Possibly this is due to sharing the GPU. Try " 521 <<
"switching the GPUs to exclusive mode (nvidia-smi -c 3) and using " 522 <<
"the option --use-gpu=wait to scripts like " 523 <<
"steps/nnet3/chain/train.py. Memory info: " 525 <<
" CUDA error: '" << cudaGetErrorString(e) <<
"'";
527 KALDI_ERR <<
"Failed to allocate a memory region of " << region_size
528 <<
" bytes. Possibly smaller minibatch size would help. " 529 <<
"Memory info: " << mem_info
530 <<
" CUDA error: '" << cudaGetErrorString(e) <<
"'";
537 size_t this_num_subregions = 1 +
538 (region_size * opts_.num_subregions) / total_memory;
540 size_t memory_region_index = memory_regions_.size();
541 memory_regions_.resize(memory_region_index + 1);
542 MemoryRegion &this_region = memory_regions_.back();
544 this_region.begin =
static_cast<char*
>(memory_region);
545 this_region.end = this_region.begin + region_size;
547 size_t subregion_size = region_size / this_num_subregions;
549 std::vector<SubRegion*> new_subregions;
550 char* subregion_begin =
static_cast<char*
>(memory_region);
551 for (
size_t i = 0; i < this_num_subregions; i++) {
552 SubRegion *subregion =
new SubRegion();
553 subregion->memory_region = memory_region_index;
554 subregion->begin = subregion_begin;
555 if (i + 1 == this_num_subregions) {
556 subregion->end = this_region.end;
559 subregion->end = subregion_begin + subregion_size;
560 subregion_begin = subregion->end;
562 subregion->next = NULL;
564 new_subregions.back()->next = subregion;
566 new_subregions.push_back(subregion);
570 MemoryBlock *block =
new MemoryBlock();
571 block->begin = this_region.begin;
572 block->end = this_region.end;
573 block->subregion = new_subregions.front();
574 block->allocated =
false;
578 for (
size_t i = 0; i < this_num_subregions; i++)
579 subregions_.push_back(new_subregions[i]);
581 this_region.block_begin = block;
583 AddToFreeBlocks(block);
591 void CuMemoryAllocator::SortSubregions() {
592 largest_free_block_.resize(subregions_.size());
594 std::vector<std::pair<size_t, SubRegion*> > pairs;
595 for (
size_t i = 0; i < subregions_.size(); i++) {
596 SubRegion *subregion = subregions_[
i];
597 MemoryRegion &memory_region = memory_regions_[subregion->memory_region];
598 size_t distance = subregion->begin - memory_region.begin;
599 pairs.push_back(std::pair<size_t, SubRegion*>(distance, subregion));
601 std::sort(pairs.begin(), pairs.end());
602 for (
size_t i = 0; i < subregions_.size(); i++) {
603 subregions_[
i] = pairs[
i].second;
604 subregions_[
i]->subregion_index =
i;
605 if (subregions_[i]->free_blocks.empty())
606 largest_free_block_[i] = 0;
608 largest_free_block_[
i] = subregions_[
i]->free_blocks.rbegin()->first;
612 CuMemoryAllocator::~CuMemoryAllocator() {
615 for (
size_t i = 0; i < memory_regions_.size(); i++) {
617 cudaFree(memory_regions_[i].begin);
619 for (
size_t i = 0; i < subregions_.size(); i++) {
620 SubRegion *subregion = subregions_[
i];
621 for (
auto iter = subregion->free_blocks.begin();
622 iter != subregion->free_blocks.end(); ++iter)
629 CuMemoryAllocator g_cuda_allocator;
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
CuAllocatorOptions g_allocator_options
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
#define KALDI_PARANOID_ASSERT(cond)
#define KALDI_ASSERT(cond)