cu-allocator.cc
Go to the documentation of this file.
1 // cudamatrix/cu-allocator.cc
2 
3 // Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 
23 
24 #if HAVE_CUDA == 1
25 
26 #include <cublas_v2.h>
27 #include <cuda.h>
28 #include <cuda_runtime_api.h>
29 
30 #include <string>
31 #include <vector>
32 #include <algorithm>
33 #ifndef _MSC_VER
34 #include <dlfcn.h>
35 #endif
36 
37 #include "cudamatrix/cu-common.h"
38 #include "cudamatrix/cu-device.h"
39 #include "cudamatrix/cu-matrix.h"
40 #include "base/kaldi-error.h"
41 #include "base/kaldi-utils.h"
42 #include "util/common-utils.h"
43 
44 namespace kaldi {
45 
46 
47 void* CuMemoryAllocator::Malloc(size_t size) {
48  Timer tim;
49  if (!opts_.cache_memory) {
50  void *ans;
51  CU_SAFE_CALL(cudaMalloc(&ans, size));
52  double elapsed = tim.Elapsed();
53  tot_time_taken_ += elapsed;
54  malloc_time_taken_ += elapsed;
55  t_++;
56  return ans;
57  }
58 
59  // We could perhaps change this to KALDI_PARANOID_ASSERT to save time.
60  KALDI_ASSERT(size != 0);
61 
62  // Round up 'size' to a multiple of 256; this ensures the right kind of
63  // memory alignment.
64  size = (size + 255) & ~((size_t)255);
65  void *ans = MallocInternal(size);
66  tot_time_taken_ += tim.Elapsed();
67  return ans;
68 }
69 
70 
71 CuMemoryAllocator::MemoryBlock *CuMemoryAllocator::SplitBlock(
72  MemoryBlock *block, size_t size) {
73  SubRegion *subregion = block->subregion;
74  // new_block will become the right-most part of 'block', and 'block' will
75  // be the left-most part.
76  MemoryBlock *new_block = new MemoryBlock;
77  bool return_new_block;
78  char *new_begin;
79 
80  // We now decide whether to make the left part of 'block' be of size ('size')
81  // and return it (the 'if' branch of the if-else block below), or the right
82  // part (the 'else' branch). We decide this based on heuristics. Basically,
83  // we want to allocate the sub-block that's either next to the edge of the
84  // MemoryRegion, or next to something that was allocated long ago (and which,
85  // we assume won't be deallocated for a relatively long time). That is: we
86  // want to leave the un-allocated memory next to a memory block that was
87  // recently allocated (and thus is likely to be freed sooner), so that when
88  // that block is freed we can merge it with the still-unallocated piece into a
89  // larger block; this will reduce fragmentation. But if this block spans
90  // multiple sub-regions we don't want to do that, as that would be against our
91  // heuristic of, where possible, allocating memory from lower-numbered
92  // sub-regions.
93  //
94  // Bear in mind that we can assume block->next and block->prev, if they are
95  // non-NULL, are both currently allocated, since 'block' is un-allocated and
96  // we would have merged any adjacent un-allocated sub-regions.
97  if (block->next != NULL && block->prev != NULL &&
98  block->prev->t < block->next->t &&
99  block->next->subregion == subregion) {
100  // We'll allocate the right part of the block, since the left side is next
101  // to a relatively recently-allocated block.
102  return_new_block = true;
103  new_begin = block->end - size;
104  } else {
105  // We'll allocate the left part of the block.
106  return_new_block = false;
107  new_begin = block->begin + size;
108  }
109 
110  // The following code makes sure the SubRegion for 'new_block' is correct,
111  // i.e. its 'begin' is >= the 'begin' of the subregion and < the 'end' of the
112  // subregion. If the following loop segfaults, it indicates a bug somewhere
113  // else.
114  while (new_begin >= subregion->end)
115  subregion = subregion->next;
116  MemoryBlock *next_block = block->next;
117  new_block->begin = new_begin;
118  new_block->end = block->end;
119  new_block->subregion = subregion;
120  new_block->allocated = false;
121  new_block->thread_id = block->thread_id;
122  new_block->t = block->t;
123  new_block->next = next_block;
124  new_block->prev = block;
125  if (next_block)
126  next_block->prev = new_block;
127  block->next = new_block;
128  block->end = new_begin;
129 
130  // Add the split-up piece that we won't be allocating, to the
131  // 'free_blocks' member of its subregion.
132  if (return_new_block) {
133  AddToFreeBlocks(block);
134  return new_block;
135  } else {
136  AddToFreeBlocks(new_block);
137  return block;
138  }
139 }
140 
141 
142 void CuMemoryAllocator::RemoveFromFreeBlocks(MemoryBlock *block) {
143  SubRegion *subregion = block->subregion;
144  size_t block_size = block->end - block->begin;
145  std::pair<size_t, MemoryBlock*> p(block_size, block);
146  size_t num_removed = subregion->free_blocks.erase(p);
147  KALDI_ASSERT(num_removed != 0);
148  // Update largest_free_block_, if needed.
149  size_t subregion_index = subregion->subregion_index;
150  if (block_size == largest_free_block_[subregion_index]) {
151  if (subregion->free_blocks.empty())
152  largest_free_block_[subregion_index] = 0;
153  else
154  largest_free_block_[subregion_index] =
155  subregion->free_blocks.rbegin()->first;
156  }
157 }
158 
159 void CuMemoryAllocator::AddToFreeBlocks(MemoryBlock *block) {
160  SubRegion *subregion = block->subregion;
161  KALDI_PARANOID_ASSERT(block->begin >= subregion->begin &&
162  block->begin < subregion->end);
163  size_t block_size = block->end - block->begin,
164  subregion_index = subregion->subregion_index;
165  // Update largest_free_block_, if needed.
166  if (block_size > largest_free_block_[subregion_index]) {
167  largest_free_block_[subregion_index] = block_size;
168  }
169  subregion->free_blocks.insert(std::pair<size_t, MemoryBlock*>(block_size, block));
170 }
171 
172 
173 void* CuMemoryAllocator::MallocFromSubregion(SubRegion *subregion,
174  size_t size) {
175  // NULL is implementation defined and doesn't have to be zero so we can't
176  // guarantee that NULL will be <= a valid pointer-- so we cast to a pointer
177  // from zero instead of using NULL.
178  std::pair<size_t, MemoryBlock*> p(size, (MemoryBlock*)0);
179 
180  std::set<std::pair<size_t, MemoryBlock*> >::iterator iter =
181  subregion->free_blocks.lower_bound(p);
182  // so now 'iter' is the first member of free_blocks whose size_t value is >=
183  // size. If 'iter' was equal to the end() of that multi_map, it would be a
184  // bug because the calling code checked that the largest free block in this
185  // region was sufficiently large. We don't check this; if it segfaults, we'll
186  // debug.
187 
188  // search for a block that we don't have to synchronize on
189  int max_iters = 20;
190  auto search_iter = iter;
191  for (int32 i = 0;
192  search_iter != subregion->free_blocks.end() && i < max_iters;
193  ++i, ++search_iter) {
194  if (search_iter->second->thread_id == std::this_thread::get_id() ||
195  search_iter->second->t <= synchronize_gpu_t_) {
196  iter = search_iter;
197  break;
198  }
199  }
200 
201  MemoryBlock *block = iter->second;
202  // Erase 'block' from its subregion's free blocks list... the next lines are
203  // similar to RemoveFromFreeBlocks(), but we code it directly as we have the
204  // iterator here, and it would be wasteful to do another lookup.
205  subregion->free_blocks.erase(iter);
206  // Update largest_free_block_, if needed. The following few lines of code also appear
207  // in RemoveFromFreeBlocks().
208  size_t block_size = block->end - block->begin,
209  subregion_index = subregion->subregion_index;
210  if (block_size == largest_free_block_[subregion_index]) {
211  if (subregion->free_blocks.empty())
212  largest_free_block_[subregion_index] = 0;
213  else
214  largest_free_block_[subregion_index] =
215  subregion->free_blocks.rbegin()->first;
216  }
217 
218  KALDI_PARANOID_ASSERT(block_size >= size && block->allocated == false);
219 
220  // the most memory we allow to be 'wasted' by failing to split a block, is the
221  // smaller of: 1/16 of the size we're allocating, or half a megabyte.
222  size_t allowed_extra_size = std::min<size_t>(size >> 4, 524288);
223  if (block_size > size + allowed_extra_size) {
224  // If the requested block is substantially larger than what was requested,
225  // split it so we don't waste memory.
226  block = SplitBlock(block, size);
227  }
228 
229  if (std::this_thread::get_id() != block->thread_id &&
230  block->t > synchronize_gpu_t_) {
231  // see NOTE ON SYNCHRONIZATION in the header.
232  SynchronizeGpu();
233  synchronize_gpu_t_ = t_;
234  num_synchronizations_++;
235  }
236  block->allocated = true;
237  block->t = t_;
238  allocated_block_map_[block->begin] = block;
239  allocated_memory_ += (block->end - block->begin);
240  if (allocated_memory_ > max_allocated_memory_)
241  max_allocated_memory_ = allocated_memory_;
242  return block->begin;
243 }
244 
245 // By the time MallocInternal is called, we will have ensured that 'size' is
246 // a nonzero multiple of 256 (for memory aligment reasons).
247 // inline
248 void* CuMemoryAllocator::MallocInternal(size_t size) {
249 start:
250  std::vector<size_t>::const_iterator iter = largest_free_block_.begin(),
251  end = largest_free_block_.end();
252  size_t subregion_index = 0;
253  for (; iter != end; ++iter, ++subregion_index) {
254  if (*iter > size) {
255  return MallocFromSubregion(subregions_[subregion_index], size);
256  }
257  }
258  // We dropped off the loop without finding a subregion with enough memory
259  // to satisfy the request -> allocate a new region.
260  AllocateNewRegion(size);
261  // An infinite loop shouldn't be possible because after calling
262  // AllocateNewRegion(size), there should always be a SubRegion
263  // with that size available.
264  goto start;
265 }
266 
267 // Returns max(0, floor(log_2(i))). Not tested independently.
268 static inline size_t IntegerLog2(size_t i) {
269  size_t ans = 0;
270  while (i > 256) {
271  i >>= 8;
272  ans += 8;
273  }
274  while (i > 16) {
275  i >>= 4;
276  ans += 4;
277  }
278  while (i > 1) {
279  i >>= 1;
280  ans++;
281  }
282  return ans;
283 }
284 
285 std::string GetFreeGpuMemory(int64* free, int64* total) {
286 #ifdef _MSC_VER
287  size_t mem_free, mem_total;
288  cuMemGetInfo_v2(&mem_free, &mem_total);
289 #else
290  // define the function signature type
291  size_t mem_free, mem_total;
292  {
293  // we will load cuMemGetInfo_v2 dynamically from libcuda.so
294  // pre-fill ``safe'' values that will not cause problems
295  mem_free = 1; mem_total = 1;
296  // open libcuda.so
297  void* libcuda = dlopen("libcuda.so", RTLD_LAZY);
298  if (NULL == libcuda) {
299  KALDI_WARN << "cannot open libcuda.so";
300  } else {
301  // define the function signature type
302  // and get the symbol
303  typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
304  cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
305  if (NULL == dl_cuMemGetInfo) {
306  KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
307  } else {
308  // call the function
309  dl_cuMemGetInfo(&mem_free, &mem_total);
310  }
311  // close the library
312  dlclose(libcuda);
313  }
314  }
315 #endif
316  // copy the output values outside
317  if (NULL != free) *free = mem_free;
318  if (NULL != total) *total = mem_total;
319  // prepare the text output
320  std::ostringstream os;
321  os << "free:" << mem_free/(1024*1024) << "M, "
322  << "used:" << (mem_total-mem_free)/(1024*1024) << "M, "
323  << "total:" << mem_total/(1024*1024) << "M, "
324  << "free/total:" << mem_free/(float)mem_total;
325  return os.str();
326 }
327 
328 void CuMemoryAllocator::PrintMemoryUsage() const {
329  if (!opts_.cache_memory) {
330  KALDI_LOG << "Not caching allocations; time taken in "
331  << "malloc/free is " << malloc_time_taken_
332  << "/" << (tot_time_taken_ - malloc_time_taken_)
333  << ", num operations is " << t_
334  << "; device memory info: "
335  << GetFreeGpuMemory(NULL, NULL);
336  return;
337  }
338 
339  size_t num_blocks_allocated = 0, num_blocks_free = 0,
340  memory_allocated = 0, memory_held = 0,
341  largest_free_block = 0, largest_allocated_block = 0;
342 
343  for (size_t i = 0; i < memory_regions_.size(); i++) {
344  MemoryBlock *m = memory_regions_[i].block_begin;
345  KALDI_ASSERT(m->begin == memory_regions_[i].begin);
346  for (; m != NULL; m = m->next) {
347  size_t size = m->end - m->begin;
348  if (m->allocated) {
349  num_blocks_allocated++;
350  memory_allocated += size;
351  if (size > largest_allocated_block)
352  largest_allocated_block = size;
353  } else {
354  num_blocks_free++;
355  if (size > largest_free_block)
356  largest_free_block = size;
357  }
358  memory_held += size;
359  // The following is just some sanity checks; this code is rarely called so
360  // it's a reasonable place to put them.
361  if (m->next) {
362  KALDI_ASSERT(m->next->prev == m && m->end == m->next->begin);
363  } else {
364  KALDI_ASSERT(m->end == memory_regions_[m->subregion->memory_region].end);
365  }
366  }
367  }
368  KALDI_LOG << "Memory usage: " << memory_allocated << "/"
369  << memory_held << " bytes currently allocated/total-held; "
370  << num_blocks_allocated << "/" << num_blocks_free
371  << " blocks currently allocated/free; largest "
372  << "free/allocated block sizes are "
373  << largest_allocated_block << "/" << largest_free_block
374  << "; time taken total/cudaMalloc is "
375  << tot_time_taken_ << "/" << malloc_time_taken_
376  << ", synchronized the GPU " << num_synchronizations_
377  << " times out of " << (t_/2) << " frees; "
378  << "device memory info: " << GetFreeGpuMemory(NULL, NULL)
379  << "maximum allocated: " << max_allocated_memory_
380  << "current allocated: " << allocated_memory_;
381 }
382 
383 // Note: we just initialize with the default options, but we can change it later
384 // (as long as it's before we first use the class) by calling SetOptions().
385 CuMemoryAllocator::CuMemoryAllocator():
386  opts_(CuAllocatorOptions()),
387  t_(0),
388  synchronize_gpu_t_(0),
389  num_synchronizations_(0),
390  tot_time_taken_(0.0),
391  malloc_time_taken_(0.0),
392  max_allocated_memory_(0),
393  allocated_memory_(0) {
394  // Note: we don't allocate any memory regions at the start; we wait for the user
395  // to call Malloc() or MallocPitch(), and then allocate one when needed.
396 }
397 
398 
399 void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
400  size_t num_rows,
401  size_t *pitch) {
402  Timer tim;
403  if (!opts_.cache_memory) {
404  void *ans;
405  CU_SAFE_CALL(cudaMallocPitch(&ans, pitch, row_bytes, num_rows));
406  double elapsed = tim.Elapsed();
407  tot_time_taken_ += elapsed;
408  malloc_time_taken_ += elapsed;
409  return ans;
410  }
411 
412  // Round up row_bytes to a multiple of 256.
413  row_bytes = (row_bytes + 255) & ~((size_t)255);
414  *pitch = row_bytes;
415  void *ans = MallocInternal(row_bytes * num_rows);
416  tot_time_taken_ += tim.Elapsed();
417  return ans;
418 }
419 
420 void CuMemoryAllocator::Free(void *ptr) {
421  Timer tim;
422  if (!opts_.cache_memory) {
423  CU_SAFE_CALL(cudaFree(ptr));
424  tot_time_taken_ += tim.Elapsed();
425  t_++;
426  return;
427  }
428  t_++;
429  unordered_map<void*, MemoryBlock*>::iterator iter =
430  allocated_block_map_.find(ptr);
431  if (iter == allocated_block_map_.end()) {
432  KALDI_ERR << "Attempt to free CUDA memory pointer that was not allocated: "
433  << ptr;
434  }
435  MemoryBlock *block = iter->second;
436  allocated_memory_ -= (block->end - block->begin);
437  allocated_block_map_.erase(iter);
438  block->t = t_;
439  block->thread_id = std::this_thread::get_id();
440  block->allocated = false;
441 
442  // If this is not the first block of the memory region and the previous block
443  // is not allocated, merge this block into the previous block.
444  MemoryBlock *prev_block = block->prev;
445  if (prev_block != NULL && !prev_block->allocated) {
446  RemoveFromFreeBlocks(prev_block);
447  prev_block->end = block->end;
448  if (prev_block->thread_id != block->thread_id) {
449  // the two blocks we're merging were freed by different threads, so we
450  // give the 'nonexistent thread' as their thread, which means that
451  // whichever thread requests that block, we force synchronization. We can
452  // assume that prev_block was previously allocated (prev_block->t > 0)
453  // because we always start from the left when allocating blocks, and we
454  // know that this block was previously allocated.
455  prev_block->thread_id = std::thread::id();
456  }
457  prev_block->t = t_;
458  prev_block->next = block->next;
459  if (block->next)
460  block->next->prev = prev_block;
461  delete block;
462  block = prev_block;
463  }
464 
465  // If this is not the last block of the memory region and the next block is
466  // not allocated, merge the next block into this block.
467  MemoryBlock *next_block = block->next;
468  if (next_block != NULL && !next_block->allocated) {
469  // merge next_block into 'block', deleting 'next_block'. Note: at this
470  // point, if we merged with the previous block, the variable 'block' may now
471  // be pointing to that previous block, so it would be a 3-way merge.
472  RemoveFromFreeBlocks(next_block);
473  block->end = next_block->end;
474  if (next_block->thread_id != block->thread_id && next_block->t > 0) {
475  // the two blocks we're merging were freed by different threads, so we
476  // give the 'nonexistent thread' as their thread, which means that
477  // whichever thread requests that block, we force synchronization. there
478  // is no need to do this if next_block->t == 0, which would mean it had
479  // never been allocated.
480  block->thread_id = std::thread::id();
481  }
482  // We don't need to inspect the 't' value of next_block; it can't be
483  // larger than t_ because t_ is now.
484  block->next = next_block->next;
485  if (block->next)
486  block->next->prev = block;
487  delete next_block;
488  }
489  AddToFreeBlocks(block);
490  tot_time_taken_ += tim.Elapsed();
491 }
492 
493 void CuMemoryAllocator::AllocateNewRegion(size_t size) {
494  int64 free_memory, total_memory;
495  std::string mem_info = GetFreeGpuMemory(&free_memory, &total_memory);
496  opts_.Check();
497  size_t region_size = static_cast<size_t>(free_memory * opts_.memory_proportion);
498  if (region_size < size)
499  region_size = size;
500  // Round up region_size to an exact multiple of 1M (note: we expect it will
501  // be much larger than that). 1048575 is 2^20 - 1.
502  region_size = (region_size + 1048575) & ~((size_t)1048575);
503 
504  if (!memory_regions_.empty()) {
505  // If this is not the first region allocated, print some information.
506  KALDI_LOG << "About to allocate new memory region of " << region_size
507  << " bytes; current memory info is: " << mem_info;
508  }
509  void *memory_region;
510  cudaError_t e;
511  {
512  Timer tim;
513  e = cudaMalloc(&memory_region, region_size);
514  malloc_time_taken_ += tim.Elapsed();
515  }
516  if (e != cudaSuccess) {
517  PrintMemoryUsage();
518  if (!CuDevice::Instantiate().IsComputeExclusive()) {
519  KALDI_ERR << "Failed to allocate a memory region of " << region_size
520  << " bytes. Possibly this is due to sharing the GPU. Try "
521  << "switching the GPUs to exclusive mode (nvidia-smi -c 3) and using "
522  << "the option --use-gpu=wait to scripts like "
523  << "steps/nnet3/chain/train.py. Memory info: "
524  << mem_info
525  << " CUDA error: '" << cudaGetErrorString(e) << "'";
526  } else {
527  KALDI_ERR << "Failed to allocate a memory region of " << region_size
528  << " bytes. Possibly smaller minibatch size would help. "
529  << "Memory info: " << mem_info
530  << " CUDA error: '" << cudaGetErrorString(e) << "'";
531  }
532  }
533  // this_num_subregions would be approximately 'opts_.num_subregions' if
534  // 'region_size' was all the device's memory. (We add one to round up).
535  // We're aiming to get a number of sub-regions approximately equal to
536  // opts_.num_subregions by the time we allocate all the device's memory.
537  size_t this_num_subregions = 1 +
538  (region_size * opts_.num_subregions) / total_memory;
539 
540  size_t memory_region_index = memory_regions_.size();
541  memory_regions_.resize(memory_region_index + 1);
542  MemoryRegion &this_region = memory_regions_.back();
543 
544  this_region.begin = static_cast<char*>(memory_region);
545  this_region.end = this_region.begin + region_size;
546  // subregion_size will be hundreds of megabytes.
547  size_t subregion_size = region_size / this_num_subregions;
548 
549  std::vector<SubRegion*> new_subregions;
550  char* subregion_begin = static_cast<char*>(memory_region);
551  for (size_t i = 0; i < this_num_subregions; i++) {
552  SubRegion *subregion = new SubRegion();
553  subregion->memory_region = memory_region_index;
554  subregion->begin = subregion_begin;
555  if (i + 1 == this_num_subregions) {
556  subregion->end = this_region.end;
557  KALDI_ASSERT(subregion->end > subregion->begin);
558  } else {
559  subregion->end = subregion_begin + subregion_size;
560  subregion_begin = subregion->end;
561  }
562  subregion->next = NULL;
563  if (i > 0) {
564  new_subregions.back()->next = subregion;
565  }
566  new_subregions.push_back(subregion);
567  }
568  // Initially the memory is in a single block, owned by
569  // the first subregion. It will be split up gradually.
570  MemoryBlock *block = new MemoryBlock();
571  block->begin = this_region.begin;
572  block->end = this_region.end;
573  block->subregion = new_subregions.front();
574  block->allocated = false;
575  block->t = 0; // was never allocated.
576  block->next = NULL;
577  block->prev = NULL;
578  for (size_t i = 0; i < this_num_subregions; i++)
579  subregions_.push_back(new_subregions[i]);
580  SortSubregions();
581  this_region.block_begin = block;
582 
583  AddToFreeBlocks(block);
584 }
585 
586 // We sort the sub-regions according to the distance between the start of the
587 // MemoryRegion of which they are a part, and the start of the SubRegion. This
588 // will generally mean that the highest-numbered SubRegion-- the one we keep
589 // free at all costs-- will be the end of the first block which we allocated
590 // (which under most situations will be the largest block).
591 void CuMemoryAllocator::SortSubregions() {
592  largest_free_block_.resize(subregions_.size());
593 
594  std::vector<std::pair<size_t, SubRegion*> > pairs;
595  for (size_t i = 0; i < subregions_.size(); i++) {
596  SubRegion *subregion = subregions_[i];
597  MemoryRegion &memory_region = memory_regions_[subregion->memory_region];
598  size_t distance = subregion->begin - memory_region.begin;
599  pairs.push_back(std::pair<size_t, SubRegion*>(distance, subregion));
600  }
601  std::sort(pairs.begin(), pairs.end());
602  for (size_t i = 0; i < subregions_.size(); i++) {
603  subregions_[i] = pairs[i].second;
604  subregions_[i]->subregion_index = i;
605  if (subregions_[i]->free_blocks.empty())
606  largest_free_block_[i] = 0;
607  else
608  largest_free_block_[i] = subregions_[i]->free_blocks.rbegin()->first;
609  }
610 }
611 
612 CuMemoryAllocator::~CuMemoryAllocator() {
613  // We mainly free these blocks of memory so that cuda-memcheck doesn't report
614  // spurious errors.
615  for (size_t i = 0; i < memory_regions_.size(); i++) {
616  // No need to check the return status here-- the program is exiting anyway.
617  cudaFree(memory_regions_[i].begin);
618  }
619  for (size_t i = 0; i < subregions_.size(); i++) {
620  SubRegion *subregion = subregions_[i];
621  for (auto iter = subregion->free_blocks.begin();
622  iter != subregion->free_blocks.end(); ++iter)
623  delete iter->second;
624  delete subregion;
625  }
626 }
627 
628 
629 CuMemoryAllocator g_cuda_allocator;
630 
631 
632 } // namespace kaldi
633 
634 
635 #endif // HAVE_CUDA
636 
637 
638 namespace kaldi {
639 
640 // Define/initialize this global variable. It was declared in cu-allocator.h.
641 // This has to be done outside of the ifdef, because we register the options
642 // whether or not CUDA is compiled in (so that the binaries accept the same
643 // options).
645 
646 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
CuAllocatorOptions g_allocator_options
kaldi::int32 int32
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
Definition: cu-device.cc:638
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_PARANOID_ASSERT(cond)
Definition: kaldi-error.h:206
#define KALDI_WARN
Definition: kaldi-error.h:150
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_LOG
Definition: kaldi-error.h:153