cu-allocator.h
Go to the documentation of this file.
1 // cudamatrix/cu-allocator.h
2 
3 // Copyright 2015 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 
22 #ifndef KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
23 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
24 
25 #if HAVE_CUDA == 1
26 #include <cublas_v2.h>
27 #include <cuda.h>
28 #include <cuda_runtime_api.h>
29 #endif
30 
31 #include <map>
32 #include <set>
33 #include <mutex>
34 #include <list>
35 #include <queue>
36 #include <thread>
37 #include <iostream>
38 #include "base/kaldi-common.h"
39 #include "util/stl-utils.h"
40 #include "itf/options-itf.h"
41 
42 namespace kaldi {
43 
44 
45 // For now we don't give the user a way to modify these from the command line.
46 // or the code, it just documents what the default options are. To change
47 // the options, you have to do it in the code.
49  // True if we are going to actually cache memory allocations on this device.
50  // You'd normally set it to false only if you wanted to debug a possible
51  // memory problem using cuda-memcheck or cuda-gdb. It will be slower, but
52  // using CUDA's native allocator allows those tools to detect out-of-region
53  // memory accesses.
55 
56  // The proportion of the device's memory that the CuAllocator allocates to
57  // start with; by default this is 0.5, although if you want to share the
58  // device (not recommended!) you should set this lower.
60 
61  // The target number of subregions of the entire CUDA device memory (we'll
62  // start with a smaller number of memory_proportion is << 1). Kind of
63  // a tuning knob.. more regions will make it more aggressively consolidate
64  // memory low addresses.
66 
68  cache_memory(true), memory_proportion(0.5), num_subregions(20) { }
69 
70  void Register(OptionsItf *po) {
71  po->Register("cuda-cache-memory", &cache_memory, "True if you want "
72  "to use the caching allocator. Set this to false only if you "
73  "want to use cuda-memcheck or cuda-gdb; it will be slower.");
74  po->Register("cuda-memory-proportion", &memory_proportion,
75  "Proportion of the GPU device memory that the allocator "
76  "should allocate at the start");
77  }
78 
79  void Check() {
80  // don't let it get too close to 1;
81  KALDI_ASSERT(memory_proportion >= 0.05 && memory_proportion < 0.99);
82  }
83 };
84 
86 
88  g_allocator_options.Register(po);
89 }
90 
91 
92 } // namespace kaldi
93 
94 
95 #if HAVE_CUDA == 1
96 namespace kaldi {
97 
153 class CuMemoryAllocator {
154  public:
157  void* Malloc(size_t size);
158 
160  void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
161 
163  void Free(void *ptr);
164 
166  inline void* MallocLocking(size_t size) {
167  std::unique_lock<std::mutex> lock(mutex_);
168  return Malloc(size);
169  }
171  inline void* MallocPitchLocking(size_t row_bytes, size_t num_rows, size_t *pitch) {
172  std::unique_lock<std::mutex> lock(mutex_);
173  return MallocPitch(row_bytes, num_rows, pitch);
174  }
176  void FreeLocking(void *ptr) {
177  std::unique_lock<std::mutex> lock(mutex_);
178  Free(ptr);
179  }
180 
181  void PrintMemoryUsage() const;
182 
183  // returns the current memory allocated within the cache
184  size_t GetAllocatedMemory() { return allocated_memory_; }
185 
186  // returns the maximum memory used within the cache during current execution
187  size_t GetMaxAllocatedMemory() { return max_allocated_memory_; }
188 
189  CuMemoryAllocator();
190 
191  // Allows you to set options: must be called before any Malloc function is
192  // called on this class. It's done this way so the options can be changed
193  // by the user (c.f. RegisterCuAllocatorOptions()) before the options are read.
194  void SetOptions(const CuAllocatorOptions &opts) { opts_ = opts; }
195 
196  ~CuMemoryAllocator();
197 
198  private:
199 
200  struct SubRegion;
201 
202  struct MemoryBlock {
203  char *begin; // The beginning of the block (in CUDA memory)
204  char *end; // the end of the block (in CUDA memory)
205  SubRegion *subregion; // Pointer to the SubRegion to which this memory
206  // block belongs.
207  bool allocated; // True if this MemoryBlock has currently been given to the
208  // user; false if not.
209 
210  size_t t; // Zero if this memory block was never given to the user;
211  // otherwise, the time value (t_ in the CuAllocator class)
212  // when it was most recently either allocated to the user
213  // or freed by the user.
214 
215  std::thread::id thread_id; // If allocated == false and t > 0 (i.e. this
216  // memory block was released by the user), the
217  // thread-id of the user thread that freed this
218  // block, or the invalid thread-id as created by
219  // the constructor of std::thread::id if this
220  // block was created by merging blocks from
221  // different threads. Required for
222  // synchronization; and note that we assume
223  // there is one CUDA stream per CPU thread.
224 
225  MemoryBlock *next; // The next MemoryBlock within this MemoryRegion (or
226  // NULL if this is the last one); its 'begin' would be
227  // the same as the 'end' of this block.
228  MemoryBlock *prev; // The previous MemoryBlock within this MemoryRegion (or
229  // NULL if this is the first one); its 'end' would be the
230  // same as the 'begin' of this block.
231 
232  };
233 
234  // a MemoryRegion is a large piece of memory that we allocated via CudaMalloc.
235  // there normally won't be more than about 3 or 4 of these.
236  // We'll identify MemoryRegions by a size_t (e.g 0, 1, 2, 3... ) which is an
237  // index into the memory_regions_ vector.
238  struct MemoryRegion {
239  char *begin; // 'begin' is the start of the memory region.
240  char *end; // 'end' is the end of the memory region.
241  SubRegion *subregion_begin; // The first SubRegion that belongs to this
242  // MemoryRegion.
243  MemoryBlock *block_begin; // The first MemoryBlock that belongs to this
244  // MemoryRegion.
245  };
246 
247  // a SubRegion is a smaller zone of memory within a MemoryRegion. For
248  // example, we divide the first MemoryRegion we allocate into 10 blocks, and
249  // if we allocate blocks of memory later on, we'll sub-divide them into blocks
250  // of about the same size. A SubRegion is just a largish bin into which we
251  // put any blocks of memory that happen to start within that SubRegion;
252  // actually, memory blocks may cross over the boundaries of SubRegions. The
253  // motivation for dividing up MemoryRegions into SubRegions is that it allos
254  // us an efficient mechanism to segregate smaller memory blocks into higher
255  // memory and larger ones into lower memory: for each allocation, we allocate
256  // it from the highest-numbered SubRegion that is able to allocate something of
257  // that size. Over time, this will lead to smaller memory blocks being
258  // concentrated in higher-numbered SubRegions.
259  struct SubRegion {
260  size_t memory_region; // This is an index into the memory_regions_ vector
261  // which identifies which MemoryRegion this SubRegion
262  // is a part of.
263  size_t subregion_index; // The index of this SubRegion within the
264  // subregions_ vector; this can change when we
265  // allocate more MemoryRegions.
266  char *begin; // 'begin' is the start of the memory in this SubRegion.
267  char *end; // 'end' is the end of the memory in this SubRegion.
268 
269  // Contains the free MemoryBlocks starting within this SubRegion.
270  std::set<std::pair<size_t, MemoryBlock*> > free_blocks;
271 
272  // Pointer to the next SubRegion within this MemoryRegion (i.e. the SubRegion
273  // whose begin equals this one's end), or NULL if this is the last one.
274  SubRegion *next;
275  };
276 
277  // Tries to allocate CUDA memory of the given size; will crash if it was not
278  // able to.
279  inline void* MallocInternal(size_t size);
280 
281  // Allocates from a given SubRegion, after we have determined that it
282  // can satisfy this request. Broken out of MallocInternal for clarity.
283  inline void* MallocFromSubregion(SubRegion *subregion, size_t size);
284 
285 
286  // Splits the given MemoryBlock so that one piece is of size 'size', and
287  // returns the piece which is of size 'size'. The caller guarantees that
288  // 'size' is less than the current size of the memory block, that 'block' is
289  // not currently allocated (i.e. block->allocated == false). This function
290  // assumes that, at entry, 'block' is not present in its subregion's
291  // 'free_blocks' (because the caller has removed it), and it takes
292  // responsibility for entering the 'unused' part (the part we're not
293  // returning) into its subregion's 'free_blocks' by calling AddToFreeBlocks().
294  inline MemoryBlock *SplitBlock(MemoryBlock *block, size_t size);
295 
296  // Removes this block from the 'free_blocks' set of the SubRegion to which
297  // it belongs. This is called when allocating a block, and from other places.
298  void RemoveFromFreeBlocks(MemoryBlock *block);
299 
300  // Adds this block to the 'free_blocks' set of the SubRegion to which it
301  // belongs. This is called when freeing a block, and from other places.
302  void AddToFreeBlocks(MemoryBlock *block);
303 
304  // This function is called when an allocation failed and we need to try to
305  // allocate more memory from the evice. The 'size' is the size of the
306  // requested memory block whose allocation failed-- it's provided so that
307  // we can be sure to allocate a new region of at least this size.
308  void AllocateNewRegion(size_t size);
309 
310  // Called from AllocateNewRegion(), this ensures that the subregions are
311  // sorted as we want (which is a kind of heuristic that will be discussed in
312  // the code), and it also recomputes the largest_free_block_ array.
313  void SortSubregions();
314 
315 
316 
317  CuAllocatorOptions opts_;
318 
319  std::vector<MemoryRegion> memory_regions_;
320 
321  std::vector<SubRegion*> subregions_;
322 
323  // For each SubRegion in sub_regions_, this vector gives us the size of the
324  // largest free block present in that SubRegion, which is equal to
325  // sub_regions_[i]->free_blocks.begin()->first. It allows us to fairly
326  // efficiently find the lowest-numbered SubRegion which can handle a
327  // particular request for memory.
328  std::vector<size_t> largest_free_block_;
329 
330  size_t t_; // time counter, incremented with each call.
331  size_t synchronize_gpu_t_; // value of t_ at the last time we called
332  // SynchronizeGpu().
333  size_t num_synchronizations_; // number of times we called SynchronizeGpu()
334  double tot_time_taken_; // Total time taken in calls to this object.
335  double malloc_time_taken_; // Total time we spent calling cudaMalloc().
336 
337  // This is a map from memory locations currently owned by the user, to the
338  // MemoryBlock which stores the information about that location.
339  std::unordered_map<void*, MemoryBlock*> allocated_block_map_;
340 
341  // this is only locked by the '*Locking' versions of the functions (necessary only
342  // in multi-threaded applications).
343  std::mutex mutex_;
344 
345  // Keep track of the memory usage from the cache to track the maximum memory used by
346  // the application
347  size_t max_allocated_memory_;
348  size_t allocated_memory_;
349 };
350 
351 
352 // This function returns some printable information about the memory used
353 // as a string: an example showing the format is:
354 // "free: 10M, used: 490M, total: 500M: free/total: 0.02"
355 // In addition, if the pointers 'free' and 'total' are non-NULL, it will
356 // output to them the free memory and the total memory of the device.
357 std::string GetFreeGpuMemory(int64* free, int64* total);
358 
359 extern CuMemoryAllocator g_cuda_allocator;
360 
361 } // namespace kaldi
362 
363 #endif // HAVE_CUDA
364 
365 
366 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
CuAllocatorOptions g_allocator_options
kaldi::int32 int32
void Register(OptionsItf *po)
Definition: cu-allocator.h:70
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void RegisterCuAllocatorOptions(OptionsItf *po)
Definition: cu-allocator.h:87