nnet-optimize.cc
Go to the documentation of this file.
1 // nnet3/nnet-optimize.cc
2 
3 // Copyright 2015 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Xiaohui Zhang
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include <iomanip>
22 #include "nnet3/nnet-optimize.h"
24 #include "nnet3/nnet-utils.h"
25 #include "base/timer.h"
26 
27 namespace kaldi {
28 namespace nnet3 {
29 
30 void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
31  ExpectToken(is, binary, "<NnetOptimizeOptions>");
32  ExpectToken(is, binary, "<Optimize>");
33  ReadBasicType(is, binary, &optimize);
34  ExpectToken(is, binary, "<ConsolidateModelUpdate>");
36  ExpectToken(is, binary, "<PropagateInPlace>");
37  ReadBasicType(is, binary, &propagate_in_place);
38  ExpectToken(is, binary, "<BackpropInPlace>");
39  ReadBasicType(is, binary, &backprop_in_place);
40  if (PeekToken(is, binary) == 'O') {
41  ExpectToken(is, binary, "<OptimizeRowOps>");
42  ReadBasicType(is, binary, &optimize_row_ops);
43  }
44  if (PeekToken(is, binary) == 'S') {
45  ExpectToken(is, binary, "<SplitRowOps>");
46  ReadBasicType(is, binary, &split_row_ops);
47  }
48  if (PeekToken(is, binary) == 'E') {
49  ExpectToken(is, binary, "<ExtendMatrices>");
50  ReadBasicType(is, binary, &extend_matrices);
51  }
52  ExpectToken(is, binary, "<ConvertAddition>");
53  ReadBasicType(is, binary, &convert_addition);
54  ExpectToken(is, binary, "<RemoveAssignments>");
55  ReadBasicType(is, binary, &remove_assignments);
56  ExpectToken(is, binary, "<AllowLeftMerge>");
57  ReadBasicType(is, binary, &allow_left_merge);
58  ExpectToken(is, binary, "<AllowRightMerge>");
59  ReadBasicType(is, binary, &allow_right_merge);
60  ExpectToken(is, binary, "<InitializeUndefined>");
61  ReadBasicType(is, binary, &initialize_undefined);
62  ExpectToken(is, binary, "<MoveSizingCommands>");
63  ReadBasicType(is, binary, &move_sizing_commands);
64  ExpectToken(is, binary, "<AllocateFromOther>");
65  ReadBasicType(is, binary, &allocate_from_other);
66  ExpectToken(is, binary, "<MinDerivTime>");
67  ReadBasicType(is, binary, &min_deriv_time);
68  ExpectToken(is, binary, "<MaxDerivTime>");
69  ReadBasicType(is, binary, &max_deriv_time);
70  if (PeekToken(is, binary) == 'M') {
71  ExpectToken(is, binary, "<MaxDerivTimeRelative>");
73  }
74  if (PeekToken(is, binary) == 'S') {
75  ExpectToken(is, binary, "<SnipRowOps>");
76  ReadBasicType(is, binary, &snip_row_ops);
77  }
78  if (PeekToken(is, binary) == 'M') {
79  ExpectToken(is, binary, "<MemoryCompressionLevel>");
81  }
82  ExpectToken(is, binary, "</NnetOptimizeOptions>");
83 }
84 
85 void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
86  WriteToken(os, binary, "<NnetOptimizeOptions>");
87  WriteToken(os, binary, "<Optimize>");
88  WriteBasicType(os, binary, optimize);
89  WriteToken(os, binary, "<ConsolidateModelUpdate>");
91  WriteToken(os, binary, "<PropagateInPlace>");
93  WriteToken(os, binary, "<BackpropInPlace>");
94  WriteBasicType(os, binary, backprop_in_place);
95  WriteToken(os, binary, "<OptimizeRowOps>");
96  WriteBasicType(os, binary, optimize_row_ops);
97  WriteToken(os, binary, "<SplitRowOps>");
98  WriteBasicType(os, binary, split_row_ops);
99  WriteToken(os, binary, "<ExtendMatrices>");
100  WriteBasicType(os, binary, extend_matrices);
101  WriteToken(os, binary, "<ConvertAddition>");
102  WriteBasicType(os, binary, convert_addition);
103  WriteToken(os, binary, "<RemoveAssignments>");
104  WriteBasicType(os, binary, remove_assignments);
105  WriteToken(os, binary, "<AllowLeftMerge>");
106  WriteBasicType(os, binary, allow_left_merge);
107  WriteToken(os, binary, "<AllowRightMerge>");
108  WriteBasicType(os, binary, allow_right_merge);
109  WriteToken(os, binary, "<InitializeUndefined>");
111  WriteToken(os, binary, "<MoveSizingCommands>");
113  WriteToken(os, binary, "<AllocateFromOther>");
114  WriteBasicType(os, binary, allocate_from_other);
115  WriteToken(os, binary, "<MinDerivTime>");
116  WriteBasicType(os, binary, min_deriv_time);
117  WriteToken(os, binary, "<MaxDerivTime>");
118  WriteBasicType(os, binary, max_deriv_time);
119  WriteToken(os, binary, "<MaxDerivTimeRelative>");
121  WriteToken(os, binary, "<SnipRowOps>");
122  WriteBasicType(os, binary, snip_row_ops);
123  WriteToken(os, binary, "<MemoryCompressionLevel>");
125  WriteToken(os, binary, "</NnetOptimizeOptions>");
126 }
127 
129  return (other.optimize == optimize &&
134  other.split_row_ops == split_row_ops &&
142  other.min_deriv_time == min_deriv_time &&
143  other.max_deriv_time == max_deriv_time &&
145  other.snip_row_ops == snip_row_ops &&
147 }
148 
149 // move commands that resize and zero matrices to as late/early as possible.
150 // (however, keep input and output commands where they were; it creates other
151 // headaches if we move those).
152 void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) {
153  ComputationVariables variables;
154  variables.Init(*computation);
155  std::vector<CommandAttributes> attributes;
156  ComputeCommandAttributes(nnet, *computation, variables, &attributes);
157  std::vector<std::vector<Access> > variable_accesses;
158  ComputeVariableAccesses(variables, attributes, &variable_accesses);
159  std::vector<MatrixAccesses> matrix_accesses;
160  ComputeMatrixAccesses(nnet, *computation, variables, attributes,
161  &matrix_accesses);
162 
163  // The way we will renumber the commands is, we will first set this vector up
164  // with pairs (command-index * 3, pointer-to-command), and we will then modify
165  // the command-indexes in this vector to the numbers that we want, and sort
166  // it. The reason for the * 3 is so that we can number commands "just-after"
167  // existing indexes (by adding 1) and "just-before" (by subtracting 1).
168  int32 num_commands = computation->commands.size(),
169  num_matrices = matrix_accesses.size();
170 
171  // Matrix allocation commands tend to be followed by a command that zeroes the
172  // matrix. We want to treat the two commands as a single unit for purposes of
173  // reordering. is_command_pair[c] will be true if command c is the first
174  // element of such a pair.
175  std::vector<bool> is_command_pair(num_commands, false);
176  for (int32 c = 0; c + 1 < num_commands; c++) {
177  if (computation->commands[c].command_type == kAllocMatrix &&
178  computation->commands[c+1].command_type == kSetConst &&
179  computation->commands[c].arg1 == computation->commands[c+1].arg1 &&
180  computation->commands[c+1].alpha == 0.0) {
181  is_command_pair[c] = true;
182  }
183  }
184 
185  // 'command_reordering' contains (new-number, old-number) of commands.
186  // the new-number is multiplied by 3 for reasons explained above.
187  std::vector<std::pair<int32,int32> >
188  command_reordering(num_commands);
189  // Note: for now we include the second-elements-of-pairs (i.e. the zeroing
190  // commands that follow allocation commands) here; we'll ignore them later.
191  for (int32 c = 0; c < num_commands; c++) {
192  command_reordering[c].first = c * 3;
193  command_reordering[c].second = c;
194  }
195  for (int32 m = 1; m < num_matrices; m++) {
196  const MatrixAccesses &ma = matrix_accesses[m];
197  // The following if-block relates to reordering of allocation (and,
198  // implicitly, zeroing) commands.
199  if (ma.allocate_command != -1 &&
200  computation->commands[ma.allocate_command].command_type == kAllocMatrix) {
201  // first_access_command will be index of first access, except for the
202  // zeroing command that immediately follows the initialization command.
203  int32 first_access_command = -1;
204  // this block sets 'first_access_command'.
205  if (!ma.accesses.empty()) {
206  first_access_command = ma.accesses[0].command_index;
207  if (first_access_command == ma.allocate_command + 1 &&
208  is_command_pair[ma.allocate_command]) {
209  if (ma.accesses.size() > 1)
210  first_access_command = ma.accesses[1].command_index;
211  else
212  first_access_command = -1;
213  }
214  }
215  if (first_access_command != -1) {
216  KALDI_ASSERT(first_access_command > ma.allocate_command);
217  // move the initialization command to just before the first access.
218  command_reordering[ma.allocate_command].first =
219  first_access_command * 3 - 1;
220  }
221  }
222  // The following if-block relates to reordering of deallocation
223  // commands.
224  if (ma.deallocate_command != -1 && !ma.accesses.empty() &&
225  computation->commands[ma.deallocate_command].command_type ==
226  kDeallocMatrix) {
227  int32 last_access_command = ma.accesses.back().command_index;
228  // move the deallocation command to just after the last access.
229  command_reordering[ma.deallocate_command].first =
230  last_access_command * 3 + 1;
231  }
232  }
233  std::sort(command_reordering.begin(), command_reordering.end());
234  std::vector<NnetComputation::Command> reordered_commands;
235  reordered_commands.reserve(num_commands);
236  for (int32 c = 0; c < num_commands; c++) {
237  int32 old_index = command_reordering[c].second;
238  NnetComputation::Command &old_command = computation->commands[old_index];
239  // the following assert is because this optimization is not allowed
240  // after looped optimization.
241  KALDI_ASSERT(old_command.command_type != kGotoLabel);
242  if (old_index > 0 && is_command_pair[old_index - 1]) {
243  // If the old command-index was a zeroing command that follows
244  // an allocation command, ignore it; it will be reordered to
245  // right after wherever the allocation command went, and we'll
246  // deal with it when we deal with the first element of the pair.
247  continue;
248  } else {
249  reordered_commands.push_back(computation->commands[old_index]);
250  if (is_command_pair[old_index]) {
251  // if this command is the first member of an (allocation, zeroing)
252  // pair then we need to deal with the zeroing command as well.
253  reordered_commands.push_back(computation->commands[old_index + 1]);
254  }
255  }
256  }
257  computation->commands = reordered_commands;
258 }
259 
260 // This function removes commands of type kSetConst (with alpha=0.0), where
261 // possible.
263  NnetComputation *computation) {
264  Analyzer a;
265  a.Init(nnet, *computation);
266 
267  // OK, now we'll work out which matrices have all their pieces (i.e. all the
268  // variables belonging to that matrix) written to as the first instruction
269  // apart from the initial zeroing. These matrices can have the initial
270  // zeroing replaced by a sizing operation that leaves the data undefined.
271  int32 num_matrices = a.matrix_accesses.size();
272  for (int32 matrix_index = 0; matrix_index < num_matrices; matrix_index++) {
273  const MatrixAccesses &accesses = a.matrix_accesses[matrix_index];
274  if (accesses.accesses.empty())
275  continue;
276  int32 zeroing_command_index = accesses.accesses[0].command_index;
277  NnetComputation::Command *command =
278  &(computation->commands[zeroing_command_index]);
279  if (!(command->command_type == kSetConst &&
280  command->alpha == 0.0)) {
281  continue; // First command is not a zeroing command
282  }
283  // OK, the first command that accesses this matrix is a zeroing command;
284  // we're going to figure out whether it was necessary.
285  std::vector<int32> variables_for_matrix;
286  a.variables.AppendVariablesForMatrix(matrix_index, &variables_for_matrix);
287  bool all_variables_ok = true; // if this stays true, it means we don't need
288  // the initial zeroing.
289  for (size_t i = 0; i < variables_for_matrix.size(); i++) {
290  int32 variable_index = variables_for_matrix[i];
291  const std::vector<Access> &v_accesses =
292  a.variable_accesses[variable_index];
293  if (v_accesses.size() > 1 &&
294  v_accesses[1].access_type != kWriteAccess) {
295  all_variables_ok = false; // first access after zeroing was not a write
296  break;
297  }
298  if (v_accesses.size() == 1 &&
299  accesses.is_output) {
300  // the only command that touches this variable is the allocation, and it
301  // is an output variable. (this is unusual, but can happen e.g. if it's
302  // a derivative, but due to min_deriv_time and max_deriv_time it ends up
303  // always being zero.
304  all_variables_ok = false;
305  break;
306  }
307  }
308  if (all_variables_ok) {
309  // Here is where the change actually happens.
310  // Remove the zeroing command.
311  command->command_type = kNoOperation;
312  }
313  }
314 }
315 
316 /*
317  This function is called from RemoveUnnecessaryAllocation. The input is two
318  sorted, unique lists, of (deallocation-commands, allocation-commands)
319  e.g. (d1, d2, d3.. ), (a1, a2, a3..); and to the output is *appended* a list
320  of pairs (d, a). Each output pair must satisfy the property that d < a, and
321  no member of the input lists may appear more than once in the output pairs
322  (although it's OK for input a and d values not to appear in any output pairs).
323 
324  The goal of the implementation is to output as many pairs as possible, and
325  secondarily for the pairs to be as close as possible to each other (to avoid
326  wasting too much memory). I'm not sure if this implementation achieves that.
327 */
329  const std::pair<std::vector<int32>, std::vector<int32> > &lists,
330  std::vector<std::pair<int32,int32> > *pairs) {
331  std::vector<int32> d_list = lists.first;
332 
333  std::set<int32> a_set;
334  CopyVectorToSet(lists.second, &a_set);
335 
336  std::vector<int32>::reverse_iterator iter = d_list.rbegin(),
337  end = d_list.rend();
338 
339  // from the latest to the earliest deallocation command...
340  for (; iter != end; ++iter) {
341  int32 d = *iter;
342  std::set<int32>::iterator a_iter = a_set.upper_bound(d);
343  // a_iter is an iterator to the first element a of the set 'a_set' such
344  // that a > d, or a_set.end() if no such element exists.
345  if (a_iter == a_set.end())
346  continue; // we will output no pair for this d.
347  int32 a = *a_iter;
348  KALDI_PARANOID_ASSERT(a > d); // or code error
349  a_set.erase(a_iter); // remove this a from 'a_set' so it doesn't get used
350  // twice
351  pairs->push_back(std::pair<int32,int32>(d, a));
352  }
353 }
354 
356  NnetComputation *computation) {
357  // For each size of matrix and stride-type, represented as a pair<int32,int32>
358  // (the num-rows, and the num-cols * (stride-type == kDefaultStride ? 1 : -1), we
359  // accumulate a list of indexes of deallocation commands that
360  // are for that size, and a list of indexes of allocation commands
361  // for that size.
362  // For each distinct matrix size, we then call ComputeCommandPairs on those
363  // two lists, to get pairs of (deallocation, allocation) command-indexes that
364  // we can optimize out to a single command.
365 
366  // The map is from a (num-rows,num-columns) to two lists, of
367  // (deallocation-commands, allocation-commands). The order may seem
368  // backwards, but that's the order of the pairs we are looking for.
369  typedef unordered_map<std::pair<int32,int32>,
370  std::pair<std::vector<int32>,std::vector<int32> >,
371  PairHasher<int32> > MapType;
372  MapType pair_map;
373  int32 num_commands = computation->commands.size();
374  for (int32 command_index = 0; command_index < num_commands; command_index++) {
375  NnetComputation::Command &command = computation->commands[command_index];
376  if (command.command_type == kAllocMatrix ||
377  command.command_type == kDeallocMatrix) {
378  int32 s = command.arg1, m = computation->submatrices[s].matrix_index,
379  num_rows = computation->matrices[m].num_rows,
380  num_cols = computation->matrices[m].num_cols,
381  num_cols_mod = num_cols * (
382  computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
383  std::pair<int32,int32> p(num_rows, num_cols_mod);
384  std::pair<std::vector<int32>,std::vector<int32> > &lists = pair_map[p];
385  if (command.command_type == kDeallocMatrix)
386  lists.first.push_back(command_index);
387  else
388  lists.second.push_back(command_index);
389  }
390  }
391 
392  MapType::const_iterator iter = pair_map.begin(), end = pair_map.end();
393  std::vector<std::pair<int32,int32> > command_pairs;
394  for (; iter != end; ++iter)
395  ComputeCommandPairs(iter->second, &command_pairs);
396 
397  for (size_t i = 0; i < command_pairs.size(); i++) {
398  int32 dealloc_index = command_pairs[i].first,
399  alloc_index = command_pairs[i].second;
401  &dealloc_command = computation->commands[dealloc_index],
402  &alloc_command = computation->commands[alloc_index];
403  KALDI_ASSERT(dealloc_command.command_type ==
405  KALDI_ASSERT(alloc_command.command_type ==
406  kAllocMatrix);
407  // remove the deallocation command.
408  dealloc_command.command_type = kNoOperation;
409  alloc_command.arg2 = dealloc_command.arg1;
410  alloc_command.command_type = kSwapMatrix;
411  }
412  RemoveNoOps(computation);
413  FixGotoLabel(computation);
414 }
415 
416 
418  const Nnet &nnet,
419  NnetComputation *computation) {
420  bool changed = true;
421  while (changed) {
422  changed = false;
423  VariableMergingOptimizer opt(config, nnet, computation);
424  if (opt.MergeVariables())
425  changed = true;
426  }
427 }
428 
429 
431  NnetComputation *computation) {
432  Analyzer analyzer;
433  analyzer.Init(nnet, *computation);
434  ComputationAnalysis analysis(*computation, analyzer);
435  int32 num_commands = computation->commands.size();
436  for (int32 command = 0; command < num_commands; command++) {
437  NnetComputation::Command &c = computation->commands[command];
438  switch (c.command_type) {
439  case kMatrixAdd: case kAddRows: case kAddRowsMulti:
440  case kAddToRowsMulti: {
441  const std::vector<int32> &submatrices_written =
442  analyzer.command_attributes[command].submatrices_written;
443  KALDI_ASSERT(!submatrices_written.empty());
444  std::vector<int32>::const_iterator iter = submatrices_written.begin(),
445  end = submatrices_written.end();
446  bool can_convert = true;
447  for (; iter != end; ++iter) {
448  int32 submatrix_written = *iter;
449  int32 first_access_command = analysis.FirstNontrivialAccess(
450  submatrix_written);
451  // first_access_command is first command other than zeroing and
452  // allocation that accesses this submatrix. It can be assumed to be a
453  // write command, since it makes no sense to read a variable before
454  // it's written to. If it's before this command then we need to add
455  // rather than copy; we can't do the conversion to a copy command.
456  if (first_access_command != command) {
457  can_convert = false;
458  break;
459  }
460  }
461  if (can_convert) { // convert to a copy command.
462  switch (c.command_type) {
464  break;
465  case kAddRows: c.command_type = kCopyRows;
466  break;
468  break;
469  // note: kCopyToRowsMulti does not currently support alpha != 1.0.
470  case kAddToRowsMulti: if (c.alpha == 1.0) c.command_type = kCopyToRowsMulti;
471  break;
472  default: KALDI_ERR << "Unexpected command type.";
473  }
474  }
475  break;
476  }
477  default:
478  break;
479  }
480  }
481 }
482 
483 
485  int32 ans = std::numeric_limits<int32>::min();
486  for (size_t i = 0; i < request.outputs.size(); i++) {
487  const std::vector<Index> &indexes (request.outputs[i].indexes);
488  std::vector<Index>::const_iterator iter = indexes.begin(),
489  end = indexes.end();
490  for (; iter != end; ++iter)
491  if (iter->t > ans)
492  ans = iter->t;
493  }
494  if (ans == std::numeric_limits<int32>::min()) {
495  KALDI_ERR << "Failed to find any output indexes in computation request.";
496  }
497  return ans;
498 }
499 
500 
501 void Optimize(const NnetOptimizeOptions &config,
502  const Nnet &nnet,
503  int32 max_output_time_in_request,
504  NnetComputation *computation) {
505  if (GetVerboseLevel() >= 3) {
506  CheckComputation(nnet, *computation, true);
507  KALDI_LOG << "Before optimization, max memory use (bytes) = "
508  << GetMaxMemoryUse(*computation);
509  }
510 
511  { // Call LimitDerivativeTimes(); it's important that this
512  // should come before other optimizations (search for "insist" in
513  // nnet-optimize-utils.cc for the reasons).
514  // this will do nothing unless --min-deriv-time or --max-deriv-time
515  // or --max-deriv-time-relative was set.
517  if (config.max_deriv_time_relative != std::numeric_limits<int32>::max())
518  max_deriv_time = config.max_deriv_time_relative +
519  max_output_time_in_request;
520  if (config.min_deriv_time != std::numeric_limits<int32>::min() ||
521  max_deriv_time != std::numeric_limits<int32>::max())
523  max_deriv_time, computation);
524  }
525 
526  if (GetVerboseLevel() >= 3)
527  CheckComputation(nnet, *computation, true);
528 
529  if (config.optimize && config.consolidate_model_update) {
530  ConsolidateModelUpdate(nnet, computation);
531 
532  if (GetVerboseLevel() >= 3)
533  CheckComputation(nnet, *computation, true);
534  }
535 
536  if (config.optimize && config.convert_addition) {
537  ConvertAdditionToAssignment(nnet, computation);
538  if (GetVerboseLevel() >= 3)
539  CheckComputation(nnet, *computation, true);
540  }
541 
542 
543  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops ||
544  config.split_row_ops)) {
545  bool must_renumber = false;
546  if (config.snip_row_ops && SnipRowOps(computation))
547  must_renumber = true;
548  if (config.split_row_ops && SplitRowOps(computation))
549  must_renumber = true;
550  if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
551  must_renumber = true;
552 
553  if (must_renumber) {
554  RenumberComputation(computation);
555  if (GetVerboseLevel() >= 3)
556  CheckComputation(nnet, *computation, false);
557  }
558  }
559 
560  if (config.optimize && config.extend_matrices &&
561  !config.optimize_looped_computation) {
562  ExtendMatrices(computation);
563  if (GetVerboseLevel() >= 3)
564  CheckComputation(nnet, *computation, false);
565  }
566 
567 
568  if (config.optimize &&
569  (config.remove_assignments || config.backprop_in_place ||
570  config.propagate_in_place)) {
571  VariableMergingOptimization(config, nnet, computation);
572  if (GetVerboseLevel() >= 3)
573  CheckComputation(nnet, *computation, false);
574  }
575 
576  if (config.optimize && config.initialize_undefined) {
577  RemoveUnnecessaryZeroing(nnet, computation);
578  if (GetVerboseLevel() >= 3)
579  CheckComputation(nnet, *computation, false);
580  }
581 
582 
583  if ((config.optimize && config.move_sizing_commands) ||
585  MoveSizingCommands(nnet, computation);
586  if (GetVerboseLevel() >= 3)
587  CheckComputation(nnet, *computation, false);
588  }
589 
590  // the looped computation optimization has to go before
591  // 'RemoveUnnecessaryAllocation()'. We don't gate this by 'config.optimize'
592  // because it's necessary for looped computation to run.
593  if (config.optimize_looped_computation) {
594  OptimizeLoopedComputation(nnet, computation);
595  if (GetVerboseLevel() >= 3)
596  CheckComputation(nnet, *computation, false);
597  }
598 
599  if (config.optimize && config.allocate_from_other &&
600  !config.optimize_looped_computation) {
601  // Don't do this if it's an looped computation because we're not sure if it
602  // would be correct in that case, as written. In any case the performance
603  // benefit is tiny.
604  RemoveUnnecessaryAllocation(nnet, computation);
605  if (GetVerboseLevel() >= 3)
606  CheckComputation(nnet, *computation, false);
607  }
608 
609  // The following is not configurable because it is necessary for
610  // the computation to run correctly (we do it after compilation too,
611  // but the operations may have been put out of order by
612  // other optimizations.)
613  ConsolidateIoOperations(nnet, computation);
614 
615  if (config.optimize_looped_computation)
616  FixGotoLabel(computation);
617 
618 
619  if (config.memory_compression_level > 0 &&
620  !config.optimize_looped_computation) {
622  computation);
623  if (GetVerboseLevel() >= 3)
624  CheckComputation(nnet, *computation, false);
625  }
626 
627  if (GetVerboseLevel() >= 3) {
628  CheckComputation(nnet, *computation, false);
629  KALDI_LOG << "After optimization, max memory use (bytes) = "
630  << GetMaxMemoryUse(*computation);
631  }
632 }
633 
634 
636  const Nnet &nnet,
637  const CachingOptimizingCompilerOptions config):
638  nnet_(nnet), config_(config),
639  seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
640  seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
641  seconds_taken_check_(0.0), seconds_taken_indexes_(0.0),
642  seconds_taken_io_(0.0), cache_(config.cache_capacity),
643  nnet_left_context_(-1), nnet_right_context_(-1) { }
644 
646  const Nnet &nnet,
647  const NnetOptimizeOptions &opt_config,
648  const CachingOptimizingCompilerOptions config):
649  nnet_(nnet), config_(config), opt_config_(opt_config),
653  seconds_taken_io_(0.0), cache_(config.cache_capacity),
655 
657  int32 *nnet_left_context, int32 *nnet_right_context) {
658  if (nnet_left_context_ == -1) {
661  }
662  *nnet_left_context = nnet_left_context_;
663  *nnet_right_context = nnet_right_context_;
664 }
665 
666 void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
667  {
668  Timer timer;
669  NnetOptimizeOptions opt_config_cached;
670  opt_config_cached.Read(is, binary);
671  // we won't read cached computations if any optimize option has been changed.
672  if (!(opt_config_ == opt_config_cached))
673  return;
674  cache_.Read(is, binary);
675  seconds_taken_io_ += timer.Elapsed();
676  }
677  if (GetVerboseLevel() >= 2) {
678  Timer timer;
679  cache_.Check(nnet_);
680  seconds_taken_check_ += timer.Elapsed();
681  // we consider the check time part of the total time... this is very
682  // arbitrary but it only affects printed times-taken.
683  seconds_taken_total_ += timer.Elapsed();
684  }
685 
686 }
687 
688 void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) {
689  Timer timer;
690  opt_config_.Write(os, binary);
691  cache_.Write(os, binary);
692  seconds_taken_io_ += timer.Elapsed();
693 }
694 
696  if (seconds_taken_total_ > 0.0 || seconds_taken_io_ > 0.0) {
697  std::ostringstream os;
698  double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_
701  os << std::setprecision(3) << seconds_taken_total_
702  << " seconds taken in nnet3 compilation total (breakdown: "
703  << seconds_taken_compile_ << " compilation, "
704  << seconds_taken_optimize_ << " optimization, "
705  << seconds_taken_expand_ << " shortcut expansion, "
706  << seconds_taken_check_ << " checking, "
707  << seconds_taken_indexes_ << " computing indexes, "
708  << seconds_taken_misc << " misc.) + "
709  << seconds_taken_io_ << " I/O.";
710  KALDI_LOG << os.str();
711  // note: the leftover amount is misc things like hashing and == comparisons on
712  // computation-requests, and calling RequestIsDecomposable().
713  }
714 }
715 
716 std::shared_ptr<const NnetComputation> CachingOptimizingCompiler::Compile(
717  const ComputationRequest &in_request) {
718  Timer timer;
719  std::shared_ptr<const NnetComputation> ans = CompileInternal(in_request);
720  seconds_taken_total_ += timer.Elapsed();
721  return ans;
722 }
723 
724 std::shared_ptr<const NnetComputation> CachingOptimizingCompiler::CompileInternal(
725  const ComputationRequest &request) {
726  std::shared_ptr<const NnetComputation> ans = cache_.Find(request);
727  if (ans != NULL) {
728  return ans;
729  } else {
730  const NnetComputation *computation = NULL;
731  if (config_.use_shortcut)
732  computation = CompileViaShortcut(request);
733  if (computation == NULL)
734  computation = CompileNoShortcut(request);
735  KALDI_ASSERT(computation != NULL);
736  return cache_.Insert(request, computation);
737  }
738 }
739 
740 
742  const ComputationRequest &request) {
743 
744  Compiler compiler(request, nnet_);
745  // note: 'opts' only contains 'output_debug_info', which is true by default.
746  // There may be situations where we'd prefer not to keep it, for speed.
747  CompilerOptions opts;
748  NnetComputation *computation = new NnetComputation;
749 
750  {
751  Timer timer;
752  compiler.CreateComputation(opts, computation);
753  seconds_taken_compile_ += timer.Elapsed();
754  }
755 
756  int32 verbose_cutoff = 4;
757  if (GetVerboseLevel() >= verbose_cutoff) {
758  std::ostringstream os1;
759  request.Print(os1);
760  KALDI_LOG << "Computation request is " << os1.str();
761  std::ostringstream os2;
762  computation->Print(os2, nnet_);
763  KALDI_LOG << "Generated computation is: " << os2.str();
764  }
765 
766  { // some checking. Note: there may come a time when we might
767  // prefer to disable this checking.
768  Timer timer;
769  CheckComputationOptions check_config;
770  // we can do the rewrite check since it's before optimization.
771  check_config.check_rewrite = true;
772  ComputationChecker checker(check_config, nnet_, *computation);
773  checker.Check();
774  seconds_taken_check_ += timer.Elapsed();
775  }
776 
777  {
778  Timer timer;
780  MaxOutputTimeInRequest(request),
781  computation);
782  seconds_taken_optimize_ += timer.Elapsed();
783  }
784 
785  if (GetVerboseLevel() >= verbose_cutoff) {
786  std::ostringstream os;
787  computation->Print(os, nnet_);
788  KALDI_LOG << "Optimized computation is: " << os.str();
789  }
790 
791  { // check the computation again.
792  Timer timer;
793  CheckComputationOptions check_config;
794  ComputationChecker checker(check_config, nnet_, *computation);
795  checker.Check();
796  seconds_taken_check_ += timer.Elapsed();
797  }
798 
799  {
800  Timer timer;
801  computation->ComputeCudaIndexes();
802  seconds_taken_indexes_ += timer.Elapsed();
803  }
804  return computation;
805 }
806 
807 
809  const ComputationRequest &request) {
810  int32 num_n_values;
811  ComputationRequest mini_request;
812  if (!RequestIsDecomposable(request, &mini_request, &num_n_values))
813  return NULL;
814 
815  // By invoking CompileInternal() on the mini request, we go through the same
816  // caching process as for any externally requested computation.
817  std::shared_ptr<const NnetComputation> mini_computation =
818  CompileInternal(mini_request);
819 
820  // note: by default we always create debug_info, even in regular compilation.
821  // (e.g. it defaults to true in CompilerOptions). If it really seems to be a
822  // significant overhead, we can revisit this at some point in future.
823  bool need_debug_info = true;
824 
825 
826  NnetComputation *ans = new NnetComputation();
827 
828  {
829  Timer timer;
830  ExpandComputation(nnet_, request.misc_info, *mini_computation,
831  need_debug_info, num_n_values, ans);
832  seconds_taken_expand_ += timer.Elapsed();
833  }
834  if (GetVerboseLevel() >= 3) {
835  CheckComputation(nnet_, *ans, false);
836  }
837 
838  {
839  Timer timer;
840  ans->ComputeCudaIndexes();
841  seconds_taken_indexes_ += timer.Elapsed();
842  }
843  return ans;
844 }
845 
846 
847 
853  const NnetComputation &computation,
854  std::vector<std::pair<int32, int32> > *segments) {
855 
856  int32 num_commands = computation.commands.size();
857  segments->clear();
858  int32 cur_start = 0;
859  for (int32 c = 0; c < num_commands; c++) {
860  if (computation.commands[c].command_type == kNoOperationMarker) {
861  segments->push_back(std::pair<int32, int32>(cur_start, c));
862  cur_start = c + 1;
863  }
864  }
865  segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
866 }
867 
868 
869 void ConsolidateIoOperations(const Nnet &nnet,
870  NnetComputation *computation) {
871  // These segments, represented as (start-index, end-index),
872  // are segments of the computation separated by kNoOperationMarker.
873  std::vector<std::pair<int32, int32> > segments;
874  SplitComputationIntoSegments(*computation, &segments);
875 
876  int32 num_commands = computation->commands.size();
877  std::vector<NnetComputation::Command> reordered_commands(num_commands);
878  // put kNoOperationMarker between all segments in the reordered commands.
879  for (size_t s = 0; s + 1 < segments.size(); s++)
880  reordered_commands[segments[s].second].command_type = kNoOperationMarker;
881 
882  // for each segment we'll divide the commands up into those that must appear
883  // at the left of the segment (kAcceptInput for inputs and output-derivs), those
884  // that must appear in the middle (most commands), those that must appear
885  // on the right (kProvideOutput for output nodes and input derivatives).
886  std::vector<int32> left_commands, middle_commands, right_commands;
887 
888  for (size_t s = 0; s < segments.size(); s++) {
889  int32 segment_start = segments[s].first,
890  segment_end = segments[s].second;
891  left_commands.clear();
892  middle_commands.clear();
893  right_commands.clear();
894  for (int32 c = segment_start; c < segment_end; c++) {
895  if (computation->commands[c].command_type == kProvideOutput) {
896  right_commands.push_back(c);
897  } else if (computation->commands[c].command_type == kAcceptInput) {
898  left_commands.push_back(c);
899  } else {
900  middle_commands.push_back(c);
901  }
902  }
903  std::vector<int32>::const_iterator iter = left_commands.begin(),
904  end = left_commands.end();
905  int32 c = segment_start;
906  for (; iter != end; ++iter, ++c)
907  reordered_commands[c] = computation->commands[*iter];
908  iter = middle_commands.begin();
909  end = middle_commands.end();
910  for (; iter != end; ++iter, ++c)
911  reordered_commands[c] = computation->commands[*iter];
912  iter = right_commands.begin();
913  end = right_commands.end();
914  for (; iter != end; ++iter, ++c)
915  reordered_commands[c] = computation->commands[*iter];
916  KALDI_ASSERT(c == segment_end);
917  }
918  computation->commands.swap(reordered_commands);
919 }
920 
921 
922 
923 
924 } // namespace nnet3
925 } // namespace kaldi
void Init(const NnetComputation &computation)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
This class is responsible for merging matrices, although you probably want to access it via the the f...
void AppendVariablesForMatrix(int32 matrix_index, std::vector< int32 > *variable_indexes) const
Appends to variables_indexes the sorted list of variables corresponding to a matrix index...
int32 FirstNontrivialAccess(int32 s) const
Returns the first command (read or write) that accesses any part of &#39;s&#39; except for zeroing it (i...
void Read(std::istream &is, bool binary)
const NnetComputation * CompileNoShortcut(const ComputationRequest &request)
bool SplitRowOps(NnetComputation *computation)
This function detects cases where commands of type kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti use indexes that correspond to at most two submatrices, in two distinct ranges without gaps filled by -1&#39;s, and could be converted to at most two commands of type kMatrixAdd, kMatrixCopy, kAddRows or kCopyRows.
void OptimizeLoopedComputation(const Nnet &nnet, NnetComputation *computation)
This function tries to optimize computation &#39;computation&#39; for an &#39;looped&#39; computation.
const NnetComputation * CompileViaShortcut(const ComputationRequest &request)
void ConsolidateIoOperations(const Nnet &nnet, NnetComputation *computation)
This optimization puts the input operations (kAcceptInput) and output operations (kProvideOutput) at ...
MiscComputationInfo misc_info
misc_info is for extensibility to things that don&#39;t easily fit into the framework.
void Write(std::ostream &os, bool binary) const
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
static void ComputeCommandPairs(const std::pair< std::vector< int32 >, std::vector< int32 > > &lists, std::vector< std::pair< int32, int32 > > *pairs)
void RenumberComputation(NnetComputation *computation)
This function detects submatrices and matrices that are never used (e.g.
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
bool is_output
true if this matrix is an output of the computation (i.e.
Definition: nnet-analyze.h:270
void Print(std::ostream &os, const Nnet &nnet) const
void VariableMergingOptimization(const NnetOptimizeOptions &config, const Nnet &nnet, NnetComputation *computation)
This wraps class VariableMergingOptimizer in a simplified interface.
bool RequestIsDecomposable(const ComputationRequest &request, ComputationRequest *mini_request, int32 *num_n_values)
This function, used in &#39;shortcut&#39; compilation where we first compile a smaller computation with the s...
void ConvertAdditionToAssignment(const Nnet &nnet, NnetComputation *computation)
This converts addition operations (things with Add in their names) to copy operations (things with Co...
kaldi::int32 int32
std::vector< MatrixInfo > matrices
void NnetComputation(const Nnet &nnet, const CuMatrixBase< BaseFloat > &input, bool pad_input, CuMatrixBase< BaseFloat > *output)
Does the basic neural net computation, on a sequence of data (e.g.
void CopyVectorToSet(const std::vector< A > &v, std::set< A > *s)
Copies the contents of a vector to a set.
Definition: stl-utils.h:172
void ComputeCommandAttributes(const Nnet &nnet, const NnetComputation &computation, const ComputationVariables &vars, std::vector< CommandAttributes > *attributes)
void ExtendMatrices(NnetComputation *computation)
This is not really an optimization in itself but it can make things easier for class VariableMergingO...
std::vector< Command > commands
std::shared_ptr< const NnetComputation > Find(const ComputationRequest &request)
void LimitDerivativeTimes(const Nnet &nnet, int32 min_deriv_time, int32 max_deriv_time, NnetComputation *computation)
void Write(std::ostream &os, bool binary) const
std::vector< CommandAttributes > command_attributes
Definition: nnet-analyze.h:296
This file contains some miscellaneous functions dealing with class Nnet.
void OptimizeMemoryCompression(const Nnet &nnet, int32 memory_compression_level, NnetComputation *computation)
Performs optimization to reduce memory usage where possible, making use of the kCompressMatrix and kD...
std::vector< Access > accesses
Records the indexes of commands that access the matrix, and the type (read, read/write, write).
Definition: nnet-analyze.h:264
std::shared_ptr< const NnetComputation > Insert(const ComputationRequest &request, const NnetComputation *computation)
void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation)
This optimization moves commands that allocate and zero matrices to as late as possible, and moves commands that deallocate matrices to as early as possible.
bool operator==(const NnetOptimizeOptions &other) const
int64 GetMaxMemoryUse(const NnetComputation &computation)
This class relates the matrices and sub-matrices in the computation to imaginary "variables", such that we can think of the operations as operating on sets of individual variables, and we can then do analysis that lets us do optimization.
Definition: nnet-analyze.h:121
void Init(const Nnet &nnet, const NnetComputation &computation)
static void ExpectToken(const std::string &token, const std::string &what_we_are_parsing, const std::string **next_token)
void Read(std::istream &is, bool binary)
void ComputeSimpleNnetContext(const Nnet &nnet, int32 *left_context, int32 *right_context)
ComputeSimpleNnetContext computes the left-context and right-context of a nnet.
Definition: nnet-utils.cc:146
int32 MaxOutputTimeInRequest(const ComputationRequest &request)
void RemoveUnnecessaryAllocation(const Nnet &nnet, NnetComputation *computation)
This optimization detects cases where we deallocate a matrix, and then later allocate another matrix ...
void RemoveUnnecessaryZeroing(const Nnet &nnet, NnetComputation *computation)
This optimization function removes, where possible, commands of type type kSetConst.
void ComputeVariableAccesses(const ComputationVariables &variables, const std::vector< CommandAttributes > &command_attributes, std::vector< std::vector< Access > > *variable_accesses)
After the command-level attributes have been computed, this function organizes them per variable (see...
std::vector< SubMatrixInfo > submatrices
void Check(const Nnet &nnet) const
bool ReplaceRowWithMatrixOps(NnetComputation *computation)
This function detects cases where commands of type kCopyRows, kAddRows or kAddToRows can be converted...
#define KALDI_ERR
Definition: kaldi-error.h:147
CachingOptimizingCompiler(const Nnet &nnet, const CachingOptimizingCompilerOptions config=CachingOptimizingCompilerOptions())
#define KALDI_PARANOID_ASSERT(cond)
Definition: kaldi-error.h:206
std::vector< std::vector< Access > > variable_accesses
Definition: nnet-analyze.h:297
int32 deallocate_command
Index of the command that deallocates the matrix (which will be of type kDeallocMatrix or kSwapMatrix...
Definition: nnet-analyze.h:257
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134
void GetSimpleNnetContext(int32 *nnet_left_context, int32 *nnet_right_context)
void FixGotoLabel(NnetComputation *computation)
This function ensures that the arg1 of a final command of type kGotoLabel is the same as the command ...
int PeekToken(std::istream &is, bool binary)
PeekToken will return the first character of the next token, or -1 if end of file.
Definition: io-funcs.cc:170
void Optimize(const NnetOptimizeOptions &config, const Nnet &nnet, int32 max_output_time_in_request, NnetComputation *computation)
This is the top-level function for optimizing a computation.
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
void ExpandComputation(const Nnet &nnet, const MiscComputationInfo &misc_info, const NnetComputation &computation, bool need_debug_info, int32 num_n_values, NnetComputation *expanded_computation)
This function is used in &#39;shortcut&#39; compilation to expand a computation that has been compiled for ex...
void ComputeMatrixAccesses(const Nnet &nnet, const NnetComputation &computation, const ComputationVariables &variables, const std::vector< CommandAttributes > &command_attributes, std::vector< MatrixAccesses > *matrix_accesses)
This function organizes information in the CommandAttributes in a way that is convenient to access pe...
void ReadCache(std::istream &is, bool binary)
void ConsolidateModelUpdate(const Nnet &nnet, NnetComputation *computation)
This optimization consolidates the model-update part of backprop commands, for components in (e...
std::vector< MatrixAccesses > matrix_accesses
Definition: nnet-analyze.h:298
void WriteCache(std::ostream &os, bool binary)
void CheckComputation(const Nnet &nnet, const NnetComputation &computation, bool check_rewrite)
This is a convenience interface for class ComputationChecker.
void CreateComputation(const CompilerOptions &opts, NnetComputation *computation)
Definition: nnet-compile.cc:50
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::vector< IoSpecification > outputs
void RemoveNoOps(NnetComputation *computation)
Removes commands of type kNoOperation in the computation.
This class creates an initial version of the NnetComputation, without any optimization or sharing of ...
Definition: nnet-compile.h:44
int32 allocate_command
Index of the command that allocates the matrix (which will be of type kAllocMatrix or kSwapMatrix)...
Definition: nnet-analyze.h:253
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34
ComputationVariables variables
Definition: nnet-analyze.h:295
bool SnipRowOps(NnetComputation *computation)
This function detects cases where commands of type kCopyRows, kAddRows, kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti or kAddRowRanges use indexes that start or end with -1&#39;s or equivalents, and replace them with similar commands that act on a sub-matrix of the matrices they are currently acting on.
void Print(std::ostream &os) const
This function is for printing info about the computation request in a human-readable way...
#define KALDI_LOG
Definition: kaldi-error.h:153
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74
This struct exists to set up various pieces of analysis; it helps avoid the repetition of code where ...
Definition: nnet-analyze.h:294
CachingOptimizingCompilerOptions config_
std::shared_ptr< const NnetComputation > CompileInternal(const ComputationRequest &request)
static void SplitComputationIntoSegments(const NnetComputation &computation, std::vector< std::pair< int32, int32 > > *segments)
Split the computation up into segments bounded by kNoOperationMarker.
A hashing function-object for pairs of ints.
Definition: stl-utils.h:235
This class performs various kinds of specific analysis on top of what class Analyzer gives you immedi...
Definition: nnet-analyze.h:308