nnet-optimize-utils.cc
Go to the documentation of this file.
1 // nnet3/nnet-optimize-utils.cc
2 
3 // Copyright 2015 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #include <map>
22 #include "nnet3/nnet-optimize.h"
23 
24 namespace kaldi {
25 namespace nnet3 {
26 
27 
29  std::vector<int32*> *submatrix_args) {
30  submatrix_args->clear();
31  switch (c->command_type) {
32  case kAllocMatrix:
33  case kDeallocMatrix:
34  case kSetConst:
35  submatrix_args->push_back(&c->arg1);
36  break;
37  case kSwapMatrix:
38  submatrix_args->push_back(&c->arg1);
39  submatrix_args->push_back(&c->arg2);
40  break;
41  case kPropagate:
42  submatrix_args->push_back(&c->arg3);
43  submatrix_args->push_back(&c->arg4);
44  break;
45  case kBackprop:
47  submatrix_args->push_back(&c->arg3);
48  submatrix_args->push_back(&c->arg4);
49  submatrix_args->push_back(&c->arg5);
50  submatrix_args->push_back(&c->arg6);
51  break;
52  case kMatrixCopy:
53  case kMatrixAdd:
54  case kAddRows:
55  case kCopyRows:
56  case kAddRowRanges:
57  submatrix_args->push_back(&c->arg1);
58  submatrix_args->push_back(&c->arg2);
59  break;
60  case kAddRowsMulti:
61  case kCopyRowsMulti:
62  case kAddToRowsMulti:
63  case kCopyToRowsMulti:
64  submatrix_args->push_back(&c->arg1);
65  break;
66  case kAcceptInput: case kProvideOutput:
67  submatrix_args->push_back(&c->arg1);
68  break;
69  case kNoOperation:
71  case kNoOperationMarker:
72  case kNoOperationLabel:
73  case kGotoLabel:
74  break;
75  default:
76  KALDI_ERR << "Unknown command type.";
77  }
78 }
79 
80 void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
81  std::vector<int32*> *submatrix_args) {
82  submatrix_args->clear();
83  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
84  end = commands->end();
85  std::vector<int32*> this_submatrix_args;
86  for (; iter != end; ++iter) {
87  IdentifySubmatrixArgs(&(*iter), &this_submatrix_args);
88  submatrix_args->insert(submatrix_args->end(),
89  this_submatrix_args.begin(),
90  this_submatrix_args.end());
91  }
92 }
93 
94 
95 
97  std::vector<int32*> *matrix_args) {
98  int32 num_submatrices = computation->submatrices.size();
99  matrix_args->reserve(computation->submatrices.size());
100  for (int32 s = 1; s < num_submatrices; s++)
101  matrix_args->push_back(&(computation->submatrices[s].matrix_index));
102 }
103 
104 
105 void IdentifyIndexesMultiArgs(std::vector<NnetComputation::Command> *commands,
106  std::vector<int32*> *indexes_multi_args) {
107  indexes_multi_args->clear();
108  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
109  end = commands->end();
110  for (; iter != end; ++iter) {
111  NnetComputation::Command &command = *iter;
112  if (command.command_type == kAddRowsMulti ||
113  command.command_type == kAddToRowsMulti ||
114  command.command_type == kCopyRowsMulti ||
115  command.command_type == kCopyToRowsMulti)
116  indexes_multi_args->push_back(&(command.arg2));
117  }
118 }
119 
120 
121 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
122  std::vector<int32*> *indexes_ranges_args) {
123  indexes_ranges_args->clear();
124  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
125  end = commands->end();
126  for (; iter != end; ++iter) {
127  NnetComputation::Command &command = *iter;
128  if (command.command_type == kAddRowRanges)
129  indexes_ranges_args->push_back(&(command.arg3));
130  }
131 }
132 
133 void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
134  std::vector<int32*> *indexes_args) {
135  indexes_args->clear();
136  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
137  end = commands->end();
138  for (; iter != end; ++iter) {
139  NnetComputation::Command &command = *iter;
140  if (command.command_type == kCopyRows ||
141  command.command_type == kAddRows)
142  indexes_args->push_back(&(command.arg3));
143  }
144 }
145 
146 // We declare this class in the .cc file, we don't need to export it.
147 // It's used inside RenumberComputation.
149  public:
151  computation_(computation) { }
152 
153  void Renumber();
154  private:
155  // this function removes unused vectors within the indexes_multi_ array, i.e.
156  // ones that are not referenced in the computation.
158  // this function computes the submatrix_is_used_ vector, saying whether each
159  // of the original submatrices is referenced somewhere.
160  void ComputeSubmatrixIsUsed();
161  // this function computes the matrix_is_used_ vector (from the
162  // submatrix_is_used_ vector, from computation_->input_output_info, and from
163  // computation_->commands, saying whether each of the original matrices is
164  // referenced somewhere, directly or indirectly.
165  void ComputeMatrixIsUsed();
166  // This function sets up mappings from old to new matrix and submatrix indexes,
167  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
168  void SetUpMappings();
169  // This function renumbers submatrix indexes appearing within commands and
170  // indexes_multi_, and then removes unused submatrices from the list of
171  // submatrices while leaving the matrix-indexes at their old values (they will
172  // be mapped by RenumberMatrices()).
173  void RenumberSubmatrices();
174  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
175  // and 'input_output_info'; renumber 'matrices' and if applicable
176  // 'debug_info'.
177  void RenumberMatrices();
178  // removes duplicates within the indexes_multi array itself.
180  // removes unused elements and duplicates within 'computation->indexes'
181  void RenumberIndexes();
182  // removes unused elements and duplicates within 'computation->indexes_ranges'
183  void RenumberIndexesRanges();
184  // renumbers memos, removing any gaps between memo indexes.
185  void RenumberMemos();
186 
189  size_t operator () (const NnetComputation::SubMatrixInfo &submat) const noexcept {
190  // these numbers are arbitrarily chosen primes.
191  return submat.matrix_index +
192  19553 * submat.row_offset +
193  29297 * submat.num_rows +
194  42209 * submat.col_offset +
195  56527 * submat.num_cols;
196  }
197  };
198 
199 
200  // Here, T will be int32 or std::pair<int32,int32>
201  template <class T>
202  struct PointerCompare {
203  // This provides an operator < on two vectors of ints or pairs of ints. It
204  // is designed to provide a total order on the vectors while accessing as
205  // small a portion of the vectors' data as possible. It's used in removing
206  // duplicates from computation_->indexes_multi and computation_->indexes.
207  // First it compares the length, then it does lexicographical compare.
208  bool operator ()(const std::vector<T> *ptr1,
209  const std::vector<T> *ptr2) const {
210  size_t size1 = ptr1->size(), size2 = ptr2->size();
211  if (size1 < size2) return true;
212  else if (size1 > size2) return false;
213  else return (*ptr1 < *ptr2); // use the std::vector operator <, which is
214  // lexicographical comparison.
215  }
216  };
217 
221  static void CreateRenumbering(int32 old_num_elements,
222  const std::vector<int32> &to_remove,
223  std::vector<int32> *renumbering);
224 
229  static int32 CreateRenumbering(const std::vector<bool> &used,
230  std::vector<int32> *renumbering);
231 
232  // vector of bool indexed by original submatrix-index, that is true if a
233  // submatrix-index is used somewhere in the computation (always true for
234  // the zeroth element).
235  std::vector<bool> submatrix_is_used_;
236  // vector of bool indexed by original submatrix-index, that is true if a
237  // submatrix-index will be kept; this is like submatrix_is_used_; but for
238  // duplicate submatrices, all but the first duplicate will be marked false).
239  std::vector<bool> submatrix_is_kept_;
240  // vector of bool indexed by original-matrix-index > 0, that is true if a
241  // matrix-index is used somewhere in the computation, directly or indirectly.
242  // always true for the zeroth element.
243  std::vector<bool> matrix_is_used_;
247  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
248  // new-matrix-index. -1 for removed
249  // ones.
250  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
251  // gives new-submatrix-index. -1
252  // for removed ones.
253 };
254 
255 // static
257  const std::vector<bool> &used,
258  std::vector<int32> *renumbering) {
259  renumbering->clear();
260  renumbering->reserve(used.size());
261  std::vector<bool>::const_iterator iter = used.begin(), end = used.end();
262  int32 cur_index = 0;
263  for (; iter != end; ++iter) {
264  if (*iter) renumbering->push_back(cur_index++);
265  else renumbering->push_back(-1);
266  }
267  return cur_index;
268 }
269 
270 // static
272  int32 old_num_elements,
273  const std::vector<int32> &to_remove,
274  std::vector<int32> *renumbering) {
275  KALDI_ASSERT(IsSortedAndUniq(to_remove) && old_num_elements > 0);
276  renumbering->clear();
277  renumbering->resize(old_num_elements, 0);
278  int32 num_remove = to_remove.size();
279  for (int32 r = 0; r < num_remove; r++) {
280  int32 this_remove = to_remove[r];
281  // the "> 0" would be ">= 0" in a more generic context, but zero is
282  // not valid in this particular application.
283  KALDI_ASSERT(this_remove > 0 && this_remove < old_num_elements);
284  (*renumbering)[this_remove] = -1;
285  }
286  int32 cur_number = 0;
287  for (int32 i = 0; i < old_num_elements; i++) {
288  if ((*renumbering)[i] != -1)
289  (*renumbering)[i] = cur_number++;
290  }
291  KALDI_ASSERT(cur_number == old_num_elements -
292  static_cast<int32>(to_remove.size()));
293 }
294 
295 
297  // this is indexed by memo-index, and maps to the
298  // (propagate, backprop) commands that use that memo-index, or
299  // (-1, -1) if there are no such commands.
300  std::vector<std::pair<int32, int32> > memo_to_commands;
301  std::vector<int32> memo_indexes_used;
302  std::pair<int32, int32> blank(-1, -1);
303  int32 num_commands = computation_->commands.size();
304  for (int32 c = 0; c < num_commands; c++) {
306  if (command.command_type == kPropagate) {
307  int32 memo_index = command.arg5;
308  if (memo_index > 0) {
309  if (memo_to_commands.size() <= static_cast<size_t>(memo_index))
310  memo_to_commands.resize(memo_index + 1, blank);
311  KALDI_ASSERT(memo_to_commands[memo_index].first == -1);
312  memo_to_commands[memo_index].first = c;
313  memo_indexes_used.push_back(memo_index);
314  }
315  } else if (command.command_type == kBackprop) {
316  int32 memo_index = command.arg7;
317  if (memo_index > 0) {
318  if (memo_to_commands.size() <= static_cast<size_t>(memo_index))
319  memo_to_commands.resize(memo_index + 1, blank);
320  KALDI_ASSERT(memo_to_commands[memo_index].first > 0 &&
321  memo_to_commands[memo_index].second == -1);
322  memo_to_commands[memo_index].second = c;
323  }
324  }
325  }
326  int32 new_memo_index = 1;
327  for (std::vector<int32>::iterator iter = memo_indexes_used.begin();
328  iter != memo_indexes_used.end(); ++iter) {
329  int32 memo_index = *iter;
330  int32 propagate_command = memo_to_commands[memo_index].first,
331  backprop_command = memo_to_commands[memo_index].second;
332  KALDI_ASSERT(backprop_command > 0 &&
333  "Propagate generates memo but backprop doesn't use it.");
334  computation_->commands[propagate_command].arg5 = new_memo_index;
335  computation_->commands[backprop_command].arg7 = new_memo_index;
336  new_memo_index++;
337  }
338 }
339 
341  std::vector<int32*> *submatrix_args) {
342  IdentifySubmatrixArgs(&(computation->commands), submatrix_args);
343 
344  size_t extra_size = 0;
345  for (size_t i = 0; i < computation->indexes_multi.size(); i++)
346  extra_size += computation->indexes_multi[i].size();
347  submatrix_args->reserve(submatrix_args->size() + extra_size);
348 
349  for (size_t i = 0; i < computation->indexes_multi.size(); i++) {
350  std::vector<std::pair<int32, int32> > &indexes_multi =
351  computation->indexes_multi[i];
352  std::vector<std::pair<int32, int32> >::iterator
353  iter = indexes_multi.begin(), end = indexes_multi.end();
354  for (; iter != end; ++iter)
355  if (iter->first != -1)
356  submatrix_args->push_back(&(iter->first));
357  }
358 }
359 
360 
362  int32 num_submatrices = computation_->submatrices.size();
363  submatrix_is_used_.clear();
364  submatrix_is_used_.resize(num_submatrices, false);
365  submatrix_is_used_[0] = true;
366  // the zeroth element of the array is 'special', it refers to the
367  // zero submatrix, and we don't want to renumber it.
368  std::vector<int32*> submatrix_args;
370  std::vector<int32*>::iterator iter = submatrix_args.begin(),
371  end = submatrix_args.end();
372  int32 cur_submatrix_index = -1; // an optimization to avoid too many
373  // indexings of the bool vector
374  // submatrix_is_used_.
375  for (; iter != end; ++iter) {
376  int32 submatrix_index = **iter;
377  if (submatrix_index > 0 && submatrix_index != cur_submatrix_index) {
378  cur_submatrix_index = submatrix_index;
379  KALDI_ASSERT(submatrix_index < num_submatrices);
380  submatrix_is_used_[submatrix_index] = true;
381  }
382  }
383 }
384 
386  matrix_is_used_.clear();
387  matrix_is_used_.resize(computation_->matrices.size(), false);
388  matrix_is_used_[0] = true;
389  // We also need to take into account when matrices are used indirectly via
390  // submatrices (which is actually the main way they are accessed).
391  int32 num_submatrices = computation_->submatrices.size();
392  for (int32 s = 1; s < num_submatrices; s++) {
393  int32 matrix_index = computation_->submatrices[s].matrix_index;
394  if (submatrix_is_used_[s])
395  matrix_is_used_[matrix_index] = true;
396  }
397 }
398 
399 
400 
403 
404  unordered_map<NnetComputation::SubMatrixInfo, int32,
405  SubMatrixHasher> submat_map;
406  int32 cur_index = 1, num_submatrices_orig =
407  computation_->submatrices.size();
408  // the old_to_new_submatrix_ map will remove duplicates.
409  // -1's will appear wherever a particular submatrix was never used.
411  old_to_new_submatrix_.resize(num_submatrices_orig, -1);
412  old_to_new_submatrix_[0] = 0;
413  for (int32 s = 1; s < num_submatrices_orig; s++) {
414  if (submatrix_is_used_[s]) {
415  const NnetComputation::SubMatrixInfo &info =
417  if (submat_map.count(info) > 0) { // a duplicate...
418  old_to_new_submatrix_[s] = submat_map[info];
419  submatrix_is_kept_[s] = false;
420  } else {
421  old_to_new_submatrix_[s] = (submat_map[info] = cur_index++);
422  }
423  }
424  }
425  num_submatrices_new_ = cur_index;
426 }
427 
429  std::vector<int32*> submatrix_args;
431  std::vector<int32*>::iterator iter = submatrix_args.begin(),
432  end = submatrix_args.end();
433  for (; iter != end; ++iter) {
434  if (**iter > 0) {
435  int32 new_submatrix_index = old_to_new_submatrix_[**iter];
436  // old_to_new_submatrix_[s] for s > 0 is only <= 0 (actually, -1) for
437  // submatrices that are never accessed, and these should never appear
438  // in this list.
439  KALDI_ASSERT(new_submatrix_index > 0);
440  **iter = new_submatrix_index;
441  }
442  }
443  std::vector<NnetComputation::SubMatrixInfo> new_submatrices;
444  int32 num_submatrices_old = computation_->submatrices.size();
445  new_submatrices.reserve(num_submatrices_old);
446  for (int32 s = 0; s < num_submatrices_old; s++)
447  if (submatrix_is_kept_[s])
448  new_submatrices.push_back(computation_->submatrices[s]);
449  computation_->submatrices.swap(new_submatrices);
450  // We'll map the matrix indexes inside computation_->submatrices
451  // when we call RenumberMatrices().
452 }
453 
455  std::vector<int32*> matrix_args;
456  int32 num_submatrices = computation_->submatrices.size();
457  for (int32 s = 1; s < num_submatrices; s++) {
458  int32 *matrix_index = &(computation_->submatrices[s].matrix_index);
459  // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
460  // submatrices that are never accessed, and these should never appear
461  // in this list. (presumably because we renumber the submatrices first).
462  int32 new_matrix_index = old_to_new_matrix_[*matrix_index];
463  KALDI_ASSERT(new_matrix_index > 0);
464  *matrix_index = new_matrix_index;
465  }
466 
467  std::vector<NnetComputation::MatrixInfo> new_matrices;
468  int32 num_matrices_old = computation_->matrices.size();
469  new_matrices.reserve(num_matrices_old);
470  for (int32 m = 0; m < num_matrices_old; m++)
471  if (matrix_is_used_[m])
472  new_matrices.push_back(computation_->matrices[m]);
473  computation_->matrices.swap(new_matrices);
474 
475  std::vector<NnetComputation::MatrixDebugInfo> new_debug_info;
476  int32 debug_info_size = computation_->matrix_debug_info.size();
477  KALDI_ASSERT(debug_info_size == 0 || debug_info_size == num_matrices_old);
478  new_debug_info.reserve(debug_info_size);
479  for (int32 m = 0; m < debug_info_size; m++) {
480  if (matrix_is_used_[m]) {
481  new_debug_info.push_back(NnetComputation::MatrixDebugInfo());
482  new_debug_info.back().Swap(&(computation_->matrix_debug_info[m]));
483  }
484  }
485  computation_->matrix_debug_info.swap(new_debug_info);
486 }
487 
488 
493  SetUpMappings();
497  RenumberIndexes();
499  RenumberMemos();
500 }
501 
503  int32 num_indexes_multi = computation_->indexes_multi.size();
504  if (num_indexes_multi == 0)
505  return; // Nothing to do. An optimization.
506  std::vector<bool> indexes_multi_used(num_indexes_multi, false);
507  std::vector<int32*> indexes_multi_args;
508  IdentifyIndexesMultiArgs(&(computation_->commands), &indexes_multi_args);
509  std::vector<int32*>::iterator iter = indexes_multi_args.begin(),
510  end = indexes_multi_args.end();
511  for (; iter != end; ++iter) {
512  int32 indexes_multi_index = **iter;
513  KALDI_ASSERT(indexes_multi_index >= 0 &&
514  indexes_multi_index < num_indexes_multi);
515  indexes_multi_used[indexes_multi_index] = 1;
516  }
517  // old->new mapping for the indexes_multi arrays. will remain -1 for
518  // ones that are unused.
519  std::vector<int32> old_to_new(num_indexes_multi, -1);
520  int32 new_num_indexes_multi = CreateRenumbering(indexes_multi_used,
521  &old_to_new);
522  if (new_num_indexes_multi == num_indexes_multi)
523  return; // Nothing to do. An optimization.
524  std::vector<std::vector<std::pair<int32, int32> > >
525  new_indexes_multi(new_num_indexes_multi);
526  for (int32 i = 0; i < num_indexes_multi; i++) {
527  if (old_to_new[i] != -1)
528  new_indexes_multi[old_to_new[i]].swap(computation_->indexes_multi[i]);
529  }
530  computation_->indexes_multi.swap(new_indexes_multi);
531  // renumber within the commands.
532  for (iter = indexes_multi_args.begin(); iter != end; ++iter)
533  **iter = old_to_new[**iter];
534 }
535 
536 
537 // removes duplicates within the indexes_multi_ array itself.
539  int32 cur_index = 0,
540  old_indexes_multi_size = computation_->indexes_multi.size();
541  if (old_indexes_multi_size == 0)
542  return;
543  // create index mapping from old to new. the use of map is generally not that
544  // efficient, but the idea here is that we can do most of the comparisons just
545  // based on the size of the vectors, and avoid even visiting most of their
546  // contents.
547  std::vector<int32> indexes_multi_old_to_new(old_indexes_multi_size);
548  typedef std::vector<std::pair<int32,int32> > PairVectorType;
549  typedef std::map<const PairVectorType*, int32,
551  MapType indexes_multi_map;
552  for (int32 i = 0; i < computation_->indexes_multi.size(); i++) {
553  std::pair<MapType::iterator, bool> p =
554  indexes_multi_map.insert(std::pair<const PairVectorType*, int32>(
555  &(computation_->indexes_multi[i]), cur_index));
556  if (p.second) { // was inserted-- was not there already.
557  indexes_multi_old_to_new[i] = cur_index++;
558  } else {
559  int32 index_from_map = p.first->second;
560  indexes_multi_old_to_new[i] = index_from_map;
561  }
562  }
563  if (cur_index == old_indexes_multi_size)
564  return; // An optimization. No duplicates were found.
565  std::vector<PairVectorType> new_indexes_multi(cur_index);
566  for (int32 i = 0; i < old_indexes_multi_size; i++) {
567  int32 new_index = indexes_multi_old_to_new[i];
568  computation_->indexes_multi[i].swap(new_indexes_multi[new_index]);
569  }
570  computation_->indexes_multi.swap(new_indexes_multi);
571 
572  std::vector<int32*> indexes_multi_args;
573  IdentifyIndexesMultiArgs(&(computation_->commands), &indexes_multi_args);
574  std::vector<int32*>::const_iterator iter = indexes_multi_args.begin(),
575  end = indexes_multi_args.end();
576  for (; iter != end; ++iter)
577  **iter = indexes_multi_old_to_new[**iter];
578 }
579 
580 
582  int32 old_num_indexes = computation_->indexes.size();
583  if (old_num_indexes == 0)
584  return;
585  std::vector<int32*> indexes_args;
586  IdentifyIndexesArgs(&(computation_->commands), &indexes_args);
587 
588  std::vector<bool> indexes_seen(old_num_indexes, false);
589  std::vector<int32*>::const_iterator iter = indexes_args.begin(),
590  end = indexes_args.end();
591  for (; iter != end; ++iter)
592  indexes_seen[**iter] = true;
593 
594  std::vector<int32> old_to_new_index(old_num_indexes);
595  typedef std::map<const std::vector<int32>*, int32,
596  PointerCompare<int32> > MapType;
597  MapType indexes_map;
598 
599  int32 cur_index = 0;
600  for (int32 i = 0; i < old_num_indexes; i++) {
601  if (!indexes_seen[i]) {
602  old_to_new_index[i] = -1;
603  } else {
604  std::pair<MapType::iterator, bool> p =
605  indexes_map.insert(std::pair<const std::vector<int32>*, int32>(
606  &(computation_->indexes[i]), cur_index));
607  if (p.second) { // was inserted-- was not there already.
608  old_to_new_index[i] = cur_index++;
609  } else {
610  int32 index_from_map = p.first->second;
611  old_to_new_index[i] = index_from_map;
612  }
613  }
614  }
615  if (cur_index == old_num_indexes)
616  return; // An optimization. No changes to the numbering are made.
617  std::vector<std::vector<int32> > new_indexes(cur_index);
618  for (int32 i = 0; i < old_num_indexes; i++) {
619  int32 new_index = old_to_new_index[i];
620  if (new_index != -1)
621  computation_->indexes[i].swap(new_indexes[new_index]);
622  }
623  computation_->indexes.swap(new_indexes);
624 
625  // renumber the indexes inside the commmands.
626  for (iter = indexes_args.begin(); iter != end; ++iter) {
627  int32 old_index = **iter;
628  KALDI_ASSERT(old_index >= 0 && old_index < old_num_indexes);
629  int32 new_index = old_to_new_index[old_index];
630  KALDI_ASSERT(new_index >= 0);
631  **iter = new_index;
632  }
633 }
634 
636  int32 old_num_indexes_ranges = computation_->indexes_ranges.size();
637  if (old_num_indexes_ranges == 0)
638  return;
639  std::vector<int32*> indexes_ranges_args;
640  IdentifyIndexesRangesArgs(&(computation_->commands), &indexes_ranges_args);
641 
642  std::vector<bool> is_seen(old_num_indexes_ranges, false);
643  std::vector<int32*>::const_iterator iter = indexes_ranges_args.begin(),
644  end = indexes_ranges_args.end();
645  for (; iter != end; ++iter)
646  is_seen[**iter] = true;
647 
648  std::vector<int32> old_to_new_index(old_num_indexes_ranges);
649  typedef std::map<const std::vector<std::pair<int32, int32> >*, int32,
651  MapType indexes_map;
652  int32 cur_index = 0;
653  for (int32 i = 0; i < old_num_indexes_ranges; i++) {
654  if (!is_seen[i]) {
655  old_to_new_index[i] = -1;
656  } else {
657  std::pair<MapType::iterator, bool> p =
658  indexes_map.insert(
659  std::pair<const std::vector<std::pair<int32, int32> >*, int32>(
660  &(computation_->indexes_ranges[i]), cur_index));
661  if (p.second) { // was inserted-- was not there already.
662  old_to_new_index[i] = cur_index++;
663  } else {
664  int32 index_from_map = p.first->second;
665  old_to_new_index[i] = index_from_map;
666  }
667  }
668  }
669  if (cur_index == old_num_indexes_ranges)
670  return; // An optimization. No changes to the numbering are made.
671  std::vector<std::vector<std::pair<int32, int32> > > new_indexes_ranges(
672  cur_index);
673  for (int32 i = 0; i < old_num_indexes_ranges; i++) {
674  int32 new_index = old_to_new_index[i];
675  if (new_index != -1)
676  computation_->indexes_ranges[i].swap(new_indexes_ranges[new_index]);
677  }
678  computation_->indexes_ranges.swap(new_indexes_ranges);
679 
680  // renumber the indexes inside the commmands.
681  for (iter = indexes_ranges_args.begin(); iter != end; ++iter) {
682  int32 old_index = **iter;
683  KALDI_ASSERT(old_index >= 0 && old_index < old_num_indexes_ranges);
684  int32 new_index = old_to_new_index[old_index];
685  KALDI_ASSERT(new_index >= 0);
686  **iter = new_index;
687  }
688 }
689 
690 
691 
692 
694  ComputationRenumberer renumberer(computation);
695  renumberer.Renumber();
696 }
697 
698 
699 static bool IsNoop(const NnetComputation::Command &command) {
700  return command.command_type == kNoOperation;
701 }
702 
703 void RemoveNoOps(NnetComputation *computation) {
704  computation->commands.erase(
705  std::remove_if(computation->commands.begin(),
706  computation->commands.end(),
707  IsNoop), computation->commands.end());
708 }
709 
710 
712  const NnetOptimizeOptions &config,
713  const Nnet &nnet,
714  NnetComputation *computation):
715  config_(config), nnet_(nnet),
716  computation_(computation),
717  already_called_merge_variables_(false) {
718  analyzer_.Init(nnet, *computation);
721 }
722 
726  if (!config_.optimize)
727  return false;
728  bool merged = false;
729  int32 num_commands = computation_->commands.size();
730  for (int32 command_index = 0; command_index < num_commands;
731  command_index++) {
732  // This loop looks for pairs of sub-matrix indexes s1,s2 that we could
733  // potentially merge into a single variable.
734  const NnetComputation::Command &c = computation_->commands[command_index];
735  int32 s1 = -1, s2 = -1;
736  if (c.command_type == kMatrixCopy &&
738  s2 = c.arg1; // s2 is the written-to matrix.
739  s1 = c.arg2;
740  } else if (c.command_type == kPropagate &&
742  const Component *component = nnet_.GetComponent(c.arg1);
743  if (component->Properties() & kPropagateInPlace) {
744  s1 = c.arg3;
745  s2 = c.arg4; // s2 is the written-to matrix.
746  }
747  } else if ((c.command_type == kBackprop ||
750  const Component *component = nnet_.GetComponent(c.arg1);
751  if (component->Properties() & kBackpropInPlace) {
752  s1 = c.arg5;
753  s2 = c.arg6; // s2 is the written-to matrix.
754  if (s1 == c.arg3 || s2 == c.arg3 || s1 == c.arg4 || s2 == c.arg4) {
755  // we don't think this should ever happen, but just out of an
756  // abundance of caution: if either of these submatrix indexes are the
757  // input-value or output-value args to Backprop, don't do the optimization.
758  s1 = -1;
759  s2 = -1;
760  }
761  }
762  }
763  if (s1 > 0 && s2 > 0) {
764  std::pair<bool,bool> p = MayBeMerged(command_index, s1, s2);
765  if (p.first) {
766  DoMerge(command_index, s1, s2);
767  merged = true;
768  } else if (p.second) {
769  DoMerge(command_index, s2, s1);
770  merged = true;
771  }
772  }
773  }
774  if (merged) {
777  }
778  return merged;
779 }
780 
790  const NnetComputation &computation, int32 submat_a, int32 submat_b) {
791  KALDI_ASSERT(static_cast<size_t>(submat_a) < computation.submatrices.size());
792  KALDI_ASSERT(static_cast<size_t>(submat_b) < computation.submatrices.size());
793  const NnetComputation::SubMatrixInfo &a = computation.submatrices[submat_a],
794  &b = computation.submatrices[submat_b];
795  const NnetComputation::MatrixInfo &a_mat =
796  computation.matrices[a.matrix_index];
797  KALDI_ASSERT(a_mat.num_rows == b.num_rows && a_mat.num_cols == b.num_cols);
799  ans.matrix_index = b.matrix_index;
800  ans.row_offset = a.row_offset + b.row_offset;
801  ans.num_rows = a.num_rows;
802  ans.col_offset = a.col_offset + b.col_offset;
803  ans.num_cols = a.num_cols;
804  return ans;
805 }
806 
808  std::vector<int32> variable_indexes;
809  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
810  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
811  end = variable_indexes.end();
812  for (; iter != end; ++iter) {
813  int32 v = *iter;
814  KALDI_ASSERT(static_cast<size_t>(v) < variable_dirty_.size());
815  variable_dirty_[v] = true;
816  }
817 }
818 
820  int32 s_to_keep,
821  int32 s_to_discard) {
822  // Prevent further optimizations touching either submatrix (we can try again
823  // in a later round of optimization, with a new instance of this class).
824  MarkAsDirty(s_to_keep);
825  MarkAsDirty(s_to_discard);
826 
827  int32 m_to_keep = computation_->submatrices[s_to_keep].matrix_index,
828  m_to_discard = computation_->submatrices[s_to_discard].matrix_index;
829  KALDI_ASSERT(m_to_keep != m_to_discard && m_to_keep > 0 && m_to_discard > 0);
830 
831  { // modify submatrices of m_to_discard to effectively be sub-matrices of
832  // s_to_keep instead (they will refer to m_to_keep as the matrix_index).
833  std::vector<int32>::const_iterator iter =
834  matrix_to_submatrix_[m_to_discard].begin(),
835  end = matrix_to_submatrix_[m_to_discard].end();
836  for (; iter != end; ++iter) {
837  int32 submatrix_index = *iter;
838  KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index
839  == m_to_discard);
840  computation_->submatrices[submatrix_index] =
841  GetSubMatrixOfSubMatrix(*computation_, submatrix_index,
842  s_to_keep);
843  }
844  }
845 
847  NnetComputation::Command &c = computation_->commands[command_index];
848  const std::vector<MatrixAccesses> &matrix_accesses =
850 
851  // - If it was a matrix-copy (assignment) with scale 1.0, replace the
852  // assignment command with a no-op.
853  // If it was matrix-copy with a scale, leave the command there;
854  // it will have the effect of scaling the matrix (it will be
855  // mapped so that arg1 == arg2, but that is OK).
856  if (c.command_type == kMatrixCopy && c.alpha == 1.0) {
857  // remove the command.
859  c.arg1 = -1;
860  c.arg2 = -1;
861  }
862 
863  // We want to ensure that there is only one deallocation command.
864  // If neither matrix is an output, then there will be 2 deallocation
865  // commands and we keep the one for m_to_keep (which, if the sizes
866  // differ, will be the larger of the two, so it's the one whose
867  // submatrix index refers to the entirety of the matrix).
868  // If one of them is an output, then remove the deallocation command
869  // of whichever one is not an output.
870  // As a simplification to the logic above: if the 'discard' matrix
871  // has a deallocation command (i.e. if that matrix was not an output)
872  // then remove it; otherwise remove the deallocation command of
873  // the 'keep' matrix.
874 
875  int32 dealloc_keep = matrix_accesses[m_to_keep].deallocate_command,
876  dealloc_discard = matrix_accesses[m_to_discard].deallocate_command;
877  if (dealloc_discard != -1) {
878  computation_->commands[dealloc_discard].command_type = kNoOperation;
879  } else {
880  KALDI_ASSERT(dealloc_keep != -1);
881  computation_->commands[dealloc_keep].command_type = kNoOperation;
882  }
883 
884  {
885  // - Both m_to_keep and m_to_discard will have commands that allocate
886  // them, as all matrices do (note, kAcceptInput counts as an allocation
887  // command). If one of them is kAcceptInput, then delete the other one,
888  // because the position of the kAcceptInput commands is important.
889  // Otherwise delete the "discard" one. As a simplification of the logic
890  // of the previous sentence: if the "discard" allocate command is
891  // kAcceptInput then delete the "keep" allocate command, else delete
892  // the "discard" allocate command.
893  // Note: after we renumber the submatrices, they both refer to the
894  // same underlying matrix, but we need to refer to them using a
895  // submatrix that refers to the entire matrix. The one we keep will
896  // always refer to the entire matrix. (In the case where one of
897  // them is an input, both submatrices are guaranteed to refer to the
898  // entire matrix, this is guaranteed by the logic we use to decide
899  // which matrices we can merge).
900  int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
901  alloc_discard = matrix_accesses[m_to_discard].allocate_command;
902 
903  KALDI_ASSERT(alloc_keep != -1 && alloc_discard != -1);
904  KALDI_ASSERT(analysis.FirstNontrivialMatrixAccess(m_to_discard) >
905  alloc_keep);
906 
908  &keep_alloc_command = computation_->commands[alloc_keep],
909  &discard_alloc_command = computation_->commands[alloc_discard];
910  int32 matrix_whose_zeroing_to_discard;
911  if (discard_alloc_command.command_type == kAcceptInput) {
912  keep_alloc_command.command_type = kNoOperation;
913  matrix_whose_zeroing_to_discard = m_to_keep;
914  } else {
915  discard_alloc_command.command_type = kNoOperation;
916  matrix_whose_zeroing_to_discard = m_to_discard;
917  }
918  // Now remove the command that zeroed one of the matrices
919  // (the one whose allocation command we just discarded).
920  int32 zeroing_command_to_discard =
921  matrix_accesses[matrix_whose_zeroing_to_discard].accesses[0].command_index;
922  NnetComputation::Command &zeroing_command =
923  computation_->commands[zeroing_command_to_discard];
924  if (zeroing_command.command_type == kSetConst &&
925  zeroing_command.alpha == 0.0) {
926  // if 'zeroing_command' actually *was* a zeroing command, then remove it.
927  zeroing_command.command_type = kNoOperation;
928  }
929  }
930 
931  // If the matrix to discard had stride_type == kStrideEqualNumCols, set the
932  // stride type of the matrix we're keeping to kStrideEqualNumCols.
933  if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
934  computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
935  // ... and perform an additional check.
936  KALDI_ASSERT(computation_->matrices[m_to_discard].num_rows ==
937  computation_->matrices[m_to_keep].num_rows &&
938  computation_->matrices[m_to_discard].num_cols ==
939  computation_->matrices[m_to_keep].num_cols);
940  }
941 }
942 
943 
944 
946  int32 command_index, int32 s1, int32 s2) const {
947  KALDI_ASSERT(s1 > 0 && s2 > 0 && static_cast<size_t>(command_index) <
948  computation_->commands.size());
950  return std::pair<bool,bool>(false,false);
951  int32 m1 = computation_->submatrices[s1].matrix_index,
952  m2 = computation_->submatrices[s2].matrix_index;
953  // we can't merge two different submatrices of the same matrix.
954  if (m1 == m2) return std::pair<bool,bool>(false,false);
955  std::vector<int32> variable_indexes;
956  analyzer_.variables.AppendVariablesForSubmatrix(s1, &variable_indexes);
957  analyzer_.variables.AppendVariablesForSubmatrix(s2, &variable_indexes);
958  std::vector<int32>::iterator iter = variable_indexes.begin(),
959  end = variable_indexes.end();
960  // condition c5:
961  for (; iter != end; ++iter)
962  if (variable_dirty_[*iter])
963  return std::pair<bool,bool>(false,false);
964  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
965  const MatrixAccesses &m1_access = matrix_accesses[m1],
966  &m2_access = matrix_accesses[m2];
967  // condition c1:
968  if ((m1_access.is_input && m2_access.is_input) ||
969  (m1_access.is_output && m2_access.is_output))
970  return std::pair<bool,bool>(false,false);
971  // condition c2:
972  if ((m1_access.is_input || m1_access.is_output ||
973  m2_access.is_input || m2_access.is_output) &&
974  (!computation_->IsWholeMatrix(s1) ||
976  return std::pair<bool,bool>(false,false);
977  bool left = config_.allow_left_merge,
978  right = config_.allow_right_merge;
979  // condition c3:
980  if (!computation_->IsWholeMatrix(s2)) left = false;
981  // condition c4:
982  if (!computation_->IsWholeMatrix(s1)) right = false;
983  // condition c6:
984  if (computation_->matrices[m2].stride_type == kStrideEqualNumCols &&
985  !computation_->IsWholeMatrix(s1)) left = false;
986  // condition c7:
987  if (computation_->matrices[m1].stride_type == kStrideEqualNumCols &&
988  !computation_->IsWholeMatrix(s2)) right = false;
989 
990 
991  if (!left && !right) // save some time.
992  return std::pair<bool,bool>(false,false);
993  bool is_assignment = (computation_->commands[command_index].command_type ==
994  kMatrixCopy &&
995  computation_->commands[command_index].alpha == 1.0);
997  if (is_assignment) {
998  if (analysis.FirstNontrivialAccess(s2) == command_index &&
999  analysis.LastWriteAccess(s1) < command_index &&
1000  analysis.LastAccess(s1) <
1001  analysis.DataInvalidatedCommand(command_index, s2)) {
1002  return std::pair<bool,bool>(left, right); // possible success.
1003  }
1004  } else {
1005  if (analysis.FirstNontrivialAccess(s2) == command_index &&
1006  analysis.LastAccess(s1) == command_index) {
1007  return std::pair<bool,bool>(left, right); // possible success.
1008  }
1009  }
1010  // failure.
1011  return std::pair<bool,bool>(false,false);
1012 }
1013 
1014 
1015 // This class is used inside the function
1016 // `void ExtendMatrices(NnetComputation *computation)`;
1017 // see that function's declaration in nnet-optimize-utils.h for
1018 // a summary of what this class does.
1020  public:
1023 
1024  MatrixExtender(NnetComputation *computation);
1025 
1026  void ExtendMatrices();
1027 
1028  private:
1029  // This function returns true if a copy command from 'src_submatrix'
1030  // to 'dest_submatrix' has the properties we need to be able to
1031  // extend its rows to cover all of the source matrix.
1032  bool CanBeExtended(int32 dest_submatrix_index,
1033  int32 src_submatrix_index);
1034 
1035  // This actually extends the matrices... it's called only if CanBeExtended()
1036  // with the same args returned true. It modifies 'dest_submatrix_index'
1037  // and 'src_submatrix_index'.
1038  void Extend(int32 *dest_submatrix_index, int32 *src_submatrix_index);
1039 
1040  // This function modifies the computation to fix certain problems
1041  // that might have been introduced by Extend()... allocation, deallocation,
1042  void FixComputation();
1043 
1044  // This function modifies the computation to fix the debug info; if needed,
1045  // it's called from FixComputation().
1046  void FixDebugInfo();
1047 
1048  // don't extend a destination matrix if it wasn't already
1049  // at least 'min_proportion' (80%) big enough to store the source.
1051 
1053 
1054  // Indexed by matrix-index m, orig_num_rows_[m] is the value of
1055  // computation_->matrices[m].num_rows when this class was initialized,
1056  // i.e. before we changed anything.
1057  std::vector<int32> orig_num_rows_;
1058 
1059  // Indexed by matrix-index m, this vector contains true if matrix
1060  // m is involved in any AcceptInput() or ProvideOutput() operations.
1061  std::vector<bool> is_input_or_output_;
1062 };
1063 
1064 // note: the initializer for min_proportion_ below needs to be kept in sync with
1065 // the min_proportion variable in
1066 // ComputationChecker::CheckComputationUndefined() in nnet-analyze.cc.
1068  min_proportion_(0.8),
1069  computation_(computation) {
1070  int32 num_matrices = computation_->matrices.size();
1071 
1072  { // set up orig_num_rows_.
1073  orig_num_rows_.resize(num_matrices);
1074  // matrix 0 is not a real matrix so skip that index.
1075  for (int32 m = 1; m < num_matrices; m++)
1076  orig_num_rows_[m] = computation_->matrices[m].num_rows;
1077  }
1078  { // set up is_input_or_output_.
1079  is_input_or_output_.resize(num_matrices, false);
1080  std::vector<NnetComputation::Command>::iterator
1081  command_iter = computation_->commands.begin(),
1082  command_end = computation_->commands.end();
1083  for (; command_iter != command_end; ++command_iter) {
1084  const NnetComputation::Command &command = *command_iter;
1085  // make sure there are no kSwapMatrix commands; they should not be present
1086  // at this stage of optimization.
1088  if (command.command_type == kProvideOutput ||
1089  command.command_type == kAcceptInput) {
1090  int32 s = command.arg1,
1091  m = computation_->submatrices[s].matrix_index;
1092  is_input_or_output_[m] = true;
1093  }
1094  }
1095  }
1096 }
1097 
1098 
1099 bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index,
1100  int32 src_submatrix_index) {
1101  const SubMatrixInfo
1102  &src_submatrix = computation_->submatrices[src_submatrix_index],
1103  &dest_submatrix = computation_->submatrices[dest_submatrix_index];
1104  if (src_submatrix.matrix_index == dest_submatrix.matrix_index)
1105  return false;
1106 
1107  // we can't resize the destination matrix if it's involved in input or output.
1108  if (is_input_or_output_[dest_submatrix.matrix_index])
1109  return false;
1110 
1111  const MatrixInfo
1112  &src_matrix = computation_->matrices[src_submatrix.matrix_index];
1113 
1114  int32 dest_matrix_orig_num_rows = orig_num_rows_[dest_submatrix.matrix_index],
1115  src_matrix_orig_num_rows = orig_num_rows_[src_submatrix.matrix_index];
1116 
1117  if (src_submatrix.num_rows < min_proportion_ * src_matrix_orig_num_rows)
1118  return false;
1119 
1120  // The following checks that the source submatrix covers be all of the
1121  // source matrix except a few final rows, and the destination submatrix goes
1122  // to the final row of its matrix.
1123  return (src_submatrix.col_offset == 0 &&
1124  src_submatrix.num_cols == src_matrix.num_cols &&
1125  src_submatrix.row_offset == 0 &&
1126  src_submatrix.num_rows < src_matrix.num_rows &&
1127  dest_submatrix.row_offset + dest_submatrix.num_rows ==
1128  dest_matrix_orig_num_rows);
1129 }
1130 
1131 
1132 void MatrixExtender::Extend(int32 *dest_submatrix_index,
1133  int32 *src_submatrix_index) {
1134  // copy the SubMatrixInfo to avoid iterator invalidation.
1136  src_submatrix = computation_->submatrices[*src_submatrix_index],
1137  dest_submatrix = computation_->submatrices[*dest_submatrix_index];
1138 
1139  MatrixInfo &src_matrix = computation_->matrices[src_submatrix.matrix_index],
1140  &dest_matrix = computation_->matrices[dest_submatrix.matrix_index];
1141 
1142  int32 new_dest_num_rows = dest_submatrix.row_offset + src_matrix.num_rows;
1143 
1144  // extend the destination matrix so it has enough rows to fit the entire
1145  // source matrix. Note: doing this will break certain invariances in the
1146  // computation, principally with allocation and deallocation commands, which
1147  // we'll later fix up by calling FixComputation().
1148  if (new_dest_num_rows > dest_matrix.num_rows) {
1149  dest_matrix.num_rows = new_dest_num_rows;
1150  // make sure there's a submatrix index covering the whole of the dest matrix.
1151  computation_->submatrices.push_back(
1152  SubMatrixInfo(dest_submatrix.matrix_index, 0, new_dest_num_rows,
1153  0, dest_matrix.num_cols));
1154  }
1155 
1156  // The following 3 statements create a new submatrix that will be
1157  // the destination submatrix; it's the same as the original destination
1158  // submatrix, but with a few extra rows.
1159  *dest_submatrix_index = computation_->submatrices.size();
1160  dest_submatrix.num_rows = src_matrix.num_rows;
1161  computation_->submatrices.push_back(
1162  SubMatrixInfo(dest_submatrix));
1163 
1164  // The following 3 statements create a new submatrix that will be
1165  // the source submatrix; it's the same as the original source
1166  // submatrix, but with a few extra rows, and actually will cover
1167  // the entire source matrix.
1168  *src_submatrix_index = computation_->submatrices.size();
1169  computation_->submatrices.push_back(
1170  SubMatrixInfo(src_submatrix.matrix_index, 0, src_matrix.num_rows,
1171  0, src_matrix.num_cols));
1172 }
1173 
1175  std::vector<NnetComputation::Command>::iterator
1176  command_iter = computation_->commands.begin(),
1177  command_end = computation_->commands.end();
1178  bool changed = false;
1179  for (; command_iter != command_end; ++command_iter) {
1180  NnetComputation::Command &command = *command_iter;
1181  if (command.command_type == kMatrixCopy &&
1182  command.alpha == 1.0) {
1183  int32 dest_submatrix_index = command.arg1,
1184  src_submatrix_index = command.arg2;
1185  if (CanBeExtended(dest_submatrix_index, src_submatrix_index)) {
1186  Extend(&command.arg1, &command.arg2);
1187  changed = true;
1188  }
1189  }
1190  }
1191  if (changed)
1192  FixComputation();
1193 }
1194 
1196  // make sure that allocation and deallocation commands
1197  // operate on whole matrix.
1198  std::vector<NnetComputation::Command>::iterator
1199  command_iter = computation_->commands.begin(),
1200  command_end = computation_->commands.end();
1201  std::vector<int32> whole_submatrices;
1202  computation_->GetWholeSubmatrices(&whole_submatrices);
1203  for (; command_iter != command_end; ++command_iter) {
1204  NnetComputation::Command &command = *command_iter;
1205  if (command.command_type == kAllocMatrix ||
1206  command.command_type == kDeallocMatrix) {
1207  int32 s = command.arg1,
1208  m = computation_->submatrices[s].matrix_index,
1209  new_s = whole_submatrices[m];
1210  if (new_s != s) {
1211  KALDI_ASSERT(
1213  orig_num_rows_[m] != computation_->matrices[m].num_rows);
1214  command.arg1 = new_s;
1215  }
1216  }
1217  if (command.command_type == kSetConst && command.alpha == 0.0) {
1218  int32 s = command.arg1,
1219  m = computation_->submatrices[s].matrix_index,
1220  new_s = whole_submatrices[m];
1221  if (new_s != s) {
1222  {
1224  command.arg1];
1226  info.matrix_index];
1227  // If this command wasn't zeroing the the entirety of a matrix,
1228  // (before we extended the matrix), we don't need to extend it.
1229  if (!(info.row_offset == 0 && info.col_offset == 0 &&
1230  info.num_cols == mat_info.num_cols &&
1231  info.num_rows == orig_num_rows_[info.matrix_index]))
1232  continue;
1233  // I know doing this via 'continue' is odd, but it's done this way to
1234  // avoid invalid iterators still being in scope; I think some runtimes
1235  // check for it.
1236  }
1237  command.arg1 = new_s;
1238  }
1239  }
1240  }
1241  if (!computation_->matrix_debug_info.empty())
1242  FixDebugInfo();
1244 }
1245 
1247  int32 num_matrices = computation_->matrices.size();
1248  // matrix zero is not a 'real' matrix.
1249  for (int32 m = 1; m < num_matrices; m++) {
1250  NnetComputation::MatrixDebugInfo &debug_info =
1252  int32 new_num_rows = computation_->matrices[m].num_rows,
1253  old_num_rows = debug_info.cindexes.size();
1254  if (new_num_rows != old_num_rows) {
1255  debug_info.cindexes.resize(new_num_rows);
1256  int32 num_extra_rows = new_num_rows - old_num_rows;
1257  // the following should be true because min_proportion_ > 0.5.
1258  KALDI_ASSERT(num_extra_rows <= old_num_rows);
1259  for (int32 r = old_num_rows; r < new_num_rows; r++) {
1260  Cindex cindex = debug_info.cindexes[r - num_extra_rows];
1261  // set the 't' value to kNoTime which indicates that it's not a 'real'
1262  // time step, and may avoid errors in checking code.
1263  cindex.second.t = kNoTime;
1264  debug_info.cindexes[r] = cindex;
1265  }
1266  }
1267  }
1268 }
1269 
1270 void ExtendMatrices(NnetComputation *computation) {
1271  MatrixExtender ext(computation);
1272  ext.ExtendMatrices();
1273 }
1274 
1275 
1276 
1283  public:
1284  ModelUpdateConsolidator(const Nnet &nnet,
1285  NnetComputation *computation);
1286  void ConsolidateModelUpdate();
1287  private:
1288  void ConsolidateUpdateForComponent(
1289  int32 component,
1290  const std::vector<int32> &backprop_commands);
1291 
1296  void AddCommandsToComputation();
1297 
1313  int32 ConsolidateSubmatrices(
1314  const std::vector<int32> &commands,
1315  const std::vector<int32> &submatrices);
1316 
1324  void AppendDebugInfoForSubmatrix(
1325  int32 submatrix_index,
1326  NnetComputation::MatrixDebugInfo *debug_info) const;
1327 
1328  const Nnet &nnet_;
1330 
1331  // Indexed by the original command index in *computation_ (and sized to the
1332  // original number of commands in *computation_ before we added anything),
1333  // extra_commands_[c] contains a list of commands that need to be inserted
1334  // just before command c in the previously existing computation.
1335  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
1336 
1337  // This is as list of kBackprop commands that will be placed after the
1338  // commands in 'computation_->commands' and 'extra_commands_', but before
1339  // the 'final_deallocate_commands_'.
1340  std::vector<NnetComputation::Command> final_commands_;
1341  // This is a list of commands to deallocate our 'consolidated' matrices; the
1342  // commands will be placed after the commands in 'final_commands_'.
1343  std::vector<NnetComputation::Command> final_deallocate_commands_;
1344 };
1345 
1346 
1348  int32 submatrix_index,
1349  NnetComputation::MatrixDebugInfo *debug_info) const {
1351  KALDI_ASSERT(static_cast<size_t>(submatrix_index) <
1352  computation_->submatrices.size());
1353  NnetComputation::SubMatrixInfo submatrix_info =
1354  computation_->submatrices[submatrix_index];
1355  int32 matrix_index = submatrix_info.matrix_index;
1356  KALDI_ASSERT(matrix_index > 0 && static_cast<size_t>(matrix_index) <
1358  const NnetComputation::MatrixDebugInfo &src_info =
1359  computation_->matrix_debug_info[matrix_index];
1360  debug_info->is_deriv = src_info.is_deriv;
1361  KALDI_ASSERT(src_info.cindexes.size() ==
1362  computation_->matrices[matrix_index].num_rows);
1363  int32 row_begin = submatrix_info.row_offset,
1364  row_end = row_begin + submatrix_info.num_rows;
1365  debug_info->cindexes.insert(debug_info->cindexes.end(),
1366  src_info.cindexes.begin() + row_begin,
1367  src_info.cindexes.begin() + row_end);
1368 }
1369 
1370 // see comment by declaration in header.
1372  const std::vector<int32> &commands,
1373  const std::vector<int32> &submatrices) {
1374  int32 num_submatrices = submatrices.size();
1375  KALDI_ASSERT(num_submatrices > 1 && commands.size() == submatrices.size());
1376  int32 first_submatrix = submatrices[0];
1377  int32 num_cols = computation_->submatrices[first_submatrix].num_cols,
1378  num_rows = 0;
1379  MatrixStrideType stride_type = kDefaultStride;
1381  for (int32 i = 0; i < num_submatrices; i++) {
1382  int32 submatrix = submatrices[i];
1383  num_rows += computation_->submatrices[submatrix].num_rows;
1384  KALDI_ASSERT(computation_->submatrices[submatrix].num_cols == num_cols);
1385  if (!computation_->matrix_debug_info.empty())
1386  AppendDebugInfoForSubmatrix(submatrix, &debug_info);
1387  if (computation_->IsWholeMatrix(submatrix)) {
1388  int32 matrix = computation_->submatrices[submatrix].matrix_index;
1389  if (computation_->matrices[matrix].stride_type == kStrideEqualNumCols)
1390  stride_type = kStrideEqualNumCols;
1391  }
1392  }
1393  // new_whole_submatrix is a new submatrix index corresponding to the whole
1394  // of a new matrix that we are creating.
1395  int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
1396  stride_type);
1397  // Add commands at the very start, to initialize and then zero this new
1398  // matrix. we can later on remove the zeroing if it is not necessary.
1399  extra_commands_[0].push_back(
1400  NnetComputation::Command(kAllocMatrix, new_whole_submatrix));
1401  extra_commands_[0].push_back(
1402  NnetComputation::Command(0.0, kSetConst, new_whole_submatrix));
1403 
1404  final_deallocate_commands_.push_back(
1405  NnetComputation::Command(kDeallocMatrix, new_whole_submatrix));
1406  int32 new_matrix_index =
1407  computation_->submatrices[new_whole_submatrix].matrix_index;
1408  if (!computation_->matrix_debug_info.empty())
1409  computation_->matrix_debug_info[new_matrix_index].Swap(&debug_info);
1410 
1411  int32 row_offset = 0;
1412  for (int32 i = 0; i < num_submatrices; i++) {
1413  int32 submatrix_index = submatrices[i];
1414  int32 this_num_rows = computation_->submatrices[submatrix_index].num_rows;
1415  // submatrix corresponding to the part of the new matrix corresponding
1416  // to 'submatrices[i]'.
1417  int32 new_submatrix = computation_->NewSubMatrix(new_whole_submatrix,
1418  row_offset, this_num_rows,
1419  0, num_cols);
1420  // Just before command 'commands[i]', add a command that assigns to the
1421  // submatrix numbered 'new_submatrix' the contents of the submatrix numbered
1422  // 'submatrices[i]'. Note: we hope that a later pass of optimization
1423  // (VariableMergingOptimization) will remove this redundant copy by
1424  // having the operation that created it write directly to the location
1425  // we want it to be.
1426  NnetComputation::Command c(kMatrixCopy, new_submatrix, submatrices[i]);
1427  extra_commands_[commands[i]].push_back(c);
1428  row_offset += this_num_rows;
1429  }
1430  KALDI_ASSERT(row_offset == num_rows);
1431  return new_whole_submatrix;
1432 }
1433 
1435  KALDI_ASSERT(computation_->commands.size() == extra_commands_.size());
1436  int32 old_num_commands = computation_->commands.size(),
1437  new_num_commands = old_num_commands +
1438  static_cast<int32>(final_commands_.size() +
1439  final_deallocate_commands_.size());
1440  for (size_t i = 0; i < extra_commands_.size(); i++)
1441  new_num_commands += static_cast<int32>(extra_commands_[i].size());
1442  std::vector<NnetComputation::Command> new_commands;
1443  new_commands.reserve(new_num_commands);
1444  for (int32 c = 0; c < old_num_commands; c++) {
1445  new_commands.insert(new_commands.end(),
1446  extra_commands_[c].begin(), extra_commands_[c].end());
1447  new_commands.push_back(computation_->commands[c]);
1448  }
1449  new_commands.insert(new_commands.end(),
1450  final_commands_.begin(), final_commands_.end());
1451  new_commands.insert(new_commands.end(),
1452  final_deallocate_commands_.begin(),
1453  final_deallocate_commands_.end());
1454  computation_->commands.swap(new_commands);
1455 }
1456 
1461  int32 component_index,
1462  const std::vector<int32> &backprop_commands) {
1463  const Component *component = nnet_.GetComponent(component_index);
1464  int32 num_backprop_commands = backprop_commands.size();
1465 
1466  bool need_input = (component->Properties() & kBackpropNeedsInput) != 0,
1467  need_output = (component->Properties() & kBackpropNeedsOutput) != 0;
1468 
1469  std::vector<int32> input_submatrices(num_backprop_commands),
1470  output_submatrices(num_backprop_commands),
1471  output_deriv_submatrices(num_backprop_commands);
1472 
1473  for (int32 i = 0; i < num_backprop_commands; i++) {
1474  int32 command_index = backprop_commands[i];
1475  NnetComputation::Command &command =
1476  computation_->commands[command_index];
1477  // arg2 must be 0 because simple components don't use precomputed indexes.
1478  KALDI_ASSERT(command.command_type == kBackprop && command.arg2 == 0);
1480  int32 input_submatrix = command.arg3,
1481  output_submatrix = command.arg4,
1482  output_deriv_submatrix = command.arg5;
1483  KALDI_ASSERT((input_submatrix != 0) == need_input &&
1484  (output_submatrix != 0) == need_output);
1485  input_submatrices[i] = input_submatrix;
1486  output_submatrices[i] = output_submatrix;
1487  output_deriv_submatrices[i] = output_deriv_submatrix;
1488  }
1489  // Get the sub-matrix indexes of whichever of the consolidated matrices we
1490  // need (will usually be input_submatrix and output_deriv_submatrix).
1491  int32 input_submatrix = (need_input ?
1492  ConsolidateSubmatrices(backprop_commands,
1493  input_submatrices) : 0),
1494  output_submatrix = (need_output ?
1495  ConsolidateSubmatrices(backprop_commands,
1496  output_submatrices) : 0),
1497  output_deriv_submatrix = ConsolidateSubmatrices(backprop_commands,
1498  output_deriv_submatrices);
1499  int32 precomputed_indexes_index = 0, // unused since simple component
1500  input_deriv_submatrix = 0, // we don't need the input-deriv.
1501  memo_index = 0; // we checked that no memos were used.
1502  NnetComputation::Command c(kBackprop, component_index, precomputed_indexes_index,
1503  input_submatrix, output_submatrix,
1504  output_deriv_submatrix, input_deriv_submatrix,
1505  memo_index);
1506  final_commands_.push_back(c);
1507 }
1508 
1510  const Nnet &nnet,
1511  NnetComputation *computation):
1512  nnet_(nnet), computation_(computation),
1513  extra_commands_(computation->commands.size()) { }
1514 
1516  int32 num_components = nnet_.NumComponents(),
1517  num_commands = computation_->commands.size();
1518  // 'backprop_commands' is a list, for each component (but nonempty only for
1519  // updatable simple components), of the command indexes for the backprop
1520  // commands.
1521  std::vector<std::vector<int32> > backprop_commands(num_components);
1522  for (int32 command_index = 0;
1523  command_index < num_commands; command_index++) {
1524  const NnetComputation::Command &c = computation_->commands[command_index];
1525  if (c.command_type == kBackprop) {
1526  int32 component_index = c.arg1;
1527  const Component *component = nnet_.GetComponent(component_index);
1528  int32 properties = component->Properties();
1529  if ((properties & kUpdatableComponent) &&
1530  (properties & kSimpleComponent) &&
1531  !(properties & kUsesMemo))
1532  backprop_commands[component_index].push_back(command_index);
1533  }
1534  }
1535  bool consolidated = false;
1536  for (int32 component = 0; component < num_components; component++) {
1537  if (backprop_commands[component].size() > 1) {
1539  backprop_commands[component]);
1540  consolidated = true;
1541  }
1542  }
1543  if (!consolidated) // This is an optimization to avoid redundant computation
1544  return; // if there is nothing to do.
1545  // the following function call commits all the commands we stored in member
1546  // variables, to computation_->commands.
1548 }
1549 
1550 
1551 void ConsolidateModelUpdate(const Nnet &nnet,
1552  NnetComputation *computation) {
1553  // This following if-statement is an optimization: if the computation
1554  // request(s) had need_model_derivative == false, there would be nothing to
1555  // optimize, so don't bother trying.
1556  if (!computation->need_model_derivative)
1557  return;
1558  ModelUpdateConsolidator consolidator(nnet, computation);
1559  consolidator.ConsolidateModelUpdate();
1560 }
1561 
1562 
1563 // inline
1565  int32 new_submatrix,
1566  int32 *left_prune,
1567  int32 *right_prune) const {
1568  KALDI_ASSERT(initial_submatrix > 0 && new_submatrix > 0);
1570  initial_info = computation_->submatrices[initial_submatrix],
1571  new_info = computation_->submatrices[new_submatrix];
1572  KALDI_ASSERT(initial_info.matrix_index == new_info.matrix_index);
1573  *left_prune = new_info.row_offset - initial_info.row_offset;
1574  if (right_prune != NULL) {
1575  *right_prune = initial_info.num_rows - new_info.num_rows - *left_prune;
1576  KALDI_ASSERT(*left_prune >= 0 && *right_prune >= 0);
1577  }
1578 }
1579 
1581  int32 submatrix,
1582  int32 row_index) const {
1583  KALDI_ASSERT(submatrix > 0 && submatrix < computation_->submatrices.size());
1584  const NnetComputation::SubMatrixInfo &info =
1585  computation_->submatrices[submatrix];
1586  KALDI_ASSERT(row_index >= 0 &&
1587  row_index < computation_->submatrices[submatrix].num_rows);
1588  int32 matrix_index = info.matrix_index;
1590  &debug_info = computation_->matrix_debug_info[matrix_index];
1591  if (!debug_info.is_deriv) {
1592  // the derivative time limitation doesn't apply to things that aren't
1593  // derivatives.
1594  return true;
1595  }
1596  int32 t = debug_info.cindexes[row_index + info.row_offset].second.t;
1597  return (t >= min_deriv_time_ && t <= max_deriv_time_);
1598 }
1599 
1600 
1601 // modify commands to take into account the fact that some matrices are zero or
1602 // partially zero. Allocation commands and sizes of underlying matrices are not
1603 // affected-- we'll work out later on, what to do with them.
1605  CommandType command_type = command->command_type;
1606  switch (command_type) {
1607  case kAllocMatrix:
1608  case kDeallocMatrix:
1609  case kSetConst:
1610  case kSwapMatrix:
1611  break; // we'll deal with allocation and deallocation later on.
1612  case kPropagate:
1613  // Propagate commands are unchanged, except that if the output of the
1614  // propagate is completely outside the accepted time-range (only likely if
1615  // we're inside a recurrency), then we don't store stats; this is not
1616  // really important, and is mostly done to minimize the difference from an
1617  // older version of the code, to reduce the need for testing.
1618  if (submatrix_map_[command->arg4] == 0)
1619  command->arg6 = 0;
1620  break;
1621  case kBackpropNoModelUpdate: // we actually don't expect to encounter this,
1622  // but it's trivial to support as it's the
1623  // same as backprop.
1624  case kBackprop: {
1625  const Component *component = nnet_.GetComponent(command->arg1);
1626  int32 properties = component->Properties();
1627  if (!(properties & kSimpleComponent)) {
1628  // we don't (yet) do this optimization for non-simple Components...
1629  // it would be a bit more complicated as we'd have to recompute the
1630  // PrecomputedIndexes.
1631  break;
1632  }
1633  int32 input_submatrix = command->arg3,
1634  output_submatrix = command->arg4,
1635  output_deriv_submatrix = command->arg5,
1636  input_deriv_submatrix = command->arg6;
1637  int32 mapped_input_submatrix = submatrix_map_[input_submatrix],
1638  mapped_output_submatrix = submatrix_map_[output_submatrix],
1639  mapped_output_deriv_submatrix = submatrix_map_[output_deriv_submatrix],
1640  mapped_input_deriv_submatrix = submatrix_map_[input_deriv_submatrix];
1641 
1642  if (mapped_output_deriv_submatrix == 0) {
1643  // completely outside range..
1644  KALDI_ASSERT(mapped_input_deriv_submatrix == 0 &&
1645  mapped_input_submatrix == 0 &&
1646  mapped_output_submatrix == 0);
1647  // just delete the command.
1648  command->command_type = kNoOperation;
1649  if (command->arg7 > 0)
1650  memos_to_delete_.insert(command->arg7);
1651  } else if (mapped_output_deriv_submatrix !=
1652  output_deriv_submatrix &&
1653  !(properties & kUsesMemo)) {
1654  // we're operating on a range of the input or output.
1655  // we can't do this type of mapping of the component uses
1656  // a memo, though.
1657  command->arg3 = mapped_input_submatrix;
1658  command->arg4 = mapped_output_submatrix;
1659  command->arg5 = mapped_output_deriv_submatrix;
1660  command->arg6 = mapped_input_deriv_submatrix;
1661  }
1662  break;
1663  }
1664  case kMatrixCopy: case kMatrixAdd:
1665  MapSimpleMatrixCommand(command);
1666  break;
1667  case kCopyRows: case kAddRows:
1668  MapIndexesCommand(command);
1669  break;
1670  case kCopyRowsMulti: case kCopyToRowsMulti:
1671  case kAddRowsMulti: case kAddToRowsMulti:
1672  MapIndexesMultiCommand(command);
1673  break;
1674  case kAddRowRanges: {
1675  MapAddRowRangesCommand(command);
1676  break;
1677  }
1678  case kAcceptInput: case kProvideOutput:
1680  break;
1681  default:
1682  KALDI_ERR << "Un-handled command type.";
1683  }
1684 }
1685 
1687  int32 submatrix1 = c->arg1,
1688  submatrix2 = c->arg2;
1689  int32 submatrix1_mapped = submatrix_map_if_deriv_[submatrix1],
1690  submatrix2_mapped = submatrix_map_if_deriv_[submatrix2];
1691  if (submatrix1_mapped == submatrix1 &&
1692  submatrix2_mapped == submatrix2) {
1693  // nothing to do.
1694  return;
1695  }
1696  if (submatrix1_mapped == 0 || submatrix2_mapped == 0) {
1697  // remove the operation-- it has nothing to do.
1699  return;
1700  }
1701  // left_prune1 is the number of rows pruned away on the left for submatrix1.
1702  int32 orig_num_rows = computation_->submatrices[submatrix1].num_rows,
1703  left_prune1, left_prune2, right_prune1, right_prune2;
1704  GetPruneValues(submatrix1, submatrix1_mapped, &left_prune1, &right_prune1);
1705  GetPruneValues(submatrix2, submatrix2_mapped, &left_prune2, &right_prune2);
1706  if (left_prune1 == left_prune2 && right_prune1 == right_prune2) {
1707  // we took the same number of rows away from the left and right for
1708  // both arguments; the normal mapped values will work in this case
1709  c->arg1 = submatrix1_mapped;
1710  c->arg2 = submatrix2_mapped;
1711  } else {
1712  // there is some kind of mismatch- we'll prune back to what remains
1713  // after applying the maximum pruning on the left and right.
1714  int32 left_prune = std::max(left_prune1, left_prune2),
1715  right_prune = std::max(right_prune1, right_prune2);
1716  if (left_prune + right_prune >= orig_num_rows) {
1717  // everything was pruned away; remove the operation.
1719  return;
1720  } else {
1721  int32 num_rows = orig_num_rows - left_prune - right_prune;
1722  // note: the call NewSubMatrix effectively gives us a sub-matrix of a
1723  // sub-matrix.
1724  c->arg1 = computation_->NewSubMatrix(submatrix1,
1725  left_prune, num_rows, 0, -1);
1726  c->arg2 = computation_->NewSubMatrix(submatrix2,
1727  left_prune, num_rows, 0, -1);
1728  }
1729  }
1730 }
1731 
1732 // does the processing for a command of type kCopyRows or kAddRows, where
1733 // 1st and 2nd args are submatrix indexes and the 3rd arg is a vector of
1734 // row-indexes.
1736  int32 output_submatrix = c->arg1,
1737  input_submatrix = c->arg2;
1738  int32 input_submatrix_mapped = submatrix_map_if_deriv_[input_submatrix],
1739  output_submatrix_mapped = submatrix_map_if_deriv_[output_submatrix];
1740  // input_submatrix_mapped and output_submatrix_mapped map both submatrices to
1741  // just the portion that we are treating as nonzero.
1742 
1743  if (input_submatrix_mapped == 0 ||
1744  output_submatrix_mapped == 0) {
1745  // Either input or output is all zeros; make the command a no-op.
1746  // It may not be obvious that in the case of kCopyRows it would
1747  // be valid to make this a no-op (because what if the existing
1748  // contents were nonzero?), but we insist that this optimization
1749  // come before optimizations, and we know that the originally
1750  // generated computation would not overwrite a nonzero value
1751  // (and there are no undefined values because we make sure to
1752  // initialize everything with zeros; ununitialized values are
1753  // allowed only at a later optimization stage.
1755  return;
1756  }
1757  const std::vector<int32> &old_indexes = computation_->indexes[c->arg3];
1758 
1759  int32 left_prune_input, left_prune_output;
1760  GetPruneValues(input_submatrix, input_submatrix_mapped,
1761  &left_prune_input, NULL);
1762  GetPruneValues(output_submatrix, output_submatrix_mapped,
1763  &left_prune_output, NULL);
1764  int32 new_num_input_rows =
1765  computation_->submatrices[input_submatrix_mapped].num_rows,
1766  new_num_output_rows =
1767  computation_->submatrices[output_submatrix_mapped].num_rows;
1768  std::vector<int32> new_indexes(new_num_output_rows);
1769  bool must_keep_command = false;
1770  for (int32 i = 0; i < new_num_output_rows; i++) {
1771  // the index into the 'new_indexes' vector is the row of the output
1772  // submatrix; the value is the row of the input submatrix.
1773  int32 orig_index = old_indexes[i + left_prune_output];
1774  if (orig_index == -1 ||
1775  !RowIsKept(input_submatrix, orig_index) ||
1776  !RowIsKept(output_submatrix_mapped, i)) {
1777  new_indexes[i] = -1;
1778  } else {
1779  int32 mapped_index = orig_index - left_prune_input;
1780  // we can do the following assert because the RowIsKept command
1781  // would have turned it into a -1 if not.
1782  KALDI_ASSERT(mapped_index >= 0 && mapped_index < new_num_input_rows);
1783  new_indexes[i] = mapped_index;
1784  must_keep_command = true;
1785  }
1786  }
1787  if (!must_keep_command) {
1789  return;
1790  }
1791  int32 new_indexes_index = computation_->indexes.size();
1792  computation_->indexes.push_back(new_indexes);
1793  c->arg1 = output_submatrix_mapped;
1794  c->arg2 = input_submatrix_mapped;
1795  c->arg3 = new_indexes_index;
1796 }
1797 
1799  int32 dest_submatrix = c->arg1,
1800  indexes_multi_arg = c->arg2;
1801  int32 dest_submatrix_mapped = submatrix_map_if_deriv_[dest_submatrix];
1802  if (dest_submatrix_mapped == 0) {
1803  // The destination matrix is completely outside the allowed time range.
1805  return;
1806  }
1807  int32 left_prune;
1808  GetPruneValues(dest_submatrix, dest_submatrix_mapped, &left_prune, NULL);
1809  int32 new_num_rows = computation_->submatrices[dest_submatrix_mapped].num_rows;
1810  const std::vector<std::pair<int32, int32> > &old_indexes_multi(
1811  computation_->indexes_multi[indexes_multi_arg]);
1812  std::vector<std::pair<int32, int32> > new_indexes_multi(new_num_rows);
1813  bool must_keep_command = false;
1814  for (int32 i = 0; i < new_num_rows; i++) {
1815  std::pair<int32,int32> &this_pair = new_indexes_multi[i];
1816  this_pair = old_indexes_multi[i + left_prune];
1817  // note: 'this_submatrix' is the source submatrix, from where we copy or add
1818  // the the data; 'this_row' is the source row.
1819  int32 this_submatrix = this_pair.first,
1820  this_row = this_pair.second;
1821  if (this_submatrix == -1) // don't map the (-1, -1) pairs.
1822  continue;
1823  if (!RowIsKept(this_submatrix, this_row) ||
1824  !RowIsKept(dest_submatrix_mapped, i)) {
1825  this_pair.first = -1;
1826  this_pair.second = -1;
1827  continue;
1828  }
1829  int32 this_submatrix_mapped = submatrix_map_if_deriv_[this_submatrix];
1830 
1831  // Reason for the assert below: if this_submatrix_mapped was 0, then all the
1832  // values in it should be not-kept, but RowIsKept above returned true, so
1833  // this would be a code error.
1834  KALDI_ASSERT(this_submatrix_mapped != 0);
1835 
1836  int32 this_left_prune, this_num_rows =
1837  computation_->submatrices[this_submatrix_mapped].num_rows;
1838  GetPruneValues(this_submatrix, this_submatrix_mapped,
1839  &this_left_prune, NULL);
1840  int32 this_row_mapped = this_row - this_left_prune;
1841  // the above assert is there because if it was going to be outside the
1842  // kept range, RowIsKept should have returned false above.
1843  KALDI_ASSERT(this_row_mapped >= 0 && this_row_mapped < this_num_rows);
1844  this_pair.first = this_submatrix_mapped;
1845  this_pair.second = this_row_mapped;
1846  must_keep_command = true;
1847  }
1848  if (!must_keep_command) {
1850  return;
1851  }
1852  if (dest_submatrix_mapped == dest_submatrix &&
1853  new_indexes_multi == old_indexes_multi) // nothing changed.
1854  return;
1855  c->arg1 = dest_submatrix_mapped;
1856  c->arg2 = computation_->indexes_multi.size();
1857  computation_->indexes_multi.push_back(new_indexes_multi);
1858 }
1859 
1862  int32 dest_submatrix = c->arg1,
1863  src_submatrix = c->arg2,
1864  indexes_ranges_index = c->arg3;
1865  int32 dest_submatrix_mapped = submatrix_map_if_deriv_[dest_submatrix],
1866  src_submatrix_mapped = submatrix_map_if_deriv_[src_submatrix];
1867  if (dest_submatrix_mapped == dest_submatrix &&
1868  src_submatrix_mapped == src_submatrix)
1869  return;
1870  if (dest_submatrix_mapped == 0 || src_submatrix_mapped == 0) {
1872  return;
1873  }
1874  int32 dest_num_rows = computation_->submatrices[dest_submatrix_mapped].num_rows,
1875  src_num_rows = computation_->submatrices[src_submatrix_mapped].num_rows,
1876  src_left_prune, dest_left_prune;
1877  GetPruneValues(dest_submatrix, dest_submatrix_mapped,
1878  &dest_left_prune, NULL);
1879  GetPruneValues(src_submatrix, src_submatrix_mapped,
1880  &src_left_prune, NULL);
1881  const std::vector<std::pair<int32,int32> > &old_indexes_ranges(
1882  computation_->indexes_ranges[indexes_ranges_index]);
1883  std::vector<std::pair<int32,int32> > new_indexes_ranges(dest_num_rows);
1884 
1885  bool must_keep_command = false;
1886  for (int32 i = 0; i < dest_num_rows; i++) {
1887  std::pair<int32, int32> &this_pair = new_indexes_ranges[i];
1888  this_pair = old_indexes_ranges[i + dest_left_prune];
1889 
1890  int32 start = this_pair.first, end = this_pair.second;
1891  if (!RowIsKept(dest_submatrix_mapped, i)) {
1892  start = -1;
1893  end = -1;
1894  } else if (start >= 0) {
1895  // no need to change start, end if they are (-1, -1).
1896  // Note: this code is not optimally efficient, as RowIsKept
1897  // has a bunch of statements that we could cache some variables
1898  // for, but this command is pretty rare so not worth to optimize
1899  // at this point.
1900  while (start < end && !RowIsKept(src_submatrix, start))
1901  start++;
1902  while (end > start && !RowIsKept(src_submatrix, end - 1))
1903  end--;
1904  if (start == end) {
1905  start = -1;
1906  end = -1;
1907  } else {
1908  start -= src_left_prune;
1909  end -= src_left_prune;
1910  must_keep_command = true;
1911  // the next assert is because if we were outside the 'kept' part of the
1912  // submatrix, RowIsKept() should have instructed us to modify the value.
1913  KALDI_ASSERT(start >= 0 && end <= src_num_rows && start < end);
1914  }
1915  }
1916  this_pair.first = start;
1917  this_pair.second = end;
1918  }
1919  if (must_keep_command) {
1920  c->arg1 = dest_submatrix_mapped;
1921  c->arg2 = src_submatrix_mapped;
1922  c->arg3 = computation_->indexes_ranges.size();
1923  computation_->indexes_ranges.push_back(new_indexes_ranges);
1924  } else {
1926  }
1927 }
1928 
1929 
1931  int32 min_deriv_time,
1932  int32 max_deriv_time,
1933  NnetComputation *computation):
1934  nnet_(nnet),
1935  min_deriv_time_(min_deriv_time),
1936  max_deriv_time_(max_deriv_time),
1937  computation_(computation) { }
1938 
1941  if (min_deriv_time_ == std::numeric_limits<int32>::min() &&
1942  max_deriv_time_ == std::numeric_limits<int32>::max())
1943  return; // nothing to do.
1944 
1948  ModifyCommands();
1949  PruneMatrices();
1953 }
1954 
1956  if (memos_to_delete_.empty())
1957  return;
1958  size_t num_commands = computation_->commands.size(),
1959  num_memos_removed = 0;
1960  for (size_t command_index = 0; command_index < num_commands;
1961  command_index++) {
1962  NnetComputation::Command &c = computation_->commands[command_index];
1963  if (c.command_type == kPropagate &&
1964  memos_to_delete_.count(c.arg5) != 0) {
1965  c.arg5 = 0;
1966  num_memos_removed++;
1967  }
1968  }
1969  KALDI_ASSERT(num_memos_removed == memos_to_delete_.size());
1970 }
1971 
1974  computation_->matrices.size() &&
1975  "Limiting derivative times requires debug info.");
1976  const int32 num_matrices = computation_->matrices.size(),
1977  min_deriv_time = min_deriv_time_,
1978  max_deriv_time = max_deriv_time_;
1979  matrix_prune_info_.resize(num_matrices);
1980  // matrix_prune_info_[0] will remain undefined.
1981  for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
1982  NnetComputation::MatrixDebugInfo &debug_info =
1983  computation_->matrix_debug_info[matrix_index];
1984  MatrixPruneInfo &prune_info = matrix_prune_info_[matrix_index];
1985  const std::vector<Cindex> &cindexes = debug_info.cindexes;
1986  int32 num_rows = computation_->matrices[matrix_index].num_rows;
1987  KALDI_ASSERT(num_rows == static_cast<int32>(cindexes.size()));
1988  int32 first_row_within_range = num_rows,
1989  last_row_within_range = -1;
1990  for (int32 i = 0; i < num_rows; i++) {
1991  int32 t = cindexes[i].second.t;
1992  if (t >= min_deriv_time && t <= max_deriv_time) {
1993  if (i < first_row_within_range) first_row_within_range = i;
1994  if (i > last_row_within_range) last_row_within_range = i;
1995  }
1996  }
1997  if (last_row_within_range == -1) {
1998  prune_info.fully_inside_range = false;
1999  prune_info.partly_inside_range = false;
2000  } else if (last_row_within_range == num_rows - 1 &&
2001  first_row_within_range == 0) {
2002  prune_info.fully_inside_range = true;
2003  prune_info.partly_inside_range = false;
2004  } else {
2005  prune_info.fully_inside_range = false;
2006  prune_info.partly_inside_range = true;
2007  prune_info.row_begin = first_row_within_range;
2008  prune_info.row_end = last_row_within_range + 1;
2009  }
2010  }
2011 }
2012 
2014  int32 num_submatrices = computation_->submatrices.size();
2015  submatrix_map_.resize(num_submatrices);
2016  submatrix_map_if_deriv_.resize(num_submatrices);
2017  // index zero is for the empty submatrix.
2018  submatrix_map_[0] = 0;
2019  submatrix_map_if_deriv_[0] = 0;
2020  for (int32 s = 1; s < num_submatrices; s++) {
2022  int32 matrix_index = submatrix_info.matrix_index;
2023  int32 row_offset = submatrix_info.row_offset,
2024  num_rows = submatrix_info.num_rows;
2025  const MatrixPruneInfo &matrix_prune_info = matrix_prune_info_[matrix_index];
2026  if (matrix_prune_info.fully_inside_range) {
2027  submatrix_map_[s] = s;
2028  } else if (!matrix_prune_info.partly_inside_range) {
2029  // completely outside time range.
2030  submatrix_map_[s] = 0;
2031  } else {
2032  // the matrix is partly inside the time range.
2033  int32 pruned_row_begin = std::max(matrix_prune_info.row_begin,
2034  row_offset),
2035  pruned_row_end = std::min(matrix_prune_info.row_end,
2036  row_offset + num_rows);
2037  if (pruned_row_end <= pruned_row_begin) {
2038  // there was no overlap between the submatrix and the part
2039  // of the matrix that was inside the time range.
2040  submatrix_map_[s] = 0;
2041  } else {
2042  // caution: this invalidates the reference 'submatrix_info'.
2043  int32 row_offset_within_submatrix =
2044  pruned_row_begin - row_offset,
2045  new_num_rows = pruned_row_end - pruned_row_begin;
2046  submatrix_map_[s] =
2047  computation_->NewSubMatrix(s, row_offset_within_submatrix,
2048  new_num_rows, 0, -1);
2049  }
2050  }
2051  bool is_deriv = computation_->matrix_debug_info[matrix_index].is_deriv;
2052  submatrix_map_if_deriv_[s] = (is_deriv ?
2053  submatrix_map_[s] : s);
2054  }
2055 }
2056 
2058  std::vector<NnetComputation::Command>::iterator
2059  iter = computation_->commands.begin(),
2060  end = computation_->commands.end();
2061  for (; iter != end; ++iter)
2062  ModifyCommand(&(*iter));
2063 }
2064 
2065 // called from PruneMatrices only for matrices that are derivatives,
2066 // not inputs or outputs of the computation, and which are partly
2067 // inside the time range, this function returns true if we can
2068 // limit the size of the matrix (because variables outside the
2069 // desired range are never accessed), and false otherwise.
2071  int32 m) const {
2072  int32 s_whole = whole_submatrices_[m]; // submatrix consisting of
2073  // all of the matrix m
2074  int32 s_mapped = submatrix_map_[s_whole]; // submatrix consisting of the time
2075  // range of the matrix m that we
2076  // plan to limit it to.
2077  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_whole);
2078  std::vector<int32> whole_variables, mapped_variables;
2079  analyzer.variables.AppendVariablesForSubmatrix(s_whole,
2080  &whole_variables);
2081  analyzer.variables.AppendVariablesForSubmatrix(s_mapped,
2082  &mapped_variables);
2083  KALDI_ASSERT(whole_variables.size() > mapped_variables.size());
2084  std::vector<int32> excluded_variables(whole_variables.size() -
2085  mapped_variables.size());
2086  std::vector<int32>::iterator end_iter =
2087  std::set_difference(whole_variables.begin(), whole_variables.end(),
2088  mapped_variables.begin(), mapped_variables.end(),
2089  excluded_variables.begin());
2090  KALDI_ASSERT(end_iter == excluded_variables.end());
2091  // We want to make sure that none of the excluded variables are ever accessed,
2092  // except possibly for zeroing or setting to other constant value. If they
2093  // are, we cannot prune the matrix.
2094  for (std::vector<int32>::iterator iter = excluded_variables.begin();
2095  iter != end_iter; ++iter) {
2096  int32 variable_index = *iter;
2097  const std::vector<Access> &variable_accesses =
2098  analyzer.variable_accesses[variable_index];
2099  std::vector<Access>::const_iterator viter = variable_accesses.begin(),
2100  vend = variable_accesses.end();
2101  for (; viter != vend; ++viter) {
2102  // if a variable outside the pruned range of the matrix is ever accessed
2103  // apart from on allocation, we cannot prune.
2104  int32 command_index = viter->command_index;
2105  NnetComputation::Command &command = computation_->commands[command_index];
2106  if (command.command_type != kSetConst) {
2107  // we may one day want to look at this.. it's not really expected.
2108  KALDI_VLOG(3) << "Cannot prune matrix " << m;
2109  return false;
2110  }
2111  }
2112  }
2113  return true;
2114 }
2115 
2116 void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
2117  // first modify 'submatrices'.
2118  int32 num_submatrices = computation_->submatrices.size(),
2119  num_matrices = computation_->matrices.size();
2120  for (int32 s = 1; s < num_submatrices; s++) {
2122  int32 m = submat_info.matrix_index;
2123  if (will_limit[m]) {
2124  // we need to do something...
2125  const MatrixPruneInfo &prune_info = matrix_prune_info_[m];
2126  int32 matrix_num_rows = prune_info.row_end - prune_info.row_begin;
2127  KALDI_ASSERT(matrix_num_rows > 0 &&
2128  matrix_num_rows < computation_->matrices[m].num_rows);
2129  KALDI_ASSERT(prune_info.partly_inside_range);
2130  int32 new_row_begin = submat_info.row_offset - prune_info.row_begin;
2131  if (new_row_begin >= 0 &&
2132  submat_info.num_rows + new_row_begin <= matrix_num_rows) {
2133  // If this submatrix is entirely inside the limited range of the matrix,
2134  // then we modify its row_offset to account for the truncation of
2135  // rows to the left.
2136  submat_info.row_offset = new_row_begin;
2137  } else {
2138  // This submatrix is not entirely inside the kept range of the matrix.
2139  // We assume that this submatrix is never accessed directly except (if
2140  // it was the whole matrix) for in allocation and deallocation commands,
2141  // since when we modified the computation we ensured this.
2142  if (computation_->IsWholeMatrix(s)) {
2143  // If it was the whole matrix then it may be used in allocation and
2144  // deallocation commands, so we should modify it to be the whole of the
2145  // new matrix, which will have fewer rows than before.
2146  submat_info.num_rows = matrix_num_rows;
2147  } else {
2148  // We believe this matrix should never be used. We give it a valid
2149  // but stupid size of num-rows=1, num-cols=1, so that if it ever does
2150  // get accessed it should produce an error.
2151  submat_info.row_offset = 0;
2152  submat_info.num_rows = 1;
2153  submat_info.col_offset = 0;
2154  submat_info.num_cols = 1;
2155  }
2156  }
2157  }
2158  }
2159  // next modify 'matrices'
2160  for (int32 m = 1; m < num_matrices; m++) {
2161  if (will_limit[m]) {
2162  const MatrixPruneInfo &prune_info = matrix_prune_info_[m];
2164  if (!computation_->matrix_debug_info.empty()) {
2165  NnetComputation::MatrixDebugInfo &debug_info =
2167  std::vector<Cindex> &cindexes = debug_info.cindexes;
2168  KALDI_ASSERT(cindexes.size() == static_cast<size_t>(matrix_info.num_rows));
2169  cindexes.erase(cindexes.begin() + prune_info.row_end, cindexes.end());
2170  cindexes.erase(cindexes.begin(),
2171  cindexes.begin() + prune_info.row_begin);
2172  }
2173  matrix_info.num_rows = prune_info.row_end - prune_info.row_begin;
2174  // num_cols stays the same.
2175  }
2176  }
2177 }
2178 
2180  Analyzer analyzer;
2181  analyzer.Init(nnet_, *computation_);
2183  int32 num_matrices = computation_->matrices.size();
2184  std::vector<bool> will_limit(num_matrices, false);
2185  bool will_limit_at_least_one = false;
2186  for (int32 m = 1; m < num_matrices; m++) {
2187  const MatrixAccesses &accesses = analyzer.matrix_accesses[m];
2188  const MatrixPruneInfo &matrix_prune_info = matrix_prune_info_[m];
2189  if (matrix_prune_info.fully_inside_range ||
2190  accesses.is_input || accesses.is_output ||
2191  !computation_->matrix_debug_info[m].is_deriv)
2192  continue; // nothing to do: it's inside the time-range or not a
2193  // derivative.
2194  // if we got here it's not completely inside the time range, not an input or
2195  // an output, and it's a derivative.
2196  if (!matrix_prune_info.partly_inside_range) {
2197  // completely outside time range. we can prune the matrix if it is not an
2198  // input or output, and is never accessed apart from allocation.
2199  if (MatrixIsUnused(analyzer, *computation_, m))
2201  } else {
2202  // the matrix is partly inside the time range, it's a derivative, and not
2203  // an input or an output.
2204  if (CanLimitMatrix(analyzer, m)) {
2205  will_limit[m] = true;
2206  will_limit_at_least_one = true;
2207  }
2208  }
2209  }
2210  if (will_limit_at_least_one)
2211  LimitMatrices(will_limit);
2212 }
2213 
2214 
2215 void LimitDerivativeTimes(const Nnet &nnet,
2216  int32 min_deriv_time,
2217  int32 max_deriv_time,
2218  NnetComputation *computation) {
2219  DerivativeTimeLimiter limiter(nnet, min_deriv_time, max_deriv_time,
2220  computation);
2221  limiter.LimitDerivTimes();
2222 }
2223 
2224 
2225 /*
2226  This helper function, used in ReplaceRowWithMatrixOps, detects
2227  when the vector 'indexes' has a 'special structure'. The special structure
2228  is:
2229  zero or more -1's, then
2230  a consecutive nonempty sequence of nonnegative numbers, e.g. 6 7 8 9 10, then
2231  zero or more -1's.
2232 
2233  Note: this function assumes that any negative elements of 'indexes' are -1.
2234  If there are elements less than -1, then it is an error, but this function
2235  does not thoroughly check for that. 'indexes' is required to be a nonempty
2236  vector.
2237 
2238  If 'indexes' has the special structure then this function returns true
2239  and sets the following values, which will explain with the following
2240  example in mind: 'indexes = [ -1, -1, 5 6 7 8, -1 ]'.
2241  - '*first_nonnegative_pos' is set to the number of initial -1's (and also
2242  the location of the first nonnegative element): 2 in this case.
2243  - '*first_nonnegative_value' is set to the value of the first nonnegative
2244  element (5 in this case)
2245  - '*num_nonnegative_values' is set to the number of nonnegative values in
2246  the sequence (4 in this case).
2247  If 'indexes' does not have this special structure, then this function returns
2248  false, and the values of '*first_nonnegative_pos',
2249  '*first_nonnegative_value' and '*num_nonnegative_indexes' on exit are
2250  undefined.
2251 */
2252 static bool IndexesHaveSpecialStructure(const std::vector<int32> &indexes,
2253  int32 *first_nonnegative_pos,
2254  int32 *first_nonnegative_value,
2255  int32 *num_nonnegative_indexes) {
2256  KALDI_ASSERT(!indexes.empty());
2257  const int32 *indexes_ptr = &(indexes[0]);
2258  size_t pos = 0, size = indexes.size();
2259 
2260  // Find the first nonnegative element of 'indexes'.
2261  for (; pos < size; ++pos)
2262  if (indexes_ptr[pos] >= 0)
2263  break;
2264  if (pos == size)
2265  return false; // all -1's... should not happen, but not our problem.
2266  *first_nonnegative_pos = static_cast<int32>(pos);
2267  int32 n = indexes_ptr[pos];
2268  *first_nonnegative_value = n;
2269  // Find the first element after '*first_nonnegative_index' that isn't
2270  // consecutive.
2271  for (; pos < size; ++pos,++n)
2272  if (indexes_ptr[pos] != n)
2273  break;
2274 
2275  *num_nonnegative_indexes = n - *first_nonnegative_value;
2276 
2277  // Check that the remaining values are all <0 (assumed equal to -1, but
2278  // checking <0 may be faster as just one instruction).
2279  for (; pos < size; ++pos)
2280  if (indexes_ptr[pos] >= 0)
2281  return false; // does not have the special structure.
2282 
2283  return true;
2284 }
2285 
2286 
2287 
2289  bool ans = false;
2290  int32 num_commands = computation->commands.size(),
2291  num_indexes = computation->indexes.size();
2292  for (int32 command_index = 0; command_index < num_commands;
2293  command_index++) {
2294  // non-const because we'll be changing it.
2295  NnetComputation::Command &c = computation->commands[command_index];
2296 
2297  int32 first_nonnegative_pos,
2298  first_nonnegative_value,
2299  num_nonnegative_indexes;
2300  switch (c.command_type) {
2301  case kCopyRows: case kAddRows: {
2302  int32 indexes_index = c.arg3;
2303  KALDI_ASSERT(indexes_index < num_indexes);
2304  const std::vector<int32> &indexes = computation->indexes[indexes_index];
2305  if (IndexesHaveSpecialStructure(indexes,
2306  &first_nonnegative_pos,
2307  &first_nonnegative_value,
2308  &num_nonnegative_indexes)) {
2309  ans = true;
2310  c.arg1 = computation->NewSubMatrix(c.arg1, first_nonnegative_pos,
2311  num_nonnegative_indexes,
2312  0, -1);
2313  c.arg2 = computation->NewSubMatrix(c.arg2, first_nonnegative_value,
2314  num_nonnegative_indexes,
2315  0, -1);
2317  kMatrixAdd);
2318  }
2319  break;
2320  }
2321  default:
2322  break;
2323  }
2324  }
2325  return ans;
2326 }
2327 
2328 
2329 
2330 /*
2331  This function, used in SnipSingleRowOp(),
2332  finds the number of leading, and trailing, negative numbers
2333  in a vector of integers. For instance, if vec is
2334  [ -1 -1 2 3 -1 4 5 -1 ]
2335  then '*num_leading_negatives' will be set to 2 and '*num_trailing_negatives'
2336  will be set to 1. If all the numbers in 'vec' are all negative, or 'vec' is
2337  empty, it is an error and this function will invoke KALDI_ERR.
2338 */
2339 static void FindNumLeadingAndTrailingNegatives(const std::vector<int32> &vec,
2340  int32 *num_leading_negatives,
2341  int32 *num_trailing_negatives) {
2342  KALDI_ASSERT(!vec.empty());
2343  const int32 *begin = &(vec[0]), *ptr = begin, *end = ptr + vec.size();
2344  while (ptr != end && *ptr < 0)
2345  ptr++;
2346  // note regarding error message: we assume all negative numbers are -1, due to
2347  // the way this is called, but it only affects how we describe the error.
2348  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
2349  *num_leading_negatives = ptr - begin;
2350  const int32 *ptr2 = end - 1;
2351  // the following while loop should terminate before falling off the vector,
2352  // because we've established above (in the assertion) that the vector contains
2353  // at least one nonnegative number.
2354  while (*ptr2 < 0)
2355  ptr2--;
2356  KALDI_ASSERT(ptr2 >= begin); // or would be code error.
2357  *num_trailing_negatives = end - 1 - ptr2;
2358 }
2359 
2360 // This function, called from SnipRowOps, is called when it encounters commands
2361 // of type kAddRows; it modifies such commands when the indexes have leading or
2362 // trailing -1's,h, to make them operate on a smaller submatrix. It returns
2363 // true if it made a change, and false otherwise.
2364 static bool SnipSingleRowOp(NnetComputation *computation,
2365  int32 command_index) {
2366  NnetComputation::Command &c = computation->commands[command_index];
2367  KALDI_ASSERT(static_cast<size_t>(c.arg3) < computation->indexes.size());
2368  const std::vector<int32> &indexes = computation->indexes[c.arg3];
2369  int32 num_leading_negatives, num_trailing_negatives;
2371  &num_leading_negatives,
2372  &num_trailing_negatives);
2373  if (num_leading_negatives == 0 && num_trailing_negatives == 0)
2374  return false;
2375 
2376  int32 new_num_rows = static_cast<int32>(indexes.size()) -
2377  num_leading_negatives - num_trailing_negatives;
2378  KALDI_ASSERT(new_num_rows > 0);
2379  std::vector<int32> new_indexes(indexes.begin() + num_leading_negatives,
2380  indexes.begin() + num_leading_negatives +
2381  new_num_rows);
2382  c.arg3 = computation->indexes.size();
2383  computation->indexes.push_back(std::vector<int32>());
2384  computation->indexes.back().swap(new_indexes);
2385  c.arg1 = computation->NewSubMatrix(c.arg1,
2386  num_leading_negatives, new_num_rows,
2387  0, -1);
2388  return true; // made a change.
2389 }
2390 
2391 
2392 
2393 /*
2394  This function, used in SnipSingleRowOp(), finds the number of leading, and
2395  trailing, negative values in a vector of pairs of integers. In particular,
2396  it finds the number of leading and trailing pairs whose .first value is negative
2397  (in practice we'll only encounter either (-1,-1) pairs, or pairs of both
2398  nonnegative values).
2399 
2400  For instance, if vec is
2401  [ (-1,-1) (-1,-1) (80,2) (81,3) (-1,-1) (80,4) (81,5) (-1,-1) ]
2402  then '*num_leading_negatives' will be set to 2 and '*num_trailing_negatives'
2403  will be set to 1. If all the .first numbers in 'vec' are all negative, or
2404  'vec' is empty, it is an error and this function will invoke KALDI_ERR.
2405 */
2407  const std::vector<std::pair<int32, int32> > &vec,
2408  int32 *num_leading_negatives,
2409  int32 *num_trailing_negatives) {
2410  KALDI_ASSERT(!vec.empty());
2411  const std::pair<int32, int32> *begin = &(vec[0]), *ptr = begin,
2412  *end = ptr + vec.size();
2413  while (ptr != end && ptr->first < 0)
2414  ptr++;
2415  // note regarding error message: we assume all negative numbers are -1, due to
2416  // the way this is called, but it only affects how we describe the error.
2417  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
2418  *num_leading_negatives = ptr - begin;
2419  const std::pair<int32, int32> *ptr2 = end - 1;
2420  // the following while loop should terminate before falling off the vector,
2421  // because we've established above (in the assertion) that the vector contains
2422  // at least one nonnegative number.
2423  while (ptr2->first < 0)
2424  ptr2--;
2425  KALDI_ASSERT(ptr2 >= begin); // would be code error.
2426  *num_trailing_negatives = end - 1 - ptr2;
2427 }
2428 
2429 
2430 // This function, called from SnipRowOps, is called when it encounters commands
2431 // of type kAddRowsMulti, kAddToRowsMulti, or kCopyToRowsMulti; have leading or
2432 // trailing (-1,-1) pairs, to make them operate on a smaller submatrix. It
2433 // returns true if it made a change, and false otherwise.
2434 static bool SnipMultiRowOp(NnetComputation *computation,
2435  int32 command_index) {
2436  NnetComputation::Command &c = computation->commands[command_index];
2437  KALDI_ASSERT(static_cast<size_t>(c.arg2) < computation->indexes_multi.size());
2438  const std::vector<std::pair<int32, int32> > &indexes_multi =
2439  computation->indexes_multi[c.arg2];
2440  int32 num_leading_negatives, num_trailing_negatives;
2441  FindNumLeadingAndTrailingNegatives(indexes_multi,
2442  &num_leading_negatives,
2443  &num_trailing_negatives);
2444  if (num_leading_negatives == 0 && num_trailing_negatives == 0)
2445  return false;
2446 
2447  int32 new_num_rows = static_cast<int32>(indexes_multi.size()) -
2448  num_leading_negatives - num_trailing_negatives;
2449  KALDI_ASSERT(new_num_rows > 0);
2450  std::vector<std::pair<int32, int32> > new_indexes_multi(
2451  indexes_multi.begin() + num_leading_negatives,
2452  indexes_multi.begin() + num_leading_negatives + new_num_rows);
2453  c.arg2 = computation->indexes_multi.size();
2454  computation->indexes_multi.push_back(std::vector<std::pair<int32, int32> >());
2455  computation->indexes_multi.back().swap(new_indexes_multi);
2456  c.arg1 = computation->NewSubMatrix(c.arg1,
2457  num_leading_negatives, new_num_rows,
2458  0, -1);
2459  return true; // made a change.
2460 }
2461 
2462 
2463 
2464 /*
2465  This function, used in SnipRangeRowOp(), finds the number of leading and
2466  trailing values in a vector of pairs of integers, that are the same (i.e.
2467  pairs of the form (x, x) for any x. [This is how we represent an empty
2468  range, which is a kind of no-op, in commands of kCopyRowRanges or
2469  kAddRowRanges.
2470 
2471  For instance, if vec is
2472  [ (0,0) (0,0) (4,5) (6,8) (0,0) (10,12) (14,20) (0,0) ]
2473  then '*num_leading_identicals' will be set to 2 and '*num_trailing_identicals'
2474  will be set to 1. If all pairs in 'vec' are identical, or 'vec' is empty, it
2475  is an error and this function will invoke KALDI_ERR.
2476 */
2478  const std::vector<std::pair<int32, int32> > &vec,
2479  int32 *num_leading_identicals,
2480  int32 *num_trailing_identicals) {
2481  KALDI_ASSERT(!vec.empty());
2482  const std::pair<int32, int32> *begin = &(vec[0]), *ptr = begin,
2483  *end = ptr + vec.size();
2484  while (ptr != end && ptr->first == ptr->second)
2485  ptr++;
2486  // note regarding error message: we assume all pairs of identical numbers are
2487  // -1, due to the way this is called, but it only affects how we describe the
2488  // error.
2489  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
2490  *num_leading_identicals = ptr - begin;
2491  const std::pair<int32, int32> *ptr2 = end - 1;
2492  // the following while loop should terminate before falling off the vector,
2493  // because we've established above (in the assertion) that the vector contains
2494  // at least one nonnegative number.
2495  while (ptr2->first == ptr2->second)
2496  ptr2--;
2497  KALDI_ASSERT(ptr2 >= begin); // would be code error.
2498  *num_trailing_identicals = end - 1 - ptr2;
2499 }
2500 
2501 
2502 // This function, called from SnipRowOps, is called when it encounters commands
2503 // of type kAddRowRanges that have leading or trailing (x, x) pairs [i.e. pairs
2504 // of identical values; these are how we represent empty ranges], to make them
2505 // operate on a smaller submatrix. It returns true if it made a change, and
2506 // false otherwise.
2507 static bool SnipRangesRowOp(NnetComputation *computation,
2508  int32 command_index) {
2509  NnetComputation::Command &c = computation->commands[command_index];
2510  KALDI_ASSERT(static_cast<size_t>(c.arg3) < computation->indexes_ranges.size());
2511  const std::vector<std::pair<int32, int32> > &indexes_ranges =
2512  computation->indexes_ranges[c.arg3];
2513  int32 num_leading_identicals, num_trailing_identicals;
2514  FindNumLeadingAndTrailingIdenticals(indexes_ranges,
2515  &num_leading_identicals,
2516  &num_trailing_identicals);
2517  if (num_leading_identicals == 0 && num_trailing_identicals == 0)
2518  return false;
2519 
2520  int32 new_num_rows = static_cast<int32>(indexes_ranges.size()) -
2521  num_leading_identicals - num_trailing_identicals;
2522  KALDI_ASSERT(new_num_rows > 0);
2523  std::vector<std::pair<int32, int32> > new_indexes_ranges(
2524  indexes_ranges.begin() + num_leading_identicals,
2525  indexes_ranges.begin() + num_leading_identicals + new_num_rows);
2526  c.arg3 = computation->indexes_ranges.size();
2527  computation->indexes_ranges.push_back(std::vector<std::pair<int32, int32> >());
2528  computation->indexes_ranges.back().swap(new_indexes_ranges);
2529  c.arg1 = computation->NewSubMatrix(c.arg1,
2530  num_leading_identicals, new_num_rows,
2531  0, -1);
2532  return true; // made a change.
2533 }
2534 
2535 
2536 
2537 bool SnipRowOps(NnetComputation *computation) {
2538  bool ans = false;
2539  int32 num_commands = computation->commands.size();
2540  for (int32 command_index = 0; command_index < num_commands;
2541  command_index++) {
2542  // non-const because we'll be changing it.
2543  NnetComputation::Command &c = computation->commands[command_index];
2544 
2545  // note: we can't do the snipping for commands of type case kCopyRows and case
2546  // kCopyRowsMulti, because the -1's aren't a pure no-op; they have the
2547  // meaning of setting the destination value to zero, so we can't prune
2548  // them away.
2549 
2550  switch (c.command_type) {
2551  case kAddRows: {
2552  if (SnipSingleRowOp(computation, command_index))
2553  ans = true;
2554  break;
2555  }
2556  case kAddRowsMulti: case kAddToRowsMulti:
2557  case kCopyToRowsMulti: {
2558  if (SnipMultiRowOp(computation, command_index))
2559  ans = true;
2560  break;
2561  }
2562  case kAddRowRanges: {
2563  if (SnipRangesRowOp(computation, command_index))
2564  ans = true;
2565  break;
2566  }
2567  default:
2568  break;
2569  }
2570  }
2571  return ans;
2572 }
2573 
2574 
2575 
2576 // This class implements the internals of the function SplitRowOps() which is
2577 // declared in nnet-optimize-utils.h.
2579  public:
2580  RowOpsSplitter(NnetComputation *computation): computation_(computation) { }
2581 
2582  // Attempts to perform the optimization. Returns true if it made any change
2583  // to the computation.
2584  bool Split() {
2585  return SplitIndexes() && SplitCommands();
2586  }
2587 
2588  private:
2589 
2590  // This function sets up split_info_, which describes how we can split up
2591  // the vectors that are elements of computation_->indexes_multi.
2592  // It will return true if it successfully split at least one of those
2593  // vectors, and false otherwise.
2594  bool SplitIndexes();
2595 
2596  // This function modifies the commands in the computation. It returns
2597  // true if it made any change.
2598  bool SplitCommands();
2599 
2600 
2601  // This function attempts to optimize the command in
2602  // computation_->commands[command_index]. It returns true if it made any
2603  // change. If we are going to have to insert an extra command into the
2604  // computation, this function will append an element to new_commands_.
2605  bool SplitCommand(int32 command_index);
2606 
2607  // Below, define a multi-index as an element of NnetComputation::indexes_multi,
2608  // for example,
2609  // const std::vector<std::pair<int32,int32> > &multi_index = computation_->indexes_multi[1];
2610  // It is a list of pairs.
2611 
2612  // This struct appears as an element of the list inside MultiIndexSplitInfo.
2613  // It helps us describe how we can split up a multi-index (a list of pairs)
2614  // into a sequence of ranges where the .first value is constant across the
2615  // range.
2617  // 'offset' is the index into the vector of pairs that forms the
2618  // start of this range. In the example where we are splitting up
2619  // ((10,2), (10,3), (10,4), (15,3), (15,5), (15,7))
2620  // there would be two instances of struct SingleSplitInfo, with
2621  // offset = 0 and offset = 3.
2623  // 'size' is the number of pairs in this range; in the example
2624  // above, both 'size' elements would be 3.
2626  // first_value is the value of the .first index throughout this range; in
2627  // the example above, it would be 10 and 15 respectively. It represents a
2628  // submatrix index.
2630 
2631  // initial_second_value is the minimum value of .second for any element in
2632  // this range: it would be 2 and 3 respectively in the example above.
2634 
2635  // second_value_range is the highest value of .second for any element in
2636  // this range, plus one, minus min_second_value. (It's the number of rows
2637  // in the other submatrix of the operation).
2639 
2640  // If the .second values in the range are consecutive then
2641  // 'second_value_offsets' will be empty. Otherwise it will
2642  // be a vector of size 'size', containing numbers in the
2643  // range 0 ... second_value_range - 1, such that
2644  // min_second_value + second_value_offsets[i] gives
2645  // the .second value at the corresponding position in the range.
2646  // In the second range of the example above, the range
2647  // consisting of ((15,3), (15,5), (15,7)), 'second_value_offsets
2648  // would be the vector (0, 2, 4).
2649  std::vector<int32> second_value_offsets;
2650  };
2651 
2652  // An instance of the struct MultiIndexSplitInfo will be created for each multi-index,
2653  // i.e. for each element of NnetComputation::indexes_multi.
2655  // If we can split this multi-index into at most two ranges, this
2656  // vector will be nonempty; otherwise it will be empty.
2657  std::vector<SingleSplitInfo> splits;
2658  };
2659 
2660  // GetSplitInfo() attempts to take a range of a
2661  // std::vector<std::pair<int32, int32> >, as represented by begin and end
2662  // iterators, and to extract its information into an object of type
2663  // SingleSplitInfo. (all except for the .offset member, which will have
2664  // been set by calling code).
2665  // It return true if successful, and false otherwise. The only reasons that
2666  // it might return false are that the range contains -1's or does not contain
2667  // all-identical .first members).
2668  bool GetSplitInfo(std::vector<std::pair<int32, int32> >::const_iterator begin,
2669  std::vector<std::pair<int32, int32> >::const_iterator end,
2670  SingleSplitInfo *info);
2671 
2672  // computation_ is the computation that we are modifying.
2674  // split_info_ will contain information about how we can split up the members
2675  // of computation_->indexes_multi into ranges.
2676  std::vector<MultiIndexSplitInfo> split_info_;
2677  // The following is a list of additional commands that we are going to insert
2678  // into computation_, of the form (command-index, command) where command-index
2679  // is a command index just before which we will insert the new command.
2680  // (this is the format accepted by the function InsertCommands()).
2681  std::vector<std::pair<int32, NnetComputation::Command> > new_commands_;
2682 
2683 };
2684 
2685 
2687  std::vector<std::pair<int32, int32> >::const_iterator begin,
2688  std::vector<std::pair<int32, int32> >::const_iterator end,
2689  SingleSplitInfo *info) {
2690  // max_size_ratio must be > 1.0, and could in principle be a float. It is
2691  // there to prevent us from making changes to the computation which would end
2692  // up wastefully launching too many kernels that would do nothing.
2693  const int32 max_size_ratio = 2;
2694 
2695  int32 size = end - begin;
2696  KALDI_ASSERT(size != 0);
2697  int32 first = begin->first;
2698  if (first < 0)
2699  return false;
2700  info->size = size;
2701  info->first_value = first;
2702  int32 initial_second_value = begin->second,
2703  min_second_value = initial_second_value,
2704  max_second_value = initial_second_value;
2705  info->second_value_offsets.resize(size);
2706  bool is_consecutive = true;
2707  for (int32 i = 0; i < size; i++) {
2708  int32 second = begin[i].second;
2709  if (begin[i].first != first || second < 0) return false;
2710  info->second_value_offsets[i] = second;
2711  if (second != initial_second_value + i)
2712  is_consecutive = false;
2713  if (second < min_second_value) min_second_value = second;
2714  if (second > max_second_value) max_second_value = second;
2715  }
2716  info->min_second_value = min_second_value;
2717  info->second_value_range = max_second_value + 1 - min_second_value;
2718  if (info->second_value_range > size * max_size_ratio)
2719  return false;
2720  if (is_consecutive) {
2721  info->second_value_offsets.clear();
2722  } else {
2723  for (int32 i = 0; i < size; i++)
2724  info->second_value_offsets[i] -= min_second_value;
2725  }
2726  return true;
2727 }
2728 
2729 
2731  bool ans = false;
2732  int32 num_indexes_multi = computation_->indexes_multi.size();
2733  split_info_.resize(num_indexes_multi);
2734  for (int32 i = 0; i < num_indexes_multi; i++) {
2735  const std::vector<std::pair<int32,int32> > &multi_index =
2737  MultiIndexSplitInfo &split_info = split_info_[i];
2738 
2739  int32 num_pairs = multi_index.size();
2740  KALDI_ASSERT(num_pairs > 0);
2741  // 'split_point' will be set to the first index j for which
2742  // multi_index[j-1].first != multi_index[j].first, or -1
2743  // if no such j exists.
2744  int32 split_point = -1, initial_first = multi_index[0].first;
2745  for (int32 j = 1; j < num_pairs; j++) {
2746  if (multi_index[j].first != initial_first) {
2747  split_point = j;
2748  break;
2749  }
2750  }
2751  if (split_point == -1) {
2752  split_info.splits.resize(1);
2753  split_info.splits[0].offset = 0;
2754  if (!GetSplitInfo(multi_index.begin(), multi_index.end(),
2755  &(split_info.splits[0]))) {
2756  split_info.splits.clear();
2757  } else {
2758  ans = true;
2759  }
2760  } else {
2761  split_info.splits.resize(2);
2762  split_info.splits[0].offset = 0;
2763  split_info.splits[1].offset = split_point;
2764 
2765  std::vector<std::pair<int32,int32> >::const_iterator mid_iter =
2766  multi_index.begin() + split_point;
2767  if (!GetSplitInfo(multi_index.begin(), mid_iter,
2768  &(split_info.splits[0])) ||
2769  !GetSplitInfo(mid_iter, multi_index.end(),
2770  &(split_info.splits[1]))) {
2771  split_info.splits.clear();
2772  } else {
2773  ans = true;
2774  }
2775  }
2776  }
2777  return ans;
2778 }
2779 
2782  CommandType command_type = command.command_type;
2783  // For commands that are not of the following four types, return false: we
2784  // won't be changing these commands.
2785  switch (command_type) {
2786  case kAddRowsMulti: case kCopyRowsMulti:
2787  case kAddToRowsMulti: case kCopyToRowsMulti: break;
2788  default: return false;
2789  }
2790  int32 indexes_multi_index = command.arg2;
2791  KALDI_ASSERT(indexes_multi_index <
2792  static_cast<int32>(split_info_.size()));
2793  const MultiIndexSplitInfo &split_info = split_info_[indexes_multi_index];
2794  if (split_info.splits.empty())
2795  return false; // these indexes couldn't be split: e.g. they contained more
2796  // than two distinct .first elements, or there were other
2797  // reasons.
2798 
2799  // we'll be splitting the command into either one or two pieces.
2800  std::vector<NnetComputation::Command> split_commands(
2801  split_info.splits.size());
2802  for (size_t i = 0; i < split_info.splits.size(); i++) {
2803  const SingleSplitInfo &split = split_info.splits[i];
2804  NnetComputation::Command &command_out = split_commands[i];
2805  command_out.alpha = command.alpha;
2806  command_out.arg1 = computation_->NewSubMatrix(
2807  command.arg1, split.offset, split.size, 0, -1);
2808  command_out.arg2 = computation_->NewSubMatrix(
2809  split.first_value, split.min_second_value,
2810  split.second_value_range, 0, -1);
2811 
2812  if (split.second_value_offsets.empty()) {
2813  // The .second elements are consecutive.
2814  switch (command_type) {
2815  case kAddRowsMulti:
2816  command_out.command_type = kMatrixAdd;
2817  break;
2818  case kCopyRowsMulti:
2819  command_out.command_type = kMatrixCopy;
2820  break;
2821  case kAddToRowsMulti:
2822  command_out.command_type = kMatrixAdd;
2823  std::swap(command_out.arg1, command_out.arg2);
2824  break;
2825  case kCopyToRowsMulti:
2826  command_out.command_type = kMatrixCopy;
2827  std::swap(command_out.arg1, command_out.arg2);
2828  break;
2829  default: // will never be reached.
2830  break;
2831  }
2832  } else {
2833  // Indexes are not consecutive: it needs to be a kAddRows or kCopyRows
2834  // command.
2835  command_out.arg3 = computation_->indexes.size();
2836  switch (command_type) {
2837  case kAddRowsMulti: case kCopyRowsMulti: {
2838  command_out.command_type = (command_type == kAddRowsMulti ?
2839  kAddRows : kCopyRows);
2840  computation_->indexes.push_back(split.second_value_offsets);
2841  break;
2842  }
2843  case kCopyToRowsMulti: {
2844  // We can't operate on this command because of what would happen
2845  // with values of 'indexes' (see the variable in the block for
2846  // kAddToRowsMulti) which were -1. Rows of the output would be
2847  // set to zero, which is not the behavior we want here; we'd want
2848  // them to be unaffected.
2849  return false;
2850  }
2851  case kAddToRowsMulti: {
2852  command_out.command_type = kAddRows;
2853  std::swap(command_out.arg1, command_out.arg2);
2854  // invert the indexes.
2855  std::vector<int32> indexes(split.second_value_range, -1);
2856  for (int32 i = 0; i < split.size; i++) {
2857  // the following assert should always succeed because the
2858  // AddToRowsMulti and CopyToRowsMulti should never have
2859  // duplicate destinations in their indexes.
2860  KALDI_ASSERT(indexes[split.second_value_offsets[i]] >= 0);
2861  indexes[split.second_value_offsets[i]] = i;
2862  }
2863  computation_->indexes.push_back(indexes);
2864  break;
2865  }
2866  default:
2867  KALDI_ERR << "Code error: un-handled case.";
2868  }
2869  }
2870  }
2871  command = split_commands[0];
2872  // note: for now, split_commands.size() will be 1 or 2.
2873  for (size_t i = 1; i < split_commands.size(); i++) {
2874  new_commands_.resize(new_commands_.size() + 1);
2875  // we'll want to insert this command right after command c,
2876  // which is the same as just before command c + 1.
2877  new_commands_.back().first = c + 1;
2878  new_commands_.back().second = split_commands[i];
2879  }
2880  return true; // We made a change.
2881 }
2882 
2884  bool ans = false;
2885  int32 num_commands = computation_->commands.size();
2886  for (int32 c = 0; c < num_commands; c++)
2887  if (SplitCommand(c))
2888  ans = true;
2889  if (!new_commands_.empty())
2890  InsertCommands(&new_commands_, computation_);
2891  return ans;
2892 }
2893 
2894 bool SplitRowOps(NnetComputation *computation) {
2895  RowOpsSplitter splitter(computation);
2896  return splitter.Split();
2897 }
2898 
2899 
2900 /*
2901  This function finds and returns the 'n-stride' of the vector of Indexes, or
2902  returns 0 if it does not exist because the Indexes lack the required regular
2903  structure. This function relates to 'shortcut' compilation and is used in
2904  class IoSpecificationIsDecomposable(). There is an overloaded version of
2905  this function that works with Cindex input, that has almost exactly
2906  the same code.
2907 
2908  It is used to discover regular structure in vectors of indexes. We are
2909  interested in the structure on the 'n' index; in particular, the stride on
2910  the 'n' index. We expect the vector 'indexes' to contain 'n' values of the
2911  form 0, 1, ... N-1 (where the value of N can be obtained easily by looking at
2912  the .n value of the last element of 'indexes'). And we expect the 'n' values
2913  of Indexes that are otherwise the same to be separated by a fixed stride,
2914  which we will return.
2915 
2916  If the stride is inconsistent or one of our other requirements (see below) is
2917  not fulfilled, we will return 0. If it's always consistent and our
2918  requirements are fulfilled we'll return the stride. If 'full_check' is true
2919  we do an exhaustive check for consistency; otherwise we do a randomized
2920  check.
2921 
2922  The full definition of 'consistency' is as follows:
2923 
2924  For some n_stride >= 1 (which we'll return), and with N as the number of
2925  'n' values (which should be numbered 0, 1, ... N-1):
2926 
2927  - For any Index with n < N-1 located at position i, an Index with one
2928  greater 'n' but otherwise the same must exist at position i + n_stride
2929  - For any Index with n > 0 located at position i, an Index with one
2930  smaller 'n' but otherwise the same must exist at position i - n_stride.
2931  - The input must be arranged in blocks of size block_size = n_stride * N,
2932  which these strides never cross. "Strides never cross" is an informal
2933  definition: we can formalize this by saying that for an Index with n == 0
2934  at position i, we must have (i / block_size) == ((i + n_stride*(N-1)) /
2935  block_size), with integer division.
2936  The above conditions imply that the size of the input must be a multiple
2937  of the n-stride.
2938 
2939  Reminder: we return 0 if the regular structure is not found, and the n-stride
2940  if the regular structure is found.
2941 */
2942 static int32 FindNStride(const std::vector<Index> &indexes,
2943  bool full_check) {
2944  // First find candidate stride. Later we'll check for consistency.
2945  int32 size = indexes.size();
2946  KALDI_ASSERT(size > 0);
2947  int32 N = indexes[size-1].n + 1,
2948  n_stride = -1;
2949  if (N <= 1) {
2950  // we wouldn't be able to determine the stride if N <= 1.
2951  return 0;
2952  }
2953  Index index(indexes[0]);
2954  if (index.n != 0 || size % N != 0) {
2955  // for the n stride to be positive, we must start with an index with n == 0.
2956  // if indexes.size() is not divisible by N, we have no hope of finding the
2957  // regular structure.
2958  return 0;
2959  }
2960  index.n = 1;
2961  // First check the two most common strides, which are 1
2962  // and size / N.
2963  if (indexes[1] == index) {
2964  n_stride = 1;
2965  } else if (indexes[size / N] == index) {
2966  n_stride = size / N;
2967  } else {
2968  int32 stride;
2969  // try the other possible strides one by one (for subsampling
2970  // layers of convnets, we might see strides of 2, for instance).
2971  for (stride = 2; stride < size / N; stride++) {
2972  if (size % stride == 0 && indexes[stride] == index) {
2973  n_stride = stride;
2974  break;
2975  }
2976  }
2977  if (n_stride == -1) {
2978  // if we fell off the loop then we found no candidates, which is strange
2979  // and means we did not find the expected structure; we'll return 0 as we
2980  // failed.
2981  return 0;
2982  }
2983  }
2984  // Now is the checking phase.
2985 
2986  // to understand block_size, see the comment above this functcion.
2987  int32 block_size = n_stride * N;
2988 
2989  std::vector<int32> indexes_to_check;
2990  if (full_check) {
2991  indexes_to_check.resize(size);
2992  for (int32 i = 0; i < size; i++)
2993  indexes_to_check[i] = i;
2994  } else {
2995  int32 num_to_check = std::min<int32>(5, size);
2996  indexes_to_check.resize(num_to_check);
2997  for (int32 j = 0; j < num_to_check; j++)
2998  indexes_to_check[j] = RandInt(0, size - 1);
2999  SortAndUniq(&indexes_to_check);
3000  }
3001  for (std::vector<int32>::iterator iter = indexes_to_check.begin();
3002  iter != indexes_to_check.end(); ++iter) {
3003  int32 i = *iter;
3004  Index index = indexes[i];
3005  int32 n = index.n;
3006  if (n < N - 1) {
3007  index.n = n + 1;
3008  if (i + n_stride >= size || indexes[i + n_stride] != index)
3009  return 0;
3010  }
3011  if (n == 0) {
3012  if (i / block_size != (i + n_stride * (N-1)) / block_size) {
3013  // this is a check that the input divides into blocks of size n_stride *
3014  // N and the N different versions of the same Index are always within a
3015  // block (i.e. that the n stride never crosses over the block; having
3016  // the same Index repeated within different blocks actually would not
3017  // matter).
3018  return 0;
3019  }
3020  } else { // n > 0
3021  index.n = n - 1;
3022  if (i - n_stride < 0 || indexes[i - n_stride] != index)
3023  return 0;
3024  }
3025  }
3026  return n_stride;
3027 }
3028 
3029 
3030 // This is almost exactly the same as the version of FindNStride declared above
3031 // that takes a vector of Indexes as input. Comments have been removed from
3032 // this version; see the other version for documentation.
3033 static int32 FindNStride(const std::vector<Cindex> &cindexes,
3034  bool full_check) {
3035  int32 size = cindexes.size();
3036  KALDI_ASSERT(size > 0);
3037  int32 N = cindexes[size-1].second.n + 1,
3038  n_stride = 0;
3039  if (N <= 1)
3040  return 0;
3041  Cindex cindex(cindexes[0]);
3042  if (cindex.second.n != 0 || size % N != 0)
3043  return 0;
3044  cindex.second.n = 1;
3045  if (cindexes[1] == cindex) {
3046  n_stride = 1;
3047  } else if (cindexes[size / N] == cindex) {
3048  n_stride = size / N;
3049  } else {
3050  int32 stride;
3051  for (stride = 2; stride < size / N; stride++) {
3052  if (size % stride == 0 && cindexes[stride] == cindex) {
3053  n_stride = stride;
3054  break;
3055  }
3056  }
3057  if (stride == size / N)
3058  return 0;
3059  }
3060  int32 block_size = n_stride * N;
3061  std::vector<int32> indexes_to_check;
3062  if (full_check) {
3063  indexes_to_check.resize(size);
3064  for (int32 i = 0; i < size; i++)
3065  indexes_to_check[i] = i;
3066  } else {
3067  int32 num_to_check = std::min<int32>(5, size);
3068  indexes_to_check.resize(num_to_check);
3069  for (int32 j = 0; j < num_to_check; j++)
3070  indexes_to_check[j] = RandInt(0, size - 1);
3071  SortAndUniq(&indexes_to_check);
3072  }
3073  for (std::vector<int32>::iterator iter = indexes_to_check.begin();
3074  iter != indexes_to_check.end(); ++iter) {
3075  int32 i = *iter;
3076  Cindex cindex = cindexes[i];
3077  int32 n = cindex.second.n;
3078  if (n < N - 1) {
3079  cindex.second.n = n + 1;
3080  if (i + n_stride >= size || cindexes[i + n_stride] != cindex)
3081  return 0;
3082  }
3083  if (n == 0) {
3084  if (i / block_size != (i + n_stride * (N-1)) / block_size)
3085  return 0;
3086  } else {
3087  cindex.second.n = n - 1;
3088  if (i - n_stride < 0 || cindexes[i - n_stride] != cindex)
3089  return 0;
3090  }
3091  }
3092  return n_stride;
3093 }
3094 
3095 
3096 /*
3097  This function, used in shortcut compilation, converts a vector of Indexes
3098  having a range of 'n' values (0, 1, ... old_N - 1), to a vector of
3099  Indexes that's otherwise the same, but has a different range of 'n' values
3100  (0, 1, ... new_N - 1).
3101 
3102  The input vector is expected to have a stride 'n_stride > 0', as
3103  would be returned by FindNStride, and the output vector will be given the
3104  same n-stride.
3105  */
3106 static void ConvertNumNValues(int32 n_stride, int32 old_N, int32 new_N,
3107  const std::vector<Index> &indexes_in,
3108  std::vector<Index> *indexes_out) {
3109  int32 size_in = indexes_in.size();
3110  KALDI_ASSERT(size_in > 0 && indexes_in[size_in - 1].n == old_N - 1);
3111  int32 block_size_in = n_stride * old_N,
3112  block_size_out = n_stride * new_N;
3113 
3114  indexes_out->resize((size_in / old_N) * new_N);
3115  for (int32 i_in = 0; i_in < size_in; i_in++) {
3116  if (indexes_in[i_in].n != 0)
3117  continue;
3118  Index index(indexes_in[i_in]);
3119  int32 block_index = i_in / block_size_in,
3120  offset_within_block = i_in % block_size_in;
3121 
3122 
3123  int32 i_out = block_index * block_size_out +
3124  offset_within_block;
3125  for (int32 n = 0; n < new_N; n++, i_out += n_stride) {
3126  index.n = n;
3127  (*indexes_out)[i_out] = index;
3128  }
3129  }
3130 }
3131 
3132 
3133 
3134 // This class implements the internals of the ExpandComputation() function (used
3135 // in shortcut compilation); see comment by the declaration of
3136 // ExpandComputation() in nnet-optimize-utils.h for overview. (It relates to
3137 // shortcut compilation).
3139  public:
3141  const MiscComputationInfo &misc_info,
3142  const NnetComputation &computation,
3143  bool need_debug_info,
3144  int32 num_n_values,
3145  NnetComputation *expanded_computation):
3146  nnet_(nnet), misc_info_(misc_info),
3147  computation_(computation),
3148  need_debug_info_(need_debug_info),
3149  num_n_values_(num_n_values),
3150  expanded_computation_(expanded_computation) {
3151  KALDI_ASSERT(num_n_values > 2);
3152  }
3153 
3154  // This function call implements the functionality of the class,
3155  // expanding the computation.
3156  void Expand();
3157 
3158  private:
3159  // This function sets up and computes the 'n_stride_' vector (see comment
3160  // by the declaration of 'n_stride_' for what this is.
3161  void InitStrideInfo();
3162 
3163  // This function sets up the 'matrices' vector in 'expanded_computation_'.
3164  // It's quite simple: it just multiplies all the num-rows by num_n_values_ and
3165  // divides by 2, and leaves the num-cols the same.
3166  void ComputeMatrixInfo();
3167 
3168  // This function, only called if need_debug_info_ is true, sets up
3169  // the 'matrix_debug_info' vector in 'expanded_computation_'.
3170  void ComputeDebugInfo();
3171 
3172  // This function sets up the 'submatrices' vector in 'expanded_computation_'.
3173  // Column ranges always stay the same, but for row ranges it's a little
3174  // more complicated.
3175  void ComputeSubmatrixInfo();
3176 
3177  // Expands a command of type kCopyRows or kAddRows; involves adding a new
3178  // element of 'indexes' to expanded_computation_.
3179  void ExpandRowsCommand(const NnetComputation::Command &c_in,
3180  NnetComputation::Command *c_out);
3181 
3182  // Expands a command of type kCopyRowsMulti or kAddRowsMulti, kCopyToRowsMulti
3183  // or kAddToRowsMulti; involves adding a new element of 'indexes_multi' to
3184  // expanded_computation_.
3185  void ExpandRowsMultiCommand(const NnetComputation::Command &c_in,
3186  NnetComputation::Command *c_out);
3187 
3188 
3189  // Expands a command of type kAddRowRanges; involves adding a new element of
3190  // 'indexes_ranges' to expanded_computation_.
3191  void ExpandRowRangesCommand(const NnetComputation::Command &c_in,
3192  NnetComputation::Command *c_out);
3193 
3194 
3195  // This function computes all the PrecomputedIndexes in the
3196  // 'component_precomputed_indexes' member of 'expanded_computation_'.
3197  // They are all generated from scratch, by using the Component::PrecomputedIndexes()
3198  // member function. The 'input_indexes' and 'output_indexes' arguments are worked
3199  // out from the 'debug_info' [if we're not generating debug_info we specially generate
3200  // it for the specific matrices in question], and the 'need_backprop'
3201  // argument is worked out by seeing whether there is a call to Backprop() with
3202  // the same precomputed-indexes element.
3203  void ComputePrecomputedIndexes();
3204 
3205  // Computes the 'commands' member of the output. This function also adds as
3206  // needed to 'indexes', 'indexes_multi' and 'indexes_ranges' in the output.
3207  // Later on we can call RenumberComputation() to remove any duplicates that
3208  // might result from this.
3209  void ComputeCommands();
3210 
3211 
3212  // This command ensure that the debug-info in expanded_computation_ for the
3213  // matrix underlying the submatrix with index 'submatrix_index', exists and is
3214  // set up. In some cases we need the debug info for some matrices in order to
3215  // do the expansion, even if debug info is not requested for the output; in
3216  // those cases we set it up temporarily and clear it before we finish.
3217  void EnsureDebugInfoExists(int32 submatrix_index);
3218 
3219 
3220 
3221  // This function is used in mapping row-indexes into sub-matrices from the
3222  // old to the new computation. It is mostly a wrapper for
3223  // GetNewMatrixLocationInfo, but designed to give row indexes into
3224  // submatrices rather than matrices; see the documentation for
3225  // GetNewMatrixLocationinfo() for details and an explanation of the
3226  // interface.
3227  // This function assumes that ComputeSubmatrixInfo() has already
3228  // been called.
3229  // Note: it returns true if the index 'old_row_index' into submatrix
3230  // indexed 'submat_index' corresponds to an Index with n=0; otherwise
3231  // it returns false and does not set the output values.
3232  // If it returns true, it will set '*new_row_index' to be the row-index
3233  // into the new submatrix, that corresponds to the same Cindex that
3234  // 'old_row_index' points to in the old computation; and it will set
3235  // '*n_stride' to the n stride of the corresponding matrix (this is the
3236  // same in the old and new computations).
3237  bool GetNewSubmatLocationInfo(int32 submat_index,
3238  int32 old_row_index,
3239  int32 *new_row_index,
3240  int32 *n_stride) const;
3241 
3242 
3258  int32 GetNewMatrixLocationInfo(int32 old_matrix_index,
3259  int32 old_row_index) const;
3260 
3261 
3262  // This function 'expands' a set of indexes; it's called from
3263  // ComputePrecomputedIndexes(). The indexes are expected to
3264  // have the normal kind of regularity.
3265  void ExpandIndexes(const std::vector<Index> &indexes,
3266  std::vector<Index> *indexes_expanded) const;
3267 
3268 
3269 
3270  // This 'n_stride_' vector is indexed by the matrix-index in the computation,
3271  // i.e. the same value that you would use to index computation_.matrix_info and
3272  // expanded_computation_->matrix_info. For each matrix-index m > 0 it
3273  // contains the stride of the 'n' values in the Indexes. This is worked out
3274  // from the debug_info of the input computation. For example, if
3275  // the n stride is 3, and we have an Index (n, t, x) = (0, 50, 88) at the
3276  // 11'th row of the matrix, then we would expect to have an Index
3277  // (n, t, x) = (1, 50, 88) at the 11 + 3 = 14'th row of the matrix.
3278  // The input and output computations will always have the same n-stride, so
3279  // there is only one variable.
3280  //
3281  // Let's define num-n-in = 2, and num-n-out = num_n_values_, and suppose
3282  // we're dealing with a matrix that has an n stride of "n-stride".
3283  // We expect the (input, output) matrices to be arranged in blocks of num-rows
3284  // (n-stride * num-n-in), (n-stride * num-n-out) respectively, where
3285  // the n-stride never crosses the block boundaries. We check this.
3286  std::vector<int32> n_stride_;
3287 
3288  const Nnet &nnet_;
3294 };
3295 
3296 
3297 
3299  const NnetComputation::Command &c_in,
3300  NnetComputation::Command *c_out) {
3301  // we need to expand the row-indexes in c_in.arg3, and put the index of the
3302  // resulting vector<int> in expanded_computation_->indexes, in 'c_out->arg3'.
3303 
3304  int32 s1 = c_in.arg1, s2 = c_in.arg2;
3305 
3306  // The command that gets called is something like
3307  // submat1.AddRows(submat2, indexes) if submat1 is the submatrix referred to in
3308  // 's1' and submat2 is the submatrix referred to in 's2'.
3309  // 'indexes' has the same size as the num-rows of submat1, and the values
3310  // in the vector are row-indexes into s2.
3311  int32 old_arg3 = c_out->arg3;
3312  c_out->arg3 = expanded_computation_->indexes.size();
3313  c_out->alpha = c_in.alpha;
3314  expanded_computation_->indexes.push_back(std::vector<int32>());
3315  std::vector<int32> &new_indexes = expanded_computation_->indexes.back();
3316  const std::vector<int32> &old_indexes = computation_.indexes[old_arg3];
3317 
3318  int32 old_size = old_indexes.size(),
3319  num_n_values = num_n_values_,
3320  new_s1_size = expanded_computation_->submatrices[s1].num_rows,
3321  new_s2_size = expanded_computation_->submatrices[s2].num_rows;
3322 
3323  KALDI_ASSERT(old_size == computation_.submatrices[s1].num_rows);
3324 
3325  new_indexes.resize(new_s1_size, -1);
3326 
3327 
3328  // A note on the variable names: i1 and i2 are indexes into the destination
3329  // submatrix and the source submatrix respectively, of the CopyRows or AddRows
3330  // command.
3331  // "n0" in the variable name means that this corresponds to an Index with n==0.
3332  // things without "new" in the name refer to the old computation; things with
3333  // "new" in the name refer to the computation that we are generating.
3334  for (int32 i1 = 0; i1 < old_size; i1++) {
3335  int32 new_i1_n0, n_stride1;
3336  if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &n_stride1)) {
3337  // GetNewSubmatLocationInfo() returns true if this corresponds to
3338  // a Cindex with n == 0.
3339  int32 i2 = old_indexes[i1]; // note: i2 is the row index into submatrix s2.
3340  int32 new_i2_n0, n_stride2;
3341  if (i2 < 0) { // if i2 is -1, we'll just leave any relevant positions in
3342  // 'new_indexes' with -1's in them.
3343  continue;
3344  } else {
3345  bool ans = GetNewSubmatLocationInfo(s2, i2, &new_i2_n0, &n_stride2);
3346  KALDI_ASSERT(ans); // source should also be for n==0, because we don't
3347  // (or at least shouldn't) create computations that
3348  // mix up the 'n' values
3349 
3350  int32 new_i1 = new_i1_n0, new_i2 = new_i2_n0;
3351  for (int32 n = 0; n < num_n_values;
3352  ++n, new_i1 += n_stride1, new_i2 += n_stride2) {
3353  KALDI_ASSERT(new_i1 < new_s1_size && new_i2 < new_s2_size);
3354  new_indexes[new_i1] = new_i2;
3355  }
3356  }
3357  }
3358  }
3359 }
3360 
3362  const NnetComputation::Command &c_in,
3363  NnetComputation::Command *c_out) {
3364  // we need to expand the (submatrix,row)-index pairs in c_in.arg2, and put the
3365  // index of the resulting vector<int> in expanded_computation_->indexes_multi,
3366  // in 'c_out->arg2'.
3367 
3368  int32 s1 = c_in.arg1,
3369  num_rows_old = computation_.submatrices[s1].num_rows,
3370  num_rows_new = expanded_computation_->submatrices[s1].num_rows;
3371 
3372  KALDI_ASSERT(num_rows_old % 2 == 0);
3373  int32 num_n_values = num_n_values_;
3374 
3375  int32 old_arg2 = c_out->arg2;
3376  c_out->arg2 = expanded_computation_->indexes_multi.size();
3377  expanded_computation_->indexes_multi.push_back(
3378  std::vector<std::pair<int32, int32> >());
3379  std::vector<std::pair<int32, int32> > &new_indexes_multi =
3380  expanded_computation_->indexes_multi.back();
3381  const std::vector<std::pair<int32, int32> > &old_indexes_multi =
3382  computation_.indexes_multi[old_arg2];
3383  // old_indexes_multi is a vector that has the same size as the num-rows
3384  // of submatrix s1. It contains pairs that are either (-1, -1), or
3385  // pairs (submatrix-index, row-index) referring to other submatrices
3386  // in the computation.
3387 
3388  KALDI_ASSERT(static_cast<int32>(old_indexes_multi.size()) == num_rows_old);
3389 
3390 
3391  new_indexes_multi.resize(num_rows_new,
3392  std::pair<int32,int32>(-1, -1));
3393 
3394  for (int32 i1 = 0; i1 < num_rows_old; i1++) {
3395  int32 new_i1_n0, n_stride1;
3396  if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &n_stride1)) {
3397  // GetNewSubmatLocationInfo() returns true if this corresponds to
3398  // a Cindex with n == 0.
3399  int32 s2 = old_indexes_multi[i1].first,
3400  i2 = old_indexes_multi[i1].second;
3401  int32 new_i2_n0, n_stride2;
3402  if (s2 < 0) { // if s2 is -1, we don't have to do anything... we'd have
3403  // to fill any relevant positions in 'new_indexes_multi'
3404  // with (-1,-1)'s, but it's filled with that by default.
3405  continue;
3406  } else {
3407  bool ans = GetNewSubmatLocationInfo(s2, i2, &new_i2_n0, &n_stride2);
3408  KALDI_ASSERT(ans); // source should also be for n==0, because we don't
3409  // (or at least shouldn't) create computations that
3410  // mix up the 'n' values
3411 
3412  int32 new_i1 = new_i1_n0, new_i2 = new_i2_n0;
3413 
3414  for (int32 n = 0; n < num_n_values;
3415  n++, new_i1 += n_stride1, new_i2 += n_stride2) {
3416  new_indexes_multi[new_i1].first = s2;
3417  new_indexes_multi[new_i1].second = new_i2;
3418  }
3419  }
3420  }
3421  }
3422 }
3423 
3424 
3425 
3427  const NnetComputation::Command &c_in,
3428  NnetComputation::Command *c_out) {
3429  // we need to expand the pairs of row-indexes in c_in.arg2, and put the index
3430  // of the resulting vector<int> in expanded_computation_->indexes_ranges, in
3431  // 'c_out->arg2'.
3432 
3433  int32 s1 = c_in.arg1, s2 = c_in.arg2,
3434  num_rows_old = computation_.submatrices[s1].num_rows,
3435  num_rows_new = expanded_computation_->submatrices[s1].num_rows;
3436  KALDI_ASSERT(static_cast<size_t>(c_in.arg3) <
3437  computation_.indexes_ranges.size());
3438  int32 num_n_values = num_n_values_;
3439 
3440  int32 old_arg3 = c_out->arg3;
3441  c_out->arg3 = expanded_computation_->indexes_ranges.size();
3442  expanded_computation_->indexes_ranges.push_back(
3443  std::vector<std::pair<int32, int32> >());
3444  std::vector<std::pair<int32, int32> > &new_indexes_ranges =
3445  expanded_computation_->indexes_ranges.back();
3446  const std::vector<std::pair<int32, int32> > &old_indexes_ranges =
3447  computation_.indexes_ranges[old_arg3];
3448  // old_indexes_ranges is a vector that has the same size as the num-rows of
3449  // submatrix s1. It contains pairs that are either two copies of the same
3450  // value (in practice the pair (-1, -1)), or pairs (begin-row-index,
3451  // end-row-index) representing the (begin,end) of a range in submatrix s2.
3452  // Note: end-row-index is one past the end of the range, as for C++ iterators.
3453 
3454  KALDI_ASSERT(static_cast<int32>(old_indexes_ranges.size()) == num_rows_old);
3455 
3456  new_indexes_ranges.resize(num_rows_new,
3457  std::pair<int32,int32>(-1, -1));
3458 
3459  for (int32 i1 = 0; i1 < num_rows_old; i1++) {
3460  int32 new_i1_n0, n_stride1;
3461  if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &n_stride1)) {
3462  // GetNewSubmatLocationInfo() returns true if this corresponds to
3463  // a Cindex with n == 0.
3464  int32 i2_begin = old_indexes_ranges[i1].first,
3465  i2_end = old_indexes_ranges[i1].second;
3466  if (i2_end == i2_begin)
3467  continue; // (-1, -1) pair, meaning an empty range.
3468  // 'new_indexes_ranges' is filled with (-1, -1) pairs as a
3469  // default so we don't have to do anything for these
3470  // elements.
3471  int32 i2_last = i2_end - 1;
3472  int32 new_i2_n0_begin, new_i2_n0_last,
3473  n_stride2; // only 1 stride variable; both calls will output
3474  // the same value.
3475 
3476  bool ans1 = GetNewSubmatLocationInfo(s2, i2_begin, &new_i2_n0_begin,
3477  &n_stride2),
3478  ans2 = GetNewSubmatLocationInfo(s2, i2_last, &new_i2_n0_last,
3479  &n_stride2);
3480  KALDI_ASSERT(ans1 && ans2 && new_i2_n0_last >= new_i2_n0_begin &&
3481  new_i2_n0_begin >= 0 && n_stride1 > 0 && n_stride2 > 0);
3482  // source should also be for n==0, because we don't (or at least
3483  // shouldn't) create computations that mix up the 'n' values
3484 
3485 
3486  int32 new_i1 = new_i1_n0,
3487  new_i2_begin = new_i2_n0_begin,
3488  new_i2_end = new_i2_n0_last + 1;
3489  for (int32 n = 0; n < num_n_values;
3490  n++, new_i1 += n_stride1, new_i2_begin += n_stride2,
3491  new_i2_end += n_stride2) {
3492  new_indexes_ranges[new_i1].first = new_i2_begin;
3493  new_indexes_ranges[new_i1].second = new_i2_end;
3494  }
3495  }
3496  }
3497 }
3498 
3499 
3500 
3502  int32 num_commands = computation_.commands.size();
3503  expanded_computation_->commands.resize(num_commands);
3504  for (int32 command_index = 0; command_index < num_commands;
3505  command_index++) {
3506  const NnetComputation::Command &c = computation_.commands[command_index];
3507  NnetComputation::Command &c_out =
3508  expanded_computation_->commands[command_index];
3509  c_out = c;
3510  // Commands that only operate on submatrices, components and
3511  // precomputed-indexes do not have to be changed because we'll take care of
3512  // the expansion by suitably redefining the matrices and submatrices, and
3513  // recreating the precomputed-indexes.
3514  // However, commands that require, 'indexes', 'indexes_multi' or
3515  // 'indexes_ranges' do need to be modified.
3516  switch (c.command_type) {
3517  case kAllocMatrix:
3518  case kDeallocMatrix:
3519  case kSetConst:
3520  case kSwapMatrix:
3521  case kPropagate: case kBackprop:
3523  break;
3524  case kCopyRows: case kAddRows:
3525  ExpandRowsCommand(c, &c_out);
3526  break;
3527  case kCopyRowsMulti: case kAddRowsMulti:
3528  case kCopyToRowsMulti: case kAddToRowsMulti:
3529  ExpandRowsMultiCommand(c, &c_out);
3530  break;
3531  case kAddRowRanges:
3532  ExpandRowRangesCommand(c, &c_out);
3533  break;
3535  case kAcceptInput: case kProvideOutput: case kNoOperation:
3537  case kNoOperationLabel: case kGotoLabel:
3538  break;
3539  default:
3540  KALDI_ERR << "Un-handled command type";
3541  }
3542  }
3543 }
3544 
3545 
3546 
3547 
3549  // note: the zeroth matrix is not a real matrix, it's the empty matrix.
3550  int32 num_matrices = computation_.matrices.size();
3551  n_stride_.resize(num_matrices);
3552  n_stride_[0] = 0;
3553 
3554  // the input computation to class ComputationExpander is required to
3555  // have its debug info set up.
3557  for (int32 m = 1; m < num_matrices; m++) {
3558  int32 num_rows = computation_.matrices[m].num_rows;
3560  KALDI_ASSERT(debug_info.cindexes.size() == num_rows);
3561  bool full_check = true; // TODO: eventually change this to false.
3562  int32 n_stride = FindNStride(debug_info.cindexes, full_check);
3563  if (n_stride == 0) {
3564  KALDI_ERR << "Problem encountered in 'shortcut' compilation: the computation "
3565  << "does not have the expected structure. Try compiling with "
3566  << "--use-shortcut=false.";
3567  }
3568  n_stride_[m] = n_stride;
3569  }
3570 }
3571 
3572 
3574  InitStrideInfo();
3575  ComputeMatrixInfo();
3576  if (need_debug_info_)
3577  ComputeDebugInfo();
3578  else
3579  expanded_computation_->matrix_debug_info.clear();
3580  ComputeSubmatrixInfo();
3581  ComputePrecomputedIndexes();
3582  ComputeCommands();
3583 
3584  expanded_computation_->need_model_derivative =
3586 }
3587 
3589  int32 num_matrices = computation_.matrices.size();
3590  expanded_computation_->matrices.resize(num_matrices);
3591  // Matrix zero is a special case; it's the empty matrix.
3592  expanded_computation_->matrices[0] = computation_.matrices[0];
3593  int32 old_num_n_values = 2,
3594  new_num_n_values = num_n_values_;
3595  for (int32 m = 1; m < num_matrices; m++) {
3596  expanded_computation_->matrices[m] = computation_.matrices[m];
3597  expanded_computation_->matrices[m].num_rows =
3598  (computation_.matrices[m].num_rows / old_num_n_values) *
3599  new_num_n_values;
3600  }
3601 }
3602 
3604  int32 num_matrices = computation_.matrices.size();
3605  KALDI_ASSERT(computation_.matrix_debug_info.size() == num_matrices);
3606  expanded_computation_->matrix_debug_info.resize(num_matrices);
3607  // Matrix zero is a special case; it's the empty matrix.
3608  expanded_computation_->matrix_debug_info[0] =
3610  int32 num_n_values = num_n_values_;
3611  for (int32 m = 1; m < num_matrices; m++) {
3612  const NnetComputation::MatrixDebugInfo &info_in =
3615  expanded_computation_->matrix_debug_info[m];
3616  info_out.is_deriv = info_in.is_deriv;
3617  int32 num_rows_in = computation_.matrices[m].num_rows,
3618  num_rows_out = expanded_computation_->matrices[m].num_rows;
3619  KALDI_ASSERT(num_rows_in == info_in.cindexes.size());
3620  info_out.cindexes.resize(num_rows_out);
3621  const Cindex *cindexes_in = &(info_in.cindexes[0]);
3622  Cindex *cindexes_out = &(info_out.cindexes[0]);
3623  for (int32 r = 0; r < num_rows_in; r++) {
3624  if (info_in.cindexes[r].second.n == 0) {
3625  int32 new_r = GetNewMatrixLocationInfo(m, r),
3626  n_stride = n_stride_[m];
3627  for (int32 n = 0; n < num_n_values; n++) {
3628  int32 r_out = new_r + n * n_stride;
3629  cindexes_out[r_out] = cindexes_in[r];
3630  cindexes_out[r_out].second.n = n;
3631  }
3632  }
3633  }
3634  }
3635 }
3636 
3638  int32 num_submatrices = computation_.submatrices.size();
3639  expanded_computation_->submatrices.resize(num_submatrices);
3640  // Sub-matrix zero is a special case; it's the empty submatrix.
3641  expanded_computation_->submatrices[0] = computation_.submatrices[0];
3642  for (int32 s = 1; s < num_submatrices; s++) {
3644  int32 m = info_in.matrix_index;
3645  const NnetComputation::MatrixDebugInfo &debug_info_in =
3647 
3648  // we may need to change the row_offset and num_rows.
3649  int32 first_row_in = info_in.row_offset,
3650  last_row_in = first_row_in + info_in.num_rows - 1;
3651  if (!(debug_info_in.cindexes[first_row_in].second.n == 0 &&
3652  debug_info_in.cindexes[last_row_in].second.n == 1)) {
3653  std::ostringstream computation_ss;
3654  std::vector<std::string> submat_strings;
3655  computation_.GetSubmatrixStrings(nnet_, &submat_strings);
3656  computation_.Print(computation_ss, nnet_);
3657  KALDI_ERR << "Submatrix s" << s << " = " << submat_strings[s]
3658  << " has strange dimensions. Computation is: "
3659  << computation_ss.str();
3660  }
3661 
3662  int32 first_row_out = GetNewMatrixLocationInfo(m, first_row_in),
3663  last_row_out = GetNewMatrixLocationInfo(m, last_row_in),
3664  new_num_rows = (last_row_out + 1 - first_row_out);
3665 
3666  NnetComputation::SubMatrixInfo &info_out =
3667  expanded_computation_->submatrices[s];
3668  info_out.matrix_index = m;
3669  info_out.row_offset = first_row_out;
3670  info_out.num_rows = new_num_rows;
3671  info_out.col_offset = info_in.col_offset;
3672  info_out.num_cols = info_in.num_cols;
3673  }
3674 }
3675 
3677  // for each element of 'component_precomputed_indexes',
3678  // we will try to work out the command-index of the associated
3679  // Propagate() command and of the associated Backprop() command,
3680  // if it exists.
3681  // We expect that each such element will be associated with
3682  // exactly one Propagate() command and at most one Backprop() command.
3683  int32 num_commands = computation_.commands.size(),
3684  num_precomputed_indexes = computation_.component_precomputed_indexes.size();
3685 
3686  std::vector<bool> need_backprop(num_precomputed_indexes, false);
3687 
3688  std::vector<int32> component_index(num_precomputed_indexes, -1);
3689 
3690  for (int32 command_index = 0; command_index < num_commands; command_index++) {
3691  const NnetComputation::Command &c = computation_.commands[command_index];
3692 
3693  if (c.command_type == kPropagate && c.arg2 > 0) {
3694  KALDI_ASSERT(c.arg2 < num_precomputed_indexes);
3695  component_index[c.arg2] = c.arg1;
3696  }
3697  if ((c.command_type == kBackprop ||
3698  c.command_type == kBackpropNoModelUpdate) && c.arg2 > 0) {
3699  KALDI_ASSERT(c.arg2 < num_precomputed_indexes);
3700  need_backprop[c.arg2] = true;
3701  }
3702  }
3703 
3704  for (size_t p = 1;
3705  p < expanded_computation_->component_precomputed_indexes.size();
3706  ++p)
3707  delete expanded_computation_->component_precomputed_indexes[p].data;
3708  expanded_computation_->component_precomputed_indexes.clear();
3709  expanded_computation_->component_precomputed_indexes.resize(
3710  num_precomputed_indexes);
3711 
3712  for (int32 p = 1; p < num_precomputed_indexes; ++p) {
3713  const NnetComputation::PrecomputedIndexesInfo &old_info =
3716  expanded_computation_->component_precomputed_indexes[p];
3717  KALDI_ASSERT(!old_info.input_indexes.empty() &&
3718  !old_info.output_indexes.empty() &&
3719  "Input/output indexes not present in precomputed info of "
3720  "computation to be expanded.");
3721  // note: we could place these expanded indexes into 'new_info.input_indexes'
3722  // and 'new_info.output_indexes', but we actually don't need to keep them
3723  // there, because they are only required to be kept in computations where
3724  // the n indexes consist of the set (0, 1), and the computation we're
3725  // creating has more distinct n indexes than that.
3726  std::vector<Index> input_indexes, output_indexes;
3727  ExpandIndexes(old_info.input_indexes, &input_indexes);
3728  ExpandIndexes(old_info.output_indexes, &output_indexes);
3729  KALDI_ASSERT(component_index[p] >= 0);
3730  const Component *component = nnet_.GetComponent(component_index[p]);
3731  ComponentPrecomputedIndexes *expanded_precomputed_indexes =
3732  component->PrecomputeIndexes(misc_info_, input_indexes,
3733  output_indexes, need_backprop[p]);
3734  // this object should not be null because it was not NULL the
3735  // last time we generated it from the same component, for the
3736  // same computation.
3737  KALDI_ASSERT(expanded_precomputed_indexes != NULL);
3738  new_info.data = expanded_precomputed_indexes;
3739  }
3740 }
3741 
3742 
3744  int32 submat_index, int32 old_row_index,
3745  int32 *new_row_index, int32 *n_stride) const {
3746  int32 matrix_index = computation_.submatrices[submat_index].matrix_index,
3747  old_row_offset = computation_.submatrices[submat_index].row_offset,
3748  new_row_offset = expanded_computation_->submatrices[submat_index].row_offset;
3749 
3750  const NnetComputation::MatrixDebugInfo &debug_info_in =
3751  computation_.matrix_debug_info[matrix_index];
3752  if (debug_info_in.cindexes[old_row_index + old_row_offset].second.n != 0)
3753  return false;
3754  *new_row_index = (GetNewMatrixLocationInfo(matrix_index,
3755  old_row_index + old_row_offset) -
3756  new_row_offset);
3757  *n_stride = n_stride_[matrix_index];
3758  return true;
3759 }
3760 
3762  int32 matrix_index, int32 old_row_index) const {
3763  // to understand 'block_size', read the comment for FindNStride().
3764  int32 n_stride = n_stride_[matrix_index],
3765  old_num_n_values = 2, new_num_n_values = num_n_values_,
3766  old_block_size = old_num_n_values * n_stride,
3767  new_block_size = new_num_n_values * n_stride,
3768  block_index = old_row_index / old_block_size,
3769  offset_within_block = old_row_index % old_block_size;
3770 
3771  // within each block, we can show, given our assumptions, that
3772  // we must first have a sub-block of 'n_stride' values all with
3773  // n == 0, then another sub-clock of 'n_stride' values all with
3774  // n == 1, and so on. [except there is no 'and so on' for the
3775  // input computation, where we expect the 'n' values to be the
3776  // set {0, 1}.]
3777  int32 old_n_value = offset_within_block / n_stride,
3778  index_within_subblock = offset_within_block % n_stride;
3779  const std::vector<Cindex> &cindexes =
3780  computation_.matrix_debug_info[matrix_index].cindexes;
3781  KALDI_ASSERT(old_n_value == cindexes[old_row_index].second.n &&
3782  (old_n_value == 0 || old_n_value == 1));
3783  // Search for CAVEAT in the comment for this function to see what this is
3784  // about. Mapping old_n_value == 1 -> new_n_value == new_num_n_values - 1
3785  // just happens to be useful for the way we use this function... it maps the
3786  // end of an old submatrix to the end of a new submatrix.
3787  int32 new_n_value = (old_n_value == 0 ? 0 : new_num_n_values - 1);
3788 
3789  return block_index * new_block_size + index_within_subblock +
3790  new_n_value * n_stride;
3791 }
3792 
3793 
3795  const std::vector<Index> &indexes,
3796  std::vector<Index> *indexes_expanded) const {
3797  bool full_check = false;
3798  int32 n_stride = FindNStride(indexes, full_check);
3799  KALDI_ASSERT(n_stride > 0);
3800  ConvertNumNValues(n_stride, 2, num_n_values_,
3801  indexes, indexes_expanded);
3802 }
3803 
3804 void ExpandComputation(const Nnet &nnet,
3805  const MiscComputationInfo &misc_info,
3806  const NnetComputation &computation,
3807  bool need_debug_info,
3808  int32 num_n_values,
3809  NnetComputation *expanded_computation) {
3810  ComputationExpander expander(nnet, misc_info, computation,
3811  need_debug_info, num_n_values,
3812  expanded_computation);
3813  expander.Expand();
3814 }
3815 
3816 
3817 
3818 // This helper function is used in RequestIsDecomposable(); you can work out
3819 // what it does, and why, from the documentation of RequestIsDecomposable() in
3820 // the header. This function does basically the same thing, except
3821 // at a lower level, for an IoSpecification rather than a ComputationRequest.
3823  IoSpecification *mini_io_spec,
3824  int32 *num_n_values_out) {
3825  mini_io_spec->name = io_spec.name;
3826  mini_io_spec->has_deriv = io_spec.has_deriv;
3827  const std::vector<Index> &indexes = io_spec.indexes;
3828  KALDI_ASSERT(!indexes.empty() && "Empty Indexes in computation request");
3829 
3830  bool full_check = true; // We might eventually change this to false, for
3831  // efficiency.
3832  int32 num_n_values = indexes.back().n + 1;
3833  if (num_n_values <= 2) {
3834  // Computations with 2 or fewer 'n' values are not decomposable, as there
3835  // would be no speed benefit in shortcut compilation (which relies on
3836  // compiling an otherwise similar computation with n == 2).
3837  return false;
3838  }
3839  *num_n_values_out = num_n_values;
3840 
3841  int32 n_stride = FindNStride(indexes, full_check);
3842 
3843  if (n_stride == 0)
3844  return false;
3845 
3846  ConvertNumNValues(n_stride, num_n_values, 2,
3847  indexes, &(mini_io_spec->indexes));
3848 
3849  return true;
3850 }
3851 
3853  ComputationRequest *mini_request,
3854  int32 *num_n_values) {
3855  size_t num_inputs = request.inputs.size(),
3856  num_outputs = request.outputs.size();
3857  mini_request->inputs.resize(num_inputs);
3858  mini_request->outputs.resize(num_outputs);
3859  mini_request->need_model_derivative = request.need_model_derivative;
3860  mini_request->store_component_stats = request.store_component_stats;
3861  mini_request->misc_info = request.misc_info;
3862 
3863  KALDI_ASSERT(num_inputs != 0 && num_outputs != 0);
3864  for (size_t i = 0; i < num_inputs; i++) {
3865  int32 this_num_n_values = 0;
3866  if (!IoSpecificationIsDecomposable(request.inputs[i],
3867  &(mini_request->inputs[i]),
3868  &this_num_n_values))
3869  return false;
3870  if (i == 0) {
3871  *num_n_values = this_num_n_values;
3872  } else {
3873  if (this_num_n_values != *num_n_values)
3874  return false; // .. which would be odd.
3875  }
3876  }
3877  for (size_t i = 0; i < num_outputs; i++) {
3878  int32 this_num_n_values = 0;
3879  if (!IoSpecificationIsDecomposable(request.outputs[i],
3880  &(mini_request->outputs[i]),
3881  &this_num_n_values))
3882  return false;
3883  if (this_num_n_values != *num_n_values)
3884  return false; // .. which would be odd.
3885  }
3886  return true;
3887 }
3888 
3889 
3891  public:
3893  NnetComputation *computation):
3894  nnet_(nnet), computation_(computation) { }
3895  bool Optimize();
3896 
3897  private:
3898 
3899  // Figures out the time shift between the successive computation requests.
3900  static int32 FindTimeShift(const NnetComputation &computation);
3901 
3902  // This function creates a mapping from a matrix-index > 0,
3903  // to a pair (unique_id, time_offset) that represents the debug-info
3904  // for that matrix-id in computation.debug_info (these terms are explained
3905  // below).
3906  //
3907  // The output vector 'matrix_to_pair' is indexed by the matrix-index in the
3908  // computation (the zeroth member is not valid).
3909  //
3910  // The 'time_offset' is equal to the 't' value of the first member of the
3911  // cindexes vector for with t != kNoTime. The 'unique_id' is an integer that
3912  // uniquely identifies what we get from subtracting the 'time_offset' from
3913  // each 't' value of that 'cindexes' vector for which t != kNoTime, and then
3914  // pairing it up with the 'is_deriv' value of the DebugInfo. That is, if two
3915  // 'cindexes' vectors differ only by a time offset, and the 'is_deriv' values
3916  // are the same they will map to the same unique_id.
3917  static void CreateMatrixPairs(const NnetComputation &computation,
3918  std::vector<std::pair<int32, int32> > *matrix_to_pair);
3919 
3920  // This helper function, used in CreateMatrixPairs, find the value 't' which
3921  // is the first (*cindexes)[i].second.t that is not kNoTime; it then subtracts
3922  // that 't' value from all (*cindexes)[i].second.t that are not kNoTime. If
3923  // all the 't' values are kNoTime, which we don't expect to happen, we throw
3924  // an error.
3925  static inline int32 NormalizeCindexes(std::vector<Cindex> *cindexes);
3926 
3927 
3928  // This very simple helper function reverses the map 'matrix_to_pair' so we can
3929  // do the reverse lookup. It outputs a map from pair to matrix index m, where
3930  // 1 <= m < matrix_to_pair.size().
3931  static void GetPairToMatrixMap(
3932  std::vector<std::pair<int32, int32> > &matrix_to_pair,
3933  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix);
3934 
3935 
3936  // Given a vector of lists, one list for each segment, of the active matrices
3937  // at the end of that segment, this function converts those lists into a
3938  // different representation where each matrix is represented as a pair instead
3939  // of as a single int32. 'active_pairs' will have the same dimensions as
3940  // 'active_matrices'.
3941  static void ConvertListsToPairLists(
3942  const std::vector<std::vector<int32> > &active_matrices,
3943  const std::vector<std::pair<int32, int32> > &matrix_to_pair,
3944  std::vector<std::vector<std::pair<int32, int32> > > *active_pairs);
3945 
3946  // This function, used in FindFirstRepeat, tells us whether the two lists a
3947  // and b are the same except for a possible time-shift.
3948  // Each element of a or b is of the form (matrix-unique-index, time-offset).
3949  // Let's suppose we have two pairs p1=(m1, o1) and p2=(m2, o2).
3950  // For p2 to be equal to p1 except for a possible shift of value 'shift', we
3951  // require m2 == m1 and either o2 == o1 + 'shift' or o2 == o1.
3952  // This function returns true if a.size() == b.size() and for each
3953  // i, b[i].first == a[i].first and b[i].second is either
3954  // a[i].second or a[i].second + shift.
3955  static bool ListsAreEqualExceptForPossibleShift(
3956  const std::vector<std::pair<int32, int32> > &a,
3957  const std::vector<std::pair<int32, int32> > &b,
3958  int32 shift);
3959 
3960  // This function looks in the matrix 'active_pairs' for the first pair of
3961  // identical values, i.e. it is looking for i < j for which
3962  // normalized_active_pairs[i] == normalized_active_pairs[j]. (However, the
3963  // pair i,j must satisfy an extra condition, see below). If a pair
3964  // i,j exists satisfying these conditions, this function outputs them to *seg1
3965  // and *seg2, and returns true; otherwise it returns false.
3966  //
3967  // Extra condition:
3968  // It turns out that under some circumstances, we can
3969  // fine repeats that were not "really" repeats (the matrices were not time
3970  // shifted) The situation was a bit obscure (it was a non-recurrent setup with
3971  // a lot of extra-right-context, where some inputs were never used), but to
3972  // prevent it happening again we are now checking in addition to the above,
3973  // that the time-shift between the segments (i.e. time_offsets[j] -
3974  // time_offsets[i]), has the "expected value" based on the assumption that
3975  // each segment should be shifted relative to the previous segment, by
3976  // 'time_shift_per_segment'.
3977  static bool FindFirstRepeat(
3978  const std::vector<std::vector<std::pair<int32, int32> > > &active_pairs,
3979  int32 time_shift_per_segment,
3980  int32 *seg1, int32 *seg2);
3981 
3982 
3983  // 'pair_list1' is the list of active (unique-id, time-offset) pairs for one
3984  // segment of the computation and 'pair_list2' is the same list for a later
3985  // segment. The map 'pair_to_matrix' can convert these back into matrix
3986  // indexes. This function will output two lists of matrices. These will just
3987  // be 'pair_list1' and 'pair_list2' converted back into matrix indexes,
3988  // except we omit pairs which are identical (i.e. the time-offset was zero).
3989  static void GetIdentifiedMatrices(
3990  const std::vector<std::pair<int32, int32> > &pair_list1,
3991  const std::vector<std::pair<int32, int32> > &pair_list2,
3992  const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
3993  std::vector<int32> *matrix_list1,
3994  std::vector<int32> *matrix_list2);
3995 
3996 
3997  // This function just does some checking (via asserts), that
3998  // the lists of matrices 'list1' and 'list2' are of the same length,
3999  // that time_difference > 0, that each matrix with index m = list2[i] is of the
4000  // same dimension as the list1[i], with Cindexes that are the same except for
4001  // the time index being greater by 'time_difference'
4002  static void CheckIdentifiedMatrices(
4003  const NnetComputation &computation,
4004  const std::vector<int32> &list1,
4005  const std::vector<int32> &list2,
4006  int32 time_difference);
4007 
4008 
4009  // Given two command indexes command1 < command2 pointing to commands of type
4010  // kNoOperationMarker, this function modifies the computation by
4011  // removing all commands after command2, replacing command2 with a kGotoLabel
4012  // command pointing to command1 and then inserting just before command1
4013  // a marker of type kNoOperationLabel.
4014  static void FormInfiniteLoop(int32 command1, int32 command2,
4015  NnetComputation *computation);
4016 
4017  // This is to be called after FormInfiniteLoop. It inserts, just before
4018  // the final kGotoLabel command, commands that initialize
4019  // each of the matrices in list 'matrices1' from the corresponding
4020  // matrix in 'matrices2', using the kAllocMatrixFromOther command.
4021  // This effectively does, for example, matrices1[i] = matrices2[i],
4022  // and it's implemented as a shallow swap.
4023  // It does this in such an order that even if the two lists are
4024  // not disjoint, the right thing happens.
4025  static void AddMatrixSwapCommands(
4026  const std::vector<int32> &matrices1,
4027  const std::vector<int32> &matrices2,
4028  NnetComputation *computation);
4029 
4030 
4031  // Called from AddMatrixSwapCommands, this function figures out for us
4032  // an acceptable order in which to execute the kAllocMatrixFromOther
4033  // commands. This is easy to do if matrices1 and matrices2 are disjoint
4034  // sets, but has to be done more carefully if they overlap.
4035  // The output is a list of pairs where each pair (a, b) comes from
4036  // from matrices1 and matrices2 in the same position, i.e.
4037  // a = matrices1[i] and b = matrices2[i].
4038  static void GetMatrixSwapOrder(
4039  const std::vector<int32> &matrices1,
4040  const std::vector<int32> &matrices2,
4041  std::vector<std::pair<int32, int32> > *swaps);
4042 
4043 
4044 
4055  static void FindActiveMatrices(const NnetComputation &computation,
4056  const Analyzer &analyzer,
4057  const std::vector<int32> &splice_point_commands,
4058  std::vector<std::vector<int32> > *active_matrices);
4059 
4060 
4061  const Nnet &nnet_;
4064  std::vector<std::pair<int32, int32> > matrix_to_pair_;
4065 
4066  std::vector<int32> splice_point_commands_;
4067 };
4068 
4069 // static
4071  const NnetComputation &computation) {
4072  std::vector<int32> segment_ends;
4073  GetCommandsOfType(computation, kNoOperationMarker, &segment_ends);
4074  KALDI_ASSERT(segment_ends.size() >= 3);
4075  // Ignore the first segment as it tends to be a special case
4076  // (it has more left context),
4077  int32 second_segment_begin = segment_ends[0],
4078  third_segment_begin = segment_ends[1],
4079  fourth_segment_begin = segment_ends[2];
4080  int32 first_output_command_seg2 = -1,
4081  first_output_command_seg3 = -1;
4082  for (int32 c = second_segment_begin; c < third_segment_begin; c++)
4083  if (computation.commands[c].command_type == kProvideOutput &&
4084  first_output_command_seg2 < 0)
4085  first_output_command_seg2 = c;
4086  for (int32 c = third_segment_begin; c < fourth_segment_begin; c++)
4087  if (computation.commands[c].command_type == kProvideOutput &&
4088  first_output_command_seg3 < 0)
4089  first_output_command_seg3 = c;
4090  if (first_output_command_seg2 < 0 ||
4091  first_output_command_seg3 < 0)
4092  KALDI_ERR << "Could not locate output commands for segments 2 and 3.";
4094  &command2 = computation.commands[first_output_command_seg2],
4095  &command3 = computation.commands[first_output_command_seg3];
4096  int32 seg2_node = command2.arg2, seg3_node = command3.arg2;
4097  KALDI_ASSERT(seg2_node == seg3_node);
4098  int32 seg2_submatrix = command2.arg1,
4099  seg3_submatrix = command3.arg1;
4100  KALDI_ASSERT(computation.IsWholeMatrix(seg2_submatrix) &&
4101  computation.IsWholeMatrix(seg3_submatrix));
4102  int32 seg2_matrix = computation.submatrices[seg2_submatrix].matrix_index,
4103  seg3_matrix = computation.submatrices[seg3_submatrix].matrix_index;
4104  KALDI_ASSERT(computation.matrices[seg2_matrix].num_rows ==
4105  computation.matrices[seg3_matrix].num_rows);
4106  KALDI_ASSERT(!computation.matrix_debug_info.empty());
4108  &debug_info2 = computation.matrix_debug_info[seg2_matrix],
4109  &debug_info3 = computation.matrix_debug_info[seg3_matrix];
4110  int32 t_offset = debug_info3.cindexes[0].second.t -
4111  debug_info2.cindexes[0].second.t;
4112  int32 num_rows = debug_info2.cindexes.size();
4113  for (int32 r = 0; r < num_rows; r++) {
4114  KALDI_ASSERT(debug_info3.cindexes[r].second.t ==
4115  debug_info2.cindexes[r].second.t + t_offset);
4116  }
4117  return t_offset;
4118 }
4119 
4120 // static inline
4122  std::vector<Cindex> *cindexes) {
4123  std::vector<Cindex>::iterator iter = cindexes->begin(),
4124  end = cindexes->end();
4125  int32 ans;
4126  for (; iter != end; iter++) {
4127  if (iter->second.t != kNoTime) {
4128  ans = iter->second.t;
4129  break;
4130  }
4131  }
4132  if (iter == end) {
4133  // this should not happen.
4134  KALDI_ERR << "All t values are kNoTime in matrix.";
4135  }
4136  iter = cindexes->begin();
4137  for (; iter != end; iter++)
4138  if (iter->second.t != kNoTime)
4139  iter->second.t -= ans;
4140  return ans;
4141 }
4142 
4143 // static
4145  const NnetComputation &computation,
4146  std::vector<std::pair<int32, int32> > *matrix_to_pair) {
4147  typedef unordered_map<std::vector<Cindex>, int32,
4148  CindexVectorHasher> MapType;
4149  int32 cur_vector_id = 1;
4150  // Note: cindex_map just maps the vector<Cindex> to a unique value,
4151  // and then we manually work out a unique id that takes into
4152  // account the 'is_deriv' values.
4153  MapType cindex_map;
4154  int32 num_matrices = computation.matrices.size();
4155  matrix_to_pair->resize(num_matrices);
4156  KALDI_ASSERT(computation.matrix_debug_info.size() == num_matrices);
4157  for (int32 m = 1; m < num_matrices; m++) {
4158  KALDI_ASSERT(!computation.matrix_debug_info[m].cindexes.empty());
4159  std::vector<Cindex> cindexes = computation.matrix_debug_info[m].cindexes;
4160  int32 t_offset = NormalizeCindexes(&cindexes);
4161  MapType::const_iterator iter = cindex_map.find(cindexes);
4162  int32 vector_id;
4163  if (iter != cindex_map.end()) {
4164  vector_id = iter->second;
4165  } else {
4166  vector_id = cur_vector_id++;
4167  cindex_map[cindexes] = vector_id;
4168  }
4169  bool is_deriv = computation.matrix_debug_info[m].is_deriv;
4170  int32 unique_id = 2 * vector_id + (is_deriv ? 1 : 0);
4171  (*matrix_to_pair)[m].first = unique_id;
4172  (*matrix_to_pair)[m].second = t_offset;
4173  }
4174 }
4175 
4176 // static
4178  std::vector<std::pair<int32, int32> > &matrix_to_pair,
4179  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix) {
4180  int32 num_matrices = matrix_to_pair.size();
4181  // actually there are one fewer matrices than num_matrices.
4182  pair_to_matrix->clear();
4183  for (int32 m = 1; m < num_matrices; m++)
4184  (*pair_to_matrix)[matrix_to_pair[m]] = m;
4185 }
4186 
4187 
4188 // static
4190  const std::vector<std::vector<int32> > &active_matrices,
4191  const std::vector<std::pair<int32, int32> > &matrix_to_pair,
4192  std::vector<std::vector<std::pair<int32, int32> > > *active_pairs) {
4193  active_pairs->clear();
4194  active_pairs->resize(active_matrices.size());
4195  int32 num_matrices = matrix_to_pair.size();
4196  for (size_t seg = 0; seg < active_matrices.size(); seg++) {
4197  const std::vector<int32> &this_active_matrix_list = active_matrices[seg];
4198  std::vector<std::pair<int32, int32> > &this_active_pair_list =
4199  (*active_pairs)[seg];
4200  this_active_pair_list.resize(this_active_matrix_list.size());
4201  std::vector<int32>::const_iterator iter = this_active_matrix_list.begin(),
4202  end = this_active_matrix_list.end();
4203  std::vector<std::pair<int32, int32> >::iterator
4204  out_iter = this_active_pair_list.begin();
4205  for (; iter != end; ++iter, ++out_iter) {
4206  KALDI_ASSERT(*iter > 0 && *iter < num_matrices);
4207  *out_iter = matrix_to_pair[*iter];
4208  }
4209  }
4210 }
4211 
4212 // static
4214  const std::vector<std::pair<int32, int32> > &a,
4215  const std::vector<std::pair<int32, int32> > &b,
4216  int32 shift) {
4217  size_t size = a.size();
4218  if (b.size() != size)
4219  return false;
4220  for (size_t i = 0; i < size; i++) {
4221  const std::pair<int32, int32> &p1 = a[i],
4222  &p2 = b[i];
4223  if (p1.first != p2.first)
4224  return false;
4225  if (p2.second != p1.second + shift && p2.second != p1.second)
4226  return false;
4227  }
4228  return true;
4229 }
4230 
4231 // static
4233  const std::vector<std::vector<std::pair<int32, int32> > > &active_pairs,
4234  int32 time_shift_per_segment,
4235  int32 *seg1, int32 *seg2) {
4236  int32 num_segments = active_pairs.size();
4237  // This algorithm may seem like it would be very slow, but the number of
4238  // segments will normally be quite small (e.g. 10), and the comparison of
4239  // elements of 'active_pairs' should be fast in cases where they
4240  // differ.
4241  KALDI_ASSERT(num_segments >= 2);
4242 
4243  for (int32 s = 0; s < num_segments; s++) {
4244  for (int32 t = s + 1; t < num_segments; t++) {
4245  if (ListsAreEqualExceptForPossibleShift(active_pairs[s],
4246  active_pairs[t],
4247  (t - s) * time_shift_per_segment)) {
4248  *seg1 = s;
4249  *seg2 = t;
4250  return true;
4251  }
4252  }
4253  }
4254  return false;
4255 }
4256 
4257 // static
4259  const std::vector<std::pair<int32, int32> > &pair_list1,
4260  const std::vector<std::pair<int32, int32> > &pair_list2,
4261  const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
4262  std::vector<int32> *matrix_list1,
4263  std::vector<int32> *matrix_list2) {
4264  size_t size = pair_list1.size();
4265  KALDI_ASSERT(pair_list2.size() == size);
4266  matrix_list1->clear();
4267  matrix_list2->clear();
4268  matrix_list1->reserve(size);
4269  matrix_list2->reserve(size);
4270  std::vector<std::pair<int32, int32> >::const_iterator
4271  iter1 = pair_list1.begin(), end1 = pair_list1.end(),
4272  iter2 = pair_list2.begin();
4273  for (; iter1 != end1; ++iter1, ++iter2) {
4274  if (iter1->second == iter2->second)
4275  continue;
4276  // skip those that have no time shift, we won't have to do any swapping for
4277  // those.
4278  unordered_map<std::pair<int32, int32>, int32,
4279  PairHasher<int32> >::const_iterator
4280  map_iter1 = pair_to_matrix.find(*iter1),
4281  map_iter2 = pair_to_matrix.find(*iter2);
4282  if (map_iter1 == pair_to_matrix.end() ||
4283  map_iter2 == pair_to_matrix.end())
4284  KALDI_ERR << "Could not find pair in map (code error)";
4285  matrix_list1->push_back(map_iter1->second);
4286  matrix_list2->push_back(map_iter2->second);
4287  }
4288 }
4289 
4290 
4291 
4292 // static
4294  const NnetComputation &computation,
4295  const Analyzer &analyzer,
4296  const std::vector<int32> &splice_point_commands,
4297  std::vector<std::vector<int32> > *active_matrices) {
4298  int32 num_matrices = computation.matrices.size();
4299  int32 num_splice_points = splice_point_commands.size();
4300  active_matrices->clear();
4301  active_matrices->resize(num_splice_points);
4302  // this object just makes available some extra functions, vs. the Analyzer
4303  // object.
4304  ComputationAnalysis analysis(computation, analyzer);
4305  KALDI_ASSERT(IsSortedAndUniq(splice_point_commands));
4306 
4307  // the following vector gives us, for each matrix index, a submatrix index
4308  // that covers the whole of that matrix (needed by interface of 'analysis' object).
4309  std::vector<int32> whole_submatrices;
4310  computation.GetWholeSubmatrices(&whole_submatrices);
4311  for (int32 m = 1; m < num_matrices; m++) {
4312  // the following are command indexes, comparable with the indexes
4313  // in 'splice_point_commands'.
4314  int32 s = whole_submatrices[m], // submatrix consisting of the whole of
4315  // 'm'.
4316  first_access = analysis.FirstNontrivialAccess(s),
4317  last_access = analysis.LastAccess(s);
4318  for (int32 i = 0; i < num_splice_points; i++) {
4319  int32 splice_point = splice_point_commands[i];
4320  if (first_access < splice_point && last_access > splice_point) {
4321  // If the block of time during which the matrix is accessed, includes
4322  // this command index, then the matrix is considered 'active' at this
4323  // time.
4324  (*active_matrices)[i].push_back(m);
4325  }
4326  }
4327  }
4328 }
4329 
4330 // static
4332  const NnetComputation &computation,
4333  const std::vector<int32> &list1,
4334  const std::vector<int32> &list2,
4335  int32 time_difference) {
4336  KALDI_ASSERT(time_difference > 0);
4337  KALDI_ASSERT(list1.size() == list2.size());
4338  KALDI_ASSERT(!computation.matrix_debug_info.empty());
4339  for (size_t i = 0; i < list1.size(); i++) {
4340  int32 m1 = list1[i], m2 = list2[i];
4342  &matrix_info1 = computation.matrices[m1],
4343  &matrix_info2 = computation.matrices[m2];
4344  KALDI_ASSERT(matrix_info1.num_rows == matrix_info2.num_rows &&
4345  matrix_info1.num_cols == matrix_info2.num_cols &&
4346  matrix_info1.stride_type == matrix_info2.stride_type);
4348  &debug_info1 = computation.matrix_debug_info[m1],
4349  &debug_info2 = computation.matrix_debug_info[m2];
4350  KALDI_ASSERT(debug_info1.is_deriv == debug_info2.is_deriv);
4351  KALDI_ASSERT(debug_info1.cindexes.size() == debug_info2.cindexes.size());
4352  std::vector<Cindex>::const_iterator iter1 = debug_info1.cindexes.begin(),
4353  end1 = debug_info1.cindexes.end(),
4354  iter2 = debug_info2.cindexes.begin();
4355  for (; iter1 != end1; iter1++,iter2++) {
4356  KALDI_ASSERT(iter2->first == iter1->first &&
4357  iter2->second.n == iter1->second.n &&
4358  ((iter1->second.t == kNoTime && iter2->second.t == kNoTime) ||
4359  iter2->second.t == iter1->second.t + time_difference) &&
4360  iter2->second.x == iter1->second.x);
4361  }
4362  }
4363 }
4364 
4365 
4366 // static
4368  const std::vector<int32> &matrices1,
4369  const std::vector<int32> &matrices2,
4370  std::vector<std::pair<int32, int32> > *swaps) {
4371  KALDI_ASSERT(matrices1.size() == matrices2.size());
4372  swaps->clear();
4373  int32 num_matrices = matrices1.size();
4374  std::vector<bool> processed(num_matrices, false);
4375  std::vector<int32> queue;
4376 
4377  // num_loops is just for infinite-loop detection.
4378  int32 num_loops = 0;
4379  for (; static_cast<int32>(swaps->size()) < num_matrices; num_loops++) {
4380  for (int32 i = 0; i < num_matrices; i++) {
4381  if (processed[i])
4382  continue;
4383  int32 m1 = matrices1[i], m2 = matrices2[i];
4384  std::vector<int32>::const_iterator iter =
4385  std::lower_bound(matrices2.begin(), matrices2.end(), m1);
4386  if (iter == matrices2.end() || *iter != m1) {
4387  // Matrix m1 does not appear in the list 'matrices2', so
4388  // we are safe to process it at any time.
4389  swaps->push_back(std::pair<int32,int32>(m1, m2));
4390  processed[i] = true;
4391  } else {
4392  int32 m1_pos_in_matrices2 = iter - matrices2.begin();
4393  if (processed[m1_pos_in_matrices2]) {
4394  // We're safe to do this swap now, because the matrix m1 has already
4395  // appeared on the RHS of a swap, and by this point has been
4396  // deallocated, in effect.
4397  swaps->push_back(std::pair<int32,int32>(m1, m2));
4398  processed[i] = true;
4399  }
4400  // else do nothing, we cannot process m1 yet because
4401  // at this point in the computation it is still allocated.
4402  }
4403  }
4404  // The following assert is to check that we don't loop infinitely. We can
4405  // prove that infinite looping won't happen, after on proving that there can
4406  // be no cycles like (m1, m2), (m2, m3), (m3, m1) (the length of 3 is chosen
4407  // arbitrarily as an example). If such a cycle existed, we can reach a
4408  // contradiction based on the time-index (t) of the first cindex in m1.
4409  // Define t1 = that time index, t2 the same for m2, t3 the same for m3. The
4410  // existence of the three pairs [as pairs like (matrices1[i], matrices2[i])]
4411  // implies that t2 > t1, t3 > t2, and t1 > t3 respectively, but this is
4412  // impossible.
4413  // This shows that all chains of dependencies must terminate.
4414  KALDI_ASSERT(num_loops <= num_matrices);
4415  }
4416 }
4417 
4418 // static
4420  const std::vector<int32> &matrices1,
4421  const std::vector<int32> &matrices2,
4422  NnetComputation *computation) {
4423  std::vector<std::pair<int32, int32> > swaps;
4424  // Note: in 'easy' cases where matrices1 and matrices2 are disjoint,
4425  // 'swaps' will just be the vector { (matrices1[0],matrices2[0]),
4426  // (matrices1[1],matrices2[1]), ... },
4427  // but in some cases these may need to get reordered.
4428  GetMatrixSwapOrder(matrices1, matrices2, &swaps);
4429 
4430  NnetComputation::Command goto_label_command = computation->commands.back();
4431  KALDI_ASSERT(goto_label_command.command_type == kGotoLabel);
4432  computation->commands.pop_back();
4433 
4434  // the following vector gives us, for each matrix index, a submatrix index
4435  // that covers the whole of that matrix (needed because the commands
4436  // require submatrix indexes)
4437  std::vector<int32> whole_submatrices;
4438  computation->GetWholeSubmatrices(&whole_submatrices);
4439  size_t num_matrices = whole_submatrices.size();
4440 
4441  for (size_t i = 0; i < swaps.size(); i++) {
4442  int32 m1 = swaps[i].first, m2 = swaps[i].second;
4443  KALDI_ASSERT(static_cast<size_t>(m1) < num_matrices &&
4444  static_cast<size_t>(m2) < num_matrices);
4445  int32 s1 = whole_submatrices[m1], s2 = whole_submatrices[m2];
4446  computation->commands.push_back(
4448  }
4449  computation->commands.push_back(goto_label_command);
4450 }
4451 
4452 // static
4454  int32 command1, int32 command2,
4455  NnetComputation *computation) {
4456  KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
4457  command2 + 1 && command1 < command2);
4458  KALDI_ASSERT(
4459  computation->commands[command1].command_type == kNoOperationPermanent &&
4460  computation->commands[command2].command_type == kNoOperationPermanent);
4461  // Remove any commands after 'command2'.
4462  computation->commands.resize(command2 + 1);
4463  computation->commands[command2].command_type = kGotoLabel;
4464  computation->commands[command2].arg1 = command1;
4466  computation->commands.insert(computation->commands.begin() + command1,
4467  c);
4468  // Now the kNoOperationLabel command is at position 'command1'.
4469 }
4470 
4471 
4472 
4474  analyzer_.Init(nnet_, *computation_);
4476  "You must request matrix debug info when compiling "
4477  "looped computations.");
4478 
4479  // get the indexes of potential splice points, one per segment of the
4480  // computation. We locate the splice points where kNoOperationPermanent is.
4481  // This is guaranteed to be after the inputs have been received, and before
4482  // the bulk of the computation in the segment, and of course before we provide
4483  // the output. It happens that by choosing this as the splice point we avoid
4484  // certain problems that would arise, for instance, if we chose the segment
4485  // boundaries (kNoOperationMarker).
4486  std::vector<int32> splice_points;
4488  &splice_points);
4489  int32 time_shift_per_segment = FindTimeShift(*computation_);
4490 
4491 
4492  std::vector<std::vector<int32> > active_matrices;
4493  // Find the list of matrices active at each of the potential splice points.
4494  FindActiveMatrices(*computation_, analyzer_, splice_points,
4495  &active_matrices);
4496 
4497  // Find a representation of the matrices of the computation as pairs
4498  // (unique_id, time_offset) that are more amenable to finding
4499  // matrices that represet lists of Cindexes that differ only by
4500  // a time offset.
4501  std::vector<std::pair<int32, int32> > matrix_to_pair;
4502  CreateMatrixPairs(*computation_, &matrix_to_pair);
4503 
4504  // Create the reverse map from pair to matrix index; we'll need it later.
4505  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
4506  GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
4507 
4508  // get lists of matrix per splice-point in the pair representation.
4509  std::vector<std::vector<std::pair<int32, int32> > > pair_lists;
4510  ConvertListsToPairLists(active_matrices, matrix_to_pair,
4511  &pair_lists);
4512 
4513  // Note: seg1 and seg2 are indexes into 'splice_points', representing
4514  // potential splice points (located near the beginnings of segments).
4515  int32 seg1, seg2;
4516  if (!FindFirstRepeat(pair_lists,
4517  time_shift_per_segment,
4518  &seg1, &seg2)) {
4519  KALDI_VLOG(2) << "Could not find repeats of variables.";
4520  return false;
4521  }
4522 
4523  std::vector<int32> seg1_matrices, seg2_matrices;
4524  GetIdentifiedMatrices(pair_lists[seg1], pair_lists[seg2],
4525  pair_to_matrix,
4526  &seg1_matrices, &seg2_matrices);
4527 
4528  int32 time_difference = time_shift_per_segment * (seg2 - seg1);
4529  CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
4530  time_difference);
4531 
4532  FormInfiniteLoop(splice_points[seg1], splice_points[seg2], computation_);
4533 
4534  AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
4535 
4537 
4539 
4540  return true;
4541 }
4542 
4543 
4545  NnetComputation *computation) {
4546  ComputationLoopedOptimizer optimizer(nnet, computation);
4547  optimizer.Optimize();
4548 }
4549 
4550 
4551 
4552 void FixGotoLabel(NnetComputation *computation) {
4553  int32 num_commands = computation->commands.size();
4554  if (num_commands == 0)
4555  return;
4556  for (int32 c = num_commands - 1; c >= 0; c--) {
4557  if (computation->commands[c].command_type == kGotoLabel) {
4558  int32 dest_command = computation->commands[c].arg1;
4559  if (static_cast<size_t>(dest_command) < computation->commands.size() &&
4560  computation->commands[dest_command].command_type == kNoOperationLabel)
4561  return; // nothing to fix.
4562  for (int32 d = 0; d + 1 < num_commands; d++) {
4563  if (computation->commands[d].command_type == kNoOperationLabel) {
4564  computation->commands[c].arg1 = d;
4565  return;
4566  }
4567  }
4568  KALDI_ERR << "Label not found.";
4569  } else if (computation->commands[c].command_type == kProvideOutput) {
4570  // sometimes kProvideOutput commands are temporarily ordered after
4571  // the kGotoLabel command, and we need to work in that case.
4572  continue;
4573  } else {
4574  // it loks like there is no 'goto' command in this computation-
4575  // if there were, it would be right at the end, possibly followed by
4576  // kProvideOutput commands.
4577  break;
4578  }
4579  }
4580 }
4581 
4582 bool MatrixIsUnused(const Analyzer &analyzer,
4583  const NnetComputation &computation,
4584  int32 m) {
4585  const MatrixAccesses &accesses = analyzer.matrix_accesses[m];
4586  if (accesses.is_input || accesses.is_output)
4587  return false;
4588  for (size_t i = 0; i < accesses.accesses.size(); i++) {
4589  int32 command_index = accesses.accesses[i].command_index;
4590  const NnetComputation::Command &command =
4591  computation.commands[command_index];
4592  if (command.command_type != kNoOperation &&
4593  command.command_type != kSetConst) {
4594  return false;
4595  }
4596  }
4597  return true;
4598 }
4599 
4601  int32 m,
4602  NnetComputation *computation) {
4603  const MatrixAccesses &accesses = analyzer.matrix_accesses[m];
4604  if (accesses.allocate_command >= 0) {
4605  NnetComputation::Command &command = computation->commands[
4606  accesses.allocate_command];
4607  KALDI_ASSERT(command.command_type == kNoOperation ||
4608  command.command_type == kAllocMatrix);
4609  command.command_type = kNoOperation;
4610  }
4611  if (accesses.deallocate_command >= 0) {
4612  NnetComputation::Command &command = computation->commands[
4613  accesses.deallocate_command];
4614  KALDI_ASSERT(command.command_type == kNoOperation ||
4615  command.command_type == kDeallocMatrix);
4616  command.command_type = kNoOperation;
4617  }
4618  for (size_t i = 0; i < accesses.accesses.size(); i++) {
4619  int32 command_index = accesses.accesses[i].command_index;
4620  NnetComputation::Command &command = computation->commands[command_index];
4621  KALDI_ASSERT(command.command_type == kNoOperation ||
4622  command.command_type == kSetConst);
4623  command.command_type = kNoOperation;
4624  }
4625 }
4626 
4627 
4628 
4629 // This comparison operator is used in the function InsertCommands()
4630 // to sort a list of these pairs by the .first element.
4632  // operator () should be viewed as a '<' operator that only looks at
4633  // the .first element, treating the .second elements as equal.
4634  bool operator () (const std::pair<int32, NnetComputation::Command> &p1,
4635  const std::pair<int32, NnetComputation::Command> &p2) const {
4636  return p1.first < p2.first;
4637  }
4638 };
4639 
4641  std::vector<std::pair<int32, NnetComputation::Command> > *new_commands,
4642  NnetComputation *computation) {
4643  int32 num_new_commands = new_commands->size(),
4644  num_old_commands = computation->commands.size();
4645  if (num_new_commands == 0)
4646  return;
4647  CommandPairComparator comparison_operator;
4648  // use std::stable_sort so that for entries in 'new_commands' that
4649  // have the same .first value, they stay in the same order they were
4650  // in before sorting.
4651  std::stable_sort(new_commands->begin(), new_commands->end(),
4652  comparison_operator);
4653 
4654  if (RandInt(0, 3) == 0) { // check 'new_commands'
4655  for (int32 i = 0; i + 1 < num_new_commands; i++) {
4656  KALDI_ASSERT((*new_commands)[i].first <= (*new_commands)[i+1].first &&
4657  (*new_commands)[i].first >= 0 &&
4658  (*new_commands)[i+1].first <= num_old_commands);
4659  }
4660  }
4661  std::vector<NnetComputation::Command> merged_commands;
4662  merged_commands.reserve(num_old_commands + num_new_commands);
4663 
4664  std::vector<std::pair<int32, NnetComputation::Command> >::const_iterator
4665  new_commands_iter = new_commands->begin(),
4666  new_commands_end = new_commands->end();
4667 
4668  for (int32 old_command_index = 0; old_command_index <= num_old_commands;
4669  old_command_index++) {
4670  while (new_commands_iter != new_commands_end &&
4671  new_commands_iter->first <= old_command_index) {
4672  merged_commands.push_back(new_commands_iter->second);
4673  ++new_commands_iter;
4674  }
4675  if (old_command_index < num_old_commands)
4676  merged_commands.push_back(computation->commands[old_command_index]);
4677  }
4678  KALDI_ASSERT(merged_commands.size() == num_old_commands +
4679  num_new_commands);
4680  // copy to 'computation->commands' via shallow swap.
4681  computation->commands.swap(merged_commands);
4682  FixGotoLabel(computation);
4683 }
4684 
4691  public:
4692 
4704  int32 memory_compression_level,
4705  int32 middle_command,
4706  NnetComputation *computation):
4707  nnet_(nnet), memory_compression_level_(memory_compression_level),
4708  middle_command_(middle_command), computation_(computation) { }
4709 
4710  void Optimize();
4711  private:
4712 
4713  // This function, called from Compress(), figures out whether we can compress
4714  // matrix m, and if so, adds an entry to compress_info_.
4715  void ProcessMatrix(int32 m);
4716 
4717  // This function modifies the commands in '*computation_', taking
4718  // as input the commands in compress_info_.
4719  void ModifyComputation();
4720 
4721  // While deciding what matrices to compress we will create a list of structs
4722  // of type MatrixCompressInfo. Later we copy-and-modify the commands in the
4723  // computation, putting the compression commands into their appropriate place.
4725  // m is the matrix-index of the matrix we're going to compress.
4727  // compression_command_index is the command-index of the command
4728  // *after* which we will place the compression command. Normally
4729  // this will be some type of propagation.
4731  // compression_command_index is the command-index of the command
4732  // *before* which we will place the uncompression command. Normally
4733  // this will be some type of backprop.
4735  // 'compression_type' (e.g. kCompressedMatrixInt8) determines the type
4736  // we compress the BaseFloats to.
4738  // 'range' determines range of values that the compressed values can
4739  // be in: for signed types they are in [-range, range], for unsigned
4740  // types, in [0, range].
4741  // As a special case, range = 0 means that the compression just stores the
4742  // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this
4743  // is useful for ReLUs.
4745  // this is provided to the initializer of CuCompressedMatrix; it should
4746  // be true if the values being compressed are potentially outside of
4747  // the representable range.
4748  bool truncate;
4749  MatrixCompressInfo(int32 m, int32 forward_command_index,
4750  int32 backward_command_index,
4751  CuCompressedMatrixType compression_type,
4752  BaseFloat range, bool truncate):
4753  m(m), compression_command_index(forward_command_index),
4754  uncompression_command_index(backward_command_index),
4755  compression_type(compression_type), range(range),
4756  truncate(truncate) { }
4757 
4758  };
4759  std::vector<MatrixCompressInfo> compress_info_;
4760 
4761  const Nnet &nnet_;
4766 };
4767 
4768 
4770  // whole_submatrices[m] is the submatrix-index of the submatrix that
4771  // represents the whole of matrix m.
4772  std::vector<int32> whole_submatrices;
4773  computation_->GetWholeSubmatrices(&whole_submatrices);
4774 
4775  // 'pairs_to_insert' will be a list of pairs (command-index, command),
4776  // meaning: (command-index just before which to insert this command; command
4777  // to insert).
4778  std::vector<std::pair<int32, NnetComputation::Command> >
4779  pairs_to_insert;
4780  pairs_to_insert.reserve(compress_info_.size() * 2);
4781  for (size_t i = 0; i < compress_info_.size(); i++) {
4782  const MatrixCompressInfo &info = compress_info_[i];
4783  int32 s = whole_submatrices[info.m];
4784  // below we use compression_command_index + 1 because we want the
4785  // compression to go after the command in 'info.compression_command_index'
4786  // (which might be, for instance, a forward propagation command).
4787  std::pair<int32, NnetComputation::Command> p1(
4788  info.compression_command_index + 1,
4790  s, static_cast<int32>(info.compression_type),
4791  info.truncate ? 1 : 0));
4792  pairs_to_insert.push_back(p1);
4793  std::pair<int32, NnetComputation::Command> p2(
4796  pairs_to_insert.push_back(p2);
4797  }
4798  InsertCommands(&pairs_to_insert,
4799  computation_);
4800 }
4801 
4802 
4804  analyzer_.Init(nnet_, *computation_);
4805  // note: matrix zero is not really a matrix.
4806  int32 num_matrices = computation_->matrices.size();
4807  for (int32 m = 1; m < num_matrices; m++)
4808  ProcessMatrix(m);
4809  if (!compress_info_.empty())
4810  ModifyComputation();
4811 }
4812 
4814  if (analyzer_.matrix_accesses[m].is_output) {
4815  return; // We can't do this optimization for matrices that are going to be
4816  // output to the user.
4817  }
4818 
4819  // 'accesses' list the commands that access this matrix.
4820  const std::vector<Access> &accesses = analyzer_.matrix_accesses[m].accesses;
4821  // the 'kReadAccess' below is actually a don't-care This is just
4822  // to find the position in 'accesses' that corresponds to command-index
4823  // 'middle_command'.
4824  Access middle_access(middle_command_, kReadAccess);
4825  std::vector<Access>::const_iterator iter = std::lower_bound(accesses.begin(),
4826  accesses.end(),
4827  middle_access);
4828  // At this point, 'iter' points to the first access in 'accesses'
4829  // whose command index is >= 'middle_command_' (which separates the forward
4830  // and backward passes), or accesses.end() if this matrix was not
4831  // accessed during the backward pass.
4832  if (iter == accesses.end()) {
4833  return; // There is nothing to do: this matrix was not accessed during the
4834  // backward pass.
4835  }
4836  if (iter == accesses.begin()) {
4837  return; // There is nothing to do: this matrix was not accessed during the
4838  // forward pass.
4839  }
4840  // 'backward_access' is the first access of the matrix in the backward
4841  // pass of the computation, and
4842  // 'forward_access' is the last access of the matrix in the forward pass
4843  // of the computation.
4844  const Access &backward_access = iter[0],
4845  &forward_access = iter[-1];
4846  KALDI_ASSERT(forward_access.command_index < middle_command_ &&
4847  backward_access.command_index > middle_command_);
4848 
4849  // 'backward_access_is_last_access' is going to be set to true if
4850  // 'backward_access' is the last command to access the matrix (apart from
4851  // deallocation or matrix-swap commands, which don't show up in the list of
4852  // accesses).
4853  bool backward_access_is_last_access = (accesses.end() == iter + 1);
4854 
4855  int32 backward_command_index = backward_access.command_index,
4856  forward_command_index = forward_access.command_index;
4858  &backward_command = computation_->commands[backward_command_index];
4859 
4860  if (memory_compression_level_ >= 1 &&
4861  backward_access_is_last_access &&
4862  backward_access.access_type == kReadAccess &&
4863  backward_command.command_type == kBackprop) {
4864  int32 component_index = backward_command.arg1;
4865  const Component *component = nnet_.GetComponent(component_index);
4866  // this is potentially a candidate for our optimization for ReLU units,
4867  // where we only need to store the sign.
4868  if (component->Type() == "RectifiedLinearComponent") {
4869  compress_info_.push_back(
4870  MatrixCompressInfo(m, forward_command_index,
4871  backward_command_index,
4873  true));
4874  return;
4875  }
4876  }
4877 
4878  // If memory_compression_level >= 2 (an "intermediate" level of compression),
4879  // then we'll consider compressing quantities using 16 bits in the range
4880  // [-10, 10]. Because of the way this compression works, exact zero will
4881  // still be uncompressed as exact zero, so even if this is the output
4882  // of a ReLU, it's OK. (Having a few derivatives zero for ReLU outputs
4883  // that were very close to zero is OK.)
4884  if (memory_compression_level_ >= 2) {
4885  compress_info_.push_back(
4886  MatrixCompressInfo(m, forward_command_index,
4887  backward_command_index,
4888  kCompressedMatrixInt16, 10.0,
4889  true));
4890  return;
4891  }
4892 
4893  // TODO: later maybe implement something for memory compression level = 3.
4894 }
4895 
4896 
4897 
4898 
4900  int32 memory_compression_level,
4901  NnetComputation *computation) {
4902  if (memory_compression_level == 0 || computation->commands.empty())
4903  return;
4904  // don't apply this optimization to looped computations.
4905  if (computation->commands.back().command_type == kGotoLabel)
4906  return;
4907 
4908  // 'middle_command' will be the index of the command of type
4909  // 'kNoOperationMarker' that separates the forward and backward
4910  // passes. If it doesn't exist, it means this computation doesn't
4911  // include
4912  int32 middle_command = -1;
4913  for (size_t i = 0; i < computation->commands.size(); i++) {
4914  if (computation->commands[i].command_type == kNoOperationMarker) {
4915  if (middle_command < 0) {
4916  middle_command = static_cast<int32>(i);
4917  } else {
4918  KALDI_WARN << "Found more than one command of type kNoOperationMarker "
4919  "in non-looped computation.";
4920  // there are more than one command of this type... this wasn't expected.
4921  // return (i.e. do nothing).
4922  return;
4923  }
4924  }
4925  }
4926  if (middle_command == -1) {
4927  return; // This computation doesn't have a backprop pass.
4928  }
4929  if (memory_compression_level >= 1) {
4930  int64 bytes_used_initial, bytes_used_final;
4931  bool verbose_ge_2 = GetVerboseLevel() >= 2;
4932  if (verbose_ge_2)
4933  bytes_used_initial = GetMaxMemoryUse(*computation);
4934 
4935  MemoryCompressionOptimizer opt(nnet, memory_compression_level,
4936  middle_command, computation);
4937  opt.Optimize();
4938 
4939  if (verbose_ge_2) {
4940  bytes_used_final = GetMaxMemoryUse(*computation);
4941  if (bytes_used_final != bytes_used_initial) {
4942  KALDI_VLOG(2) << "Memory compression reduced memory use from "
4943  << bytes_used_initial << " to "
4944  << bytes_used_final << " bytes.";
4945  }
4946  }
4947  }
4948 }
4949 
4950 
4951 std::shared_ptr<const NnetComputation> ComputationCache::Find(
4952  const ComputationRequest &in_request) {
4953  std::lock_guard<std::mutex> lock(mutex_);
4954 
4955  CacheType::iterator iter = computation_cache_.find(&in_request);
4956  if (iter == computation_cache_.end()) {
4957  return NULL;
4958  } else {
4959  std::shared_ptr<const NnetComputation> ans = iter->second.first;
4960  // Update access record by moving the accessed request to the end of the
4961  // access queue, which declares that it's the most recently used.
4962  access_queue_.splice(access_queue_.end(), access_queue_,
4963  iter->second.second);
4964  return ans;
4965  }
4966 }
4967 
4968 
4970  cache_capacity_(cache_capacity) {
4971  KALDI_ASSERT(cache_capacity > 0);
4972 }
4973 
4974 std::shared_ptr<const NnetComputation> ComputationCache::Insert(
4975  const ComputationRequest &request_in,
4976  const NnetComputation *computation_in) {
4977 
4978  std::lock_guard<std::mutex> lock(mutex_);
4979  if (static_cast<int32>(computation_cache_.size()) >= cache_capacity_) {
4980  // Cache has reached capacity; purge the least-recently-accessed request
4981  const CacheType::iterator iter =
4982  computation_cache_.find(access_queue_.front());
4983  KALDI_ASSERT(iter != computation_cache_.end());
4984  const ComputationRequest *request = iter->first;
4985  computation_cache_.erase(iter);
4986  delete request;
4987  // we don't need to delete the computation in iter->second.first, as the
4988  // shared_ptr takes care of that automatically.
4989  access_queue_.pop_front();
4990  }
4991 
4992  // Now insert the thing we need to insert. We'll own the pointer 'request' in
4993  // 'computation_cache_', so we need to allocate our own version.
4994  ComputationRequest *request = new ComputationRequest(request_in);
4995  // When we construct this shared_ptr, it takes ownership of the pointer
4996  // 'computation_in'.
4997  std::shared_ptr<const NnetComputation> computation(computation_in);
4998 
4999  AqType::iterator ait = access_queue_.insert(access_queue_.end(), request);
5000 
5001  std::pair<CacheType::iterator, bool> p = computation_cache_.insert(
5002  std::make_pair(request, std::make_pair(computation, ait)));
5003  if (!p.second) {
5004  // if p.second is false, this pair was not inserted because
5005  // a computation for the same computation-request already existed in
5006  // the map. This is possible in multi-threaded operations, if two
5007  // threads try to compile the same computation at the same time (only
5008  // one of them will successfully add it).
5009  // We need to erase the access-queue element that we just added, it's
5010  // no longer going to be needed.
5011  access_queue_.erase(ait);
5012  delete request;
5013  }
5014  return computation;
5015 }
5016 
5017 
5018 void ComputationCache::Read(std::istream &is, bool binary) {
5019  // Note: the object on disk doesn't have tokens like "<ComputationCache>"
5020  // and "</ComputationCache>" for back-compatibility reasons.
5021  int32 computation_cache_size;
5022  ExpectToken(is, binary, "<ComputationCacheSize>");
5023  ReadBasicType(is, binary, &computation_cache_size);
5024  KALDI_ASSERT(computation_cache_size >= 0);
5025  computation_cache_.clear();
5026  access_queue_.clear();
5027  ExpectToken(is, binary, "<ComputationCache>");
5028  for (size_t c = 0; c < computation_cache_size; c++) {
5029  ComputationRequest request;
5030  request.Read(is, binary);
5031  NnetComputation *computation = new NnetComputation();
5032  computation->Read(is, binary);
5033  Insert(request, computation);
5034  }
5035 }
5036 
5037 void ComputationCache::Check(const Nnet &nnet) const {
5038  CacheType::const_iterator iter = computation_cache_.begin(),
5039  end = computation_cache_.end();
5040  // We only need to explicitly delete the pointer to the ComputationRequest.
5041  // The pointers to Computation are deleted automatically by std::shared_ptr
5042  // when the reference count goes to zero.
5043  for (; iter != end; ++iter) {
5044  const NnetComputation &computation = *(iter->second.first);
5045  CheckComputationOptions check_config;
5046  ComputationChecker checker(check_config, nnet, computation);
5047  checker.Check();
5048  }
5049 }
5050 
5051 void ComputationCache::Write(std::ostream &os, bool binary) const {
5052  WriteToken(os, binary, "<ComputationCacheSize>");
5053  WriteBasicType(os, binary, static_cast<int32>(computation_cache_.size()));
5054  WriteToken(os, binary, "<ComputationCache>");
5055  for (CacheType::const_iterator iter = computation_cache_.begin();
5056  iter != computation_cache_.end(); ++iter) {
5057  iter->first->Write(os, binary);
5058  iter->second.first->Write(os, binary);
5059  }
5060 }
5061 
5063  CacheType::const_iterator iter = computation_cache_.begin(),
5064  end = computation_cache_.end();
5065  // We only need to explicitly delete the pointer to the ComputationRequest.
5066  // The pointers to Computation are deleted automatically by std::shared_ptr
5067  // when the reference count goes to zero.
5068  for (; iter != end; ++iter)
5069  delete iter->first;
5070 }
5071 
5072 } // namespace nnet3
5073 } // namespace kaldi
bool MatrixIsUnused(const Analyzer &analyzer, const NnetComputation &computation, int32 m)
This function returns true if matrix 1 <= m < computation->matrices.size() is unused, defined as: it is not an input or an output, and is not accessed other than via commands of type kAllocMatrix, kDeallocMatrix, and kSetConst.
CommandType
CommandType is an enum that describes the category of the command used in the NnetComputation.
static void CreateRenumbering(int32 old_num_elements, const std::vector< int32 > &to_remove, std::vector< int32 > *renumbering)
creates a renumbering that removes the elements in "to_remove", e.g.
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
std::vector< MatrixCompressInfo > compress_info_
int32 FirstNontrivialAccess(int32 s) const
Returns the first command (read or write) that accesses any part of &#39;s&#39; except for zeroing it (i...
void IdentifyMatrixArgsInComputation(NnetComputation *computation, std::vector< int32 *> *matrix_args)
static void ConvertListsToPairLists(const std::vector< std::vector< int32 > > &active_matrices, const std::vector< std::pair< int32, int32 > > &matrix_to_pair, std::vector< std::vector< std::pair< int32, int32 > > > *active_pairs)
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
void GetCommandsOfType(const NnetComputation &computation, CommandType t, std::vector< int32 > *command_indexes)
This utility function works out from a computation, the command-indexes of the commands of the given ...
std::vector< MatrixDebugInfo > matrix_debug_info
bool SplitRowOps(NnetComputation *computation)
This function detects cases where commands of type kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti use indexes that correspond to at most two submatrices, in two distinct ranges without gaps filled by -1&#39;s, and could be converted to at most two commands of type kMatrixAdd, kMatrixCopy, kAddRows or kCopyRows.
std::vector< NnetComputation::Command > final_commands_
void OptimizeLoopedComputation(const Nnet &nnet, NnetComputation *computation)
This function tries to optimize computation &#39;computation&#39; for an &#39;looped&#39; computation.
ComputationLoopedOptimizer(const Nnet &nnet, NnetComputation *computation)
std::vector< std::vector< NnetComputation::Command > > extra_commands_
bool need_model_derivative
if need_model_derivative is true, then we&#39;ll be doing either model training or model-derivative compu...
void ExpandRowsMultiCommand(const NnetComputation::Command &c_in, NnetComputation::Command *c_out)
void IdentifySubmatrixArgs(NnetComputation::Command *c, std::vector< int32 *> *submatrix_args)
This function outputs to "submatrix_args" the addresses of a subset of arguments arg1 through arg6 in...
void Extend(int32 *dest_submatrix_index, int32 *src_submatrix_index)
MiscComputationInfo misc_info
misc_info is for extensibility to things that don&#39;t easily fit into the framework.
static void ConvertNumNValues(int32 n_stride, int32 old_N, int32 new_N, const std::vector< Index > &indexes_in, std::vector< Index > *indexes_out)
Abstract base-class for neural-net components.
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55
void RenumberComputation(NnetComputation *computation)
This function detects submatrices and matrices that are never used (e.g.