nnet-compile-utils.cc
Go to the documentation of this file.
1 // nnet3/nnet-compile-utils.cc
2 
3 // Copyright 2015-2017 Johns Hopkins University (author: Daniel Povey)
4 // 2015 (author: Vijayaditya Peddinti)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include <iterator>
22 #include <sstream>
23 #include "util/common-utils.h"
25 
26 namespace kaldi {
27 namespace nnet3 {
28 
36  const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
37  std::unordered_map<int32,int32> *submat_counts,
38  std::vector<int32> *submats_with_large_counts) {
39  auto iter = submat_lists.begin(), end = submat_lists.end();
40  for (; iter != end; ++iter) {
41  std::vector<std::pair<int32, int32> >::const_iterator
42  iter2 = iter->begin(), end2 = iter->end();
43  for (; iter2 != end2; ++iter2) {
44  int32 submat_index = iter2->first;
45  KALDI_ASSERT(submat_index >= 0); // We don't expect -1's in submat_lists.
46  std::unordered_map<int32,int32>::iterator
47  iter = submat_counts->find(submat_index);
48  if (iter == submat_counts->end())
49  (*submat_counts)[submat_index] = 1;
50  else
51  iter->second++;
52  }
53  }
54  auto counts_iter = submat_counts->begin(),
55  counts_end = submat_counts->end();
56  size_t cutoff = submat_lists.size() / 2;
57  for (; counts_iter != counts_end; ++counts_iter)
58  if (counts_iter->second > cutoff)
59  submats_with_large_counts->push_back(counts_iter->first);
60 }
61 
74  const std::vector<int32> &submats_to_separate,
75  const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
76  std::vector<std::vector<std::pair<int32, int32> > > *reduced_submat_lists,
77  std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
78  KALDI_ASSERT(split_lists->empty() && !submats_to_separate.empty());
79  size_t num_to_separate = submats_to_separate.size(),
80  num_rows = submat_lists.size();
81  std::unordered_map<int32, size_t> submat_to_index;
82  reduced_submat_lists->clear();
83  reduced_submat_lists->resize(num_rows);
84  split_lists->resize(num_to_separate);
85  for (size_t i = 0; i < num_to_separate; i++) {
86  (*split_lists)[i].resize(num_rows, std::pair<int32, int32>(-1, -1));
87  int32 submat = submats_to_separate[i];
88  submat_to_index[submat] = i;
89  }
90  for (size_t row = 0; row < submat_lists.size(); row++) {
91  std::vector<std::pair<int32, int32> >::const_iterator
92  iter = submat_lists[row].begin(), end = submat_lists[row].end();
93  std::vector<std::pair<int32, int32> >
94  &reduced_list = (*reduced_submat_lists)[row];
95  // 'reduced_lists' will contain the pairs that don't make it into
96  // 'split_lists'.
97  for (; iter != end; ++iter) {
98  int32 submat_index = iter->first;
99  std::unordered_map<int32, size_t>::const_iterator map_iter =
100  submat_to_index.find(submat_index);
101  if (map_iter == submat_to_index.end()) { // not a large-count submatrix.
102  reduced_list.push_back(*iter);
103  continue;
104  }
105  size_t index = map_iter->second;
106  std::pair<int32,int32> &p = (*split_lists)[index][row];
107  if (p.first >= 0) {
108  // we'd only reach here if the same submat index repeated in the same
109  // row, which is possible but rare.
110  reduced_list.push_back(*iter);
111  continue;
112  }
113  p.first = submat_index;
114  int32 src_row_index = iter->second;
115  p.second = src_row_index;
116  }
117  }
118 }
119 
121  const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
122  std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
123  size_t num_rows = submat_lists.size(),
124  num_output_lists = 0;
125  auto iter = submat_lists.begin(), end = submat_lists.end();
126  for (; iter != end; ++iter)
127  if (iter->size() > num_output_lists)
128  num_output_lists = iter->size();
129  split_lists->clear();
130  if (num_output_lists == 0) // Odd, but could happen, maybe
131  return;
132  else if (num_output_lists == 1) {
133  split_lists->resize(1);
134  std::vector<std::pair<int32, int32> > &list = (*split_lists)[0];
135  list.resize(num_rows, std::pair<int32, int32>(-1, -1));
136  for (size_t i = 0; i < num_rows; i++) {
137  if (!submat_lists[i].empty())
138  list[i] = submat_lists[i][0];
139  }
140  return;
141  }
142 
143  // counts for each submatrix index, of how many times it occurs.
144  std::unordered_map<int32,int32> submat_counts;
145  std::vector<int32> submats_with_large_counts;
146  GetSubmatCounts(submat_lists, &submat_counts, &submats_with_large_counts);
147  if (!submats_with_large_counts.empty()) {
148  // There are submatrices with counts over half the num-rows. We assign these
149  // their own output lists.
150 
151  std::vector<std::vector<std::pair<int32, int32> > > reduced_submat_lists;
152  SeparateSubmatsWithLargeCounts(submats_with_large_counts,
153  submat_lists,
154  &reduced_submat_lists,
155  split_lists);
156  // 'reduced_split_lists' is the result of recursing with input 'reduced_submat_lists';
157  // we'll append its result to 'split_lists'.
158  std::vector<std::vector<std::pair<int32, int32> > > reduced_split_lists;
159  SplitLocations(reduced_submat_lists, &reduced_split_lists);
160  size_t cur_num_lists = split_lists->size(),
161  num_extra_lists = reduced_split_lists.size(),
162  new_num_lists = cur_num_lists + num_extra_lists;
163  split_lists->resize(new_num_lists);
164  for (size_t i = 0; i < num_extra_lists; i++)
165  (*split_lists)[cur_num_lists + i].swap(reduced_split_lists[i]);
166  return;
167  // and we're done.
168  } else {
169  // All the counts of submatrix indexes seem to be small so we are resigned to
170  // only using AddRowsMulti commands.
171  split_lists->resize(num_output_lists);
172  for (size_t i = 0; i < num_output_lists; i++)
173  (*split_lists)[i].resize(num_rows, std::pair<int32, int32>(-1, -1));
174  for (size_t row = 0; row < num_rows; row++) {
175  const std::vector<std::pair<int32, int32> > &this_list =
176  submat_lists[row];
177  size_t this_list_size = submat_lists[row].size();
178  for (size_t i = 0; i < this_list_size; i++) {
179  (*split_lists)[i][row] = this_list[i];
180  }
181  }
182  }
183 }
184 
185 
186 /* If it is the case for some i >= 0 that all the .first elements of
187  "location_vector" are either i or -1, then output i to first_value and the
188  .second elements into "second_values", and return true. Otherwise return
189  false and the outputs are don't-cares. */
191  const std::vector<std::pair<int32, int32> > &location_vector,
192  int32 *first_value,
193  std::vector<int32> *second_values) {
194  *first_value = -1;
195  second_values->clear();
196  second_values->reserve(location_vector.size());
197  std::vector<std::pair<int32, int32> >::const_iterator iter;
198  for (iter = location_vector.begin(); iter < location_vector.end(); ++iter) {
199  if (iter->first != -1) {
200  if (*first_value == -1)
201  *first_value = iter->first;
202  if (iter->first != *first_value)
203  return false;
204  second_values->push_back(iter->second);
205  } else {
206  second_values->push_back(-1);
207  }
208  }
209  return true;
210 }
211 
212 
213 // see declaration in header for documentation
215  const std::vector<int32> &indexes,
216  std::vector<std::vector<int32> > *indexes_out) {
217  indexes_out->clear();
218  indexes_out->reserve(3);
219  if (indexes.empty()) return;
220  int32 max_value = *std::max_element(indexes.begin(), indexes.end());
221  if (max_value == -1) return;
222  std::vector<int32> num_segments_seen(max_value + 1, 0);
223  int32 dim = indexes.size(), num_output_vectors = 0;
224  for (int32 i = 0; i < dim;) {
225  // note, we increment i within the loop.
226  if (indexes[i] == -1) {
227  i++;
228  continue;
229  }
230  int32 value = indexes[i], start_index = i;
231  for (; i < dim && indexes[i] == value; i++);
232  int32 end_index = i; // one past the end.
233  // the input 'indexes' contains a sequence of possibly-repeated instances of
234  // the value 'value', starting at index 'start_index', with 'end_index' as
235  // one past the end.
236  int32 this_num_segments_seen = num_segments_seen[value]++;
237  if (this_num_segments_seen >= num_output_vectors) { // we have nowhere to
238  // put it.
239  indexes_out->resize(++num_output_vectors);
240  indexes_out->back().resize(dim, -1); // fill newly added vector with -1's.
241  }
242  std::vector<int32> &this_out_vec((*indexes_out)[this_num_segments_seen]);
243  std::vector<int32>::iterator iter = this_out_vec.begin() + start_index,
244  end = this_out_vec.begin() + end_index;
245  // Fill the appropriate range of the output vector with 'value'
246  for (; iter != end; ++iter) *iter = value;
247  }
248 }
249 
250 
251 
280 void SplitPairList(std::vector<std::pair<int32, int32> >& list,
281  std::vector<std::vector<std::pair<int32, int32> > >* split_lists) {
282  split_lists->clear();
283  typedef unordered_map<std::pair<int32, int32>,
284  int32, PairHasher<int32> > MapType;
285  // this maps a pair not equal to -1,-1, to the number of times we've already seen it.
286  MapType pair_to_count;
287  int32 cur_num_lists = 0;
288 
289  for (int32 i = 0; i < list.size(); i++) {
290  if (list[i].first == -1)
291  continue;
292  MapType::iterator iter = pair_to_count.find(list[i]);
293  int32 this_count;
294  if (iter == pair_to_count.end())
295  pair_to_count[list[i]] = this_count = 1;
296  else
297  this_count = (++iter->second);
298  if (this_count > cur_num_lists) {
299  KALDI_ASSERT(this_count == cur_num_lists + 1);
300  split_lists->resize(this_count);
301  split_lists->back().resize(list.size(),
302  std::pair<int32, int32>(-1, -1));
303  cur_num_lists++;
304  }
305  (*split_lists)[this_count-1][i] = list[i];
306  }
307  if (split_lists->size() == 0)
308  KALDI_ERR << "Input list has just dummy pairs";
309 }
310 
312  const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
313  std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
314  std::vector<std::vector<std::pair<int32, int32> > > split_lists_intermediate;
315  // Split the submat_lists
316  SplitLocations(submat_lists, &split_lists_intermediate);
317  for (size_t i = 0; i < split_lists_intermediate.size(); i++) {
318  int32 first_value;
319  std::vector<int32> second_values;
320  if (ConvertToIndexes(split_lists_intermediate[i],
321  &first_value, &second_values)) {
322  // the .first values in split_lists_intermediate[i] are all the same (or
323  // equal to -1).
324  if (first_value == -1) {
325  // all the .first values were equal to -1. this is like a NULL marker.
326  continue;
327  }
328  std::vector<std::vector<int32> > second_values_split;
329  EnsureContiguousProperty(second_values, &second_values_split);
330  if (second_values_split.size() == 1) {
331  // this branch is an optimization for speed.
332  split_lists->push_back(split_lists_intermediate[i]);
333  } else {
334  for (size_t j = 0; j < second_values_split.size(); j++) {
335  split_lists->resize(split_lists->size() + 1);
336  const std::vector<int32> &input_list = second_values_split[j];
337  std::vector<std::pair<int32, int32> > &output_list =
338  split_lists->back();
339  output_list.resize(input_list.size());
340  int32 size = input_list.size();
341  for (int32 k = 0; k < size; k++) {
342  int32 row = input_list[k];
343  if (row == -1) output_list[k].first = -1;
344  else output_list[k].first = first_value;
345  output_list[k].second = row;
346  }
347  }
348  }
349  } else {
350  // the .first values are not the same
351  // splitting the list of pairs to ensure unique pairs, unless it is
352  // (-1,-1)
353  std::vector<std::vector<std::pair<int32, int32> > > new_split_lists;
354  SplitPairList(split_lists_intermediate[i],
355  &new_split_lists);
356  for (int32 j = 0; j < new_split_lists.size(); j++) {
357  split_lists->push_back(new_split_lists[j]);
358  }
359  }
360  }
361 }
362 
363 // This function returns true if for each integer i != -1, all the indexes j at
364 // which indexes[j] == i are consecutive with no gaps (more formally: if j1 < j2
365 // < j3 and indexes[j1] == indexes[j3], then indexes[j1] == indexes[j2]). If
366 // so, it also outputs to "reverse_indexes" the begin and end of these ranges,
367 // so that indexes[j] == i for all j such that (*reverse_indexes)[i].first <= j
368 // && j < (*reverse_indexes)[i].second.
370  const std::vector<int32> &indexes,
371  std::vector<std::pair<int32, int32> > *reverse_indexes) {
372  reverse_indexes->clear();
373  int32 num_indexes = indexes.size();
374  if (num_indexes == 0)
375  return true;
376  int32 num_input_indexes =
377  *std::max_element(indexes.begin(), indexes.end()) + 1;
378  KALDI_ASSERT(num_input_indexes >= 0);
379  if (num_input_indexes == 0) {
380  // we don't really expect this input, filled with -1's.
381  KALDI_WARN << "HasContiguousProperty called on vector of -1's.";
382  return true;
383  }
384  reverse_indexes->resize(num_input_indexes,
385  std::pair<int32,int32>(-1, -1));
386  // set each pair's "first" to the min index of all elements
387  // of "indexes" with that value, and the "second" to the
388  // max plus one.
389  for (int32 i = 0; i < num_indexes; i++) {
390  int32 j = indexes[i];
391  if (j == -1) continue;
392  KALDI_ASSERT(j >= 0);
393  std::pair<int32, int32> &pair = (*reverse_indexes)[j];
394  if (pair.first == -1) {
395  pair.first = i;
396  pair.second = i + 1;
397  } else {
398  pair.first = std::min(pair.first, i);
399  pair.second = std::max(pair.second, i + 1);
400  }
401  }
402  // check that the contiguous property holds.
403  for (int32 i = 0; i < num_input_indexes; i++) {
404  std::pair<int32, int32> pair = (*reverse_indexes)[i];
405  if (pair.first != -1) {
406  for (int32 j = pair.first; j < pair.second; j++)
407  if (indexes[j] != i)
408  return false;
409  }
410  }
411  return true;
412 }
413 
414 
415 // see comment in header.
416 void GetNxList(const std::vector<Index> &indexes,
417  std::vector<std::pair<int32, int32> > *pairs) {
418  // set of (n,x) pairs
419  std::unordered_set<std::pair<int32, int32>, PairHasher<int32> > n_x_set;
420 
421  for (std::vector<Index>::const_iterator iter = indexes.begin();
422  iter != indexes.end(); ++iter)
423  n_x_set.insert(std::pair<int32, int32>(iter->n, iter->x));
424  pairs->clear();
425  pairs->reserve(n_x_set.size());
426  for (std::unordered_set<std::pair<int32, int32>, PairHasher<int32> >::iterator
427  iter = n_x_set.begin(); iter != n_x_set.end(); ++iter)
428  pairs->push_back(*iter);
429  std::sort(pairs->begin(), pairs->end());
430 }
431 
432 
433 // see comment in header.
434 void GetTList(const std::vector<Index> &indexes,
435  std::vector<int32> *t_values) {
436  // set of t values
437  std::unordered_set<int32> t_set;
438 
439  for (std::vector<Index>::const_iterator iter = indexes.begin();
440  iter != indexes.end(); ++iter)
441  if (iter->t != kNoTime)
442  t_set.insert(iter->t);
443  t_values->clear();
444  t_values->reserve(t_set.size());
445  for (std::unordered_set<int32>::iterator iter = t_set.begin();
446  iter != t_set.end(); ++iter)
447  t_values->push_back(*iter);
448  std::sort(t_values->begin(), t_values->end());
449 }
450 
451 
452 
453 } // namespace nnet3
454 } // namespace kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void GetTList(const std::vector< Index > &indexes, std::vector< int32 > *t_values)
This function outputs a sorted, unique list of the &#39;t&#39; values that are encountered in the provided li...
bool HasContiguousProperty(const std::vector< int32 > &indexes, std::vector< std::pair< int32, int32 > > *reverse_indexes)
This function returns true if for each integer i != -1, all the indexes j at which indexes[j] == i ar...
bool ConvertToIndexes(const std::vector< std::pair< int32, int32 > > &location_vector, int32 *first_value, std::vector< int32 > *second_values)
If it is the case for some i >= 0 that all the .first elements of "location_vector" are either i or -...
kaldi::int32 int32
void SeparateSubmatsWithLargeCounts(const std::vector< int32 > &submats_to_separate, const std::vector< std::vector< std::pair< int32, int32 > > > &submat_lists, std::vector< std::vector< std::pair< int32, int32 > > > *reduced_submat_lists, std::vector< std::vector< std::pair< int32, int32 > > > *split_lists)
This function, used in SplitLocations(), is used to make separate &#39;split lists&#39; for certain high-coun...
void GetSubmatCounts(const std::vector< std::vector< std::pair< int32, int32 > > > &submat_lists, std::unordered_map< int32, int32 > *submat_counts, std::vector< int32 > *submats_with_large_counts)
Gets counts of submatrices (the 1st members of pairs) in submat_lists.
void SplitLocations(const std::vector< std::vector< std::pair< int32, int32 > > > &submat_lists, std::vector< std::vector< std::pair< int32, int32 > > > *split_lists)
The input to this function is a vector (indexed by matrix-row-index) of lists of pairs (submat_index...
void SplitPairList(std::vector< std::pair< int32, int32 > > &list, std::vector< std::vector< std::pair< int32, int32 > > > *split_lists)
This function splits a vector of pairs into a list of vectors of pairs.
void EnsureContiguousProperty(const std::vector< int32 > &indexes, std::vector< std::vector< int32 > > *indexes_out)
This function takes a vector of indexes and splits it up into as separate vectors of the same size...
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
void SplitLocationsBackward(const std::vector< std::vector< std::pair< int32, int32 > > > &submat_lists, std::vector< std::vector< std::pair< int32, int32 > > > *split_lists)
This function has the same interface as SplitLocations(); however, it ensures certain additional prop...
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
const int kNoTime
Definition: nnet-common.cc:573
void GetNxList(const std::vector< Index > &indexes, std::vector< std::pair< int32, int32 > > *pairs)
This function outputs a unique, lexicographically sorted list of the pairs of (n, x) values that are ...
A hashing function-object for pairs of ints.
Definition: stl-utils.h:235