nnet-example-utils.cc
Go to the documentation of this file.
1 // nnet3/nnet-example-utils.cc
2 
3 // Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey)
4 // 2014 Vimal Manohar
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
22 #include "lat/lattice-functions.h"
23 #include "hmm/posterior.h"
24 #include "util/text-utils.h"
25 #include <numeric>
26 #include <iomanip>
27 
28 namespace kaldi {
29 namespace nnet3 {
30 
31 
32 // get a sorted list of all NnetIo names in all examples in the list (will
33 // normally be just the strings "input" and "output", but maybe also "ivector").
34 static void GetIoNames(const std::vector<NnetExample> &src,
35  std::vector<std::string> *names_vec) {
36  std::set<std::string> names;
37  std::vector<NnetExample>::const_iterator iter = src.begin(), end = src.end();
38  for (; iter != end; ++iter) {
39  std::vector<NnetIo>::const_iterator iter2 = iter->io.begin(),
40  end2 = iter->io.end();
41  for (; iter2 != end2; ++iter2)
42  names.insert(iter2->name);
43  }
44  CopySetToVector(names, names_vec);
45 }
46 
47 // Get feature "sizes" for each NnetIo name, which are the total number of
48 // Indexes for that NnetIo (needed to correctly size the output matrix). Also
49 // make sure the dimensions are consistent for each name.
50 static void GetIoSizes(const std::vector<NnetExample> &src,
51  const std::vector<std::string> &names,
52  std::vector<int32> *sizes) {
53  std::vector<int32> dims(names.size(), -1); // just for consistency checking.
54  sizes->clear();
55  sizes->resize(names.size(), 0);
56  std::vector<std::string>::const_iterator names_begin = names.begin(),
57  names_end = names.end();
58  std::vector<NnetExample>::const_iterator iter = src.begin(), end = src.end();
59  for (; iter != end; ++iter) {
60  std::vector<NnetIo>::const_iterator iter2 = iter->io.begin(),
61  end2 = iter->io.end();
62  for (; iter2 != end2; ++iter2) {
63  const NnetIo &io = *iter2;
64  std::vector<std::string>::const_iterator names_iter =
65  std::lower_bound(names_begin, names_end, io.name);
66  KALDI_ASSERT(*names_iter == io.name);
67  int32 i = names_iter - names_begin;
68  int32 this_dim = io.features.NumCols();
69  if (dims[i] == -1) {
70  dims[i] = this_dim;
71  } else if (dims[i] != this_dim) {
72  KALDI_ERR << "Merging examples with inconsistent feature dims: "
73  << dims[i] << " vs. " << this_dim << " for '"
74  << io.name << "'.";
75  }
76  KALDI_ASSERT(io.features.NumRows() == io.indexes.size());
77  int32 this_size = io.indexes.size();
78  (*sizes)[i] += this_size;
79  }
80  }
81 }
82 
83 
84 
85 
86 // Do the final merging of NnetIo, once we have obtained the names, dims and
87 // sizes for each feature/supervision type.
88 static void MergeIo(const std::vector<NnetExample> &src,
89  const std::vector<std::string> &names,
90  const std::vector<int32> &sizes,
91  bool compress,
92  NnetExample *merged_eg) {
93  // The total number of Indexes we have across all examples.
94  int32 num_feats = names.size();
95 
96  std::vector<int32> cur_size(num_feats, 0);
97 
98  // The features in the different NnetIo in the Indexes across all examples
99  std::vector<std::vector<GeneralMatrix const*> > output_lists(num_feats);
100 
101  // Initialize the merged_eg
102  merged_eg->io.clear();
103  merged_eg->io.resize(num_feats);
104  for (int32 f = 0; f < num_feats; f++) {
105  NnetIo &io = merged_eg->io[f];
106  int32 size = sizes[f];
107  KALDI_ASSERT(size > 0);
108  io.name = names[f];
109  io.indexes.resize(size);
110  }
111 
112  std::vector<std::string>::const_iterator names_begin = names.begin(),
113  names_end = names.end();
114  std::vector<NnetExample>::const_iterator eg_iter = src.begin(),
115  eg_end = src.end();
116  for (int32 n = 0; eg_iter != eg_end; ++eg_iter, ++n) {
117  std::vector<NnetIo>::const_iterator io_iter = eg_iter->io.begin(),
118  io_end = eg_iter->io.end();
119  for (; io_iter != io_end; ++io_iter) {
120  const NnetIo &io = *io_iter;
121  std::vector<std::string>::const_iterator names_iter =
122  std::lower_bound(names_begin, names_end, io.name);
123  KALDI_ASSERT(*names_iter == io.name);
124 
125  int32 f = names_iter - names_begin;
126  int32 this_size = io.indexes.size();
127  int32 &this_offset = cur_size[f];
128  KALDI_ASSERT(this_size + this_offset <= sizes[f]);
129 
130  // Add f'th Io's features
131  output_lists[f].push_back(&(io.features));
132 
133  // Work on the Indexes for the f^th Io in merged_eg
134  NnetIo &output_io = merged_eg->io[f];
135  std::copy(io.indexes.begin(), io.indexes.end(),
136  output_io.indexes.begin() + this_offset);
137  std::vector<Index>::iterator output_iter = output_io.indexes.begin();
138  // Set the n index to be different for each of the original examples.
139  for (int32 i = this_offset; i < this_offset + this_size; i++) {
140  // we could easily support merging already-merged egs, but I don't see a
141  // need for it right now.
142  KALDI_ASSERT(output_iter[i].n == 0 &&
143  "Merging already-merged egs? Not currentlysupported.");
144  output_iter[i].n = n;
145  }
146  this_offset += this_size; // note: this_offset is a reference.
147  }
148  }
149  KALDI_ASSERT(cur_size == sizes);
150  for (int32 f = 0; f < num_feats; f++) {
151  AppendGeneralMatrixRows(output_lists[f],
152  &(merged_eg->io[f].features));
153  if (compress) {
154  // the following won't do anything if the features were sparse.
155  merged_eg->io[f].features.Compress();
156  }
157  }
158 }
159 
160 
161 
162 void MergeExamples(const std::vector<NnetExample> &src,
163  bool compress,
164  NnetExample *merged_eg) {
165  KALDI_ASSERT(!src.empty());
166  std::vector<std::string> io_names;
167  GetIoNames(src, &io_names);
168  // the sizes are the total number of Indexes we have across all examples.
169  std::vector<int32> io_sizes;
170  GetIoSizes(src, io_names, &io_sizes);
171  MergeIo(src, io_names, io_sizes, compress, merged_eg);
172 }
173 
174 void ShiftExampleTimes(int32 t_offset,
175  const std::vector<std::string> &exclude_names,
176  NnetExample *eg) {
177  if (t_offset == 0)
178  return;
179  std::vector<NnetIo>::iterator iter = eg->io.begin(),
180  end = eg->io.end();
181  for (; iter != end; iter++) {
182  bool name_is_excluded = false;
183  std::vector<std::string>::const_iterator
184  exclude_iter = exclude_names.begin(),
185  exclude_end = exclude_names.end();
186  for (; exclude_iter != exclude_end; ++exclude_iter) {
187  if (iter->name == *exclude_iter) {
188  name_is_excluded = true;
189  break;
190  }
191  }
192  if (!name_is_excluded) {
193  // name is not something like "ivector" that we exclude from shifting.
194  std::vector<Index>::iterator index_iter = iter->indexes.begin(),
195  index_end = iter->indexes.end();
196  for (; index_iter != index_end; ++index_iter)
197  index_iter->t += t_offset;
198  }
199  }
200 }
201 
202 void GetComputationRequest(const Nnet &nnet,
203  const NnetExample &eg,
204  bool need_model_derivative,
205  bool store_component_stats,
206  ComputationRequest *request) {
207  request->inputs.clear();
208  request->inputs.reserve(eg.io.size());
209  request->outputs.clear();
210  request->outputs.reserve(eg.io.size());
211  request->need_model_derivative = need_model_derivative;
212  request->store_component_stats = store_component_stats;
213  for (size_t i = 0; i < eg.io.size(); i++) {
214  const NnetIo &io = eg.io[i];
215  const std::string &name = io.name;
216  int32 node_index = nnet.GetNodeIndex(name);
217  if (node_index == -1 ||
218  (!nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)))
219  KALDI_ERR << "Nnet example has input or output named '" << name
220  << "', but no such input or output node is in the network.";
221 
222  std::vector<IoSpecification> &dest =
223  nnet.IsInputNode(node_index) ? request->inputs : request->outputs;
224  dest.resize(dest.size() + 1);
225  IoSpecification &io_spec = dest.back();
226  io_spec.name = name;
227  io_spec.indexes = io.indexes;
228  io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative;
229  }
230  // check to see if something went wrong.
231  if (request->inputs.empty())
232  KALDI_ERR << "No inputs in computation request.";
233  if (request->outputs.empty())
234  KALDI_ERR << "No outputs in computation request.";
235 }
236 
237 void WriteVectorAsChar(std::ostream &os,
238  bool binary,
239  const VectorBase<BaseFloat> &vec) {
240  if (binary) {
241  int32 dim = vec.Dim();
242  std::vector<unsigned char> char_vec(dim);
243  const BaseFloat *data = vec.Data();
244  for (int32 i = 0; i < dim; i++) {
245  BaseFloat value = data[i];
246  KALDI_ASSERT(value >= 0.0 && value <= 1.0);
247  // below, the adding 0.5 is done so that we round to the closest integer
248  // rather than rounding down (since static_cast will round down).
249  char_vec[i] = static_cast<unsigned char>(255.0 * value + 0.5);
250  }
251  WriteIntegerVector(os, binary, char_vec);
252  } else {
253  // the regular floating-point format will be more readable for text mode.
254  vec.Write(os, binary);
255  }
256 }
257 
258 void ReadVectorAsChar(std::istream &is,
259  bool binary,
260  Vector<BaseFloat> *vec) {
261  if (binary) {
262  BaseFloat scale = 1.0 / 255.0;
263  std::vector<unsigned char> char_vec;
264  ReadIntegerVector(is, binary, &char_vec);
265  int32 dim = char_vec.size();
266  vec->Resize(dim, kUndefined);
267  BaseFloat *data = vec->Data();
268  for (int32 i = 0; i < dim; i++)
269  data[i] = scale * char_vec[i];
270  } else {
271  vec->Read(is, binary);
272  }
273 }
274 
275 void RoundUpNumFrames(int32 frame_subsampling_factor,
276  int32 *num_frames,
277  int32 *num_frames_overlap) {
278  if (*num_frames % frame_subsampling_factor != 0) {
279  int32 new_num_frames = frame_subsampling_factor *
280  (*num_frames / frame_subsampling_factor + 1);
281  KALDI_LOG << "Rounding up --num-frames=" << (*num_frames)
282  << " to a multiple of --frame-subsampling-factor="
283  << frame_subsampling_factor
284  << ", now --num-frames=" << new_num_frames;
285  *num_frames = new_num_frames;
286  }
287  if (*num_frames_overlap % frame_subsampling_factor != 0) {
288  int32 new_num_frames_overlap = frame_subsampling_factor *
289  (*num_frames_overlap / frame_subsampling_factor + 1);
290  KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap)
291  << " to a multiple of --frame-subsampling-factor="
292  << frame_subsampling_factor
293  << ", now --num-frames-overlap=" << new_num_frames_overlap;
294  *num_frames_overlap = new_num_frames_overlap;
295  }
296  if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) {
297  KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
298  << "--num-frames=" << (*num_frames);
299  }
300 }
301 
303  if (num_frames_str == "-1") {
304  return;
305  }
306  if (!SplitStringToIntegers(num_frames_str, ",", false, &num_frames) ||
307  num_frames.empty()) {
308  KALDI_ERR << "Invalid option (expected comma-separated list of integers): "
309  << "--num-frames=" << num_frames_str;
310  }
311 
313  if (m < 1) {
314  KALDI_ERR << "Invalid value --frame-subsampling-factor=" << m;
315  }
316  bool changed = false;
317  for (size_t i = 0; i < num_frames.size(); i++) {
318  int32 value = num_frames[i];
319  if (value <= 0) {
320  KALDI_ERR << "Invalid option --num-frames=" << num_frames_str;
321  }
322  if (value % m != 0) {
323  value = m * ((value / m) + 1);
324  changed = true;
325  }
326  num_frames[i] = value;
327  }
328  if (changed) {
329  std::ostringstream rounded_num_frames_str;
330  for (size_t i = 0; i < num_frames.size(); i++) {
331  if (i > 0)
332  rounded_num_frames_str << ',';
333  rounded_num_frames_str << num_frames[i];
334  }
335  KALDI_LOG << "Rounding up --num-frames=" << num_frames_str
336  << " to multiples of --frame-subsampling-factor=" << m
337  << ", to: " << rounded_num_frames_str.str();
338  }
339 }
340 
341 
343  config_(config),
344  total_num_utterances_(0), total_input_frames_(0),
345  total_frames_overlap_(0), total_num_chunks_(0),
346  total_frames_in_chunks_(0) {
347  if (config.num_frames_str != "-1") {
348  if (config.num_frames.empty()) {
349  KALDI_ERR << "You need to call ComputeDerived() on the "
350  "ExampleGenerationConfig().";
351  }
353  }
354 }
355 
357  KALDI_LOG << "Split " << total_num_utterances_ << " utts, with "
358  << "total length " << total_input_frames_ << " frames ("
359  << (total_input_frames_ / 360000.0) << " hours assuming "
360  << "100 frames per second)";
361  float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_,
362  overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_,
363  output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_,
364  output_percent_no_overlap = output_percent - overlap_percent;
365 
366  KALDI_LOG << "Average chunk length was " << average_chunk_length
367  << " frames; overlap between adjacent chunks was "
368  << overlap_percent << "% of input length; length of output was "
369  << output_percent << "% of input length (minus overlap = "
370  << output_percent_no_overlap << "%).";
371  if (chunk_size_to_count_.size() > 1) {
372  std::ostringstream os;
373  os << std::setprecision(4);
374  for (std::map<int32, int32>::iterator iter = chunk_size_to_count_.begin();
375  iter != chunk_size_to_count_.end(); ++iter) {
376  int32 chunk_size = iter->first,
377  num_frames = chunk_size * iter->second;
378  float percent_of_total = num_frames * 100.0 / total_frames_in_chunks_;
379  if (iter != chunk_size_to_count_.begin()) os << ", ";
380  os << chunk_size << " = " << percent_of_total << "%";
381  }
382  KALDI_LOG << "Output frames are distributed among chunk-sizes as follows: "
383  << os.str();
384  }
385 
386 }
387 
389  const std::vector<int32> &split) const {
390  if (split.empty()) // not a valid split, but useful to handle this case.
391  return 0.0;
392  float principal_num_frames = config_.num_frames[0],
393  num_frames_overlap = config_.num_frames_overlap;
394  KALDI_ASSERT(num_frames_overlap < principal_num_frames &&
395  "--num-frames-overlap value is too high");
396  float overlap_proportion = num_frames_overlap / principal_num_frames;
397  float ans = std::accumulate(split.begin(), split.end(), int32(0));
398  for (size_t i = 0; i + 1 < split.size(); i++) {
399  float min_adjacent_chunk_length = std::min(split[i], split[i + 1]),
400  overlap = overlap_proportion * min_adjacent_chunk_length;
401  ans -= overlap;
402  }
403  KALDI_ASSERT(ans > 0.0);
404  return ans;
405 }
406 
407 /*
408  This comment describes the idea behind what InitChunkSize() is supposed to do,
409  and how it relates to the purpose of class UtteranceSplitter.
410 
411  Class UtteranceSplitter is supposed to tell us, for a given utterance length,
412  what chunk sizes to use. The chunk sizes it may choose are:
413  - zero or more chunks of the 'principal' size (the first-listed value in
414  --num-frames option)
415  - at most two chunks of 'alternative' num-frames (meaning, any but the
416  first-listed choice in the --num-frames option).
417 
418  (note: an empty list of chunks is not allowed as a split). A split is
419  a list of chunk-sizes in increasing order (we when we actually split the
420  utterance into chunks, we may, at random, reverse the order.
421 
422  The choice of split to use for a given utterance-length is determined as
423  follows. Firstly, for each split we compute a 'default duration' (see
424  DefaultDurationOfSplit()... if --num-frames-overlap is zero, this is just the
425  sum of the chunk sizes). We then use by a cost-function that depends on
426  default-duration and the length of the utterance: the idea is that these two
427  should be as close as possible, but penalizing the default-duration being
428  larger than the utterance-length (which in the normal case of
429  --num-frames-overlap=0 would lead to gaps between the segments), twice as much
430  as the other sign of difference.
431 
432  Specifically:
433  cost(default_duration, utt_length) = (default_duration > utt_length ?
434  default_duration - utt_length :
435  2.0 * (utt_length - default_duration))
436  [but as a special case, set c to infinity if the largest chunk size in the
437  split is longer than the utterance length; we couldn't, in that case, use
438  this split for this utterance].
439 
440  We want to make sure a good variety of combinations of chunk sizes are chosen
441  in case there are ties from the cost function. For each utterance length
442  we store the set of splits, whose costs are within 2
443  of the best cost available for that utterance length. When asked to find
444  chunks for a particular utterance of that length, we will choose randomly
445  from that pool of splits.
446  */
448  int32 max_utterance_length = MaxUtteranceLength();
449 
450  // The 'splits' vector is a list of possible splits (a split being
451  // a sorted vector of chunk-sizes).
452  // The vector 'splits' is itself sorted.
453  std::vector<std::vector<int32> > splits;
454  InitSplits(&splits);
455 
456 
457  // Define a split-index 0 <= s < splits.size() as index into the 'splits'
458  // vector, and let a cost c >= 0 represent the mismatch between an
459  // utterance length and the total length of the chunk sizes in a split:
460 
461  // c(default_duration, utt_length) = (default_duration > utt_length ?
462  // default_duration - utt_length :
463  // 2.0 * (utt_length - default_duration))
464  // [but as a special case, set c to infinity if the largest chunk size in the
465  // split is longer than the utterance length; we couldn't, in that case, use
466  // this split for this utterance].
467 
468  // 'costs_for_length[u][s]', indexed by utterance-length u and then split,
469  // contains the cost for utterance-length u and split s.
470 
471  std::vector<std::vector<float> > costs_for_length(
472  max_utterance_length + 1);
473  int32 num_splits = splits.size();
474 
475  for (int32 u = 0; u <= max_utterance_length; u++)
476  costs_for_length[u].reserve(num_splits);
477 
478  for (int32 s = 0; s < num_splits; s++) {
479  const std::vector<int32> &split = splits[s];
480  float default_duration = DefaultDurationOfSplit(split);
481  int32 max_chunk_size = *std::max_element(split.begin(), split.end());
482  for (int32 u = 0; u <= max_utterance_length; u++) {
483  // c is the cost for this utterance length and this split. We penalize
484  // gaps twice as strongly as overlaps, based on the intuition that
485  // completely throwing out frames of data is worse than counting them
486  // twice.
487  float c = (default_duration > float(u) ? default_duration - float(u) :
488  2.0 * (u - default_duration));
489  if (u < max_chunk_size) // can't fit the largest of the chunks in this
490  // utterance
491  c = std::numeric_limits<float>::max();
492  KALDI_ASSERT(c >= 0);
493  costs_for_length[u].push_back(c);
494  }
495  }
496 
497 
498  splits_for_length_.resize(max_utterance_length + 1);
499 
500  for (int32 u = 0; u <= max_utterance_length; u++) {
501  const std::vector<float> &costs = costs_for_length[u];
502  float min_cost = *std::min_element(costs.begin(), costs.end());
503  if (min_cost == std::numeric_limits<float>::max()) {
504  // All costs were infinity, becaues this utterance-length u is shorter
505  // than the smallest chunk-size. Leave splits_for_length_[u] as empty
506  // for this utterance-length, meaning we will not be able to choose any
507  // split, and such utterances will be discarded.
508  continue;
509  }
510  float cost_threshold = 1.9999; // We will choose pseudo-randomly from splits
511  // that are within this distance from the
512  // best cost. Make the threshold just
513  // slightly less than 2... this will
514  // hopefully make the behavior more
515  // deterministic for ties.
516  std::vector<int32> possible_splits;
517  std::vector<float>::const_iterator iter = costs.begin(), end = costs.end();
518  int32 s = 0;
519  for (; iter != end; ++iter,++s)
520  if (*iter < min_cost + cost_threshold)
521  splits_for_length_[u].push_back(splits[s]);
522  }
523 
524  if (GetVerboseLevel() >= 3) {
525  std::ostringstream os;
526  for (int32 u = 0; u <= max_utterance_length; u++) {
527  if (!splits_for_length_[u].empty()) {
528  os << u << "=(";
529  std::vector<std::vector<int32 > >::const_iterator
530  iter1 = splits_for_length_[u].begin(),
531  end1 = splits_for_length_[u].end();
532 
533  while (iter1 != end1) {
534  std::vector<int32>::const_iterator iter2 = iter1->begin(),
535  end2 = iter1->end();
536  while (iter2 != end2) {
537  os << *iter2;
538  ++iter2;
539  if (iter2 != end2) os << ",";
540  }
541  ++iter1;
542  if (iter1 != end1) os << "/";
543  }
544  os << ")";
545  if (u < max_utterance_length) os << ", ";
546  }
547  }
548  KALDI_VLOG(3) << "Utterance-length-to-splits map is: " << os.str();
549  }
550 }
551 
552 
553 bool UtteranceSplitter::LengthsMatch(const std::string &utt,
554  int32 utterance_length,
555  int32 supervision_length,
556  int32 length_tolerance) const {
558  expected_supervision_length = (utterance_length + sf - 1) / sf;
559  if (std::abs(supervision_length - expected_supervision_length)
560  <= length_tolerance) {
561  return true;
562  } else {
563  if (sf == 1) {
564  KALDI_WARN << "Supervision does not have expected length for utterance "
565  << utt << ": expected length = " << utterance_length
566  << ", got " << supervision_length;
567  } else {
568  KALDI_WARN << "Supervision does not have expected length for utterance "
569  << utt << ": expected length = (" << utterance_length
570  << " + " << sf << " - 1) / " << sf << " = "
571  << expected_supervision_length
572  << ", got: " << supervision_length
573  << " (note: --frame-subsampling-factor=" << sf << ")";
574  }
575  return false;
576  }
577 }
578 
579 
581  int32 utterance_length, std::vector<int32> *chunk_sizes) const {
583  // 'primary_length' is the first-specified num-frames.
584  // It's the only chunk that may be repeated an arbitrary number
585  // of times.
586  int32 primary_length = config_.num_frames[0],
587  num_frames_overlap = config_.num_frames_overlap,
588  max_tabulated_length = splits_for_length_.size() - 1,
589  num_primary_length_repeats = 0;
590  KALDI_ASSERT(primary_length - num_frames_overlap > 0);
591  KALDI_ASSERT(utterance_length >= 0);
592  while (utterance_length > max_tabulated_length) {
593  utterance_length -= (primary_length - num_frames_overlap);
594  num_primary_length_repeats++;
595  }
596  KALDI_ASSERT(utterance_length >= 0);
597  const std::vector<std::vector<int32> > &possible_splits =
598  splits_for_length_[utterance_length];
599  if (possible_splits.empty()) {
600  chunk_sizes->clear();
601  return;
602  }
603  int32 num_possible_splits = possible_splits.size(),
604  randomly_chosen_split = RandInt(0, num_possible_splits - 1);
605  *chunk_sizes = possible_splits[randomly_chosen_split];
606  for (int32 i = 0; i < num_primary_length_repeats; i++)
607  chunk_sizes->push_back(primary_length);
608 
609  std::sort(chunk_sizes->begin(), chunk_sizes->end());
610  if (RandInt(0, 1) == 0) {
611  std::reverse(chunk_sizes->begin(), chunk_sizes->end());
612  }
613 }
614 
615 
617  int32 num_lengths = config_.num_frames.size();
618  KALDI_ASSERT(num_lengths > 0);
619  // 'primary_length' is the first-specified num-frames.
620  // It's the only chunk that may be repeated an arbitrary number
621  // of times.
622  int32 primary_length = config_.num_frames[0],
623  max_length = primary_length;
624  for (int32 i = 0; i < num_lengths; i++) {
626  max_length = std::max(config_.num_frames[i], max_length);
627  }
628  return 2 * max_length + primary_length;
629 }
630 
631 void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) const {
632  // we consider splits whose default duration (as returned by
633  // DefaultDurationOfSplit()) is up to MaxUtteranceLength() + primary_length.
634  // We can be confident without doing a lot of math, that splits above this
635  // length will never be chosen for any utterance-length up to
636  // MaxUtteranceLength() (which is the maximum we use).
637  int32 primary_length = config_.num_frames[0],
638  default_duration_ceiling = MaxUtteranceLength() + primary_length;
639 
640  typedef unordered_set<std::vector<int32>, VectorHasher<int32> > SetType;
641 
642  SetType splits_set;
643 
644  int32 num_lengths = config_.num_frames.size();
645 
646  // The splits we are allow are: zero to two 'alternate' lengths, plus
647  // an arbitrary number of repeats of the 'primary' length. The repeats
648  // of the 'primary' length are handled by the inner loop over n.
649  // The zero to two 'alternate' lengths are handled by the loops over
650  // i and j. i == 0 and j == 0 are special cases; they mean, no
651  // alternate is chosen.
652  for (int32 i = 0; i < num_lengths; i++) {
653  for (int32 j = 0; j < num_lengths; j++) {
654  std::vector<int32> vec;
655  if (i > 0)
656  vec.push_back(config_.num_frames[i]);
657  if (j > 0)
658  vec.push_back(config_.num_frames[j]);
659  int32 n = 0;
660  while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
661  if (!vec.empty()) // Don't allow the empty vector as a split.
662  splits_set.insert(vec);
663  n++;
664  vec.push_back(primary_length);
665  std::sort(vec.begin(), vec.end());
666  }
667  }
668  }
669  for (SetType::const_iterator iter = splits_set.begin();
670  iter != splits_set.end(); ++iter)
671  splits->push_back(*iter);
672  std::sort(splits->begin(), splits->end()); // make the order deterministic,
673  // for consistency of output
674  // between runs and C libraries.
675 }
676 
677 
678 // static
679 void UtteranceSplitter::DistributeRandomlyUniform(int32 n, std::vector<int32> *vec) {
680  KALDI_ASSERT(!vec->empty());
681  int32 size = vec->size();
682  if (n < 0) {
683  DistributeRandomlyUniform(-n, vec);
684  for (int32 i = 0; i < size; i++)
685  (*vec)[i] *= -1;
686  return;
687  }
688  // from this point we know n >= 0.
689  int32 common_part = n / size,
690  remainder = n % size, i;
691  for (i = 0; i < remainder; i++) {
692  (*vec)[i] = common_part + 1;
693  }
694  for (; i < size; i++) {
695  (*vec)[i] = common_part;
696  }
697  std::random_shuffle(vec->begin(), vec->end());
698  KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
699 }
700 
701 
702 // static
704  const std::vector<int32> &magnitudes,
705  std::vector<int32> *vec) {
706  KALDI_ASSERT(!vec->empty() && vec->size() == magnitudes.size());
707  int32 size = vec->size();
708  if (n < 0) {
709  DistributeRandomly(-n, magnitudes, vec);
710  for (int32 i = 0; i < size; i++)
711  (*vec)[i] *= -1;
712  return;
713  }
714  float total_magnitude = std::accumulate(magnitudes.begin(), magnitudes.end(),
715  int32(0));
716  KALDI_ASSERT(total_magnitude > 0);
717  // note: 'partial_counts' contains the negative of the partial counts, so
718  // when we sort the larger partial counts come first.
719  std::vector<std::pair<float, int32> > partial_counts;
720  int32 total_count = 0;
721  for (int32 i = 0; i < size; i++) {
722  float this_count = n * float(magnitudes[i]) / total_magnitude;
723  // note: cast of float to int32 rounds towards zero (down, in this
724  // case, since this_count >= 0).
725  int32 this_whole_count = static_cast<int32>(this_count),
726  this_partial_count = this_count - this_whole_count;
727  (*vec)[i] = this_whole_count;
728  total_count += this_whole_count;
729  partial_counts.push_back(std::pair<float, int32>(-this_partial_count, i));
730  }
731  KALDI_ASSERT(total_count <= n && total_count + size >= n);
732  std::sort(partial_counts.begin(), partial_counts.end());
733  int32 i = 0;
734  // Increment by one the elements of the vector that has the largest partial
735  // count, then the next largest partial count, and so on... until we reach the
736  // desired total-count 'n'.
737  for(; total_count < n; i++,total_count++) {
738  (*vec)[partial_counts[i].second]++;
739  }
740  KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
741 }
742 
743 
744 void UtteranceSplitter::GetGapSizes(int32 utterance_length,
745  bool enforce_subsampling_factor,
746  const std::vector<int32> &chunk_sizes,
747  std::vector<int32> *gap_sizes) const {
748  if (chunk_sizes.empty()) {
749  gap_sizes->clear();
750  return;
751  }
752  if (enforce_subsampling_factor && config_.frame_subsampling_factor > 1) {
753  int32 sf = config_.frame_subsampling_factor, size = chunk_sizes.size();
754  int32 utterance_length_reduced = (utterance_length + (sf - 1)) / sf;
755  std::vector<int32> chunk_sizes_reduced(chunk_sizes);
756  for (int32 i = 0; i < size; i++) {
757  KALDI_ASSERT(chunk_sizes[i] % config_.frame_subsampling_factor == 0);
758  chunk_sizes_reduced[i] /= config_.frame_subsampling_factor;
759  }
760  GetGapSizes(utterance_length_reduced, false,
761  chunk_sizes_reduced, gap_sizes);
762  KALDI_ASSERT(gap_sizes->size() == static_cast<size_t>(size));
763  for (int32 i = 0; i < size; i++)
764  (*gap_sizes)[i] *= config_.frame_subsampling_factor;
765  return;
766  }
767  int32 num_chunks = chunk_sizes.size(),
768  total_of_chunk_sizes = std::accumulate(chunk_sizes.begin(),
769  chunk_sizes.end(),
770  int32(0)),
771  total_gap = utterance_length - total_of_chunk_sizes;
772  gap_sizes->resize(num_chunks);
773 
774  if (total_gap < 0) {
775  // there is an overlap. Overlaps can only go between chunks, not at the
776  // beginning or end of the utterance. Also, we try to make the length of
777  // overlap proportional to the size of the smaller of the two chunks
778  // that the overlap is between.
779  if (num_chunks == 1) {
780  // there needs to be an overlap, but there is only one chunk... this means
781  // the chunk-size exceeds the utterance length, which is not allowed.
782  KALDI_ERR << "Chunk size is " << chunk_sizes[0]
783  << " but utterance length is only "
784  << utterance_length;
785  }
786 
787  // note the elements of 'overlaps' will be <= 0.
788  std::vector<int32> magnitudes(num_chunks - 1),
789  overlaps(num_chunks - 1);
790  // the 'magnitudes' vector will contain the minimum of the lengths of the
791  // two adjacent chunks between which are are going to consider having an
792  // overlap. These will be used to assign the overlap proportional to that
793  // size.
794  for (int32 i = 0; i + 1 < num_chunks; i++) {
795  magnitudes[i] = std::min<int32>(chunk_sizes[i], chunk_sizes[i + 1]);
796  }
797  DistributeRandomly(total_gap, magnitudes, &overlaps);
798  for (int32 i = 0; i + 1 < num_chunks; i++) {
799  // If the following condition does not hold, it's possible we
800  // could get chunk start-times less than zero. I don't believe
801  // it's possible for this condition to fail, but we're checking
802  // for it at this level to make debugging easier, just in case.
803  KALDI_ASSERT(overlaps[i] <= magnitudes[i]);
804  }
805 
806  (*gap_sizes)[0] = 0; // no gap before 1st chunk.
807  for (int32 i = 1; i < num_chunks; i++)
808  (*gap_sizes)[i] = overlaps[i-1];
809  } else {
810  // There may be a gap. Gaps can go at the start or end of the utterance, or
811  // between segments. We try to distribute the gaps evenly.
812  std::vector<int32> gaps(num_chunks + 1);
813  DistributeRandomlyUniform(total_gap, &gaps);
814  // the last element of 'gaps', the one at the end of the utterance, is
815  // implicit and doesn't have to be written to the output.
816  for (int32 i = 0; i < num_chunks; i++)
817  (*gap_sizes)[i] = gaps[i];
818  }
819 }
820 
821 
823  int32 utterance_length,
824  std::vector<ChunkTimeInfo> *chunk_info) {
825  int32 t = 0;
826  if (config_.num_frames_str == "-1" ) {
827  ChunkTimeInfo *info;
828  info = new ChunkTimeInfo;
829  info->first_frame = 0;
830  info->num_frames = utterance_length;
835  (*chunk_info).push_back(*info);
836  } else {
837  std::vector<int32> chunk_sizes;
838  GetChunkSizesForUtterance(utterance_length, &chunk_sizes);
839  std::vector<int32> gaps(chunk_sizes.size());
840  GetGapSizes(utterance_length, true, chunk_sizes, &gaps);
841  int32 num_chunks = chunk_sizes.size();
842  chunk_info->resize(num_chunks);
843  for (int32 i = 0; i < num_chunks; i++) {
844  t += gaps[i];
845  ChunkTimeInfo &info = (*chunk_info)[i];
846  info.first_frame = t;
847  info.num_frames = chunk_sizes[i];
848  info.left_context = (i == 0 && config_.left_context_initial >= 0 ?
850  info.right_context = (i == num_chunks - 1 && config_.right_context_final >= 0 ?
852  t += chunk_sizes[i];
853  }
854  }
855  SetOutputWeights(utterance_length, chunk_info);
856  AccStatsForUtterance(utterance_length, *chunk_info);
857  // check that the end of the last chunk doesn't go more than
858  // 'config_.frame_subsampling_factor - 1' frames past the end
859  // of the utterance. That amount, we treat as rounding error.
860  KALDI_ASSERT(t - utterance_length < config_.frame_subsampling_factor);
861 }
862 
864  int32 utterance_length,
865  const std::vector<ChunkTimeInfo> &chunk_info) {
867  total_input_frames_ += utterance_length;
868 
869  for (size_t c = 0; c < chunk_info.size(); c++) {
870  int32 chunk_size = chunk_info[c].num_frames;
871  if (c > 0) {
872  int32 last_chunk_end = chunk_info[c-1].first_frame +
873  chunk_info[c-1].num_frames;
874  if (last_chunk_end > chunk_info[c].first_frame)
875  total_frames_overlap_ += last_chunk_end - chunk_info[c].first_frame;
876  }
877  std::map<int32, int32>::iterator iter = chunk_size_to_count_.find(
878  chunk_size);
879  if (iter == chunk_size_to_count_.end())
880  chunk_size_to_count_[chunk_size] = 1;
881  else
882  iter->second++;
883  total_num_chunks_ += 1;
884  total_frames_in_chunks_ += chunk_size;
885  }
886 }
887 
888 
890  int32 utterance_length,
891  std::vector<ChunkTimeInfo> *chunk_info) const {
893  int32 num_output_frames = (utterance_length + sf - 1) / sf;
894  // num_output_frames is the number of frames of supervision. 'count[t]' will
895  // be the number of chunks that this output-frame t appears in. Note: the
896  // 'first_frame' and 'num_frames' members of ChunkTimeInfo will always be
897  // multiples of frame_subsampling_factor.
898  std::vector<int32> count(num_output_frames, 0);
899  int32 num_chunks = chunk_info->size();
900  for (int32 i = 0; i < num_chunks; i++) {
901  ChunkTimeInfo &chunk = (*chunk_info)[i];
902  for (int32 t = chunk.first_frame / sf;
903  t < (chunk.first_frame + chunk.num_frames) / sf;
904  t++)
905  count[t]++;
906  }
907  for (int32 i = 0; i < num_chunks; i++) {
908  ChunkTimeInfo &chunk = (*chunk_info)[i];
909  chunk.output_weights.resize(chunk.num_frames / sf);
910  int32 t_start = chunk.first_frame / sf;
911  for (int32 t = t_start;
912  t < (chunk.first_frame + chunk.num_frames) / sf;
913  t++)
914  chunk.output_weights[t - t_start] = 1.0 / count[t];
915  }
916 }
917 
919  KALDI_ASSERT(!ranges.empty());
920  int32 ans = 0, num_ranges = ranges.size();
921  for (int32 i = 0; i < num_ranges; i++) {
922  int32 possible_ans = 0;
923  if (max_value >= ranges[i].first) {
924  if (max_value >= ranges[i].second)
925  possible_ans = ranges[i].second;
926  else
927  possible_ans = max_value;
928  }
929  if (possible_ans > ans)
930  ans = possible_ans;
931  }
932  return ans;
933 }
934 
935 // static
936 bool ExampleMergingConfig::ParseIntSet(const std::string &str,
937  ExampleMergingConfig::IntSet *int_set) {
938  std::vector<std::string> split_str;
939  SplitStringToVector(str, ",", false, &split_str);
940  if (split_str.empty())
941  return false;
942  int_set->largest_size = 0;
943  int_set->ranges.resize(split_str.size());
944  for (size_t i = 0; i < split_str.size(); i++) {
945  std::vector<int32> split_range;
946  SplitStringToIntegers(split_str[i], ":", false, &split_range);
947  if (split_range.size() < 1 || split_range.size() > 2 ||
948  split_range[0] > split_range.back() || split_range[0] <= 0)
949  return false;
950  int_set->ranges[i].first = split_range[0];
951  int_set->ranges[i].second = split_range.back();
952  int_set->largest_size = std::max<int32>(int_set->largest_size,
953  split_range.back());
954  }
955  return true;
956 }
957 
959  if (measure_output_frames != "deprecated") {
960  KALDI_WARN << "The --measure-output-frames option is deprecated "
961  "and will be ignored.";
962  }
963  if (discard_partial_minibatches != "deprecated") {
964  KALDI_WARN << "The --discard-partial-minibatches option is deprecated "
965  "and will be ignored.";
966  }
967  std::vector<std::string> minibatch_size_split;
968  SplitStringToVector(minibatch_size, "/", false, &minibatch_size_split);
969  if (minibatch_size_split.empty()) {
970  KALDI_ERR << "Invalid option --minibatch-size=" << minibatch_size;
971  }
972 
973  rules.resize(minibatch_size_split.size());
974  for (size_t i = 0; i < minibatch_size_split.size(); i++) {
975  int32 &eg_size = rules[i].first;
976  IntSet &int_set = rules[i].second;
977  // 'this_rule' will be either something like "256" or like "64-128,256"
978  // (but these two only if minibatch_size_split.size() == 1, or something with
979  // an example-size specified, like "256=64-128,256"
980  std::string &this_rule = minibatch_size_split[i];
981  if (this_rule.find('=') != std::string::npos) {
982  std::vector<std::string> rule_split; // split on '='
983  SplitStringToVector(this_rule, "=", false, &rule_split);
984  if (rule_split.size() != 2) {
985  KALDI_ERR << "Could not parse option --minibatch-size="
986  << minibatch_size;
987  }
988  if (!ConvertStringToInteger(rule_split[0], &eg_size) ||
989  !ParseIntSet(rule_split[1], &int_set))
990  KALDI_ERR << "Could not parse option --minibatch-size="
991  << minibatch_size;
992 
993  } else {
994  if (minibatch_size_split.size() != 1) {
995  KALDI_ERR << "Could not parse option --minibatch-size="
996  << minibatch_size << " (all rules must have "
997  << "eg-size specified if >1 rule)";
998  }
999  if (!ParseIntSet(this_rule, &int_set))
1000  KALDI_ERR << "Could not parse option --minibatch-size="
1001  << minibatch_size;
1002  }
1003  }
1004  {
1005  // check that no size is repeated.
1006  std::vector<int32> all_sizes(minibatch_size_split.size());
1007  for (size_t i = 0; i < minibatch_size_split.size(); i++)
1008  all_sizes[i] = rules[i].first;
1009  std::sort(all_sizes.begin(), all_sizes.end());
1010  if (!IsSortedAndUniq(all_sizes)) {
1011  KALDI_ERR << "Invalid --minibatch-size=" << minibatch_size
1012  << " (repeated example-sizes)";
1013  }
1014  }
1015 }
1016 
1018  int32 num_available_egs,
1019  bool input_ended) const {
1020  KALDI_ASSERT(num_available_egs > 0 && size_of_eg > 0);
1021  int32 num_rules = rules.size();
1022  if (num_rules == 0)
1023  KALDI_ERR << "You need to call ComputeDerived() before calling "
1024  "MinibatchSize().";
1025  int32 min_distance = std::numeric_limits<int32>::max(),
1026  closest_rule_index = 0;
1027  for (int32 i = 0; i < num_rules; i++) {
1028  int32 distance = std::abs(size_of_eg - rules[i].first);
1029  if (distance < min_distance) {
1030  min_distance = distance;
1031  closest_rule_index = i;
1032  }
1033  }
1034  if (!input_ended) {
1035  // until the input ends, we can only use the largest available
1036  // minibatch-size (otherwise, we could expect more later).
1037  int32 largest_size = rules[closest_rule_index].second.largest_size;
1038  if (largest_size <= num_available_egs)
1039  return largest_size;
1040  else
1041  return 0;
1042  } else {
1043  int32 s = rules[closest_rule_index].second.LargestValueInRange(
1044  num_available_egs);
1045  KALDI_ASSERT(s <= num_available_egs);
1046  return s;
1047  }
1048 }
1049 
1050 
1052  size_t structure_hash,
1053  int32 minibatch_size) {
1054  std::pair<int32, size_t> p(example_size, structure_hash);
1055 
1056 
1057  unordered_map<int32, int32> &h = stats_[p].minibatch_to_num_written;
1058  unordered_map<int32, int32>::iterator iter = h.find(minibatch_size);
1059  if (iter == h.end())
1060  h[minibatch_size] = 1;
1061  else
1062  iter->second += 1;
1063 }
1064 
1066  size_t structure_hash,
1067  int32 num_discarded) {
1068  std::pair<int32, size_t> p(example_size, structure_hash);
1069  stats_[p].num_discarded += num_discarded;
1070 }
1071 
1072 
1074  PrintSpecificStats();
1075  PrintAggregateStats();
1076 }
1077 
1079  // First print some aggregate stats.
1080  int64 num_distinct_egs_types = 0, // number of distinct types of input egs
1081  // (differing in size or structure).
1082  total_discarded_egs = 0, // total number of discarded egs.
1083  total_discarded_egs_size = 0, // total number of discarded egs each multiplied by size
1084  // of that eg
1085  total_non_discarded_egs = 0, // total over all minibatches written, of
1086  // minibatch-size, equals number of input egs
1087  // that were not discarded.
1088  total_non_discarded_egs_size = 0, // total over all minibatches of size-of-eg
1089  // * minibatch-size.
1090  num_minibatches = 0, // total number of minibatches
1091  num_distinct_minibatch_types = 0; // total number of combination of
1092  // (type-of-eg, number of distinct
1093  // minibatch-sizes for that eg-type)-
1094  // reflects the number of time we have
1095  // to compile.
1096 
1097  StatsType::const_iterator eg_iter = stats_.begin(), eg_end = stats_.end();
1098 
1099  for (; eg_iter != eg_end; ++eg_iter) {
1100  int32 eg_size = eg_iter->first.first;
1101  const StatsForExampleSize &stats = eg_iter->second;
1102  num_distinct_egs_types++;
1103  total_discarded_egs += stats.num_discarded;
1104  total_discarded_egs_size += stats.num_discarded * eg_size;
1105 
1106  unordered_map<int32, int32>::const_iterator
1107  mb_iter = stats.minibatch_to_num_written.begin(),
1108  mb_end = stats.minibatch_to_num_written.end();
1109  for (; mb_iter != mb_end; ++mb_iter) {
1110  int32 mb_size = mb_iter->first,
1111  num_written = mb_iter->second;
1112  num_distinct_minibatch_types++;
1113  num_minibatches += num_written;
1114  total_non_discarded_egs += num_written * mb_size;
1115  total_non_discarded_egs_size += num_written * mb_size * eg_size;
1116  }
1117  }
1118  // the averages are written as integers- we don't really need more precision
1119  // than that.
1120  int64 total_input_egs = total_discarded_egs + total_non_discarded_egs,
1121  total_input_egs_size =
1122  total_discarded_egs_size + total_non_discarded_egs_size;
1123 
1124  float avg_input_egs_size = total_input_egs_size * 1.0 / total_input_egs;
1125  float percent_discarded = total_discarded_egs * 100.0 / total_input_egs;
1126  // note: by minibatch size we mean the number of egs per minibatch, it
1127  // does not take note of the size of the input egs.
1128  float avg_minibatch_size = total_non_discarded_egs * 1.0 / num_minibatches;
1129 
1130  std::ostringstream os;
1131  os << std::setprecision(4);
1132  os << "Processed " << total_input_egs
1133  << " egs of avg. size " << avg_input_egs_size
1134  << " into " << num_minibatches << " minibatches, discarding "
1135  << percent_discarded << "% of egs. Avg minibatch size was "
1136  << avg_minibatch_size << ", #distinct types of egs/minibatches "
1137  << "was " << num_distinct_egs_types << "/"
1138  << num_distinct_minibatch_types;
1139  KALDI_LOG << os.str();
1140 }
1141 
1143  KALDI_LOG << "Merged specific eg types as follows [format: <eg-size1>="
1144  "{<mb-size1>-><num-minibatches1>,<mbsize2>-><num-minibatches2>.../d=<num-discarded>}"
1145  ",<egs-size2>={...},... (note,egs-size == number of input "
1146  "frames including context).";
1147  std::ostringstream os;
1148 
1149  // copy from unordered map to map to get sorting, for consistent output.
1150  typedef std::map<std::pair<int32, size_t>, StatsForExampleSize> SortedMapType;
1151 
1152  SortedMapType stats;
1153  stats.insert(stats_.begin(), stats_.end());
1154  SortedMapType::const_iterator eg_iter = stats.begin(), eg_end = stats.end();
1155  for (; eg_iter != eg_end; ++eg_iter) {
1156  int32 eg_size = eg_iter->first.first;
1157  if (eg_iter != stats.begin())
1158  os << ",";
1159  os << eg_size << "={";
1160  const StatsForExampleSize &stats = eg_iter->second;
1161  unordered_map<int32, int32>::const_iterator
1162  mb_iter = stats.minibatch_to_num_written.begin(),
1163  mb_end = stats.minibatch_to_num_written.end();
1164  for (; mb_iter != mb_end; ++mb_iter) {
1165  int32 mb_size = mb_iter->first,
1166  num_written = mb_iter->second;
1167  if (mb_iter != stats.minibatch_to_num_written.begin())
1168  os << ",";
1169  os << mb_size << "->" << num_written;
1170  }
1171  os << ",d=" << stats.num_discarded << "}";
1172  }
1173  KALDI_LOG << os.str();
1174 }
1175 
1176 
1177 
1179  int32 ans = 0;
1180  for (size_t i = 0; i < a.io.size(); i++) {
1181  int32 s = a.io[i].indexes.size();
1182  if (s > ans)
1183  ans = s;
1184  }
1185  return ans;
1186 }
1187 
1189  NnetExampleWriter *writer):
1190  finished_(false), num_egs_written_(0),
1191  config_(config), writer_(writer) { }
1192 
1193 
1196  // If an eg with the same structure as 'eg' is already a key in the
1197  // map, it won't be replaced, but if it's new it will be made
1198  // the key. Also we remove the key before making the vector empty.
1199  // This way we ensure that the eg in the key is always the first
1200  // element of the vector.
1201  std::vector<NnetExample*> &vec = eg_to_egs_[eg];
1202  vec.push_back(eg);
1203  int32 eg_size = GetNnetExampleSize(*eg),
1204  num_available = vec.size();
1205  bool input_ended = false;
1206  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
1207  input_ended);
1208  if (minibatch_size != 0) { // we need to write out a merged eg.
1209  KALDI_ASSERT(minibatch_size == num_available);
1210 
1211  std::vector<NnetExample*> vec_copy(vec);
1212  eg_to_egs_.erase(eg);
1213 
1214  // MergeExamples() expects a vector of NnetExample, not of pointers,
1215  // so use swap to create that without doing any real work.
1216  std::vector<NnetExample> egs_to_merge(minibatch_size);
1217  for (int32 i = 0; i < minibatch_size; i++) {
1218  egs_to_merge[i].Swap(vec_copy[i]);
1219  delete vec_copy[i]; // we owned those pointers.
1220  }
1221  WriteMinibatch(egs_to_merge);
1222  }
1223 }
1224 
1225 void ExampleMerger::WriteMinibatch(const std::vector<NnetExample> &egs) {
1226  KALDI_ASSERT(!egs.empty());
1227  int32 eg_size = GetNnetExampleSize(egs[0]);
1228  NnetExampleStructureHasher eg_hasher;
1229  size_t structure_hash = eg_hasher(egs[0]);
1230  int32 minibatch_size = egs.size();
1231  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
1232  NnetExample merged_eg;
1233  MergeExamples(egs, config_.compress, &merged_eg);
1234  std::ostringstream key;
1235  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
1236  writer_->Write(key.str(), merged_eg);
1237 }
1238 
1240  if (finished_) return; // already finished.
1241  finished_ = true;
1242 
1243  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
1244  // iterator invalidation problems.
1245  std::vector<std::vector<NnetExample*> > all_egs;
1246  all_egs.reserve(eg_to_egs_.size());
1247 
1248  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
1249  for (; iter != end; ++iter)
1250  all_egs.push_back(iter->second);
1251  eg_to_egs_.clear();
1252 
1253  for (size_t i = 0; i < all_egs.size(); i++) {
1254  int32 minibatch_size;
1255  std::vector<NnetExample*> &vec = all_egs[i];
1256  KALDI_ASSERT(!vec.empty());
1257  int32 eg_size = GetNnetExampleSize(*(vec[0]));
1258  bool input_ended = true;
1259  while (!vec.empty() &&
1260  (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
1261  input_ended)) != 0) {
1262  // MergeExamples() expects a vector of NnetExample, not of pointers,
1263  // so use swap to create that without doing any real work.
1264  std::vector<NnetExample> egs_to_merge(minibatch_size);
1265  for (int32 i = 0; i < minibatch_size; i++) {
1266  egs_to_merge[i].Swap(vec[i]);
1267  delete vec[i]; // we owned those pointers.
1268  }
1269  vec.erase(vec.begin(), vec.begin() + minibatch_size);
1270  WriteMinibatch(egs_to_merge);
1271  }
1272  if (!vec.empty()) {
1273  int32 eg_size = GetNnetExampleSize(*(vec[0]));
1274  NnetExampleStructureHasher eg_hasher;
1275  size_t structure_hash = eg_hasher(*(vec[0]));
1276  int32 num_discarded = vec.size();
1277  stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
1278  for (int32 i = 0; i < num_discarded; i++)
1279  delete vec[i];
1280  vec.clear();
1281  }
1282  }
1283  stats_.PrintStats();
1284 }
1285 
1286 } // namespace nnet3
1287 } // namespace kaldi
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
Definition: nnet-example.h:111
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
void AcceptExample(NnetExample *a)
void CopySetToVector(const std::set< T > &s, std::vector< T > *v)
Copies the elements of a set to a vector.
Definition: stl-utils.h:86
void DiscardedExamples(int32 example_size, size_t structure_hash, int32 num_discarded)
Users call this function to inform this class that after processing all the data, for examples of ori...
void WriteMinibatch(const std::vector< NnetExample > &egs)
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
A hashing function-object for vectors.
Definition: stl-utils.h:216
bool LengthsMatch(const std::string &utt, int32 utterance_length, int32 supervision_length, int32 length_tolerance=0) const
float DefaultDurationOfSplit(const std::vector< int32 > &split) const
bool need_model_derivative
if need_model_derivative is true, then we&#39;ll be doing either model training or model-derivative compu...
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
void Write(std::ostream &Out, bool binary) const
Writes to C++ stream (option to write in binary).
static void GetIoNames(const std::vector< NnetExample > &src, std::vector< std::string > *names_vec)
bool IsInputNode(int32 node) const
Returns true if this is an output node, meaning that it is of type kInput.
Definition: nnet-nnet.cc:120
int32 MinibatchSize(int32 size_of_eg, int32 num_available_egs, bool input_ended) const
This function tells you what minibatch size should be used for this eg.
static void DistributeRandomlyUniform(int32 n, std::vector< int32 > *vec)
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
GeneralMatrix features
The features or labels.
Definition: nnet-example.h:46
std::vector< std::vector< std::vector< int32 > > > splits_for_length_
void ShiftExampleTimes(int32 t_offset, const std::vector< std::string > &exclude_names, NnetExample *eg)
Shifts the time-index t of everything in the "eg" by adding "t_offset" to all "t" values...
std::vector< IoSpecification > inputs
std::vector< Index > indexes
"indexes" is a vector the same length as features.NumRows(), explaining the meaning of each row of th...
Definition: nnet-example.h:42
void GetGapSizes(int32 utterance_length, bool enforce_subsampling_factor, const std::vector< int32 > &chunk_sizes, std::vector< int32 > *gap_sizes) const
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
const ExampleMergingConfig & config_
int32 LargestValueInRange(int32 max_value) const
void Write(const std::string &key, const T &value) const
This hashing object hashes just the structural aspects of the NnetExample without looking at the valu...
Definition: nnet-example.h:145
void SetOutputWeights(int32 utterance_length, std::vector< ChunkTimeInfo > *chunk_info) const
const size_t count
void PrintStats() const
Calling this will cause a log message with information about the examples to be printed.
MatrixIndexT NumCols() const
void WriteVectorAsChar(std::ostream &os, bool binary, const VectorBase< BaseFloat > &vec)
float BaseFloat
Definition: kaldi-types.h:29
static bool ParseIntSet(const std::string &str, IntSet *int_set)
void GetChunkSizesForUtterance(int32 utterance_length, std::vector< int32 > *chunk_sizes) const
bool IsOutputNode(int32 node) const
Returns true if this is an output node, meaning that it is of type kDescriptor and is not directly fo...
Definition: nnet-nnet.cc:112
void ReadIntegerVector(std::istream &is, bool binary, std::vector< T > *v)
Function for reading STL vector of integer types.
Definition: io-funcs-inl.h:232
void RoundUpNumFrames(int32 frame_subsampling_factor, int32 *num_frames, int32 *num_frames_overlap)
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
struct rnnlm::@11::@12 n
static void GetIoSizes(const std::vector< NnetExample > &src, const std::vector< std::string > &names, std::vector< int32 > *sizes)
void AccStatsForUtterance(int32 utterance_length, const std::vector< ChunkTimeInfo > &chunk_info)
#define KALDI_ERR
Definition: kaldi-error.h:147
int32 GetNnetExampleSize(const NnetExample &a)
This function returns the &#39;size&#39; of a nnet-example as defined for purposes of merging egs...
#define KALDI_WARN
Definition: kaldi-error.h:150
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
const ExampleGenerationConfig & config_
void WroteExample(int32 example_size, size_t structure_hash, int32 minibatch_size)
Users call this function to inform this class that one minibatch has been written aggregating &#39;miniba...
void AppendGeneralMatrixRows(const std::vector< const GeneralMatrix *> &src, GeneralMatrix *mat)
Appends all the matrix rows of a list of GeneralMatrixes, to get a single GeneralMatrix.
std::map< int32, int32 > chunk_size_to_count_
static void DistributeRandomly(int32 n, const std::vector< int32 > &magnitudes, std::vector< int32 > *vec)
std::vector< Index > indexes
A class representing a vector.
Definition: kaldi-vector.h:406
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::vector< IoSpecification > outputs
MatrixIndexT NumRows() const
void InitSplits(std::vector< std::vector< int32 > > *splits) const
static void MergeIo(const std::vector< NnetExample > &src, const std::vector< std::string > &names, const std::vector< int32 > &sizes, bool compress, NnetExample *merged_eg)
void ReadVectorAsChar(std::istream &is, bool binary, Vector< BaseFloat > *vec)
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
ExampleMerger(const ExampleMergingConfig &config, NnetExampleWriter *writer)
void GetChunksForUtterance(int32 utterance_length, std::vector< ChunkTimeInfo > *chunk_info)
void WriteIntegerVector(std::ostream &os, bool binary, const std::vector< T > &v)
Function for writing STL vectors of integer types.
Definition: io-funcs-inl.h:198
std::string name
the name of the input in the neural net; in simple setups it will just be "input".
Definition: nnet-example.h:36
struct ChunkTimeInfo is used by class UtteranceSplitter to output information about how we split an u...
int32 GetNodeIndex(const std::string &node_name) const
returns index associated with this node name, or -1 if no such index.
Definition: nnet-nnet.cc:466
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
std::vector< NnetIo > io
"io" contains the input and output.
Definition: nnet-example.h:116
std::vector< std::pair< int32, int32 > > ranges
bool IsSortedAndUniq(const std::vector< T > &vec)
Returns true if the vector is sorted and contains each element only once.
Definition: stl-utils.h:63
#define KALDI_LOG
Definition: kaldi-error.h:153
std::vector< BaseFloat > output_weights
void Read(std::istream &in, bool binary, bool add=false)
Read function using C++ streams.
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95
void ComputeDerived()
This function decodes &#39;num_frames_str&#39; into &#39;num_frames&#39;, and ensures that the members of &#39;num_frames...
void GetComputationRequest(const Nnet &nnet, const NnetExample &eg, bool need_model_derivative, bool store_component_stats, ComputationRequest *request)
This function takes a NnetExample (which should already have been frame-selected, if desired...
void MergeExamples(const std::vector< NnetExample > &src, bool compress, NnetExample *merged_eg)
Merge a set of input examples into a single example (typically the size of "src" will be the minibatc...
UtteranceSplitter(const ExampleGenerationConfig &config)