paste-post.cc File Reference
#include "base/kaldi-common.h"
#include "base/io-funcs.h"
#include "util/common-utils.h"
#include "hmm/posterior.h"
#include "nnet/nnet-utils.h"
Include dependency graph for paste-post.cc:

Go to the source code of this file.

Functions

int main (int argc, char *argv[])
 Combines 2 or more streams with NN-training targets into single one. More...
 

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

Combines 2 or more streams with NN-training targets into single one.

This is handy when training NN with more than one output layer (softmax). The format of NN-targets is 'posterior' and the dimensionality of the output stream is the sum of input-stream dimensions.

Definition at line 32 of file paste-post.cc.

References SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), kaldi::GetVerboseLevel(), rnnlm::i, KALDI_ASSERT, KALDI_ERR, KALDI_LOG, KALDI_VLOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), ParseOptions::PrintUsage(), ParseOptions::Read(), ParseOptions::Register(), kaldi::SplitStringToIntegers(), kaldi::nnet1::ToString(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

32  {
33  using namespace kaldi;
34  using namespace kaldi::nnet1;
35  typedef kaldi::int32 int32;
36  try {
37  const char *usage =
38  "Combine 2 or more streams with NN-training targets into single stream.\n"
39  "As the posterior streams are pasted, the output dimension is the sum\n"
40  "of the input dimensions. This is used when training NN with\n"
41  "multiple softmaxes on its output. This is used in multi-task, \n"
42  "multi-lingual or multi-database training. Depending on the context,\n"
43  "an utterance is not required to be in all the input streams.\n"
44  "For a multi-database training only 1 output layer will be active.\n"
45  "\n"
46  "The lengths of utterances are provided as 1st argument.\n"
47  "The dimensions of input stream are set as 2nd in argument.\n"
48  "Follow the input and output streams which are in 'posterior' format.\n"
49  "\n"
50  "Usage: paste-post <featlen-rspecifier> <dims-csl> <post1-rspecifier> "
51  "... <postN-rspecifier> <post-wspecifier>\n"
52  "e.g.: paste-post 'ark:feat-to-len $feats ark,t:-|' 1029:1124 "
53  "ark:post1.ark ark:post2.ark ark:pasted.ark\n";
54 
55  ParseOptions po(usage);
56 
57  bool allow_partial = false;
58  po.Register("allow-partial", &allow_partial,
59  "Produce output also when the utterance is not in all input streams.");
60 
61  po.Read(argc, argv);
62 
63  if (po.NumArgs() < 5) {
64  po.PrintUsage();
65  exit(1);
66  }
67 
68  std::string featlen_rspecifier = po.GetArg(1), // segment lengths,
69  stream_dims_str = po.GetArg(2),
70  post_wspecifier = po.GetArg(po.NumArgs());
71  int32 stream_count = po.NumArgs() - 3; // number of input posterior streams
72 
73  // read the dims of input posterior streams,
74  std::vector<int32> stream_dims;
75  if (!kaldi::SplitStringToIntegers(stream_dims_str, ":,", false, &stream_dims)) {
76  KALDI_ERR << "Invalid stream-dims string " << stream_dims_str;
77  }
78  if (stream_count != stream_dims.size()) {
79  KALDI_ERR << "Mismatch in input posterior-stream count " << stream_count
80  << " and --stream-dims count" << stream_dims.size()
81  << ", " << stream_dims_str;
82  }
83 
84  // prepare dim offsets of input streams,
85  std::vector<int32> stream_offset(stream_dims.size()+1, 0);
86  for (int32 s = 0; s < stream_dims.size(); s++) {
87  stream_offset[s+1] = stream_offset[s] + stream_dims[s];
88  }
89 
90  // open the input posterior readers,
91  std::vector<RandomAccessPosteriorReader> posterior_reader(po.NumArgs()-3);
92  for (int32 s = 0; s < stream_count; s++) {
93  posterior_reader[s].Open(po.GetArg(s+3));
94  }
95 
96  int32 num_done = 0, num_err = 0, num_empty = 0;
97  SequentialInt32Reader featlen_reader(featlen_rspecifier);
98  PosteriorWriter posterior_writer(post_wspecifier);
99 
100  // main loop, posterior pasting happens here,
101  for (; !featlen_reader.Done(); featlen_reader.Next()) {
102  bool ok = true, empty = true;
103  std::string utt = featlen_reader.Key();
104  int32 num_frames = featlen_reader.Value();
105 
106  // show which streams are non-empty,
107  if (allow_partial && GetVerboseLevel() >= 2) {
108  std::string nonempty_streams;
109  for (int32 s = 0; s < stream_count; s++) {
110  if (posterior_reader[s].HasKey(utt)) {
111  nonempty_streams += " " + ToString(s);
112  }
113  }
114  KALDI_VLOG(2) << "Processing " << utt
115  << ", frames " << num_frames
116  << ", pasted-from streams " << nonempty_streams;
117  }
118 
119  // Create output posteriors,
120  Posterior post(num_frames);
121 
122  // Fill posterior from input streams,
123  for (int32 s = 0; s < stream_count; s++) {
124  if (!posterior_reader[s].HasKey(utt)) {
125  if (!allow_partial) {
126  KALDI_WARN << "No such utterance " << utt
127  << " in set " << (s+1) << " of posteriors.";
128  ok = false;
129  break;
130  }
131  } else {
132  const Posterior& post_s = posterior_reader[s].Value(utt);
133  KALDI_ASSERT(num_frames <= post_s.size());
134  for (int32 f = 0; f < num_frames; f++) {
135  for (int32 i = 0; i < post_s[f].size(); i++) {
136  int32 id = post_s[f][i].first;
137  BaseFloat val = post_s[f][i].second;
138  KALDI_ASSERT(id < stream_dims[s]);
139  post[f].push_back(std::make_pair(stream_offset[s] + id, val));
140  }
141  }
142  empty = false;
143  }
144  }
145  if (empty) {
146  KALDI_WARN << "Uttenrace with no posteriors " << utt << ", discarding";
147  num_empty++;
148  continue;
149  }
150  if (ok) {
151  posterior_writer.Write(featlen_reader.Key(), post);
152  num_done++;
153  } else {
154  num_err++;
155  }
156  }
157  KALDI_LOG << "Pasted posteriors for " << num_done << " sentences, "
158  << "missing sentences " << num_empty << ", "
159  << "failed for " << num_err;
160  return (num_done != 0 ? 0 : 1);
161  } catch(const std::exception &e) {
162  std::cerr << e.what();
163  return -1;
164  }
165 }
std::string ToString(const T &t)
Convert basic type to a string (please don&#39;t overuse),.
Definition: nnet-utils.h:52
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
#define KALDI_LOG
Definition: kaldi-error.h:153