paste-post.cc
Go to the documentation of this file.
1 // nnetbin/paste-post.cc
2 
3 // Copyright 2015 Brno University of Technology (Author: Karel Vesely)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 
21 #include "base/kaldi-common.h"
22 #include "base/io-funcs.h"
23 #include "util/common-utils.h"
24 #include "hmm/posterior.h"
25 #include "nnet/nnet-utils.h"
26 
32 int main(int argc, char *argv[]) {
33  using namespace kaldi;
34  using namespace kaldi::nnet1;
35  typedef kaldi::int32 int32;
36  try {
37  const char *usage =
38  "Combine 2 or more streams with NN-training targets into single stream.\n"
39  "As the posterior streams are pasted, the output dimension is the sum\n"
40  "of the input dimensions. This is used when training NN with\n"
41  "multiple softmaxes on its output. This is used in multi-task, \n"
42  "multi-lingual or multi-database training. Depending on the context,\n"
43  "an utterance is not required to be in all the input streams.\n"
44  "For a multi-database training only 1 output layer will be active.\n"
45  "\n"
46  "The lengths of utterances are provided as 1st argument.\n"
47  "The dimensions of input stream are set as 2nd in argument.\n"
48  "Follow the input and output streams which are in 'posterior' format.\n"
49  "\n"
50  "Usage: paste-post <featlen-rspecifier> <dims-csl> <post1-rspecifier> "
51  "... <postN-rspecifier> <post-wspecifier>\n"
52  "e.g.: paste-post 'ark:feat-to-len $feats ark,t:-|' 1029:1124 "
53  "ark:post1.ark ark:post2.ark ark:pasted.ark\n";
54 
55  ParseOptions po(usage);
56 
57  bool allow_partial = false;
58  po.Register("allow-partial", &allow_partial,
59  "Produce output also when the utterance is not in all input streams.");
60 
61  po.Read(argc, argv);
62 
63  if (po.NumArgs() < 5) {
64  po.PrintUsage();
65  exit(1);
66  }
67 
68  std::string featlen_rspecifier = po.GetArg(1), // segment lengths,
69  stream_dims_str = po.GetArg(2),
70  post_wspecifier = po.GetArg(po.NumArgs());
71  int32 stream_count = po.NumArgs() - 3; // number of input posterior streams
72 
73  // read the dims of input posterior streams,
74  std::vector<int32> stream_dims;
75  if (!kaldi::SplitStringToIntegers(stream_dims_str, ":,", false, &stream_dims)) {
76  KALDI_ERR << "Invalid stream-dims string " << stream_dims_str;
77  }
78  if (stream_count != stream_dims.size()) {
79  KALDI_ERR << "Mismatch in input posterior-stream count " << stream_count
80  << " and --stream-dims count" << stream_dims.size()
81  << ", " << stream_dims_str;
82  }
83 
84  // prepare dim offsets of input streams,
85  std::vector<int32> stream_offset(stream_dims.size()+1, 0);
86  for (int32 s = 0; s < stream_dims.size(); s++) {
87  stream_offset[s+1] = stream_offset[s] + stream_dims[s];
88  }
89 
90  // open the input posterior readers,
91  std::vector<RandomAccessPosteriorReader> posterior_reader(po.NumArgs()-3);
92  for (int32 s = 0; s < stream_count; s++) {
93  posterior_reader[s].Open(po.GetArg(s+3));
94  }
95 
96  int32 num_done = 0, num_err = 0, num_empty = 0;
97  SequentialInt32Reader featlen_reader(featlen_rspecifier);
98  PosteriorWriter posterior_writer(post_wspecifier);
99 
100  // main loop, posterior pasting happens here,
101  for (; !featlen_reader.Done(); featlen_reader.Next()) {
102  bool ok = true, empty = true;
103  std::string utt = featlen_reader.Key();
104  int32 num_frames = featlen_reader.Value();
105 
106  // show which streams are non-empty,
107  if (allow_partial && GetVerboseLevel() >= 2) {
108  std::string nonempty_streams;
109  for (int32 s = 0; s < stream_count; s++) {
110  if (posterior_reader[s].HasKey(utt)) {
111  nonempty_streams += " " + ToString(s);
112  }
113  }
114  KALDI_VLOG(2) << "Processing " << utt
115  << ", frames " << num_frames
116  << ", pasted-from streams " << nonempty_streams;
117  }
118 
119  // Create output posteriors,
120  Posterior post(num_frames);
121 
122  // Fill posterior from input streams,
123  for (int32 s = 0; s < stream_count; s++) {
124  if (!posterior_reader[s].HasKey(utt)) {
125  if (!allow_partial) {
126  KALDI_WARN << "No such utterance " << utt
127  << " in set " << (s+1) << " of posteriors.";
128  ok = false;
129  break;
130  }
131  } else {
132  const Posterior& post_s = posterior_reader[s].Value(utt);
133  KALDI_ASSERT(num_frames <= post_s.size());
134  for (int32 f = 0; f < num_frames; f++) {
135  for (int32 i = 0; i < post_s[f].size(); i++) {
136  int32 id = post_s[f][i].first;
137  BaseFloat val = post_s[f][i].second;
138  KALDI_ASSERT(id < stream_dims[s]);
139  post[f].push_back(std::make_pair(stream_offset[s] + id, val));
140  }
141  }
142  empty = false;
143  }
144  }
145  if (empty) {
146  KALDI_WARN << "Uttenrace with no posteriors " << utt << ", discarding";
147  num_empty++;
148  continue;
149  }
150  if (ok) {
151  posterior_writer.Write(featlen_reader.Key(), post);
152  num_done++;
153  } else {
154  num_err++;
155  }
156  }
157  KALDI_LOG << "Pasted posteriors for " << num_done << " sentences, "
158  << "missing sentences " << num_empty << ", "
159  << "failed for " << num_err;
160  return (num_done != 0 ? 0 : 1);
161  } catch(const std::exception &e) {
162  std::cerr << e.what();
163  return -1;
164  }
165 }
166 
167 
168 
std::string ToString(const T &t)
Convert basic type to a string (please don&#39;t overuse),.
Definition: nnet-utils.h:52
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool SplitStringToIntegers(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< I > *out)
Split a string (e.g.
Definition: text-utils.h:68
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
float BaseFloat
Definition: kaldi-types.h:29
std::vector< std::vector< std::pair< int32, BaseFloat > > > Posterior
Posterior is a typedef for storing acoustic-state (actually, transition-id) posteriors over an uttera...
Definition: posterior.h:42
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int main(int argc, char *argv[])
Combines 2 or more streams with NN-training targets into single one.
Definition: paste-post.cc:32
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156
#define KALDI_LOG
Definition: kaldi-error.h:153