nnet-relabel-egs.cc
Go to the documentation of this file.
1 // nnet2bin/nnet-relabel-egs.cc
2 
3 // Copyright 2014 Vimal Manohar
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
23 #include <sstream>
24 
25 #include "base/kaldi-common.h"
26 #include "util/common-utils.h"
27 #include "nnet2/nnet-example.h"
28 
29 namespace kaldi {
30 
31  // this functions splits an egs key like <utt_id>-<frame_id> into
32  // separate utterance id and frame id on the last delimiter.
33  // Returns false if the delimiter is not found in the key.
34  bool SplitEgsKey(const std::string &key,
35  std::string *utt_id, int32 *frame_id) {
36  size_t start = 0, found = 0, end = key.size();
37  utt_id->clear();
38 
39  found = key.find_last_of("-", end);
40  // start != end condition is for when the delimiter is at the end
41 
42  if (found != start && start != end && found < end) {
43  *utt_id = key.substr(start, found - start);
44  std::istringstream tmp(key.substr(found + 1, end));
45  tmp >> *frame_id;
46  return true;
47  }
48 
49  return false;
50  }
51 }
52 
53 int main(int argc, char *argv[]) {
54  using namespace kaldi;
55  using namespace kaldi::nnet2;
56 
57  typedef kaldi::int32 int32;
58  typedef kaldi::int64 int64;
59  try {
60  const char *usage =
61  "Relabel neural network egs with the read pdf-id alignments, "
62  "zero-based..\n"
63  "Usage: nnet-relabel-egs [options] <pdf-aligment-rspecifier> "
64  "<egs_rspecifier1> ... <egs_rspecifierN> "
65  "<egs_wspecifier1> ... <egs_wspecifierN>\n"
66  "e.g.: \n"
67  " nnet-relabel-egs ark:1.ali egs_in/egs.1.ark egs_in/egs.2.ark "
68  "egs_out/egs.1.ark egs_out/egs.2.ark\n"
69  "See also: nnet-get-egs, nnet-copy-egs, steps/nnet2/relabel_egs.sh\n";
70 
71  ParseOptions po(usage);
72 
73  po.Read(argc, argv);
74 
75  // Here we expect equal number of input egs archive and output egs archives.
76  // So the total number of arguments including the alignment specifier must be odd.
77  if (po.NumArgs() < 3 || po.NumArgs() % 2 == 0) {
78  po.PrintUsage();
79  exit(1);
80  }
81 
82  std::string alignments_rspecifier = po.GetArg(1);
83  int32 num_archives = (po.NumArgs() - 1) / 2;
84 
85  SequentialInt32VectorReader ali_reader(alignments_rspecifier);
86 
87  unordered_map<std::string, std::vector<int32>* > utt_to_pdf_ali;
88 
89  // Keep statistics
90  int32 num_ali = 0;
91  int64 num_frames_ali = 0, num_frames_egs = 0,
92  num_frames_missing = 0, num_frames_relabelled = 0;
93 
94  // Read alignments and put the pointer in an unordered map
95  // indexed by the key. This is so that we can efficiently find the
96  // alignment corresponding to the utterance to
97  // which a particular frame belongs
98  for (; !ali_reader.Done(); ali_reader.Next(), num_ali++) {
99  std::string key = ali_reader.Key();
100  std::vector<int32> *alignment = new std::vector<int32>(ali_reader.Value());
101  std::pair<std::string, std::vector<int32>* > map(key, alignment);
102  utt_to_pdf_ali.insert(map);
103  num_frames_ali += alignment->size();
104  }
105 
106  // Read archives of egs sequentially
107  for (int32 i = 0; i < num_archives; i++) {
108  std::string egs_rspecifier(po.GetArg(i+2));
109  std::string egs_wspecifier(po.GetArg(i+2+num_archives));
110 
111  SequentialNnetExampleReader egs_reader(egs_rspecifier);
112  NnetExampleWriter egs_writer(egs_wspecifier);
113 
114  for (; !egs_reader.Done(); egs_reader.Next(), num_frames_egs++) {
115 
116  std::string key(egs_reader.Key());
117 
118  std::string utt_id;
119  int32 frame_id;
120 
121  if (!SplitEgsKey(key, &utt_id, &frame_id)) {
122  KALDI_ERR << "Unable to split key " << key << " on delimiter - "
123  << " into utterance id and frame id";
124  }
125  NnetExample eg(egs_reader.Value());
126 
127  if (utt_to_pdf_ali.find(utt_id) == utt_to_pdf_ali.end()) {
128  KALDI_WARN << "Unable to find utterance id " << utt_id;
129  egs_writer.Write(key, eg);
130  num_frames_missing++;
131  continue;
132  }
133  const std::vector<int32> *alignment = utt_to_pdf_ali[utt_id];
134 
135  int32 num_frames_in_eg = eg.labels.size();
136  for (int32 t_offset = 0; t_offset < num_frames_in_eg; t_offset++) {
137  int32 t = frame_id + t_offset;
138  if (t >= static_cast<int32>(alignment->size())) {
139  KALDI_ERR << "Time index " << t << " out of range for alignment, "
140  << "should be < " << alignment->size();
141  }
142  if (eg.GetLabelSingle(t_offset) != (*alignment)[t])
143  num_frames_relabelled++;
144  eg.SetLabelSingle(t_offset, (*alignment)[t]);
145  }
146  egs_writer.Write(key, eg);
147  }
148  }
149 
150  unordered_map<std::string, std::vector<int32>*>::iterator iter;
151 
152  for (iter = utt_to_pdf_ali.begin(); iter != utt_to_pdf_ali.end(); ++iter)
153  delete iter->second;
154 
155  KALDI_LOG << "Read " << num_ali << " alignments containing a total of "
156  << num_frames_ali << " frames; labelled "
157  << num_frames_egs - num_frames_missing << " frames out of "
158  << num_frames_egs << " examples; labels changed for "
159  << num_frames_relabelled << " of those frames.\n.";
160 
161  return (num_frames_missing > 0.5 * num_frames_egs);
162 
163  } catch(const std::exception &e) {
164  std::cerr << e.what();
165  return -1;
166  }
167 }
168 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
Definition: nnet-example.h:36
int main(int argc, char *argv[])
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
bool SplitEgsKey(const std::string &key, std::string *utt_id, int32 *frame_id)
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_LOG
Definition: kaldi-error.h:153