ali-to-phones.cc
Go to the documentation of this file.
1 // bin/ali-to-phones.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 // 2015 IMSL, PKU-HKUST (author: Wei Shi)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 
22 #include "base/kaldi-common.h"
23 #include "hmm/transition-model.h"
24 #include "hmm/hmm-utils.h"
25 #include "util/common-utils.h"
26 #include "fst/fstlib.h"
27 
28 int main(int argc, char *argv[]) {
29  using namespace kaldi;
30  typedef kaldi::int32 int32;
31  try {
32  const char *usage =
33  "Convert model-level alignments to phone-sequences (in integer, "
34  "not text, form)\n"
35  "Usage: ali-to-phones [options] <model> <alignments-rspecifier> "
36  "<phone-transcript-wspecifier|ctm-wxfilename>\n"
37  "e.g.: \n"
38  " ali-to-phones 1.mdl ark:1.ali ark:-\n"
39  "or:\n"
40  " ali-to-phones --ctm-output 1.mdl ark:1.ali 1.ctm\n"
41  "See also: show-alignments lattice-align-phones, compare-int-vector\n";
42  ParseOptions po(usage);
43  bool per_frame = false;
44  bool write_lengths = false;
45  bool ctm_output = false;
46  BaseFloat frame_shift = 0.01;
47  po.Register("ctm-output", &ctm_output,
48  "If true, output the alignments in ctm format "
49  "(the confidences will be set to 1)");
50  po.Register("frame-shift", &frame_shift,
51  "frame shift used to control the times of the ctm output");
52  po.Register("per-frame", &per_frame,
53  "If true, write out the frame-level phone alignment "
54  "(else phone sequence)");
55  po.Register("write-lengths", &write_lengths,
56  "If true, write the #frames for each phone (different format)");
57 
58 
59  po.Read(argc, argv);
60 
61  KALDI_ASSERT(!(per_frame && write_lengths) && "Incompatible options.");
62 
63  if (po.NumArgs() != 3) {
64  po.PrintUsage();
65  exit(1);
66  }
67 
68  std::string model_filename = po.GetArg(1),
69  alignments_rspecifier = po.GetArg(2);
70 
71  TransitionModel trans_model;
72  ReadKaldiObject(model_filename, &trans_model);
73 
74  SequentialInt32VectorReader reader(alignments_rspecifier);
75  std::string empty;
76  Int32VectorWriter phones_writer(ctm_output ? empty :
77  (write_lengths ? empty : po.GetArg(3)));
78  Int32PairVectorWriter pair_writer(ctm_output ? empty :
79  (write_lengths ? po.GetArg(3) : empty));
80 
81  std::string ctm_wxfilename(ctm_output ? po.GetArg(3) : empty);
82  Output ctm_writer(ctm_wxfilename, false);
83  if (ctm_output) {
84  ctm_writer.Stream() << std::fixed;
85  ctm_writer.Stream().precision(frame_shift >= 0.01 ? 2 : 3);
86  }
87 
88  int32 n_done = 0;
89 
90  for (; !reader.Done(); reader.Next()) {
91  std::string key = reader.Key();
92  const std::vector<int32> &alignment = reader.Value();
93 
94  std::vector<std::vector<int32> > split;
95  SplitToPhones(trans_model, alignment, &split);
96 
97  if (ctm_output) {
98  BaseFloat phone_start = 0.0;
99  for (size_t i = 0; i < split.size(); i++) {
100  KALDI_ASSERT(!split[i].empty());
101  int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
102  int32 num_repeats = split[i].size();
103  ctm_writer.Stream() << key << " 1 " << phone_start << " "
104  << (frame_shift * num_repeats) << " " << phone << std::endl;
105  phone_start += frame_shift * num_repeats;
106  }
107  } else if (!write_lengths) {
108  std::vector<int32> phones;
109  for (size_t i = 0; i < split.size(); i++) {
110  KALDI_ASSERT(!split[i].empty());
111  int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
112  int32 num_repeats = split[i].size();
113  //KALDI_ASSERT(num_repeats!=0);
114  if (per_frame)
115  for(int32 j = 0; j < num_repeats; j++)
116  phones.push_back(phone);
117  else
118  phones.push_back(phone);
119  }
120  phones_writer.Write(key, phones);
121  } else {
122  std::vector<std::pair<int32, int32> > pairs;
123  for (size_t i = 0; i < split.size(); i++) {
124  KALDI_ASSERT(split[i].size() > 0);
125  int32 phone = trans_model.TransitionIdToPhone(split[i][0]);
126  int32 num_repeats = split[i].size();
127  //KALDI_ASSERT(num_repeats!=0);
128  pairs.push_back(std::make_pair(phone, num_repeats));
129  }
130  pair_writer.Write(key, pairs);
131  }
132  n_done++;
133  }
134  KALDI_LOG << "Done " << n_done << " utterances.";
135  } catch(const std::exception &e) {
136  std::cerr << e.what();
137  return -1;
138  }
139 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832
bool SplitToPhones(const TransitionModel &trans_model, const std::vector< int32 > &alignment, std::vector< std::vector< int32 > > *split_alignment)
SplitToPhones splits up the TransitionIds in "alignment" into their individual phones (one vector per...
Definition: hmm-utils.cc:723
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
int main(int argc, char *argv[])
#define KALDI_LOG
Definition: kaldi-error.h:153
int32 TransitionIdToPhone(int32 trans_id) const