extract-feature-segments.cc
Go to the documentation of this file.
1 // featbin/extract-feature-segments.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation; Govivace Inc.
4 // 2012-2013 Mirko Hannemann; Arnab Ghoshal
5 // 2015 Tanel Alumae
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #include "base/kaldi-common.h"
23 #include "util/common-utils.h"
24 #include "feat/feature-mfcc.h"
25 #include "matrix/kaldi-matrix.h"
26 
34 int main(int argc, char *argv[]) {
35  try {
36  using namespace kaldi;
37 
38  const char *usage =
39  "Create feature files by segmenting input files.\n"
40  "Note: this program should no longer be needed now that\n"
41  "'ranges' in scp files are supported; search for 'ranges' in\n"
42  "http://kaldi-asr.org/doc/io_tut.html, or see the script\n"
43  "utils/data/subsegment_data_dir.sh.\n"
44  "Usage: "
45  "extract-feature-segments [options...] <feats-rspecifier> "
46  " <segments-file> <feats-wspecifier>\n"
47  " (segments-file has lines like: "
48  "output-utterance-id input-utterance-or-spk-id 1.10 2.36)\n";
49 
50  // construct all the global objects
51  ParseOptions po(usage);
52 
53  BaseFloat min_segment_length = 0.1, // Minimum segment length in seconds.
54  max_overshoot = 0.0; // max time by which last segment can overshoot
55  int32 frame_shift = 10;
56  int32 frame_length = 25;
57  bool snip_edges = true;
58 
59  // Register the options
60  po.Register("min-segment-length", &min_segment_length,
61  "Minimum segment length in seconds (reject shorter segments)");
62  po.Register("frame-length", &frame_length, "Frame length in milliseconds");
63  po.Register("frame-shift", &frame_shift, "Frame shift in milliseconds");
64  po.Register("max-overshoot", &max_overshoot,
65  "End segments overshooting by less (in seconds) are truncated,"
66  " else rejected.");
67  po.Register("snip-edges", &snip_edges,
68  "If true, n_frames frames will be snipped from the end of each "
69  "extracted feature matrix, "
70  "where n_frames = ceil((frame_length - frame_shift) / frame_shift), "
71  "This ensures that only the feature vectors that "
72  "completely fit in the segment are extracted. "
73  "This makes the extracted segment lengths match the lengths of the "
74  "features that have been extracted from already segmented audio.");
75 
76  // OPTION PARSING ...
77  // parse options (+filling the registered variables)
78  po.Read(argc, argv);
79  // number of arguments should be 3
80  // (scriptfile, segments file and outputwav write mode)
81  if (po.NumArgs() != 3) {
82  po.PrintUsage();
83  exit(1);
84  }
85 
86  std::string rspecifier = po.GetArg(1); // get script file/feature archive
87  std::string segments_rxfilename = po.GetArg(2); // get segment file
88  std::string wspecifier = po.GetArg(3); // get written archive name
89 
90  BaseFloatMatrixWriter feat_writer(wspecifier);
91 
92  RandomAccessBaseFloatMatrixReader feat_reader(rspecifier);
93 
94  Input ki(segments_rxfilename); // no binary argment: never binary.
95 
96  int32 num_lines = 0, num_success = 0;
97 
98  int32 snip_length = 0;
99  if (snip_edges) {
100  snip_length = static_cast<int32>(ceil(
101  1.0 * (frame_length - frame_shift) / frame_shift));
102  }
103 
104  std::string line;
105  /* read each line from segments file */
106  while (std::getline(ki.Stream(), line)) {
107  num_lines++;
108  std::vector<std::string> split_line;
109  // Split the line by space or tab and check the number of fields in each
110  // line. There must be 4 fields--segment name , reacording wav file name,
111  // start time, end time; 5th field (channel info) is optional.
112  SplitStringToVector(line, " \t\r", true, &split_line);
113  if (split_line.size() != 4 && split_line.size() != 5) {
114  KALDI_WARN << "Invalid line in segments file: " << line;
115  continue;
116  }
117  std::string segment = split_line[0],
118  utterance = split_line[1],
119  start_str = split_line[2],
120  end_str = split_line[3];
121 
122  // Convert the start time and endtime to real from string. Segment is
123  // ignored if start or end time cannot be converted to real.
124  double start, end;
125  if (!ConvertStringToReal(start_str, &start)) {
126  KALDI_WARN << "Invalid line in segments file [bad start]: " << line;
127  continue;
128  }
129  if (!ConvertStringToReal(end_str, &end)) {
130  KALDI_WARN << "Invalid line in segments file [bad end]: " << line;
131  continue;
132  }
133 
134  // start time must not be negative; start time must not be greater than
135  // end time, except if end time is -1
136  if (start < 0 || end <= 0 || start >= end) {
137  KALDI_WARN << "Invalid line in segments file "
138  "[empty or invalid segment]: "
139  << line;
140  continue;
141  }
142  int32 channel = -1; // means channel info is unspecified.
143  // if each line has 5 elements then 5th element must be channel identifier
144  if (split_line.size() == 5) {
145  if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) {
146  KALDI_WARN<< "Invalid line in segments file [bad channel]: " << line;
147  continue;
148  }
149  }
150 
151  /* check whether a segment start time and end time exists in utterance
152  * if fails , skips the segment.
153  */
154  if (!feat_reader.HasKey(utterance)) {
155  KALDI_WARN << "Did not find features for utterance " << utterance
156  << ", skipping segment " << segment;
157  continue;
158  }
159  const Matrix<BaseFloat> &feats = feat_reader.Value(utterance);
160  // total number of samples present in wav data
161  int32 num_samp = feats.NumRows();
162  // total number of channels present in wav file
163  int32 num_chan = feats.NumCols();
164  // Convert start & end times of the segment to corresponding sample number
165  int32 start_samp = static_cast<int32>(round(
166  (start * 1000.0 / frame_shift)));
167  int32 end_samp = static_cast<int32>(round(end * 1000.0 / frame_shift));
168 
169  if (snip_edges) {
170  // snip the edge at the end of the segment (usually 2 frames),
171  end_samp -= snip_length;
172  }
173 
174  /* start sample must be less than total number of samples
175  * otherwise skip the segment
176  */
177  if (start_samp < 0 || start_samp >= num_samp) {
178  KALDI_WARN << "Start sample out of range " << start_samp
179  << " [length:] " << num_samp << "x" << num_chan
180  << ", skipping segment " << segment;
181  continue;
182  }
183 
184  /* end sample must be less than total number samples
185  * otherwise skip the segment
186  */
187  if (end_samp > num_samp) {
188  if (end_samp >= num_samp
189  + static_cast<int32>(
190  round(max_overshoot * 1000.0 / frame_shift))) {
191  KALDI_WARN<< "End sample too far out of range " << end_samp
192  << " [length:] " << num_samp << "x" << num_chan
193  << ", skipping segment "
194  << segment;
195  continue;
196  }
197  end_samp = num_samp; // for small differences, just truncate.
198  }
199 
200  /* check whether the segment size is less than minimum segment length(default 0.1 sec)
201  * if yes, skip the segment
202  */
203  if (end_samp
204  <= start_samp
205  + static_cast<int32>(round(
206  (min_segment_length * 1000.0 / frame_shift)))) {
207  KALDI_WARN<< "Segment " << segment << " too short, skipping it.";
208  continue;
209  }
210 
211  SubMatrix<BaseFloat> segment_matrix(feats, start_samp,
212  end_samp-start_samp, 0, num_chan);
213  Matrix<BaseFloat> outmatrix(segment_matrix);
214  // write segment in feature archive.
215  feat_writer.Write(segment, outmatrix);
216  num_success++;
217  }
218  KALDI_LOG << "Successfully processed " << num_success << " lines out of "
219  << num_lines << " in the segments file. ";
220  /* prints number of segments processed */
221  if (num_success == 0) return -1;
222  return 0;
223  } catch(const std::exception &e) {
224  std::cerr << e.what();
225  return -1;
226  }
227 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const T & Value(const std::string &key)
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
Definition: text-utils.cc:238
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
int main(int argc, char *argv[])
This is a program for extracting segments from feature files/archives.
#define KALDI_LOG
Definition: kaldi-error.h:153
Sub-matrix representation.
Definition: kaldi-matrix.h:988