extract-segments.cc
Go to the documentation of this file.
1 // featbin/extract-segments.cc
2 
3 // Copyright 2009-2011 Microsoft Corporation; Govivace Inc.
4 // 2013 Arnab Ghoshal
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include "base/kaldi-common.h"
22 #include "util/common-utils.h"
23 #include "feat/wave-reader.h"
24 
34 int main(int argc, char *argv[]) {
35  try {
36  using namespace kaldi;
37 
38  const char *usage =
39  "Extract segments from a large audio file in WAV format.\n"
40  "Usage: extract-segments [options] <wav-rspecifier> <segments-file> <wav-wspecifier>\n"
41  "e.g. extract-segments scp:wav.scp segments ark:- | <some-other-program>\n"
42  " segments-file format: each line is either\n"
43  "<segment-id> <recording-id> <start-time> <end-time>\n"
44  "e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5\n"
45  "or (less frequently, and not supported in scripts):\n"
46  "<segment-id> <wav-file-name> <start-time> <end-time> <channel>\n"
47  "where <channel> will normally be 0 (left) or 1 (right)\n"
48  "e.g. call-861225-A-0050-0065 call-861225 5.0 6.5 1\n"
49  "And <end-time> of -1 means the segment runs till the end of the WAV file\n"
50  "See also: extract-feature-segments, wav-copy, wav-to-duration\n";
51 
52  ParseOptions po(usage);
53  BaseFloat min_segment_length = 0.1, // Minimum segment length in seconds.
54  max_overshoot = 0.5; // max time by which last segment can overshoot
55  po.Register("min-segment-length", &min_segment_length,
56  "Minimum segment length in seconds (reject shorter segments)");
57  po.Register("max-overshoot", &max_overshoot,
58  "End segments overshooting audio by less than this (in seconds) "
59  "are truncated, else rejected.");
60 
61  po.Read(argc, argv);
62  if (po.NumArgs() != 3) {
63  po.PrintUsage();
64  exit(1);
65  }
66 
67  std::string wav_rspecifier = po.GetArg(1);
68  std::string segments_rxfilename = po.GetArg(2);
69  std::string wav_wspecifier = po.GetArg(3);
70 
71  RandomAccessTableReader<WaveHolder> reader(wav_rspecifier);
72  TableWriter<WaveHolder> writer(wav_wspecifier);
73  Input ki(segments_rxfilename); // No binary argment: never binary.
74 
75  int32 num_lines = 0, num_success = 0;
76 
77  std::string line;
78  // Read each line from the segments file.
79  while (std::getline(ki.Stream(), line)) {
80  num_lines++;
81  std::vector<std::string> split_line;
82  // Split the line into whitespace-separated fields and verify their
83  // number. There must be 4 or 5 fields: segment name, reacording ID, start
84  // time, end time, and the optional channel number.
85  SplitStringToVector(line, " \t\r", true, &split_line);
86  if (split_line.size() != 4 && split_line.size() != 5) {
87  KALDI_WARN << "Invalid line in segments file: " << line;
88  continue;
89  }
90  std::string segment = split_line[0],
91  recording = split_line[1],
92  start_str = split_line[2],
93  end_str = split_line[3];
94 
95  // Parse the start and end times as float values. Segment is ignored if
96  // any of end times is malformed.
97  double start, end;
98  if (!ConvertStringToReal(start_str, &start)) {
99  KALDI_WARN << "Invalid line in segments file [bad start]: " << line;
100  continue;
101  }
102  if (!ConvertStringToReal(end_str, &end)) {
103  KALDI_WARN << "Invalid line in segments file [bad end]: " << line;
104  continue;
105  }
106  // Start time must be non-negative and not greater than the end time,
107  // except if the end time is -1.
108  if (start < 0 || (end != -1.0 && end <= 0) ||
109  ((start >= end) && (end > 0))) {
110  KALDI_WARN << ("Invalid line in segments file "
111  "[empty or invalid segment]: ") << line;
112  continue;
113  }
114  int32 channel = -1; // -1 means channel is unspecified.
115  // If the line has 5 elements, then the 5th element is the channel number.
116  if (split_line.size() == 5) {
117  if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) {
118  KALDI_WARN << "Invalid line in segments file [bad channel]: " << line;
119  continue;
120  }
121  }
122 
123  // Check whether the recording ID is in wav.scp; if not, skip the segment.
124  if (!reader.HasKey(recording)) {
125  KALDI_WARN << "Could not find recording " << recording
126  << ", skipping segment " << segment;
127  continue;
128  }
129 
130  const WaveData &wave = reader.Value(recording);
131  const Matrix<BaseFloat> &wave_data = wave.Data();
132  BaseFloat samp_freq = wave.SampFreq(); // Sampling fequency.
133  int32 num_samp = wave_data.NumCols(), // Number of samples in recording.
134  num_chan = wave_data.NumRows(); // Number of channels in recording.
135  BaseFloat file_length = num_samp / samp_freq; // In seconds.
136 
137  // Start must be within the wave data, otherwise skip the segment.
138  if (start < 0 || start > file_length) {
139  KALDI_WARN << "Segment start is out of file data range [0, "
140  << file_length << "s]; skipping segment '" << line << "'";
141  continue;
142  }
143 
144  // End must be less than the file length adjusted for possible overshoot;
145  // otherwise skip the segment. end == -1 passes the check.
146  if (end > file_length + max_overshoot) {
147  KALDI_WARN << "Segment end is too far out of file data range [0,"
148  << file_length << "s]; skipping segment '" << line << "'";
149  continue;
150  }
151 
152  // Otherwise ensure the end is not beyond the end of data, and default
153  // end == -1 to the end of file data.
154  if (end < 0 || end > file_length) end = file_length;
155 
156  // Skip if segment size is less than the minimum allowed.
157  if (end - start < min_segment_length) {
158  KALDI_WARN << "Segment " << segment << " too short, skipping it.";
159  continue;
160  }
161 
162  // Check that the channel is specified in the segments file for a multi-
163  // channel file, and that the channel actually exists in the wave data.
164  if (channel == -1) {
165  if (num_chan == 1) channel = 0;
166  else {
167  KALDI_ERR << ("Your data has multiple channels. You must "
168  "specify the channel in the segments file. "
169  "Skipping segment ") << segment;
170  }
171  } else {
172  if (channel >= num_chan) {
173  KALDI_WARN << "Invalid channel " << channel << " >= " << num_chan
174  << ". Skipping segment " << segment;
175  continue;
176  }
177  }
178 
179  // Convert endpoints of the segment to sample numbers. Note that the
180  // conversion requires a proper rounding.
181  int32 start_samp = static_cast<int32>(start * samp_freq + 0.5f),
182  end_samp = static_cast<int32>(end * samp_freq + 0.5f);
183 
184  if (end_samp > num_samp)
185  end_samp = num_samp;
186 
187  // Get the range of data from the orignial wave_data matrix.
188  SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1,
189  start_samp, end_samp - start_samp);
190  WaveData segment_wave(samp_freq, segment_matrix);
191  writer.Write(segment, segment_wave); // Write the range in wave format.
192  num_success++;
193  }
194  KALDI_LOG << "Successfully processed " << num_success << " lines out of "
195  << num_lines << " in the segments file. ";
196  return 0;
197  } catch(const std::exception &e) {
198  std::cerr << e.what();
199  return -1;
200  }
201 }
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
bool ConvertStringToInteger(const std::string &str, Int *out)
Converts a string into an integer via strtoll and returns false if there was any kind of problem (i...
Definition: text-utils.h:118
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368
kaldi::int32 int32
BaseFloat SampFreq() const
Definition: wave-reader.h:126
const Matrix< BaseFloat > & Data() const
Definition: wave-reader.h:124
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
int main(int argc, char *argv[])
This is the main program for extracting segments from a wav file.
Allows random access to a collection of objects in an archive or script file; see The Table concept...
Definition: kaldi-table.h:233
std::istream & Stream()
Definition: kaldi-io.cc:826
float BaseFloat
Definition: kaldi-types.h:29
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36
const T & Value(const std::string &key)
void SplitStringToVector(const std::string &full, const char *delim, bool omit_empty_strings, std::vector< std::string > *out)
Split a string using any of the single character delimiters.
Definition: text-utils.cc:63
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
#define KALDI_ERR
Definition: kaldi-error.h:147
bool ConvertStringToReal(const std::string &str, T *out)
ConvertStringToReal converts a string into either float or double and returns false if there was any ...
Definition: text-utils.cc:238
#define KALDI_WARN
Definition: kaldi-error.h:150
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
bool HasKey(const std::string &key)
This class&#39;s purpose is to read in Wave files.
Definition: wave-reader.h:106
int NumArgs() const
Number of positional parameters (c.f. argc-1).
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
#define KALDI_LOG
Definition: kaldi-error.h:153
Sub-matrix representation.
Definition: kaldi-matrix.h:988