25 #if !defined(_MSC_VER) 26 #include <sys/types.h> 27 #include <sys/socket.h> 28 #include <netinet/in.h> 29 #include <arpa/inet.h> 52 int main(
int argc,
char** argv) {
53 using namespace kaldi;
55 #if !defined(_MSC_VER) 59 "Sends an audio file to the KALDI audio server (onlinebin/online-audio-server-decode-faster)\n" 60 "and prints the result optionally saving it to an HTK label file or WebVTT subtitles file\n\n" 61 "e.g.: ./online-audio-client 192.168.50.12 9012 'scp:wav_files.scp'\n\n";
64 bool htk =
false, vtt =
false;
66 int32 packet_size = 1024;
68 po.
Register(
"htk", &htk,
"Save the result to an HTK label file");
69 po.
Register(
"vtt", &vtt,
"Save the result to a WebVTT subtitle file");
72 "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
73 po.
Register(
"packet-size", &packet_size,
"Send this many bytes per packet");
81 std::string server_addr_str = po.
GetArg(1);
82 std::string server_port_str = po.
GetArg(2);
83 int32 server_port = strtol(server_port_str.c_str(), 0, 10);
84 std::string wav_rspecifier = po.
GetArg(3);
86 int32 client_desc = socket(AF_INET, SOCK_STREAM, 0);
87 if (client_desc == -1) {
88 std::cerr <<
"ERROR: couldn't create socket!\n";
95 addr = inet_addr(server_addr_str.c_str());
96 if (addr == INADDR_NONE) {
97 hp = gethostbyname(server_addr_str.c_str());
99 std::cerr <<
"ERROR: couldn't resolve host string: " 100 << server_addr_str <<
'\n';
105 addr = *((
unsigned long*) hp->h_addr);
109 server.sin_addr.s_addr = addr;
110 server.sin_family = AF_INET;
111 server.sin_port = htons(server_port);
112 if (::connect(client_desc, (
struct sockaddr*) &server,
sizeof(server))) {
113 std::cerr <<
"ERROR: couldn't connect to server!\n";
118 KALDI_VLOG(2) <<
"Connected to KALDI server at host " << server_addr_str
119 <<
" port " << server_port;
121 char* pack_buffer =
new char[packet_size];
124 for (; !reader.
Done(); reader.
Next()) {
125 std::string wav_key = reader.
Key();
132 KALDI_ERR <<
"Sampling rates other than 16kHz are not supported!";
134 int32 num_chan = wav_data.
Data().
NumRows(), this_chan = channel;
141 KALDI_WARN <<
"Channel not specified but you have data with " 142 << num_chan <<
" channels; defaulting to zero";
144 if (this_chan >= num_chan) {
145 KALDI_WARN <<
"File with id " << wav_key <<
" has " << num_chan
146 <<
" channels but you specified channel " << channel
147 <<
", producing no output.";
155 while (au_src.Read(&data)) {
156 for (int32
i = 0;
i < data.Dim();
i++) {
157 short sample = (short) data(
i);
158 memcpy(&pack_buffer[
i * 2], (
char*) &sample, 2);
161 int32 size = data.Dim() * 2;
162 WriteFull(client_desc, (
char*) &size, 4);
164 WriteFull(client_desc, pack_buffer, size);
169 WriteFull(client_desc, (
char*) &size, 4);
171 std::string reco_output;
172 std::vector<RecognizedWord> results;
173 float total_input_dur = 0.0f, total_reco_dur = 0.0f;
180 if (line.substr(0, 7) !=
"RESULT:") {
181 if (line.substr(0, 8) ==
"PARTIAL:") {
182 std::cout << line.substr(8) <<
" " << std::flush;
185 KALDI_ERR <<
"Header parse error: " << line;
188 std::cout << std::endl;
190 if (line ==
"RESULT:DONE")
197 std::string tok, key, val;
198 size_t beg = 7,
end, eq;
201 end = line.find_first_of(
',', beg);
202 tok = line.substr(beg,
end - beg);
204 eq = tok.find_first_of(
'=');
205 if (eq == std::string::npos || eq >= tok.size() - 1) {
206 KALDI_WARN <<
"Error parsing header token " << tok;
210 key = tok.substr(0, eq);
211 val = tok.substr(eq + 1);
214 res_num = strtol(val.c_str(), 0, 10);
215 }
else if (key ==
"FORMAT") {
217 KALDI_ERR <<
"Only WSE format supported by this program!";
219 }
else if (key ==
"RECO-DUR") {
220 reco_dur = strtof(val.c_str(), 0);
221 }
else if (key ==
"INPUT-DUR") {
222 input_dur = strtof(val.c_str(), 0);
226 }
while (
end != std::string::npos);
228 total_input_dur += input_dur;
229 total_reco_dur += reco_dur;
231 for (int32
i = 0;
i < res_num;
i++) {
236 std::string word_str, start_str, end_str;
238 end = line.find_first_of(
',');
239 word_str = line.substr(0,
end);
241 end = line.find_first_of(
',', beg);
242 start_str = line.substr(beg,
end - beg);
244 end = line.find_first_of(
',', beg);
245 end_str = line.substr(beg,
end - beg);
248 word.
word = word_str;
249 word.
start = strtof(start_str.c_str(), 0);
250 word.
end = strtof(end_str.c_str(), 0);
252 results.push_back(word);
254 reco_output += word_str +
" ";
259 float speed = total_input_dur / total_reco_dur;
260 KALDI_VLOG(2) <<
"Recognized (" << speed <<
"xRT): " << reco_output;
264 std::string name = wav_key +
".lab";
265 std::ofstream htk_file(name.c_str());
266 for (
size_t i = 0;
i < results.size();
i++)
267 htk_file << (
int) (results[
i].start * 10000000) <<
" " 268 << (
int) (results[
i].end * 10000000) <<
" " 269 << results[
i].
word <<
"\n";
273 if (vtt && !results.empty()) {
274 std::vector<RecognizedWord> subtitles;
277 subtitle_cue.
start = -1;
278 subtitle_cue.
end = -1;
279 subtitle_cue.
word =
"";
281 for (
size_t i = 0;
i < results.size();
i++) {
282 if (subtitle_cue.
end >= 0) {
283 if (results[
i].
start - subtitle_cue.
end > 3.0f
284 || results[
i].word.size() + subtitle_cue.
word.size() > 64) {
286 if (results[
i].
start - subtitle_cue.
end < 0.1f)
287 subtitle_cue.
end = results[
i].start - 0.1f;
289 subtitles.push_back(subtitle_cue);
290 subtitle_cue.
start = -1;
291 subtitle_cue.
end = -1;
292 subtitle_cue.
word =
"";
297 if (subtitle_cue.
start < 0)
298 subtitle_cue.
start = results[
i].start;
300 subtitle_cue.
word +=
" ";
302 subtitle_cue.
end = results[
i].end + 1.0f;
304 subtitle_cue.
word += results[
i].word;
307 subtitles.push_back(subtitle_cue);
309 std::string name = wav_key +
".vtt";
310 std::ofstream vtt_file(name.c_str());
312 vtt_file <<
"WEBVTT FILE\n\n";
314 for (
size_t i = 0;
i < subtitles.size();
i++)
315 vtt_file << (
i + 1) <<
"\n" 318 << subtitles[
i].word <<
"\n\n";
325 delete[] pack_buffer;
328 catch (
const std::exception& e) {
329 std::cerr << e.what();
341 int32 to_write = size;
343 while (to_write > 0) {
344 int32 ret = write(desc, data + wrote, to_write);
363 if (buffer_offset >= buffer_fill) {
364 buffer_fill = read(desc, read_buffer, 1024);
366 if (buffer_fill <= 0)
373 if (read_buffer[
i] ==
'\r' || read_buffer[
i] ==
'\n') {
377 buffer_offset =
i + 1;
379 if (
i < buffer_fill) {
380 if (read_buffer[
i] ==
'\n' && read_buffer[
i + 1] ==
'\r') {
381 read_buffer[
i + 1] = 0;
382 buffer_offset =
i + 2;
384 if (read_buffer[
i] ==
'\r' && read_buffer[
i + 1] ==
'\n') {
385 read_buffer[
i + 1] = 0;
386 buffer_offset =
i + 2;
408 ms = (
int32)((time - (
float) s) * 1000.0f);
414 #if !defined(_MSC_VER) 415 snprintf(buf, 64,
"%02d:%02d:%02d.%03d", h, m, s, ms);
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
bool ReadLine(int32 desc, std::string *str)
BaseFloat SampFreq() const
const Matrix< BaseFloat > & Data() const
void Register(const std::string &name, bool *ptr, const std::string &doc)
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
This class's purpose is to read in Wave files.
int NumArgs() const
Number of positional parameters (c.f. argc-1).
std::string TimeToTimecode(float time)
A class representing a vector.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
bool WriteFull(int32 desc, char *data, int32 size)
int main(int argc, char **argv)