45 "Size of chunk, in input frames. Includes the nnet " 46 "context, so the number of chunks will be more than " 47 "total-input-frames / chunk-size.");
49 "Size of the batches of chunks that we compute at once. ");
51 "If true, for utterances shorter than `chunk-size` frames " 52 "we will pad with repeats of the last frame.");
83 int32 piece_size1 = a / b,
84 piece_size2 = piece_size1 + 1,
86 int32 num_pieces_of_size1 = b - remainder,
87 num_pieces_of_size2 = remainder;
89 num_pieces_of_size2 * piece_size2);
91 for (
int32 i = 0;
i < num_pieces_of_size1;
i++)
92 pieces->push_back(piece_size1 * a_sign);
93 for (
int32 i = 0;
i < num_pieces_of_size2;
i++)
94 pieces->push_back(piece_size2 * a_sign);
113 int32 total_context);
119 void AcceptUtterance(
const std::string &utt,
126 bool XvectorReady()
const;
139 void OutputXvector(std::string *utt,
197 void SplitUtteranceIntoChunks(
int32 num_frames,
198 std::vector<int32> *start_frames);
211 void ComputeOneBatch();
276 const std::string &utt,
int32 num_chunks) {
281 task->
xvector.Resize(xvector_dim_);
284 results_tail_->
tail = task;
285 results_tail_ = task;
287 results_head_ = task;
288 results_tail_ = task;
296 int32 total_context):
298 total_context_(total_context),
300 position_in_batch_(0),
302 results_tail_(NULL) {
322 input.
name =
"input";
338 output.
name =
"output";
347 request.
outputs.push_back(output);
360 num_input_frames = input.
NumRows();
363 KALDI_ERR <<
"Feature dimension mismatch: neural net expected " 366 for (
int32 t = 0; t < T; t++) {
368 int32 src_t = t + chunk_start;
369 if (src_t >= num_input_frames) {
371 src_t = num_input_frames - 1;
393 if (new_tail == NULL)
407 Nnet *nnet_to_update = NULL;
409 nnet_, nnet_to_update);
429 const std::string &utt,
431 std::vector<int32> chunk_starts;
434 int32 num_chunks = chunk_starts.size();
437 for (
int32 i = 0;
i < num_chunks;
i++) {
446 int32 num_frames, std::vector<int32> *start_frames) {
447 start_frames->clear();
450 start_frames->push_back(0);
461 KALDI_ASSERT(modified_num_frames > modified_chunk_size);
462 int32 num_chunks1 = modified_num_frames / modified_chunk_size,
463 num_chunks2 = num_chunks1 + 1;
464 int32 num_frames1 = num_chunks1 * modified_chunk_size,
465 num_frames2 = num_chunks2 * modified_chunk_size;
473 int32 N = num_frames2 - modified_num_frames,
474 M = modified_num_frames - N;
475 KALDI_ASSERT(M + 2*N == num_frames2 && M + N == modified_num_frames);
482 variance2 = (M + 4.0*N) / ((M + 2.0*N)*(M + 2.0*N));
483 if (variance1 <= variance2) {
489 int32 num_chunks = num_chunks1,
490 num_gaps = num_chunks + 1,
491 total_gap = modified_num_frames - num_chunks * modified_chunk_size;
492 KALDI_ASSERT(0 <= total_gap && total_gap < modified_chunk_size);
493 std::vector<int32> gap_sizes;
495 int32 pos = gap_sizes[0];
496 for (
int32 i = 0;
i < num_chunks;
i++) {
497 start_frames->push_back(pos);
498 pos += modified_chunk_size + gap_sizes[
i + 1];
502 int32 num_chunks = num_chunks2,
503 num_overlaps = num_chunks - 1,
504 total_overlap = modified_num_frames - num_chunks * modified_chunk_size;
505 KALDI_ASSERT( -modified_chunk_size < total_overlap && total_overlap <= 0 );
506 std::vector<int32> overlap_sizes;
509 for (
int32 i = 0;
i < num_chunks;
i++) {
510 start_frames->push_back(pos);
511 pos += modified_chunk_size;
512 if (
i < num_overlaps)
513 pos += overlap_sizes[
i];
524 int main(
int argc,
char *argv[]) {
526 using namespace kaldi;
529 typedef kaldi::int64 int64;
532 "Propagate features through an xvector neural network model and write\n" 533 "the output vectors. \"Xvector\" is our term for a vector or\n" 534 "embedding which is the output of a particular type of neural network\n" 535 "architecture found in speaker recognition. This architecture\n" 536 "consists of several layers that operate on frames, a statistics\n" 537 "pooling layer that aggregates over the frame-level representations\n" 538 "and possibly additional layers that operate on segment-level\n" 539 "representations. The xvectors are generally extracted from an\n" 540 "output layer after the statistics pooling layer. By default, one\n" 541 "xvector is extracted directly from the set of features for each\n" 542 "utterance. Optionally, xvectors are extracted from chunks of input\n" 543 "features and averaged, to produce a single vector.\n" 545 "Usage: nnet3-xvector-compute [options] <raw-nnet-in> " 546 "<features-rspecifier> <vector-wspecifier>\n" 547 "e.g.: nnet3-xvector-compute final.raw scp:feats.scp " 548 "ark:nnet_prediction.ark\n" 549 "See also: nnet3-compute\n";
556 std::string use_gpu =
"no";
561 "yes|no|optional|wait, only has effect if compiled with CUDA");
564 CuDevice::RegisterDeviceOptions(&po);
574 CuDevice::Instantiate().SelectGpuId(use_gpu);
577 std::string nnet_rxfilename = po.
GetArg(1),
578 feature_rspecifier = po.
GetArg(2),
579 vector_wspecifier = po.
GetArg(3);
589 int32 left_context, right_context;
597 KALDI_LOG <<
"Left/right context is " << left_context <<
", " 600 total_context = left_context + right_context;
606 int32 num_utts_read = 0, num_xvectors_written = 0;
607 int64 frame_count = 0;
611 for (; !feature_reader.
Done(); feature_reader.
Next()) {
612 std::string utt = feature_reader.
Key();
614 if (features.NumRows() == 0) {
615 KALDI_WARN <<
"Zero-length utterance: " << utt;
619 frame_count += features.NumRows();
628 vector_writer.
Write(utt, xvector);
629 num_xvectors_written++;
638 vector_writer.
Write(utt, xvector);
639 num_xvectors_written++;
644 CuDevice::Instantiate().PrintProfile();
646 double elapsed = timer.
Elapsed();
648 <<
"s: real-time factor assuming 100 frames/sec is " 649 << (elapsed*100.0/frame_count);
650 KALDI_LOG <<
"Read " << num_utts_read <<
" utterances, wrote " 651 << num_xvectors_written <<
" xvectors.";
655 if (num_xvectors_written > num_utts_read / 2)
659 }
catch(
const std::exception &e) {
660 std::cerr << e.what();
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void Register(OptionsItf *opts)
int32 InputDim(const std::string &input_name) const
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
void Flush()
Calling this will force any partial minibatch to be computed, so that any utterances that have previo...
bool need_model_derivative
if need_model_derivative is true, then we'll be doing either model training or model-derivative compu...
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
void DivideIntoPieces(int32 a, int32 b, std::vector< int32 > *pieces)
This function divides the number 'a' into 'b' pieces, such that the sum of the pieces equals 'a' and ...
void AddChunkToBatch(XvectorTask *task, const Matrix< BaseFloat > &input, int32 chunk_start)
Adds a new chunk to a batch we are preparing.
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
CachingOptimizingCompilerOptions compiler_config
NnetOptimizeOptions optimize_config
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
std::shared_ptr< const NnetComputation > computation_
The compiled computation (will be the same for every batch).
Matrix< BaseFloat > input_feats_
Staging area for the input features prior to copying them to GPU.
void Register(OptionsItf *opts)
XvectorTask * CreateTask(const std::string &utt, int32 num_chunks)
This adds a newly created XvectorTask at the tail of the singly linked list whose (head...
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
A templated class for writing objects to an archive or script file; see The Table concept...
XvectorTask * results_head_
std::vector< IoSpecification > inputs
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
void Write(const std::string &key, const T &value) const
int32 OutputDim(const std::string &output_name) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
virtual void Register(const std::string &name, bool *ptr, const std::string &doc)=0
struct Index is intended to represent the various indexes by which we number the rows of the matrices...
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
This file contains some miscellaneous functions dealing with class Nnet.
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
void AcceptInput(const std::string &node_name, CuMatrix< BaseFloat > *input)
e.g.
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
void OutputXvector(std::string *utt, Vector< BaseFloat > *xvector)
This function, which must only be called if XvectorReady() has just returned true, outputs an xvector for an utterance.
Vector< BaseFloat > xvector
void ComputeOneBatch()
Does the nnet computation for one batch and distributes the computed x-vectors (of chunks) appropriat...
void AcceptUtterance(const std::string &utt, const Matrix< BaseFloat > &input)
Accepts an utterance to process into an xvector, and, if one or more batches become full...
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
std::vector< XvectorTask * > tasks_this_batch_
tasks_this_batch_ is of dimension opts_.batch_size.
void ComputeSimpleNnetContext(const Nnet &nnet, int32 *left_context, int32 *right_context)
ComputeSimpleNnetContext computes the left-context and right-context of a nnet.
int32 num_chunks_finished
void Swap(Vector< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
BatchedXvectorComputer(const BatchedXvectorComputerOptions &opts, const Nnet &nnet, int32 total_context)
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void Register(OptionsItf *po)
const BatchedXvectorComputerOptions & opts_
void Register(OptionsItf *opts)
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
int NumArgs() const
Number of positional parameters (c.f. argc-1).
std::vector< Index > indexes
A class representing a vector.
class NnetComputer is responsible for executing the computation described in the "computation" object...
int32 position_in_batch_
position_in_batch_ is the number of chunks that we have filled in in the input_feats_ matrix and task...
#define KALDI_ASSERT(cond)
std::vector< IoSpecification > outputs
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
void SetRequireDirectInput(bool b, Nnet *nnet)
Calls the corresponding function in any component of type StatisticsPoolingComponent; used as a way t...
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
NnetComputeOptions compute_config
void GetOutputDestructive(const std::string &output_name, CuMatrix< BaseFloat > *output)
MatrixIndexT NumRows() const
Dimensions.
XvectorTask * results_tail_
double Elapsed() const
Returns time in seconds.
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
void SplitUtteranceIntoChunks(int32 num_frames, std::vector< int32 > *start_frames)
This decides how to split the utterance into chunks.
int main(int argc, char *argv[])
bool XvectorReady() const
Returns true if at least one xvector is pending output (i.e.
Config class for the CollapseModel function.
void Run()
This does either the forward or backward computation, depending when it is called (in a typical compu...