doc/nnet3-xvector-compute_8cc_source.html

 // nnet3bin/nnet3-xvector-compute.cc

 // Copyright 2017   Johns Hopkins University (author: Daniel Povey)
 //           2017   Johns Hopkins University (author: Daniel Garcia-Romero)
 //           2017   David Snyder

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.


 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-am-decodable-simple.h"
 #include "base/timer.h"
 #include "nnet3/nnet-utils.h"

 namespace kaldi {
 namespace nnet3 {

 // Computes an xvector from a chunk of speech features.
 static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
     const Nnet &nnet, CachingOptimizingCompiler *compiler,
     Vector<BaseFloat> *xvector) {
   ComputationRequest request;
   request.need_model_derivative = false;
   request.store_component_stats = false;
   request.inputs.push_back(
     IoSpecification("input", 0, features.NumRows()));
   IoSpecification output_spec;
   output_spec.name = "output";
   output_spec.has_deriv = false;
   output_spec.indexes.resize(1);
   request.outputs.resize(1);
   request.outputs[0].Swap(&output_spec);
   std::shared_ptr<const NnetComputation> computation(compiler->Compile(request));
   Nnet *nnet_to_update = NULL;  // we're not doing any update.
   NnetComputer computer(NnetComputeOptions(), *computation,
                   nnet, nnet_to_update);
   CuMatrix<BaseFloat> input_feats_cu(features);
   computer.AcceptInput("input", &input_feats_cu);
   computer.Run();
   CuMatrix<BaseFloat> cu_output;
   computer.GetOutputDestructive("output", &cu_output);
   xvector->Resize(cu_output.NumCols());
   xvector->CopyFromVec(cu_output.Row(0));
 }

 } // namespace nnet3
 } // namespace kaldi

 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;

     const char *usage =
         "Propagate features through an xvector neural network model and write\n"
         "the output vectors.  \"Xvector\" is our term for a vector or\n"
         "embedding which is the output of a particular type of neural network\n"
         "architecture found in speaker recognition.  This architecture\n"
         "consists of several layers that operate on frames, a statistics\n"
         "pooling layer that aggregates over the frame-level representations\n"
         "and possibly additional layers that operate on segment-level\n"
         "representations.  The xvectors are generally extracted from an\n"
         "output layer after the statistics pooling layer.  By default, one\n"
         "xvector is extracted directly from the set of features for each\n"
         "utterance.  Optionally, xvectors are extracted from chunks of input\n"
         "features and averaged, to produce a single vector.\n"
         "\n"
         "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
         "<features-rspecifier> <vector-wspecifier>\n"
         "e.g.: nnet3-xvector-compute final.raw scp:feats.scp "
         "ark:nnet_prediction.ark\n"
         "See also: nnet3-compute\n";

     ParseOptions po(usage);
     Timer timer;

     NnetSimpleComputationOptions opts;
     CachingOptimizingCompilerOptions compiler_config;

     opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.

     std::string use_gpu = "no";
     std::string cached_compiler_in;
     std::string cached_compiler_out;
     int32 chunk_size = -1,
       min_chunk_size = 100;
     bool pad_input = true;

     opts.Register(&po);
     compiler_config.Register(&po);

     po.Register("use-gpu", &use_gpu,
       "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("chunk-size", &chunk_size,
       "If set, extracts xectors from specified chunk-size, and averages.  "
       "If not set, extracts an xvector from all available features.");
     po.Register("min-chunk-size", &min_chunk_size,
       "Minimum chunk-size allowed when extracting xvectors.");
     po.Register("pad-input", &pad_input, "If true, duplicate the first and "
       "last frames of the input features as required to equal min-chunk-size.");
     po.Register("cached-compiler-in", &cached_compiler_in,
       "If set, read the cached compiler from the specified file path.");
     po.Register("cached-compiler-out", &cached_compiler_out,
       "If set, write the cached compiler to the specified file path.");

 #if HAVE_CUDA==1
     CuDevice::RegisterDeviceOptions(&po);
 #endif

     po.Read(argc, argv);

     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }

 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif

     std::string nnet_rxfilename = po.GetArg(1),
                 feature_rspecifier = po.GetArg(2),
                 vector_wspecifier = po.GetArg(3);

     Nnet nnet;
     ReadKaldiObject(nnet_rxfilename, &nnet);
     SetBatchnormTestMode(true, &nnet);
     SetDropoutTestMode(true, &nnet);
     CollapseModel(CollapseModelConfig(), &nnet);

     CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config);

     if (!cached_compiler_in.empty()) {
         KALDI_LOG << "Reading cache from " << cached_compiler_in;
         bool cache_binary_in;
         Input ki(cached_compiler_in, &cache_binary_in);
         compiler.ReadCache(ki.Stream(), cache_binary_in);
     }

     BaseFloatVectorWriter vector_writer(vector_wspecifier);

     int32 num_success = 0, num_fail = 0;
     int64 frame_count = 0;
     int32 xvector_dim = nnet.OutputDim("output");

     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);

     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       const Matrix<BaseFloat> &features (feature_reader.Value());
       if (features.NumRows() == 0) {
         KALDI_WARN << "Zero-length utterance: " << utt;
         num_fail++;
         continue;
       }
       int32 num_rows = features.NumRows(),
             feat_dim = features.NumCols(),
             this_chunk_size = chunk_size;
       if (!pad_input && num_rows < min_chunk_size) {
         KALDI_WARN << "Minimum chunk size of " << min_chunk_size
                    << " is greater than the number of rows "
                    << "in utterance: " << utt;
         num_fail++;
         continue;
       } else if (num_rows < chunk_size) {
         KALDI_LOG << "Chunk size of " << chunk_size << " is greater than "
                   << "the number of rows in utterance: " << utt
                   << ", using chunk size  of " << num_rows;
         this_chunk_size = num_rows;
       } else if (chunk_size == -1) {
         this_chunk_size = num_rows;
       }

       int32 num_chunks = ceil(
         num_rows / static_cast<BaseFloat>(this_chunk_size));
       Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero);
       BaseFloat tot_weight = 0.0;

       // Iterate over the feature chunks.
       for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {
         // If we're nearing the end of the input, we may need to shift the
         // offset back so that we can get this_chunk_size frames of input to
         // the nnet.
         int32 offset = std::min(
           this_chunk_size, num_rows - chunk_indx * this_chunk_size);
         if (!pad_input && offset < min_chunk_size)
           continue;
         SubMatrix<BaseFloat> sub_features(
           features, chunk_indx * this_chunk_size, offset, 0, feat_dim);
         Vector<BaseFloat> xvector;
         tot_weight += offset;

         // Pad input if the offset is less than the minimum chunk size
         if (pad_input && offset < min_chunk_size) {
           Matrix<BaseFloat> padded_features(min_chunk_size, feat_dim);
           int32 left_context = (min_chunk_size - offset) / 2;
           int32 right_context = min_chunk_size - offset - left_context;
           for (int32 i = 0; i < left_context; i++) {
             padded_features.Row(i).CopyFromVec(sub_features.Row(0));
           }
           for (int32 i = 0; i < right_context; i++) {
             padded_features.Row(min_chunk_size - i - 1).CopyFromVec(sub_features.Row(offset - 1));
           }
           padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);
           RunNnetComputation(padded_features, nnet, &compiler, &xvector);
         } else {
           RunNnetComputation(sub_features, nnet, &compiler, &xvector);
         }
         xvector_avg.AddVec(offset, xvector);
       }
       xvector_avg.Scale(1.0 / tot_weight);
       vector_writer.Write(utt, xvector_avg);

       frame_count += features.NumRows();
       num_success++;
     }

 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
 #endif
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
               << (elapsed*100.0/frame_count);
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;

     if (!cached_compiler_out.empty()) {
         KALDI_LOG << "Writing cache to " << cached_compiler_out;
         bool binary_write = true;
         Output ko(cached_compiler_out, &binary_write);
         compiler.WriteCache(ko.Stream(), binary_write);
     }

     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::nnet3::CollapseModel
void CollapseModel(const CollapseModelConfig &config, Nnet *nnet)
This function modifies the neural net for efficiency, in a way that suitable to be done in test time...
Definition: nnet-utils.cc:2100

kaldi::nnet3::ComputationRequest::store_component_stats
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
Definition: nnet-computation.h:126

kaldi::Input
Definition: kaldi-io.h:190

kaldi::CuMatrixBase::Row
const CuSubVector< Real > Row(MatrixIndexT i) const
Definition: cu-matrix.h:670

kaldi::nnet3::ComputationRequest::need_model_derivative
bool need_model_derivative
if need_model_derivative is true, then we&#39;ll be doing either model training or model-derivative compu...
Definition: nnet-computation.h:121

kaldi::MatrixBase
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49

kaldi::ParseOptions::PrintUsage
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
Definition: parse-options.cc:393

kaldi::SequentialTableReader::Key
std::string Key()
Definition: kaldi-table-inl.h:918

kaldi::nnet3::CachingOptimizingCompiler
This class enables you to do the compilation and optimization in one call, and also ensures that if t...
Definition: nnet-optimize.h:219

kaldi::nnet3::CachingOptimizingCompilerOptions::Register
void Register(OptionsItf *opts)
Definition: nnet-optimize.h:200

kaldi::nnet3::SetBatchnormTestMode
void SetBatchnormTestMode(bool test_mode, Nnet *nnet)
This function affects only components of type BatchNormComponent.
Definition: nnet-utils.cc:564

kaldi::TableWriter
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

common-utils.h

kaldi::Matrix< BaseFloat >

kaldi::nnet3::ComputationRequest::inputs
std::vector< IoSpecification > inputs
Definition: nnet-computation.h:115

kaldi::CuMatrix
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71

kaldi::Vector::Resize
void Resize(MatrixIndexT length, MatrixResizeType resize_type=kSetZero)
Set vector to a specified size (can be zero).
Definition: kaldi-vector.cc:190

kaldi::nnet3::IoSpecification::name
std::string name
Definition: nnet-computation.h:73

timer.h

kaldi::TableWriter::Write
void Write(const std::string &key, const T &value) const
Definition: kaldi-table-inl.h:1511

kaldi::nnet3
Definition: dnn3_code_compilation.dox:22

kaldi::nnet3::Nnet::OutputDim
int32 OutputDim(const std::string &output_name) const
Definition: nnet-nnet.cc:677

kaldi::ParseOptions::Register
void Register(const std::string &name, bool *ptr, const std::string &doc)
Definition: parse-options.cc:56

kaldi::nnet3::NnetComputeOptions
Definition: nnet-compute.h:39

kaldi::ReadKaldiObject
void ReadKaldiObject(const std::string &filename, Matrix< float > *m)
Definition: kaldi-io.cc:832

nnet-utils.h
This file contains some miscellaneous functions dealing with class Nnet.

kaldi::nnet3::CachingOptimizingCompilerOptions
Definition: nnet-optimize.h:192

kaldi::nnet3::RunNnetComputation
static void RunNnetComputation(const MatrixBase< BaseFloat > &features, const Nnet &nnet, CachingOptimizingCompiler *compiler, Vector< BaseFloat > *xvector)
Definition: nnet3-xvector-compute.cc:33

kaldi::nnet3::SetDropoutTestMode
void SetDropoutTestMode(bool test_mode, Nnet *nnet)
This function affects components of child-classes of RandomComponent.
Definition: nnet-utils.cc:573

kaldi::nnet3::NnetComputer::AcceptInput
void AcceptInput(const std::string &node_name, CuMatrix< BaseFloat > *input)
e.g.
Definition: nnet-compute.cc:547

kaldi::VectorBase::CopyFromVec
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
Definition: kaldi-vector.cc:228

kaldi::Input::Stream
std::istream & Stream()
Definition: kaldi-io.cc:826

kaldi::nnet3::ComputationRequest
Definition: nnet-computation.h:114

kaldi::BaseFloat
float BaseFloat
Definition: kaldi-types.h:29

kaldi::ParseOptions
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36

kaldi::Output::Stream
std::ostream & Stream()
Definition: kaldi-io.cc:701

kaldi::MatrixBase::Row
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188

kaldi::nnet3::NnetSimpleComputationOptions::Register
void Register(OptionsItf *opts)
Definition: nnet-am-decodable-simple.h:68

kaldi::nnet3::Nnet
Definition: nnet-nnet.h:115

kaldi::SequentialTableReader
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287

kaldi::ParseOptions::Read
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
Definition: parse-options.cc:311

kaldi::SequentialTableReader::Done
bool Done()
Definition: kaldi-table-inl.h:948

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::ParseOptions::GetArg
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
Definition: parse-options.cc:202

kaldi::SequentialTableReader::Next
void Next()
Definition: kaldi-table-inl.h:942

kaldi::VectorBase::Scale
void Scale(Real alpha)
Multiplies all elements by this constant.
Definition: kaldi-vector.cc:963

kaldi::Timer
Definition: timer.h:63

kaldi::nnet3::IoSpecification
Definition: nnet-computation.h:72

kaldi::nnet3::IoSpecification::has_deriv
bool has_deriv
Definition: nnet-computation.h:75

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::nnet3::CachingOptimizingCompiler::Compile
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
Definition: nnet-optimize.cc:716

kaldi::ParseOptions::NumArgs
int NumArgs() const
Number of positional parameters (c.f. argc-1).
Definition: parse-options.cc:198

kaldi::nnet3::CachingOptimizingCompiler::ReadCache
void ReadCache(std::istream &is, bool binary)
Definition: nnet-optimize.cc:666

kaldi::nnet3::IoSpecification::indexes
std::vector< Index > indexes
Definition: nnet-computation.h:74

kaldi::CuMatrixBase::NumCols
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216

kaldi::nnet3::CachingOptimizingCompiler::WriteCache
void WriteCache(std::ostream &os, bool binary)
Definition: nnet-optimize.cc:688

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

kaldi::nnet3::NnetComputer
class NnetComputer is responsible for executing the computation described in the "computation" object...
Definition: nnet-compute.h:59

kaldi::SequentialTableReader::Value
T & Value()
Definition: kaldi-table-inl.h:934

kaldi::kSetZero
Definition: matrix-common.h:38

kaldi::nnet3::ComputationRequest::outputs
std::vector< IoSpecification > outputs
Definition: nnet-computation.h:116

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

kaldi::MatrixBase::Range
SubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Return a sub-part of matrix.
Definition: kaldi-matrix.h:202

kaldi::nnet3::NnetSimpleComputationOptions::acoustic_scale
BaseFloat acoustic_scale
Definition: nnet-am-decodable-simple.h:50

kaldi::Output
Definition: kaldi-io.h:124

main
int main(int argc, char *argv[])
Definition: nnet3-xvector-compute.cc:63

kaldi::nnet3::NnetComputer::GetOutputDestructive
void GetOutputDestructive(const std::string &output_name, CuMatrix< BaseFloat > *output)
Definition: nnet-compute.cc:587

kaldi::nnet3::NnetSimpleComputationOptions::optimize_config
NnetOptimizeOptions optimize_config
Definition: nnet-am-decodable-simple.h:52

KALDI_LOG
#define KALDI_LOG
Definition: kaldi-error.h:153

kaldi::VectorBase::AddVec
void AddVec(const Real alpha, const VectorBase< OtherReal > &v)
Add vector : *this = *this + alpha * rv (with casting between floats and doubles) ...
Definition: kaldi-vector.cc:1044

kaldi::Timer::Elapsed
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74

kaldi-common.h

kaldi::SubMatrix
Sub-matrix representation.
Definition: kaldi-matrix.h:988

nnet-am-decodable-simple.h

kaldi::nnet3::NnetSimpleComputationOptions
Definition: nnet-am-decodable-simple.h:43

kaldi::nnet3::CollapseModelConfig
Config class for the CollapseModel function.
Definition: nnet-utils.h:240

kaldi::nnet3::NnetComputer::Run
void Run()
This does either the forward or backward computation, depending when it is called (in a typical compu...
Definition: nnet-compute.cc:512