#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "nnet3/nnet-am-decodable-simple.h"
#include "base/timer.h"
#include "nnet3/nnet-utils.h"

Include dependency graph for nnet3-xvector-compute-batched.cc:

Classes
struct	BatchedXvectorComputerOptions

class	BatchedXvectorComputer

struct	BatchedXvectorComputer::XvectorTask

Namespaces
	kaldi
	This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:

	kaldi::nnet3

Functions
void	DivideIntoPieces (int32 a, int32 b, std::vector< int32 > *pieces)
	This function divides the number 'a' into 'b' pieces, such that the sum of the pieces equals 'a' and no two pieces differ by more than 1. More...

int	main (int argc, char *argv[])

Function Documentation

◆ main()

int main	(	int	argc,
		char *	argv[]
	)

Definition at line 524 of file nnet3-xvector-compute-batched.cc.

References BatchedXvectorComputer::AcceptUtterance(), kaldi::nnet3::CollapseModel(), kaldi::nnet3::ComputeSimpleNnetContext(), SequentialTableReader< Holder >::Done(), Timer::Elapsed(), BatchedXvectorComputer::Flush(), ParseOptions::GetArg(), KALDI_LOG, KALDI_WARN, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), BatchedXvectorComputer::OutputXvector(), ParseOptions::PrintUsage(), ParseOptions::Read(), kaldi::ReadKaldiObject(), BatchedXvectorComputerOptions::Register(), ParseOptions::Register(), kaldi::nnet3::SetBatchnormTestMode(), kaldi::nnet3::SetDropoutTestMode(), kaldi::nnet3::SetRequireDirectInput(), SequentialTableReader< Holder >::Value(), TableWriter< Holder >::Write(), and BatchedXvectorComputer::XvectorReady().

                                  {
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
 
     const char *usage =
         "Propagate features through an xvector neural network model and write\n"
         "the output vectors.  \"Xvector\" is our term for a vector or\n"
         "embedding which is the output of a particular type of neural network\n"
         "architecture found in speaker recognition.  This architecture\n"
         "consists of several layers that operate on frames, a statistics\n"
         "pooling layer that aggregates over the frame-level representations\n"
         "and possibly additional layers that operate on segment-level\n"
         "representations.  The xvectors are generally extracted from an\n"
         "output layer after the statistics pooling layer.  By default, one\n"
         "xvector is extracted directly from the set of features for each\n"
         "utterance.  Optionally, xvectors are extracted from chunks of input\n"
         "features and averaged, to produce a single vector.\n"
         "\n"
         "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
         "<features-rspecifier> <vector-wspecifier>\n"
         "e.g.: nnet3-xvector-compute final.raw scp:feats.scp "
         "ark:nnet_prediction.ark\n"
         "See also: nnet3-compute\n";
 
     ParseOptions po(usage);
     Timer timer;
 
     BatchedXvectorComputerOptions opts;
 
     std::string use_gpu = "no";
 
     opts.Register(&po);
 
     po.Register("use-gpu", &use_gpu,
       "yes|no|optional|wait, only has effect if compiled with CUDA");
 
 #if HAVE_CUDA==1
     CuDevice::RegisterDeviceOptions(&po);
 #endif
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
 
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
     std::string nnet_rxfilename = po.GetArg(1),
                 feature_rspecifier = po.GetArg(2),
                 vector_wspecifier = po.GetArg(3);
 
     Nnet nnet;
     ReadKaldiObject(nnet_rxfilename, &nnet);
     SetBatchnormTestMode(true, &nnet);
     SetDropoutTestMode(true, &nnet);
     CollapseModel(CollapseModelConfig(), &nnet);
 
     int32 total_context;
     {
       int32 left_context, right_context;
       // Compute left_context, right_context as the 'real' left/right context
       // of the network; they'll tell us how many frames on the chunk boundaries
       // won't really participate in the statistics averaging.
       // SetRequireDirectInput()  modifies how the StatisticsPoolingComponent
       // treats its dependences, so we'll get the 'real' left/right context.
       SetRequireDirectInput(true, &nnet);
       ComputeSimpleNnetContext(nnet, &left_context, &right_context);
       KALDI_LOG << "Left/right context is " << left_context << ", "
                 << right_context;
       SetRequireDirectInput(false, &nnet);
       total_context = left_context + right_context;
     }
 
     BatchedXvectorComputer computer(opts, nnet, total_context);
     BaseFloatVectorWriter vector_writer(vector_wspecifier);
 
     int32 num_utts_read = 0, num_xvectors_written = 0;
     int64 frame_count = 0;
 
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
 
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       const Matrix<BaseFloat> &features (feature_reader.Value());
       if (features.NumRows() == 0) {
         KALDI_WARN << "Zero-length utterance: " << utt;
         continue;
       }
 
       frame_count += features.NumRows();
 
       computer.AcceptUtterance(utt, features);
       num_utts_read++;
 
       while (computer.XvectorReady()) {
         std::string utt;
         Vector<BaseFloat> xvector;
         computer.OutputXvector(&utt, &xvector);
         vector_writer.Write(utt, xvector);
         num_xvectors_written++;
       }
     }
 
     computer.Flush();
     while (computer.XvectorReady()) {
       std::string utt;
       Vector<BaseFloat> xvector;
       computer.OutputXvector(&utt, &xvector);
       vector_writer.Write(utt, xvector);
       num_xvectors_written++;
     }
 
 
 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
 #endif
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
               << (elapsed*100.0/frame_count);
     KALDI_LOG << "Read " << num_utts_read << " utterances, wrote "
               << num_xvectors_written << " xvectors.";
 
     // Note: the following rule does something reasonable even if there are 0, 1
     // or 2 utterances read.
     if (num_xvectors_written > num_utts_read / 2)
       return 0;
     else
       return 1;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }

Classes

Namespaces

Functions

Function Documentation

◆ main()