This class does neural net inference in a way that is optimized for GPU use: it combines chunks of multiple utterances into minibatches for more efficient computation. More...

#include <nnet-batch-compute.h>

Collaboration diagram for NnetBatchComputer:

[legend]

Classes
struct	ComputationGroupInfo

struct	ComputationGroupKey

struct	ComputationGroupKeyHasher

struct	MinibatchSizeInfo

Public Member Functions
	NnetBatchComputer (const NnetBatchComputerOptions &opts, const Nnet &nnet, const VectorBase< BaseFloat > &priors)
	Constructor. More...

void	AcceptTask (NnetInferenceTask *task, int32 max_minibatches_full=-1)
	Accepts a task, meaning the task will be queued. More...

int32	NumFullPendingMinibatches () const
	Returns the number of full minibatches waiting to be computed. More...

bool	Compute (bool allow_partial_minibatch)
	Does some kind of computation, choosing the highest-priority thing to compute. More...

void	SplitUtteranceIntoTasks (bool output_to_cpu, const Matrix< BaseFloat > &input, const Vector< BaseFloat > ivector, const Matrix< BaseFloat > online_ivectors, int32 online_ivector_period, std::vector< NnetInferenceTask > *tasks)
	Split a single utterance into a list of separate tasks which can then be given to this class by AcceptTask(). More...

void	SplitUtteranceIntoTasks (bool output_to_cpu, const CuMatrix< BaseFloat > &input, const CuVector< BaseFloat > ivector, const CuMatrix< BaseFloat > online_ivectors, int32 online_ivector_period, std::vector< NnetInferenceTask > *tasks)

const NnetBatchComputerOptions &	GetOptions ()

	~NnetBatchComputer ()

Private Types
typedef unordered_map< ComputationGroupKey, ComputationGroupInfo, ComputationGroupKeyHasher >	MapType

Private Member Functions
	KALDI_DISALLOW_COPY_AND_ASSIGN (NnetBatchComputer)

double	GetPriority (bool allow_partial_minibatch, const ComputationGroupInfo &info) const

int32	GetMinibatchSize (const ComputationGroupInfo &info) const

std::shared_ptr< const NnetComputation >	GetComputation (const ComputationGroupInfo &info, int32 minibatch_size)

int32	GetActualMinibatchSize (const ComputationGroupInfo &info) const

void	GetHighestPriorityTasks (int32 num_tasks, ComputationGroupInfo info, std::vector< NnetInferenceTask > *tasks)

MinibatchSizeInfo *	GetHighestPriorityComputation (bool allow_partial_minibatch, int32 minibatch_size, std::vector< NnetInferenceTask > *tasks)
	This function finds and returns the computation corresponding to the highest-priority group of tasks. More...

void	FormatInputs (int32 minibatch_size, const std::vector< NnetInferenceTask > &tasks, CuMatrix< BaseFloat > input, CuMatrix< BaseFloat > *ivector)
	formats the inputs to the computation and transfers them to GPU. More...

void	FormatOutputs (const CuMatrix< BaseFloat > &output, const std::vector< NnetInferenceTask *> &tasks)

void	CheckAndFixConfigs ()

void	PrintMinibatchStats ()

Static Private Member Functions
static void	GetComputationRequest (const NnetInferenceTask &task, int32 minibatch_size, ComputationRequest *request)

Private Attributes
NnetBatchComputerOptions	opts_

const Nnet &	nnet_

CachingOptimizingCompiler	compiler_

CuVector< BaseFloat >	log_priors_

std::mutex	mutex_

MapType	tasks_

int32	num_full_minibatches_

std::unordered_map< int32, std::condition_variable * >	no_more_than_n_minibatches_full_

int32	nnet_left_context_

int32	nnet_right_context_

int32	input_dim_

int32	ivector_dim_

int32	output_dim_

Detailed Description

This class does neural net inference in a way that is optimized for GPU use: it combines chunks of multiple utterances into minibatches for more efficient computation.

It does the computation in one background thread that accesses the GPU. It is thread safe, i.e. you can call it from multiple threads without having to worry about data races and the like.

Definition at line 207 of file nnet-batch-compute.h.

Member Typedef Documentation

◆ MapType

typedef unordered_map<ComputationGroupKey, ComputationGroupInfo, ComputationGroupKeyHasher> MapType

private

Definition at line 344 of file nnet-batch-compute.h.

Constructor & Destructor Documentation

◆ NnetBatchComputer()

NnetBatchComputer	(	const NnetBatchComputerOptions &	opts,
		const Nnet &	nnet,
		const VectorBase< BaseFloat > &	priors
	)

Constructor.

It stores references to all the arguments, so don't delete them till this object goes out of scop.

Parameters

[in]	opts	Options struct
[in]	nnet	The neural net which we'll be doing the computation with
[in]	priors	Either the empty vector, or a vector of prior probabilities which we'll take the log of and subtract from the neural net outputs (e.g. used in non-chain systems).

Definition at line 31 of file nnet-batch-compute.cc.

References NnetSimpleComputationOptions::CheckAndFixConfigs(), kaldi::nnet3::ComputeSimpleNnetContext(), NnetBatchComputerOptions::edge_minibatch_size, NnetBatchComputer::input_dim_, Nnet::InputDim(), NnetBatchComputer::ivector_dim_, KALDI_ASSERT, NnetBatchComputer::log_priors_, NnetBatchComputerOptions::minibatch_size, Nnet::Modulus(), NnetBatchComputer::nnet_, NnetBatchComputer::nnet_left_context_, NnetBatchComputer::nnet_right_context_, NnetBatchComputer::opts_, NnetBatchComputer::output_dim_, Nnet::OutputDim(), and NnetBatchComputerOptions::partial_minibatch_factor.

                                         :
     opts_(opts),
     nnet_(nnet),
     compiler_(nnet_, opts.optimize_config),
     log_priors_(priors),
     num_full_minibatches_(0) {
   log_priors_.ApplyLog();
   opts_.CheckAndFixConfigs(nnet_.Modulus());
   KALDI_ASSERT(opts_.minibatch_size >= 1 && opts_.edge_minibatch_size >= 1 &&
                opts_.partial_minibatch_factor < 1.0 &&
                opts_.partial_minibatch_factor >= 0.0);
 
   ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_);
   input_dim_ = nnet.InputDim("input");
   ivector_dim_ = std::max<int32>(0, nnet.InputDim("ivector"));
   output_dim_ = nnet.OutputDim("output");
   KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0);
 }

◆ ~NnetBatchComputer()

~NnetBatchComputer ( )

Definition at line 112 of file nnet-batch-compute.cc.

References KALDI_ASSERT, KALDI_ERR, NnetBatchComputer::mutex_, NnetBatchComputer::no_more_than_n_minibatches_full_, NnetBatchComputer::num_full_minibatches_, NnetBatchComputer::PrintMinibatchStats(), and NnetBatchComputer::tasks_.

                                       {
   PrintMinibatchStats();
   // the destructor shouldn't be called while the mutex is locked; if it is, it
   // likely means the program has already crashed, or it's a programming error.
   if (!mutex_.try_lock())
     KALDI_ERR << "Destructor called while object locked.";
   int32 num_pending_tasks = 0;
   for (auto iter = tasks_.begin(); iter != tasks_.end(); ++iter)
     num_pending_tasks += iter->second.tasks.size();
   if (num_pending_tasks > 0)
     KALDI_ERR << "Tasks are pending but object is being destroyed";
   for (auto iter = no_more_than_n_minibatches_full_.begin();
        iter != no_more_than_n_minibatches_full_.end(); ++iter) {
     std::condition_variable *cond = iter->second;
     // the next call will notify any threads that were waiting on this condition
     // variable-- there shouldn't be any, though, as it would be a programming
     // error, but better to wake them up so we can see any messages they print.
     cond->notify_all();
     delete cond;
   }
   KALDI_ASSERT(num_full_minibatches_ == 0);  // failure would be a coding error.
 }

Member Function Documentation

◆ AcceptTask()

void AcceptTask	(	NnetInferenceTask *	task,
		int32	max_minibatches_full = `-1`
	)

Accepts a task, meaning the task will be queued.

(Note: the pointer is still owned by the caller. If the max_minibatches_full >= 0, then the calling thread will block until no more than that many full minibatches are waiting to be computed. This is a mechanism to prevent too many requests from piling up in memory.

Definition at line 568 of file nnet-batch-compute.cc.

References NnetBatchComputer::GetMinibatchSize(), NnetBatchComputer::mutex_, NnetBatchComputer::no_more_than_n_minibatches_full_, NnetBatchComputer::num_full_minibatches_, NnetBatchComputer::ComputationGroupInfo::tasks, and NnetBatchComputer::tasks_.

Referenced by NnetBatchInference::AcceptInput(), and NnetBatchDecoder::Decode().

                                                                {
   std::unique_lock<std::mutex> lock(mutex_);
 
   if (max_minibatches_full > 0 && num_full_minibatches_ > max_minibatches_full) {
     std::unordered_map<int32, std::condition_variable*>::iterator
         iter = no_more_than_n_minibatches_full_.find(max_minibatches_full);
     std::condition_variable *cond;
     if (iter != no_more_than_n_minibatches_full_.end()) {
       cond = iter->second;
     } else {
       cond = new std::condition_variable();
       no_more_than_n_minibatches_full_[max_minibatches_full] = cond;
     }
     while (num_full_minibatches_ > max_minibatches_full)
       cond->wait(lock);
   }
   ComputationGroupKey key(*task);
   ComputationGroupInfo &info = tasks_[key];
   info.tasks.push_back(task);
   int32 minibatch_size = GetMinibatchSize(info);
   if (static_cast<int32>(info.tasks.size()) % minibatch_size == 0)
     num_full_minibatches_++;
 }

◆ CheckAndFixConfigs()

void CheckAndFixConfigs ( )

private

◆ Compute()

bool Compute ( bool allow_partial_minibatch )

Does some kind of computation, choosing the highest-priority thing to compute.

It returns true if it did some kind of computation, and false otherwise. This function locks the class, but not for the entire time it's being called: only at the beginning and at the end.

Parameters

[in] allow_partial_minibatch If false, then this will only do the computation if a full minibatch is ready; if true, it is allowed to do computation on partial (not-full) minibatches.

Definition at line 593 of file nnet-batch-compute.cc.

References NnetComputer::AcceptInput(), NnetSimpleComputationOptions::acoustic_scale, CuMatrixBase< Real >::AddVecToRows(), NnetBatchComputer::MinibatchSizeInfo::computation, NnetSimpleComputationOptions::compute_config, Timer::Elapsed(), NnetBatchComputer::FormatInputs(), NnetBatchComputer::FormatOutputs(), NnetBatchComputer::GetHighestPriorityComputation(), NnetComputer::GetOutputDestructive(), rnnlm::i, NnetBatchComputer::log_priors_, NnetBatchComputer::nnet_, NnetBatchComputer::MinibatchSizeInfo::num_done, CuMatrixBase< Real >::NumRows(), NnetBatchComputer::opts_, NnetComputer::Run(), CuMatrixBase< Real >::Scale(), NnetBatchComputer::MinibatchSizeInfo::seconds_taken, kaldi::SynchronizeGpu(), and NnetBatchComputer::MinibatchSizeInfo::tot_num_tasks.

Referenced by NnetBatchInference::Compute(), and NnetBatchDecoder::Compute().

                                                             {
   int32 minibatch_size;
   std::vector<NnetInferenceTask*> tasks;
   MinibatchSizeInfo *minfo =
       GetHighestPriorityComputation(allow_partial_minibatch,
                                     &minibatch_size,
                                     &tasks);
   if (minfo == NULL)
     return false;
 
   Timer tim;
   Nnet *nnet_to_update = NULL;  // we're not doing any update
   NnetComputer computer(opts_.compute_config, *(minfo->computation),
                         nnet_, nnet_to_update);
 
 
   CuMatrix<BaseFloat> input;
   CuMatrix<BaseFloat> ivector;
   FormatInputs(minibatch_size, tasks, &input, &ivector);
   computer.AcceptInput("input", &input);
   if (ivector.NumRows() != 0)
     computer.AcceptInput("ivector", &ivector);
   computer.Run();
   CuMatrix<BaseFloat> output;
   computer.GetOutputDestructive("output", &output);
   if (log_priors_.Dim() != 0) {
     output.AddVecToRows(-1.0, log_priors_);
   }
   output.Scale(opts_.acoustic_scale);
   FormatOutputs(output, tasks);
 
   // Update the stats, for diagnostics.
   minfo->num_done++;
   minfo->tot_num_tasks += static_cast<int64>(tasks.size());
   minfo->seconds_taken += tim.Elapsed();
 
   SynchronizeGpu();
 
   for (size_t i = 0; i < tasks.size(); i++)
     tasks[i]->semaphore.Signal();
 
   return true;
 }

◆ FormatInputs()

void FormatInputs	(	int32	minibatch_size,
		const std::vector< NnetInferenceTask *> &	tasks,
		CuMatrix< BaseFloat > *	input,
		CuMatrix< BaseFloat > *	ivector
	)

private

formats the inputs to the computation and transfers them to GPU.

Parameters

[in]	minibatch_size	The number of parallel sequences we're doing this computation for. This will be more than tasks.size() in some cases.
[in]	tasks	The tasks we're doing the computation for. The input comes from here.
[out]	input	The main feature input to the computation is put into here.
[out]	ivector	If we're using i-vectors, the i-vectors are put here.

Definition at line 346 of file nnet-batch-compute.cc.

References CuMatrixBase< Real >::CopyFromMat(), CuVectorBase< Real >::Data(), CuMatrixBase< Real >::Data(), kaldi::GetVerboseLevel(), KALDI_ASSERT, kaldi::kUndefined, rnnlm::n, CuMatrix< Real >::Resize(), CuMatrixBase< Real >::Row(), CuMatrixBase< Real >::RowRange(), and CuMatrixBase< Real >::Stride().

Referenced by NnetBatchComputer::Compute().

                                   {
   int32 num_input_frames = tasks[0]->input.NumRows(),
       input_dim = tasks[0]->input.NumCols(),
       ivector_dim = tasks[0]->ivector.Dim(),
       num_tasks = tasks.size();
   KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
   
   // destination matrix
   input->Resize(minibatch_size * num_input_frames, input_dim,
                 kUndefined);
  
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
 
     std::vector<const BaseFloat*> inputs(num_tasks);
     std::vector<BaseFloat*> outputs(num_tasks);
     std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
     std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
 
     // compute matrix descriptions for each copy
     for (int32 n = 0; n < num_tasks; n++) {
       const CuMatrix<BaseFloat> &input_mat = tasks[n]->input;
       CuSubMatrix<BaseFloat> output_mat = input->RowRange(
           n * num_input_frames, num_input_frames);
 
       // create matrix batch description arrays
       num_rows[n] = num_input_frames;
       num_cols[n] = input_dim;
       outputs[n] = output_mat.Data();
       inputs[n] = input_mat.Data();
       ldo[n] = output_mat.Stride();
       ldi[n] = input_mat.Stride();
     }
 
     // execute batched copy
     cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], 
         &ldi[0], &outputs[0], &ldo[0]);
 
   } else 
 #endif
   {
     for (int32 n = 0; n < num_tasks; n++) {
       CuSubMatrix<BaseFloat> input_part(*input,
                                     n * num_input_frames, num_input_frames,
                                     0, input_dim);
       input_part.CopyFromMat(tasks[n]->input);
     }
   }
 
   if (GetVerboseLevel() >=2 ) {
     if (num_tasks < minibatch_size) {
       // The following will make things easier to debug if something fails, but
       // shouldn't be strictly necessary.
       // the -1 means 'take all remaining rows'.
       input->RowRange(num_tasks * num_input_frames,
                       (minibatch_size - num_tasks) * num_input_frames).SetZero();
     }
   }
 
   if (ivector_dim != 0) {
     ivector->Resize(minibatch_size, ivector_dim, kUndefined);
 
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
      
       // using the batched matrix copy routine for this.  This isn't
       // extremely efficient but the kernel takes a minimal amount of 
       // time so making a batched vector copy is not worth the effort.
       std::vector<const BaseFloat*> inputs(num_tasks);
       std::vector<BaseFloat*> outputs(num_tasks);
       std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
       std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
 
       // compute source pointers for each input
       for (int32 n = 0; n < num_tasks; n++) {
         const CuVector<BaseFloat> &input_vec = tasks[n]->ivector;
         CuSubVector<BaseFloat> output_vec = ivector->Row(n);
         // create matrix batch description arrays
         num_rows[n] = 1;
         num_cols[n] = ivector_dim;
         outputs[n] = output_vec.Data();
         inputs[n] = input_vec.Data();
         ldo[n] = 1;
         ldi[n] = 1;
       }
 
       // execute batched copy
       cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
           &outputs[0], &ldo[0]);
 
     } else 
 #endif
     {
       for (int32 n = 0; n < num_tasks; n++) {
         ivector->Row(n).CopyFromVec(tasks[n]->ivector);
       }
     }
 
     if (GetVerboseLevel() >= 2) {
       if (num_tasks < minibatch_size) {
         // The following will make things easier to debug if something fails, but
         // shouldn't be strictly necessary.
         // the -1 means 'take all remaining rows'.
         ivector->RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
       }
     }
   }
 }

◆ FormatOutputs()

void FormatOutputs	(	const CuMatrix< BaseFloat > &	output,
		const std::vector< NnetInferenceTask *> &	tasks
	)

private

Definition at line 459 of file nnet-batch-compute.cc.

References CuMatrixBase< Real >::Data(), KALDI_ASSERT, kaldi::kUndefined, rnnlm::n, NnetInferenceTask::num_initial_unused_output_frames, NnetInferenceTask::num_used_output_frames, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), NnetInferenceTask::output, NnetInferenceTask::output_cpu, NnetInferenceTask::output_to_cpu, Matrix< Real >::Resize(), MatrixBase< Real >::RowRange(), CuMatrixBase< Real >::RowRange(), CuMatrixBase< Real >::Stride(), and kaldi::SynchronizeGpu().

Referenced by NnetBatchComputer::Compute().

                                                 {
   KALDI_ASSERT(!tasks.empty());
   int32 num_output_frames = tasks[0]->num_output_frames,
       output_dim = output.NumCols(),
       num_tasks = tasks.size();
   bool did_output_to_gpu = false;
 
   // We don't bother zeroing frames of the output that are unused, but you could
   // un-comment the commented lines of code below to do so and add equivalent
   // calls to the cuda version.
 
 #if HAVE_CUDA == 1 
   if (CuDevice::Instantiate().Enabled()) {
 
     std::vector<const BaseFloat*> inputs(num_tasks);
     std::vector<BaseFloat*> outputs(num_tasks);
     std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
     std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
 
     int b=0;  // batch counter
     for (int32 n = 0; n < num_tasks; n++) {
       NnetInferenceTask *task = tasks[n];
 
       int32 left_unused = task->num_initial_unused_output_frames,
             used = task->num_used_output_frames;
       // int32 right_unused = num_output_frames - used - left_unused;
       
       // TODO do we really expect different tasks to output CPU or GPU? 
       // This adds a bit of code complexity.  Perhaps output_to_cpu should 
       // be a property of the batch computer and not the tasks
       if (task->output_to_cpu) {
         task->output_cpu.Resize(num_output_frames, output_dim,
             kUndefined);
         // if (left_unused > 0)
         //   task->output_cpu.RowRange(0, left_unused).SetZero();
         task->output_cpu.RowRange(left_unused, used).CopyFromMat(
             output.RowRange(n * num_output_frames + left_unused, used));
         // if (right_unused > 0)
         //   task->output_cpu.RowRange(
         //   0, left_unused + used, right_unused).SetZero();
 
       } else {
         did_output_to_gpu = true;
         task->output.Resize(num_output_frames, output_dim,
             kUndefined);
 
         CuSubMatrix<BaseFloat> output_mat = task->output.RowRange(
             left_unused, used);
         const CuSubMatrix<BaseFloat> input_mat = output.RowRange(
             n * num_output_frames + left_unused, used);
        
         // create matrix batch description arrays
         num_rows[b] = output_mat.NumRows();
         num_cols[b] = output_mat.NumCols();
         outputs[b] = output_mat.Data();
         inputs[b] = input_mat.Data();
         ldo[b] = output_mat.Stride();
         ldi[b] = input_mat.Stride();
         b++; // increase batch count
       }
     }
     
     // execute batched copy
     cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
         &outputs[0], &ldo[0]);
   
   } else
 #endif
   {
     //TODO i don't think all of these paths are actually possible.  We should simplify this.  
     //Is it possible to output_to_gpu with HAVE_CUDA == 0 or when the device is disabled?
     for (int32 n = 0; n < num_tasks; n++) {
       NnetInferenceTask *task = tasks[n];
 
       int32 left_unused = task->num_initial_unused_output_frames,
             used = task->num_used_output_frames;
       // int32 right_unused = num_output_frames - used - left_unused;
 
       if (task->output_to_cpu) {
         task->output_cpu.Resize(num_output_frames, output_dim,
             kUndefined);
         // if (left_unused > 0)
         //   task->output_cpu.RowRange(0, left_unused).SetZero();
         task->output_cpu.RowRange(left_unused, used).CopyFromMat(
             output.RowRange(n * num_output_frames + left_unused, used));
         // if (right_unused > 0)
         //   task->output_cpu.RowRange(0, left_unused + used, right_unused).SetZero();
       } else {
         did_output_to_gpu = true;
         task->output.Resize(num_output_frames, output_dim,
             kUndefined);
         // if (left_unused > 0)
         //   task->output.RowRange(0, left_unused).SetZero();
         task->output.RowRange(left_unused, used).CopyFromMat(
             output.RowRange(n * num_output_frames + left_unused, used));
         // if (right_unused > 0)
         //   task->output.RowRange(0, left_unused + used, right_unused).SetZero();
       }
     }
   }
   // The output of this function will likely be consumed by another thread.
   // The following call will make sure the relevant kernels complete before
   // any kernels from the other thread use the output.
   if (did_output_to_gpu)
     SynchronizeGpu();
 }

◆ GetActualMinibatchSize()

int32 GetActualMinibatchSize ( const ComputationGroupInfo & info ) const

private

Definition at line 242 of file nnet-batch-compute.cc.

References NnetBatchComputer::GetMinibatchSize(), KALDI_ASSERT, NnetBatchComputer::opts_, NnetBatchComputerOptions::partial_minibatch_factor, and NnetBatchComputer::ComputationGroupInfo::tasks.

Referenced by NnetBatchComputer::GetHighestPriorityComputation().

                                             {
   KALDI_ASSERT(!info.tasks.empty());
   int32 num_tasks = info.tasks.size(),
       this_minibatch_size = GetMinibatchSize(info);
   KALDI_ASSERT(num_tasks > 0);
   while (num_tasks <
          int32(opts_.partial_minibatch_factor * this_minibatch_size))
     this_minibatch_size *= opts_.partial_minibatch_factor;
   return int32(this_minibatch_size);
 }

◆ GetComputation()

std::shared_ptr< const NnetComputation > GetComputation	(	const ComputationGroupInfo &	info,
		int32	minibatch_size
	)

private

Definition at line 255 of file nnet-batch-compute.cc.

References CachingOptimizingCompiler::Compile(), NnetBatchComputer::compiler_, NnetBatchComputer::GetComputationRequest(), KALDI_ASSERT, and NnetBatchComputer::ComputationGroupInfo::tasks.

Referenced by NnetBatchComputer::GetHighestPriorityComputation().

                           {
   KALDI_ASSERT(!info.tasks.empty());
   // note: all the tasks will have the same structure, in the respects that
   // would affect the computation.
   NnetInferenceTask *example_task = info.tasks[0];
   ComputationRequest request;
   GetComputationRequest(*example_task, minibatch_size, &request);
   return compiler_.Compile(request);
 }

◆ GetComputationRequest()

void GetComputationRequest	(	const NnetInferenceTask &	task,
		int32	minibatch_size,
		ComputationRequest *	request
	)

staticprivate

Definition at line 312 of file nnet-batch-compute.cc.

References NnetInferenceTask::first_input_t, NnetInferenceTask::input, ComputationRequest::inputs, NnetInferenceTask::ivector, rnnlm::n, ComputationRequest::need_model_derivative, NnetInferenceTask::num_output_frames, NnetInferenceTask::output_t_stride, ComputationRequest::outputs, and ComputationRequest::store_component_stats.

Referenced by NnetBatchComputer::GetComputation().

                                  {
   request->need_model_derivative = false;
   request->store_component_stats = false;
   request->inputs.reserve(2);
 
   int32 num_input_frames = task.input.NumRows(),
       first_input_t = task.first_input_t,
       num_output_frames = task.num_output_frames,
       output_t_stride = task.output_t_stride;
   bool has_ivector = (task.ivector.Dim() != 0);
 
   std::vector<Index> input_indexes, ivector_indexes, output_indexes;
   input_indexes.reserve(minibatch_size * num_input_frames);
   output_indexes.reserve(minibatch_size * num_output_frames);
   if (has_ivector)
     ivector_indexes.reserve(minibatch_size);
 
   for (int32 n = 0; n < minibatch_size; n++) {
     for (int32 t = first_input_t; t < first_input_t + num_input_frames; t++)
       input_indexes.push_back(Index(n, t, 0));
     if (has_ivector)
       ivector_indexes.push_back(Index(n, 0, 0));
     for (int32 t = 0; t < num_output_frames; t++)
       output_indexes.push_back(Index(n, t * output_t_stride, 0));
   }
   request->inputs.push_back(IoSpecification("input", input_indexes));
   if (has_ivector)
     request->inputs.push_back(IoSpecification("ivector", ivector_indexes));
   request->outputs.push_back(IoSpecification("output", output_indexes));
 }

◆ GetHighestPriorityComputation()

NnetBatchComputer::MinibatchSizeInfo * GetHighestPriorityComputation	(	bool	allow_partial_minibatch,
		int32 *	minibatch_size,
		std::vector< NnetInferenceTask >	tasks
	)

private

This function finds and returns the computation corresponding to the highest-priority group of tasks.

Parameters

[in]	allow_partial_minibatch	If this is true, then this function may return a computation corresponding to a partial minibatch– i.e. the minibatch size in the computation may be less than the minibatch size in the options class, and/or the number of tasks may not be as many as the minibatch size in the computation.
[out]	minibatch_size	If this function returns non-NULL, then this will be set to the minibatch size that the returned computation expects. This may be less than tasks->size(), in cases where the minibatch was not 'full'.
[out]	tasks	The tasks which we'll be doing the computation for in this minibatch are put here (and removed from tasks_, in cases where this function returns non-NULL.

Returns: This function returns a pointer to the appropriate 'MinibatchSizeInfo' object corresponding to the computation that we'll be doing for this minibatch, or NULL if there is nothing to compute.

Definition at line 136 of file nnet-batch-compute.cc.

References NnetBatchComputer::MinibatchSizeInfo::computation, NnetBatchComputer::GetActualMinibatchSize(), NnetBatchComputer::GetComputation(), NnetBatchComputer::GetHighestPriorityTasks(), NnetBatchComputer::GetPriority(), NnetBatchComputer::ComputationGroupInfo::minibatch_info, NnetBatchComputer::mutex_, and NnetBatchComputer::tasks_.

Referenced by NnetBatchComputer::Compute().

                                           {
   tasks->clear();
   std::unique_lock<std::mutex> lock(mutex_);
   MapType::iterator iter = tasks_.begin(), end = tasks_.end(),
       best_iter = tasks_.end();
   double highest_priority = -std::numeric_limits<double>::infinity();
 
   for (; iter != end; ++iter) {
     ComputationGroupInfo &info = iter->second;
     double this_priority = GetPriority(allow_partial_minibatch, info);
     if (this_priority > highest_priority) {
       highest_priority = this_priority;
       best_iter = iter;
     }
   }
   if (best_iter == tasks_.end()) {
     // either allow_partial_minibatch == false and there were no full
     // minibatches, or there were no pending tasks at all.
     return NULL;
   }
   ComputationGroupInfo &info = best_iter->second;
   int32 actual_minibatch_size = GetActualMinibatchSize(info);
   *minibatch_size_out = actual_minibatch_size;
   MinibatchSizeInfo *minfo = &(info.minibatch_info[actual_minibatch_size]);
   if (minfo->computation == NULL)
     minfo->computation = GetComputation(info, actual_minibatch_size);
   GetHighestPriorityTasks(actual_minibatch_size, &info, tasks);
   return minfo;
 }

◆ GetHighestPriorityTasks()

void GetHighestPriorityTasks	(	int32	num_tasks,
		ComputationGroupInfo *	info,
		std::vector< NnetInferenceTask >	tasks
	)

private

Definition at line 170 of file nnet-batch-compute.cc.

References NnetBatchComputer::GetMinibatchSize(), rnnlm::i, KALDI_ASSERT, NnetBatchComputer::no_more_than_n_minibatches_full_, NnetBatchComputer::num_full_minibatches_, and NnetBatchComputer::ComputationGroupInfo::tasks.

Referenced by NnetBatchComputer::GetHighestPriorityComputation().

                                           {
   int32 num_tasks_present = info->tasks.size(),
       minibatch_size = GetMinibatchSize(*info);
   KALDI_ASSERT(tasks->empty());
   if (num_tasks_needed >= num_tasks_present) {
     tasks->swap(info->tasks);
   } else {
     int32 num_tasks_not_needed = num_tasks_present - num_tasks_needed;
     // We don't sort the tasks with a comparator that dereferences the pointers,
     // because the priorities can change asynchronously, and we're concerned that
     // something weird might happen in the sorting if the things it's comparing
     // are changing.
     std::vector<std::pair<double, NnetInferenceTask*> > pairs(num_tasks_present);
     for (int32 i = 0; i < num_tasks_present; i++) {
       pairs[i].first = info->tasks[i]->priority;
       pairs[i].second = info->tasks[i];
     }
     std::nth_element(pairs.begin(), pairs.begin() + num_tasks_not_needed,
                      pairs.end());
 
     // The lowest-priority 'num_tasks_not_needed' stay in the 'info' struct.
     info->tasks.clear();
     for (int32 i = 0; i < num_tasks_not_needed; i++)
       info->tasks.push_back(pairs[i].second);
     // The highest-priority 'num_tasks_needed' tasks go to the output 'tasks'
     // array.
     for (int32 i = num_tasks_not_needed; i < num_tasks_present; i++)
       tasks->push_back(pairs[i].second);
     // The following assertion checks that the is_edge and is_irregular values
     // are the same for the entire minibatch, which they should always be.
     KALDI_ASSERT(GetMinibatchSize(*info) == minibatch_size);
   }
 
   {
     // This block updates num_full_minibatches_ and notifies threads waiting on
     // any related condition variable.
     int32 new_num_tasks_present = info->tasks.size(),
         full_minibatch_reduction =
         (num_tasks_present / minibatch_size) -
         (new_num_tasks_present / minibatch_size);
     for (int32 i = 0; i < full_minibatch_reduction; i++) {
       num_full_minibatches_--;
       KALDI_ASSERT(num_full_minibatches_ >= 0);
       std::unordered_map<int32, std::condition_variable*>::const_iterator
           iter = no_more_than_n_minibatches_full_.find(num_full_minibatches_);
       if (iter != no_more_than_n_minibatches_full_.end()) {
         std::condition_variable *cond = iter->second;
         cond->notify_all();
       }
     }
   }
 }

◆ GetMinibatchSize()

int32 GetMinibatchSize ( const ComputationGroupInfo & info ) const

inlineprivate

Definition at line 227 of file nnet-batch-compute.cc.

References NnetBatchComputerOptions::edge_minibatch_size, NnetInferenceTask::is_edge, NnetInferenceTask::is_irregular, NnetBatchComputerOptions::minibatch_size, NnetBatchComputer::opts_, and NnetBatchComputer::ComputationGroupInfo::tasks.

Referenced by NnetBatchComputer::AcceptTask(), NnetBatchComputer::GetActualMinibatchSize(), NnetBatchComputer::GetHighestPriorityTasks(), and NnetBatchComputer::GetPriority().

                                             {
   if (info.tasks.empty()) {
     return opts_.minibatch_size; // actually it shouldn't matter what we return
                                  // in this case.
   }
   const NnetInferenceTask &task = *(info.tasks[0]);
   if (task.is_irregular)
     return 1;
   else if (task.is_edge)
     return opts_.edge_minibatch_size;
   else
     return opts_.minibatch_size;
 }

◆ GetOptions()

const NnetBatchComputerOptions& GetOptions ( )

inline

Definition at line 280 of file nnet-batch-compute.h.

References KALDI_DISALLOW_COPY_AND_ASSIGN.

Referenced by NnetBatchDecoder::ProcessOutputUtterance(), and NnetBatchDecoder::~NnetBatchDecoder().

280 { return opts_; }

kaldi::nnet3::NnetBatchComputer::opts_

NnetBatchComputerOptions opts_

Definition: nnet-batch-compute.h:460

◆ GetPriority()

double GetPriority	(	bool	allow_partial_minibatch,
		const ComputationGroupInfo &	info
	)		const

inlineprivate

Definition at line 268 of file nnet-batch-compute.cc.

References NnetBatchComputer::GetMinibatchSize(), rnnlm::i, and NnetBatchComputer::ComputationGroupInfo::tasks.

Referenced by NnetBatchComputer::GetHighestPriorityComputation().

                                                                               {
   if (info.tasks.empty())
     return -std::numeric_limits<double>::infinity();
   int32 this_minibatch_size = GetMinibatchSize(info);
   int32 num_tasks = info.tasks.size();
 
   if (!allow_partial_minibatch && num_tasks < this_minibatch_size)
     return -std::numeric_limits<double>::infinity();
 
   // penalty_for_not_full will be negative if the minibatch is not full, up to a
   // maximum of 10.  the 10 is a heuristic; it could be changed.
   // Note: the penalty is effectively infinity if allow_partial_minibatch == false;
   // see the 'return' above.
   double proportion_full = std::min<int32>(num_tasks, this_minibatch_size) /
       double(this_minibatch_size),
       penalty_for_not_full = 10.0 * (proportion_full - 1.0),
       task_priority_sum = 0.0;
 
 
   if (num_tasks > this_minibatch_size) {
     // Get the average of the priorities of the highest-priority tasks (no more
     // than 'minibatch_size' of them.
     std::vector<double> priorities;
     priorities.resize(num_tasks);
     for (int32 i = 0; i < num_tasks; i++)
       priorities[i] = info.tasks[i]->priority;
     // sort from greatest to least.
     std::nth_element(priorities.begin(),
                      priorities.begin() + this_minibatch_size,
                      priorities.end(),
                      std::greater<double>());
     for (int32 i = 0; i < this_minibatch_size; i++)
       task_priority_sum += priorities[i];
     return penalty_for_not_full + task_priority_sum / this_minibatch_size;
   } else {
     for (int32 i = 0; i < num_tasks; i++)
       task_priority_sum += info.tasks[i]->priority;
     return penalty_for_not_full + task_priority_sum / num_tasks;
   }
 }

◆ KALDI_DISALLOW_COPY_AND_ASSIGN()

KALDI_DISALLOW_COPY_AND_ASSIGN ( NnetBatchComputer )

private

◆ NumFullPendingMinibatches()

int32 NumFullPendingMinibatches ( ) const

inline

Returns the number of full minibatches waiting to be computed.

Definition at line 233 of file nnet-batch-compute.h.

References NnetInferenceTask::input, NnetInferenceTask::ivector, and NnetInferenceTask::output_to_cpu.

233 { return num_full_minibatches_; }

kaldi::nnet3::NnetBatchComputer::num_full_minibatches_

int32 num_full_minibatches_

Definition: nnet-batch-compute.h:480

◆ PrintMinibatchStats()

void PrintMinibatchStats ( )

private

Definition at line 53 of file nnet-batch-compute.cc.

References rnnlm::i, KALDI_LOG, NnetBatchComputer::MinibatchSizeInfo::num_done, NnetBatchComputer::ComputationGroupKey::num_input_frames, NnetBatchComputer::ComputationGroupKey::num_output_frames, operator<(), NnetBatchComputer::MinibatchSizeInfo::seconds_taken, NnetBatchComputer::tasks_, and NnetBatchComputer::MinibatchSizeInfo::tot_num_tasks.

Referenced by NnetBatchComputer::~NnetBatchComputer().

                                             {
   int32 max_stats_to_print = 10;
   int64 tot_tasks = 0, tot_minibatches = 0;
   double tot_time = 0.0;
   std::ostringstream os;
   struct MinibatchStats {
     int32 num_frames_out;
     int32 num_frames_in;
     int32 minibatch_size;
     int32 num_done;
     int32 percent_full;
     BaseFloat seconds_taken;
 
     bool operator < (const MinibatchStats &other) const {
       return seconds_taken > other.seconds_taken;  // sort from most to least time.
     }
   };
   std::vector<MinibatchStats> all_stats;
   os << "Minibatch stats: seconds-taken,frames-in:frames-out*minibatch-size=num-done(percent-full%)  ";
 
   for (MapType::const_iterator iter = tasks_.begin();
        iter != tasks_.end(); ++iter) {
     for (std::map<int32, MinibatchSizeInfo>::const_iterator
              miter = iter->second.minibatch_info.begin();
          miter != iter->second.minibatch_info.end(); ++miter) {
       const ComputationGroupKey &key = iter->first;
       const MinibatchSizeInfo &minfo = miter->second;
       MinibatchStats stats;
       stats.num_frames_in = key.num_input_frames;
       stats.num_frames_out = key.num_output_frames;
       stats.minibatch_size = miter->first;
       stats.num_done = minfo.num_done;
       stats.seconds_taken = minfo.seconds_taken;
 
       tot_tasks += minfo.tot_num_tasks;
       tot_minibatches += minfo.num_done;
       tot_time += minfo.seconds_taken;
       stats.percent_full = int32(minfo.tot_num_tasks * 100.0 /
                                  (stats.minibatch_size * stats.num_done));
       all_stats.push_back(stats);
     }
   }
 
   std::sort(all_stats.begin(), all_stats.end());
   os << std::fixed << std::setprecision(2);
   int32 num_stats = all_stats.size();
   for (int32 i = 0; i < std::min<int32>(num_stats, max_stats_to_print); i++) {
     MinibatchStats &stats = all_stats[i];
     os << stats.seconds_taken << ',' << stats.num_frames_in << ':'
        << stats.num_frames_out << '*' << stats.minibatch_size
        << '=' << stats.num_done << '(' << stats.percent_full << "%) ";
   }
   if (num_stats > max_stats_to_print)
     os << "...";
   KALDI_LOG << os.str();
   KALDI_LOG << "Did " << tot_tasks << " tasks in " << tot_minibatches
             << " minibatches, taking " << tot_time << " seconds.";
 }

◆ SplitUtteranceIntoTasks() [1/2]

void SplitUtteranceIntoTasks	(	bool	output_to_cpu,
		const Matrix< BaseFloat > &	input,
		const Vector< BaseFloat > *	ivector,
		const Matrix< BaseFloat > *	online_ivectors,
		int32	online_ivector_period,
		std::vector< NnetInferenceTask > *	tasks
	)

Split a single utterance into a list of separate tasks which can then be given to this class by AcceptTask().

Parameters

[in]	output_to_cpu	Will become the 'output_to_cpu' member of the output tasks; this controls whether the computation code should transfer the outputs to CPU (which is to save GPU memory).
[in]	ivector	If non-NULL, and i-vector for the whole utterance is expected to be supplied here (and online_ivectors should be NULL). This is relevant if you estimate i-vectors per speaker instead of online.
[in]	online_ivectors	Matrix of ivectors, one every 'online_ivector_period' frames.
[in]	online_ivector_period	Affects the interpretation of 'online_ivectors'.
[out]	tasks	The tasks created will be output to here. The priorities will be set to zero; setting them to a meaningful value is up to the caller.

Definition at line 843 of file nnet-batch-compute.cc.

References CuMatrixBase< Real >::CopyFromMat(), CuVectorBase< Real >::CopyFromVec(), VectorBase< Real >::Dim(), kaldi::kUndefined, MatrixBase< Real >::NumCols(), MatrixBase< Real >::NumRows(), CuVector< Real >::Resize(), and CuMatrix< Real >::Resize().

Referenced by NnetBatchInference::AcceptInput(), and NnetBatchDecoder::Decode().

                                          {
 
   // Inputs are expected to be in device memory. 
   // create temporary device arrays and copy
   // inputs into them
   CuMatrix<BaseFloat> cu_input(input);
   CuVector<BaseFloat> cu_ivector, *ivector = NULL;
   CuMatrix<BaseFloat> cu_online_ivectors, *online_ivectors = NULL;
 
   if (h_ivector!=NULL) {
     cu_ivector.Resize(h_ivector->Dim(), kUndefined);
     cu_ivector.CopyFromVec(*h_ivector);
     ivector = &cu_ivector;
   }
   if (h_online_ivectors!=NULL) {
     cu_online_ivectors.Resize(h_online_ivectors->NumRows(), h_online_ivectors->NumCols(), kUndefined);
     cu_online_ivectors.CopyFromMat(*h_online_ivectors);
     online_ivectors = &cu_online_ivectors;
   }
 
   SplitUtteranceIntoTasks(output_to_cpu, cu_input, ivector,
       online_ivectors, online_ivector_period, tasks);
 }

◆ SplitUtteranceIntoTasks() [2/2]

void SplitUtteranceIntoTasks	(	bool	output_to_cpu,
		const CuMatrix< BaseFloat > &	input,
		const CuVector< BaseFloat > *	ivector,
		const CuMatrix< BaseFloat > *	online_ivectors,
		int32	online_ivector_period,
		std::vector< NnetInferenceTask > *	tasks
	)

Definition at line 873 of file nnet-batch-compute.cc.

References kaldi::nnet3::utterance_splitting::AddOnlineIvectorsToTasks(), CuVectorBase< Real >::Data(), CuVectorBase< Real >::Dim(), NnetSimpleComputationOptions::frame_subsampling_factor, NnetSimpleComputationOptions::frames_per_chunk, kaldi::nnet3::utterance_splitting::GetOutputFrameInfoForTasks(), rnnlm::i, NnetBatchComputer::input_dim_, NnetBatchComputer::ivector_dim_, KALDI_ASSERT, KALDI_ERR, kaldi::kUndefined, NnetBatchComputer::nnet_left_context_, NnetBatchComputer::nnet_right_context_, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), NnetBatchComputer::opts_, CuVector< Real >::Resize(), and kaldi::nnet3::utterance_splitting::SplitInputToTasks().

                                          {
   using namespace utterance_splitting;
 
 
   { // This block does some checking.
     if (input.NumCols() != input_dim_) {
       KALDI_ERR << "Input features did not have expected dimension: expected "
           << input_dim_ << ", got " << input.NumCols();
     }
     int32 ivector_dim = (ivector != NULL ? ivector->Dim() :
                          (online_ivectors != NULL ?
                           online_ivectors->NumCols() : 0));
     if (ivector_dim_ != 0 && ivector_dim == 0)
       KALDI_ERR << "Model expects i-vectors but none were supplied";
     else if (ivector_dim_ == 0 && ivector_dim != 0)
       KALDI_ERR << "You supplied i-vectors but model does not expect them.";
     else if (ivector_dim != ivector_dim_)
       KALDI_ERR << "I-vector dimensions mismatch: model expects "
                 << ivector_dim_ << ", you supplied " << ivector_dim;
   }
 
 
   int32 num_input_frames = input.NumRows(),
       f = opts_.frame_subsampling_factor,
       num_subsampled_frames = (num_input_frames + f - 1) / f,
       num_subsampled_frames_per_chunk = opts_.frames_per_chunk / f;
 
   GetOutputFrameInfoForTasks(opts_, num_subsampled_frames,
                              num_subsampled_frames_per_chunk,
                              tasks);
 
   SplitInputToTasks(opts_, nnet_left_context_, nnet_right_context_,
                     input, tasks);
 
 
   if (ivector != NULL) {
     KALDI_ASSERT(online_ivectors == NULL);
 
 #if HAVE_CUDA == 1 
     if (CuDevice::Instantiate().Enabled()) {
       int32_t num_tasks = tasks->size();
 
       std::vector<const BaseFloat*> inputs(num_tasks);
       std::vector<BaseFloat*> outputs(num_tasks);
       std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
       std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
 
       int b=0;  // batch counter
         
       for (size_t i = 0; i < tasks->size(); i++) {
         CuVector<BaseFloat> &output_vec = (*tasks)[i].ivector;
         const CuVector<BaseFloat> &input_vec =  *ivector;
 
         output_vec.Resize(input_vec.Dim(), kUndefined);
 
         // create matrix batch description arrays
         num_rows[b] = 1;
         num_cols[b] = output_vec.Dim();
         outputs[b] = output_vec.Data();
         inputs[b] = input_vec.Data();
         ldo[b] = 0;
         ldi[b] = 0;
         b++; // increase batch count
       }
     
       // execute batched copy
       cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
           &outputs[0], &ldo[0]);
     } else
 #endif
     {
       for (size_t i = 0; i < tasks->size(); i++)
         (*tasks)[i].ivector = *ivector;
     }
 
   } else if (online_ivectors != NULL) {
     AddOnlineIvectorsToTasks(opts_, *online_ivectors,
                              online_ivector_period, tasks);
   }
 
   for (size_t i = 0; i < tasks->size(); i++) {
     (*tasks)[i].output_to_cpu = output_to_cpu;
     // The priority will be set by the user; this just avoids undefined
     // behavior.
     (*tasks)[i].priority = 0.0;
   }
 }