doc/nnet-batch-compute_8cc_source.html

 // nnet3/nnet-batch-compute.cc

 // Copyright 2012-2018  Johns Hopkins University (author: Daniel Povey)
 //           2018       Hang Lyu

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include <algorithm>
 #include <iomanip>
 #include "nnet3/nnet-batch-compute.h"
 #include "nnet3/nnet-utils.h"
 #include "decoder/decodable-matrix.h"

 namespace kaldi {
 namespace nnet3 {


 NnetBatchComputer::NnetBatchComputer(
     const NnetBatchComputerOptions &opts,
     const Nnet &nnet,
     const VectorBase<BaseFloat> &priors):
     opts_(opts),
     nnet_(nnet),
     compiler_(nnet_, opts.optimize_config),
     log_priors_(priors),
     num_full_minibatches_(0) {
   log_priors_.ApplyLog();
   opts_.CheckAndFixConfigs(nnet_.Modulus());
   KALDI_ASSERT(opts_.minibatch_size >= 1 && opts_.edge_minibatch_size >= 1 &&
                opts_.partial_minibatch_factor < 1.0 &&
                opts_.partial_minibatch_factor >= 0.0);

   ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_);
   input_dim_ = nnet.InputDim("input");
   ivector_dim_ = std::max<int32>(0, nnet.InputDim("ivector"));
   output_dim_ = nnet.OutputDim("output");
   KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0);
 }

 void NnetBatchComputer::PrintMinibatchStats() {
   int32 max_stats_to_print = 10;
   int64 tot_tasks = 0, tot_minibatches = 0;
   double tot_time = 0.0;
   std::ostringstream os;
   struct MinibatchStats {
     int32 num_frames_out;
     int32 num_frames_in;
     int32 minibatch_size;
     int32 num_done;
     int32 percent_full;
     BaseFloat seconds_taken;

     bool operator < (const MinibatchStats &other) const {
       return seconds_taken > other.seconds_taken;  // sort from most to least time.
     }
   };
   std::vector<MinibatchStats> all_stats;
   os << "Minibatch stats: seconds-taken,frames-in:frames-out*minibatch-size=num-done(percent-full%)  ";

   for (MapType::const_iterator iter = tasks_.begin();
        iter != tasks_.end(); ++iter) {
     for (std::map<int32, MinibatchSizeInfo>::const_iterator
              miter = iter->second.minibatch_info.begin();
          miter != iter->second.minibatch_info.end(); ++miter) {
       const ComputationGroupKey &key = iter->first;
       const MinibatchSizeInfo &minfo = miter->second;
       MinibatchStats stats;
       stats.num_frames_in = key.num_input_frames;
       stats.num_frames_out = key.num_output_frames;
       stats.minibatch_size = miter->first;
       stats.num_done = minfo.num_done;
       stats.seconds_taken = minfo.seconds_taken;

       tot_tasks += minfo.tot_num_tasks;
       tot_minibatches += minfo.num_done;
       tot_time += minfo.seconds_taken;
       stats.percent_full = int32(minfo.tot_num_tasks * 100.0 /
                                  (stats.minibatch_size * stats.num_done));
       all_stats.push_back(stats);
     }
   }

   std::sort(all_stats.begin(), all_stats.end());
   os << std::fixed << std::setprecision(2);
   int32 num_stats = all_stats.size();
   for (int32 i = 0; i < std::min<int32>(num_stats, max_stats_to_print); i++) {
     MinibatchStats &stats = all_stats[i];
     os << stats.seconds_taken << ',' << stats.num_frames_in << ':'
        << stats.num_frames_out << '*' << stats.minibatch_size
        << '=' << stats.num_done << '(' << stats.percent_full << "%) ";
   }
   if (num_stats > max_stats_to_print)
     os << "...";
   KALDI_LOG << os.str();
   KALDI_LOG << "Did " << tot_tasks << " tasks in " << tot_minibatches
             << " minibatches, taking " << tot_time << " seconds.";
 }

 NnetBatchComputer::~NnetBatchComputer() {
   PrintMinibatchStats();
   // the destructor shouldn't be called while the mutex is locked; if it is, it
   // likely means the program has already crashed, or it's a programming error.
   if (!mutex_.try_lock())
     KALDI_ERR << "Destructor called while object locked.";
   int32 num_pending_tasks = 0;
   for (auto iter = tasks_.begin(); iter != tasks_.end(); ++iter)
     num_pending_tasks += iter->second.tasks.size();
   if (num_pending_tasks > 0)
     KALDI_ERR << "Tasks are pending but object is being destroyed";
   for (auto iter = no_more_than_n_minibatches_full_.begin();
        iter != no_more_than_n_minibatches_full_.end(); ++iter) {
     std::condition_variable *cond = iter->second;
     // the next call will notify any threads that were waiting on this condition
     // variable-- there shouldn't be any, though, as it would be a programming
     // error, but better to wake them up so we can see any messages they print.
     cond->notify_all();
     delete cond;
   }
   KALDI_ASSERT(num_full_minibatches_ == 0);  // failure would be a coding error.
 }

 NnetBatchComputer::MinibatchSizeInfo*
 NnetBatchComputer::GetHighestPriorityComputation(
     bool allow_partial_minibatch,
     int32 *minibatch_size_out,
     std::vector<NnetInferenceTask*> *tasks) {
   tasks->clear();
   std::unique_lock<std::mutex> lock(mutex_);
   MapType::iterator iter = tasks_.begin(), end = tasks_.end(),
       best_iter = tasks_.end();
   double highest_priority = -std::numeric_limits<double>::infinity();

   for (; iter != end; ++iter) {
     ComputationGroupInfo &info = iter->second;
     double this_priority = GetPriority(allow_partial_minibatch, info);
     if (this_priority > highest_priority) {
       highest_priority = this_priority;
       best_iter = iter;
     }
   }
   if (best_iter == tasks_.end()) {
     // either allow_partial_minibatch == false and there were no full
     // minibatches, or there were no pending tasks at all.
     return NULL;
   }
   ComputationGroupInfo &info = best_iter->second;
   int32 actual_minibatch_size = GetActualMinibatchSize(info);
   *minibatch_size_out = actual_minibatch_size;
   MinibatchSizeInfo *minfo = &(info.minibatch_info[actual_minibatch_size]);
   if (minfo->computation == NULL)
     minfo->computation = GetComputation(info, actual_minibatch_size);
   GetHighestPriorityTasks(actual_minibatch_size, &info, tasks);
   return minfo;
 }


 void NnetBatchComputer::GetHighestPriorityTasks(
     int32 num_tasks_needed,
     ComputationGroupInfo *info,
     std::vector<NnetInferenceTask*> *tasks) {
   int32 num_tasks_present = info->tasks.size(),
       minibatch_size = GetMinibatchSize(*info);
   KALDI_ASSERT(tasks->empty());
   if (num_tasks_needed >= num_tasks_present) {
     tasks->swap(info->tasks);
   } else {
     int32 num_tasks_not_needed = num_tasks_present - num_tasks_needed;
     // We don't sort the tasks with a comparator that dereferences the pointers,
     // because the priorities can change asynchronously, and we're concerned that
     // something weird might happen in the sorting if the things it's comparing
     // are changing.
     std::vector<std::pair<double, NnetInferenceTask*> > pairs(num_tasks_present);
     for (int32 i = 0; i < num_tasks_present; i++) {
       pairs[i].first = info->tasks[i]->priority;
       pairs[i].second = info->tasks[i];
     }
     std::nth_element(pairs.begin(), pairs.begin() + num_tasks_not_needed,
                      pairs.end());

     // The lowest-priority 'num_tasks_not_needed' stay in the 'info' struct.
     info->tasks.clear();
     for (int32 i = 0; i < num_tasks_not_needed; i++)
       info->tasks.push_back(pairs[i].second);
     // The highest-priority 'num_tasks_needed' tasks go to the output 'tasks'
     // array.
     for (int32 i = num_tasks_not_needed; i < num_tasks_present; i++)
       tasks->push_back(pairs[i].second);
     // The following assertion checks that the is_edge and is_irregular values
     // are the same for the entire minibatch, which they should always be.
     KALDI_ASSERT(GetMinibatchSize(*info) == minibatch_size);
   }

   {
     // This block updates num_full_minibatches_ and notifies threads waiting on
     // any related condition variable.
     int32 new_num_tasks_present = info->tasks.size(),
         full_minibatch_reduction =
         (num_tasks_present / minibatch_size) -
         (new_num_tasks_present / minibatch_size);
     for (int32 i = 0; i < full_minibatch_reduction; i++) {
       num_full_minibatches_--;
       KALDI_ASSERT(num_full_minibatches_ >= 0);
       std::unordered_map<int32, std::condition_variable*>::const_iterator
           iter = no_more_than_n_minibatches_full_.find(num_full_minibatches_);
       if (iter != no_more_than_n_minibatches_full_.end()) {
         std::condition_variable *cond = iter->second;
         cond->notify_all();
       }
     }
   }
 }


 int32 NnetBatchComputer::GetMinibatchSize(
     const ComputationGroupInfo &info) const {
   if (info.tasks.empty()) {
     return opts_.minibatch_size; // actually it shouldn't matter what we return
                                  // in this case.
   }
   const NnetInferenceTask &task = *(info.tasks[0]);
   if (task.is_irregular)
     return 1;
   else if (task.is_edge)
     return opts_.edge_minibatch_size;
   else
     return opts_.minibatch_size;
 }

 int32 NnetBatchComputer::GetActualMinibatchSize(
     const ComputationGroupInfo &info) const {
   KALDI_ASSERT(!info.tasks.empty());
   int32 num_tasks = info.tasks.size(),
       this_minibatch_size = GetMinibatchSize(info);
   KALDI_ASSERT(num_tasks > 0);
   while (num_tasks <
          int32(opts_.partial_minibatch_factor * this_minibatch_size))
     this_minibatch_size *= opts_.partial_minibatch_factor;
   return int32(this_minibatch_size);
 }


 std::shared_ptr<const NnetComputation> NnetBatchComputer::GetComputation(
     const ComputationGroupInfo &info,
     int32 minibatch_size) {
   KALDI_ASSERT(!info.tasks.empty());
   // note: all the tasks will have the same structure, in the respects that
   // would affect the computation.
   NnetInferenceTask *example_task = info.tasks[0];
   ComputationRequest request;
   GetComputationRequest(*example_task, minibatch_size, &request);
   return compiler_.Compile(request);
 }


 double NnetBatchComputer::GetPriority(bool allow_partial_minibatch,
                                       const ComputationGroupInfo &info) const {
   if (info.tasks.empty())
     return -std::numeric_limits<double>::infinity();
   int32 this_minibatch_size = GetMinibatchSize(info);
   int32 num_tasks = info.tasks.size();

   if (!allow_partial_minibatch && num_tasks < this_minibatch_size)
     return -std::numeric_limits<double>::infinity();

   // penalty_for_not_full will be negative if the minibatch is not full, up to a
   // maximum of 10.  the 10 is a heuristic; it could be changed.
   // Note: the penalty is effectively infinity if allow_partial_minibatch == false;
   // see the 'return' above.
   double proportion_full = std::min<int32>(num_tasks, this_minibatch_size) /
       double(this_minibatch_size),
       penalty_for_not_full = 10.0 * (proportion_full - 1.0),
       task_priority_sum = 0.0;


   if (num_tasks > this_minibatch_size) {
     // Get the average of the priorities of the highest-priority tasks (no more
     // than 'minibatch_size' of them.
     std::vector<double> priorities;
     priorities.resize(num_tasks);
     for (int32 i = 0; i < num_tasks; i++)
       priorities[i] = info.tasks[i]->priority;
     // sort from greatest to least.
     std::nth_element(priorities.begin(),
                      priorities.begin() + this_minibatch_size,
                      priorities.end(),
                      std::greater<double>());
     for (int32 i = 0; i < this_minibatch_size; i++)
       task_priority_sum += priorities[i];
     return penalty_for_not_full + task_priority_sum / this_minibatch_size;
   } else {
     for (int32 i = 0; i < num_tasks; i++)
       task_priority_sum += info.tasks[i]->priority;
     return penalty_for_not_full + task_priority_sum / num_tasks;
   }
 }


 // static
 void NnetBatchComputer::GetComputationRequest(
     const NnetInferenceTask &task,
     int32 minibatch_size,
     ComputationRequest *request) {
   request->need_model_derivative = false;
   request->store_component_stats = false;
   request->inputs.reserve(2);

   int32 num_input_frames = task.input.NumRows(),
       first_input_t = task.first_input_t,
       num_output_frames = task.num_output_frames,
       output_t_stride = task.output_t_stride;
   bool has_ivector = (task.ivector.Dim() != 0);

   std::vector<Index> input_indexes, ivector_indexes, output_indexes;
   input_indexes.reserve(minibatch_size * num_input_frames);
   output_indexes.reserve(minibatch_size * num_output_frames);
   if (has_ivector)
     ivector_indexes.reserve(minibatch_size);

   for (int32 n = 0; n < minibatch_size; n++) {
     for (int32 t = first_input_t; t < first_input_t + num_input_frames; t++)
       input_indexes.push_back(Index(n, t, 0));
     if (has_ivector)
       ivector_indexes.push_back(Index(n, 0, 0));
     for (int32 t = 0; t < num_output_frames; t++)
       output_indexes.push_back(Index(n, t * output_t_stride, 0));
   }
   request->inputs.push_back(IoSpecification("input", input_indexes));
   if (has_ivector)
     request->inputs.push_back(IoSpecification("ivector", ivector_indexes));
   request->outputs.push_back(IoSpecification("output", output_indexes));
 }

 void NnetBatchComputer::FormatInputs(
     int32 minibatch_size,
     const std::vector<NnetInferenceTask*> &tasks,
     CuMatrix<BaseFloat> *input,
     CuMatrix<BaseFloat> *ivector) {
   int32 num_input_frames = tasks[0]->input.NumRows(),
       input_dim = tasks[0]->input.NumCols(),
       ivector_dim = tasks[0]->ivector.Dim(),
       num_tasks = tasks.size();
   KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);

   // destination matrix
   input->Resize(minibatch_size * num_input_frames, input_dim,
                 kUndefined);

 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {

     std::vector<const BaseFloat*> inputs(num_tasks);
     std::vector<BaseFloat*> outputs(num_tasks);
     std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
     std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);

     // compute matrix descriptions for each copy
     for (int32 n = 0; n < num_tasks; n++) {
       const CuMatrix<BaseFloat> &input_mat = tasks[n]->input;
       CuSubMatrix<BaseFloat> output_mat = input->RowRange(
           n * num_input_frames, num_input_frames);

       // create matrix batch description arrays
       num_rows[n] = num_input_frames;
       num_cols[n] = input_dim;
       outputs[n] = output_mat.Data();
       inputs[n] = input_mat.Data();
       ldo[n] = output_mat.Stride();
       ldi[n] = input_mat.Stride();
     }

     // execute batched copy
     cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0],
         &ldi[0], &outputs[0], &ldo[0]);

   } else
 #endif
   {
     for (int32 n = 0; n < num_tasks; n++) {
       CuSubMatrix<BaseFloat> input_part(*input,
                                     n * num_input_frames, num_input_frames,
                                     0, input_dim);
       input_part.CopyFromMat(tasks[n]->input);
     }
   }

   if (GetVerboseLevel() >=2 ) {
     if (num_tasks < minibatch_size) {
       // The following will make things easier to debug if something fails, but
       // shouldn't be strictly necessary.
       // the -1 means 'take all remaining rows'.
       input->RowRange(num_tasks * num_input_frames,
                       (minibatch_size - num_tasks) * num_input_frames).SetZero();
     }
   }

   if (ivector_dim != 0) {
     ivector->Resize(minibatch_size, ivector_dim, kUndefined);

 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {

       // using the batched matrix copy routine for this.  This isn't
       // extremely efficient but the kernel takes a minimal amount of
       // time so making a batched vector copy is not worth the effort.
       std::vector<const BaseFloat*> inputs(num_tasks);
       std::vector<BaseFloat*> outputs(num_tasks);
       std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
       std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);

       // compute source pointers for each input
       for (int32 n = 0; n < num_tasks; n++) {
         const CuVector<BaseFloat> &input_vec = tasks[n]->ivector;
         CuSubVector<BaseFloat> output_vec = ivector->Row(n);
         // create matrix batch description arrays
         num_rows[n] = 1;
         num_cols[n] = ivector_dim;
         outputs[n] = output_vec.Data();
         inputs[n] = input_vec.Data();
         ldo[n] = 1;
         ldi[n] = 1;
       }

       // execute batched copy
       cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
           &outputs[0], &ldo[0]);

     } else
 #endif
     {
       for (int32 n = 0; n < num_tasks; n++) {
         ivector->Row(n).CopyFromVec(tasks[n]->ivector);
       }
     }

     if (GetVerboseLevel() >= 2) {
       if (num_tasks < minibatch_size) {
         // The following will make things easier to debug if something fails, but
         // shouldn't be strictly necessary.
         // the -1 means 'take all remaining rows'.
         ivector->RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
       }
     }
   }
 }

 void NnetBatchComputer::FormatOutputs(
     const CuMatrix<BaseFloat> &output,
     const std::vector<NnetInferenceTask*> &tasks) {
   KALDI_ASSERT(!tasks.empty());
   int32 num_output_frames = tasks[0]->num_output_frames,
       output_dim = output.NumCols(),
       num_tasks = tasks.size();
   bool did_output_to_gpu = false;

   // We don't bother zeroing frames of the output that are unused, but you could
   // un-comment the commented lines of code below to do so and add equivalent
   // calls to the cuda version.

 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {

     std::vector<const BaseFloat*> inputs(num_tasks);
     std::vector<BaseFloat*> outputs(num_tasks);
     std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
     std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);

     int b=0;  // batch counter
     for (int32 n = 0; n < num_tasks; n++) {
       NnetInferenceTask *task = tasks[n];

       int32 left_unused = task->num_initial_unused_output_frames,
             used = task->num_used_output_frames;
       // int32 right_unused = num_output_frames - used - left_unused;

       // TODO do we really expect different tasks to output CPU or GPU?
       // This adds a bit of code complexity.  Perhaps output_to_cpu should
       // be a property of the batch computer and not the tasks
       if (task->output_to_cpu) {
         task->output_cpu.Resize(num_output_frames, output_dim,
             kUndefined);
         // if (left_unused > 0)
         //   task->output_cpu.RowRange(0, left_unused).SetZero();
         task->output_cpu.RowRange(left_unused, used).CopyFromMat(
             output.RowRange(n * num_output_frames + left_unused, used));
         // if (right_unused > 0)
         //   task->output_cpu.RowRange(
         //   0, left_unused + used, right_unused).SetZero();

       } else {
         did_output_to_gpu = true;
         task->output.Resize(num_output_frames, output_dim,
             kUndefined);

         CuSubMatrix<BaseFloat> output_mat = task->output.RowRange(
             left_unused, used);
         const CuSubMatrix<BaseFloat> input_mat = output.RowRange(
             n * num_output_frames + left_unused, used);

         // create matrix batch description arrays
         num_rows[b] = output_mat.NumRows();
         num_cols[b] = output_mat.NumCols();
         outputs[b] = output_mat.Data();
         inputs[b] = input_mat.Data();
         ldo[b] = output_mat.Stride();
         ldi[b] = input_mat.Stride();
         b++; // increase batch count
       }
     }

     // execute batched copy
     cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
         &outputs[0], &ldo[0]);

   } else
 #endif
   {
     //TODO i don't think all of these paths are actually possible.  We should simplify this.
     //Is it possible to output_to_gpu with HAVE_CUDA == 0 or when the device is disabled?
     for (int32 n = 0; n < num_tasks; n++) {
       NnetInferenceTask *task = tasks[n];

       int32 left_unused = task->num_initial_unused_output_frames,
             used = task->num_used_output_frames;
       // int32 right_unused = num_output_frames - used - left_unused;

       if (task->output_to_cpu) {
         task->output_cpu.Resize(num_output_frames, output_dim,
             kUndefined);
         // if (left_unused > 0)
         //   task->output_cpu.RowRange(0, left_unused).SetZero();
         task->output_cpu.RowRange(left_unused, used).CopyFromMat(
             output.RowRange(n * num_output_frames + left_unused, used));
         // if (right_unused > 0)
         //   task->output_cpu.RowRange(0, left_unused + used, right_unused).SetZero();
       } else {
         did_output_to_gpu = true;
         task->output.Resize(num_output_frames, output_dim,
             kUndefined);
         // if (left_unused > 0)
         //   task->output.RowRange(0, left_unused).SetZero();
         task->output.RowRange(left_unused, used).CopyFromMat(
             output.RowRange(n * num_output_frames + left_unused, used));
         // if (right_unused > 0)
         //   task->output.RowRange(0, left_unused + used, right_unused).SetZero();
       }
     }
   }
   // The output of this function will likely be consumed by another thread.
   // The following call will make sure the relevant kernels complete before
   // any kernels from the other thread use the output.
   if (did_output_to_gpu)
     SynchronizeGpu();
 }

 void NnetBatchComputer::AcceptTask(NnetInferenceTask *task,
                                    int32 max_minibatches_full) {
   std::unique_lock<std::mutex> lock(mutex_);

   if (max_minibatches_full > 0 && num_full_minibatches_ > max_minibatches_full) {
     std::unordered_map<int32, std::condition_variable*>::iterator
         iter = no_more_than_n_minibatches_full_.find(max_minibatches_full);
     std::condition_variable *cond;
     if (iter != no_more_than_n_minibatches_full_.end()) {
       cond = iter->second;
     } else {
       cond = new std::condition_variable();
       no_more_than_n_minibatches_full_[max_minibatches_full] = cond;
     }
     while (num_full_minibatches_ > max_minibatches_full)
       cond->wait(lock);
   }
   ComputationGroupKey key(*task);
   ComputationGroupInfo &info = tasks_[key];
   info.tasks.push_back(task);
   int32 minibatch_size = GetMinibatchSize(info);
   if (static_cast<int32>(info.tasks.size()) % minibatch_size == 0)
     num_full_minibatches_++;
 }

 bool NnetBatchComputer::Compute(bool allow_partial_minibatch) {
   int32 minibatch_size;
   std::vector<NnetInferenceTask*> tasks;
   MinibatchSizeInfo *minfo =
       GetHighestPriorityComputation(allow_partial_minibatch,
                                     &minibatch_size,
                                     &tasks);
   if (minfo == NULL)
     return false;

   Timer tim;
   Nnet *nnet_to_update = NULL;  // we're not doing any update
   NnetComputer computer(opts_.compute_config, *(minfo->computation),
                         nnet_, nnet_to_update);


   CuMatrix<BaseFloat> input;
   CuMatrix<BaseFloat> ivector;
   FormatInputs(minibatch_size, tasks, &input, &ivector);
   computer.AcceptInput("input", &input);
   if (ivector.NumRows() != 0)
     computer.AcceptInput("ivector", &ivector);
   computer.Run();
   CuMatrix<BaseFloat> output;
   computer.GetOutputDestructive("output", &output);
   if (log_priors_.Dim() != 0) {
     output.AddVecToRows(-1.0, log_priors_);
   }
   output.Scale(opts_.acoustic_scale);
   FormatOutputs(output, tasks);

   // Update the stats, for diagnostics.
   minfo->num_done++;
   minfo->tot_num_tasks += static_cast<int64>(tasks.size());
   minfo->seconds_taken += tim.Elapsed();

   SynchronizeGpu();

   for (size_t i = 0; i < tasks.size(); i++)
     tasks[i]->semaphore.Signal();

   return true;
 }


 namespace utterance_splitting {
 void GetOutputFrameInfoForTasks(
     const NnetBatchComputerOptions &opts,
     int32 num_subsampled_frames,
     int32 num_subsampled_frames_per_chunk,
     std::vector<NnetInferenceTask> *tasks) {
   KALDI_ASSERT(num_subsampled_frames > 0);
   int32 fpc = num_subsampled_frames_per_chunk;
   int32 num_tasks = (num_subsampled_frames + fpc - 1) / fpc;
   tasks->resize(num_tasks);
   for (int32 i = 0; i < num_tasks; i++) {
     (*tasks)[i].output_t_stride = opts.frame_subsampling_factor;
   }
   if (num_subsampled_frames <= fpc) {  // there is one chunk.
     KALDI_ASSERT(num_tasks == 1);  // TODO: remove this.
     NnetInferenceTask &task = (*tasks)[0];
     task.first_used_output_frame_index = 0;
     if (opts.ensure_exact_final_context) {
       task.num_output_frames = num_subsampled_frames;
       task.num_initial_unused_output_frames = 0;
       task.num_used_output_frames = num_subsampled_frames;
       task.is_irregular = true;
     } else {
       task.num_output_frames = fpc;
       task.num_initial_unused_output_frames = 0;
       task.num_used_output_frames = num_subsampled_frames;
       task.is_irregular = false;
     }
   } else {
     for (int32 i = 0; i + 1 < num_tasks; i++) {
       NnetInferenceTask &task = (*tasks)[i];
       task.num_output_frames = fpc;
       task.num_initial_unused_output_frames = 0;
       task.num_used_output_frames = fpc;
       task.first_used_output_frame_index = i * fpc;
       task.is_irregular = false;
     }
     // The last chunk will end on the last frame of the file, but we won't use
     // the part of its output that overlaps with the preceding chunk.
     NnetInferenceTask &task = (*tasks)[num_tasks - 1];
     task.num_output_frames = fpc;
     task.num_initial_unused_output_frames = ((num_tasks - 1) * fpc) -
         (num_subsampled_frames - fpc);
     task.num_used_output_frames =
         num_subsampled_frames - ((num_tasks - 1) * fpc);
     task.first_used_output_frame_index = (num_tasks - 1) * fpc;
     task.is_irregular = false;
   }

   if (true) {
     // Do some checking.  TODO: remove this.
     KALDI_ASSERT((*tasks)[0].first_used_output_frame_index == 0);
     for (int32 i = 1; i < num_tasks; i++) {
       KALDI_ASSERT((*tasks)[i].first_used_output_frame_index ==
                    (*tasks)[i-1].first_used_output_frame_index +
                    (*tasks)[i-1].num_used_output_frames);
     }
     KALDI_ASSERT((*tasks)[num_tasks-1].first_used_output_frame_index +
                  (*tasks)[num_tasks-1].num_used_output_frames ==
                  num_subsampled_frames);
     for (int32 i = 0; i < num_tasks; i++) {
       const NnetInferenceTask &task = (*tasks)[i];
       KALDI_ASSERT(task.num_used_output_frames +
                    task.num_initial_unused_output_frames <=
                    task.num_output_frames);
     }
   }
 }

 void AddOnlineIvectorsToTasks(
     const NnetBatchComputerOptions &opts,
     const CuMatrix<BaseFloat> &online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
   int32 f = opts.frame_subsampling_factor,
       num_tasks = tasks->size();
   for (int32 i = 0; i < num_tasks; i++) {
     NnetInferenceTask &task = (*tasks)[i];
     // begin_output_t and end_output_t are the subsampled frame indexes at
     // the output; you'd have to multiply them by f to get real frame indexes.
     int32 begin_output_t = task.first_used_output_frame_index -
         task.num_initial_unused_output_frames,
         mid_output_t = begin_output_t + (task.num_output_frames / 2),
         mid_input_t = mid_output_t * f,
         ivector_frame = mid_input_t / online_ivector_period,
         num_ivector_frames = online_ivectors.NumRows(),
         margin_in_frames = 20,
         margin_in_ivector_frames =
         (margin_in_frames + online_ivector_period - 1) / online_ivector_period;
     // the 'margin' is our tolerance for when the number of rows of
     // 'online_ivectors' is less than what we expected; we allow 20 frames of
     // tolerance in the numbering of the original (input) features.
     if (ivector_frame >= num_ivector_frames) {
       if (num_ivector_frames > 0 && ivector_frame > num_ivector_frames -
           margin_in_ivector_frames) {
         ivector_frame = num_ivector_frames - 1;  // Just take the last available one.
       } else {
         KALDI_ERR << "Could not get iVector for frame " << ivector_frame
                   << ", online-ivectors matrix has "
                   << online_ivectors.NumRows()
                   << " rows.  Mismatched --online-ivector-period?";
       }
     }
     task.ivector = online_ivectors.Row(ivector_frame);
   }
 }


 static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
                               int32 nnet_left_context,
                               int32 nnet_right_context,
                               const CuMatrix<BaseFloat> &input,
                               std::vector<NnetInferenceTask> *tasks) {
   int32 num_input_frames = input.NumRows(),
       f = opts.frame_subsampling_factor,
       num_subsampled_frames = (num_input_frames + f - 1) / f,
       extra_left_context_initial = (opts.extra_left_context_initial < 0 ?
                                     opts.extra_left_context :
                                     opts.extra_left_context_initial),
       extra_right_context_final = (opts.extra_right_context_final < 0 ?
                                    opts.extra_right_context :
                                    opts.extra_right_context_final),
       num_tasks = tasks->size();

   for (int32 i = 0; i < num_tasks; i++) {
     NnetInferenceTask &task = (*tasks)[i];
     // begin_output_t and end_output_t are the subsampled frame indexes at
     // the output; you'd have to multiply them by f to get real frame indexes.
     int32 begin_output_t = task.first_used_output_frame_index -
         task.num_initial_unused_output_frames,
         end_output_t = begin_output_t + task.num_output_frames;
     // begin_input_t and end_input_t are the real 't' values corresponding to
     // begin_output_t and end_output_t; they are the beginning and end
     // (i.e. first and last-plus-one) frame indexes without any left or right
     // context.
     int32 begin_input_t = begin_output_t * f,
         end_input_t = end_output_t * f;
     // Detect whether the left and right edges touch (or pass over) the left
     // and right boundaries.  Note: we don't expect begin_output_t to ever be
     // negative.
     bool left_edge = (begin_output_t <= 0),
         right_edge = (end_output_t >= num_subsampled_frames);
     int32 tot_left_context = nnet_left_context +
         (left_edge ? extra_left_context_initial : opts.extra_left_context),
         tot_right_context = nnet_right_context +
         (right_edge ? extra_right_context_final : opts.extra_right_context);

     // 'is_edge' is only true if it's an edge minibatch *and* its being an
     // edge actually made a difference to the structure of the example.
     task.is_edge =
         (tot_left_context != nnet_left_context + opts.extra_left_context ||
          tot_right_context !=  nnet_right_context + opts.extra_right_context);

     int32 begin_input_t_padded = begin_input_t - tot_left_context,
         end_input_t_padded = end_input_t + tot_right_context;

     // 'task.first_input_t' is a representation of 'begin_input_t_padded' in a
     // shifted/normalized numbering where the output time indexes start from
     // zero.
     task.first_input_t = begin_input_t_padded - (begin_output_t * f);

     task.input.Resize(end_input_t_padded - begin_input_t_padded,
                       input.NumCols(), kUndefined);

     // Copy from intput into task input with clamping
     task.input.CopyRangeFromMatClamped(input, begin_input_t_padded,
         end_input_t_padded, 0, num_input_frames-1);
   }
 }

 } // namespace utterance_splitting

 void NnetBatchComputer::SplitUtteranceIntoTasks(
     bool output_to_cpu,
     const Matrix<BaseFloat> &input,
     const Vector<BaseFloat> *h_ivector,
     const Matrix<BaseFloat> *h_online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {

   // Inputs are expected to be in device memory.
   // create temporary device arrays and copy
   // inputs into them
   CuMatrix<BaseFloat> cu_input(input);
   CuVector<BaseFloat> cu_ivector, *ivector = NULL;
   CuMatrix<BaseFloat> cu_online_ivectors, *online_ivectors = NULL;

   if (h_ivector!=NULL) {
     cu_ivector.Resize(h_ivector->Dim(), kUndefined);
     cu_ivector.CopyFromVec(*h_ivector);
     ivector = &cu_ivector;
   }
   if (h_online_ivectors!=NULL) {
     cu_online_ivectors.Resize(h_online_ivectors->NumRows(), h_online_ivectors->NumCols(), kUndefined);
     cu_online_ivectors.CopyFromMat(*h_online_ivectors);
     online_ivectors = &cu_online_ivectors;
   }

   SplitUtteranceIntoTasks(output_to_cpu, cu_input, ivector,
       online_ivectors, online_ivector_period, tasks);
 }

 void NnetBatchComputer::SplitUtteranceIntoTasks(
     bool output_to_cpu,
     const CuMatrix<BaseFloat> &input,
     const CuVector<BaseFloat> *ivector,
     const CuMatrix<BaseFloat> *online_ivectors,
     int32 online_ivector_period,
     std::vector<NnetInferenceTask> *tasks) {
   using namespace utterance_splitting;


   { // This block does some checking.
     if (input.NumCols() != input_dim_) {
       KALDI_ERR << "Input features did not have expected dimension: expected "
           << input_dim_ << ", got " << input.NumCols();
     }
     int32 ivector_dim = (ivector != NULL ? ivector->Dim() :
                          (online_ivectors != NULL ?
                           online_ivectors->NumCols() : 0));
     if (ivector_dim_ != 0 && ivector_dim == 0)
       KALDI_ERR << "Model expects i-vectors but none were supplied";
     else if (ivector_dim_ == 0 && ivector_dim != 0)
       KALDI_ERR << "You supplied i-vectors but model does not expect them.";
     else if (ivector_dim != ivector_dim_)
       KALDI_ERR << "I-vector dimensions mismatch: model expects "
                 << ivector_dim_ << ", you supplied " << ivector_dim;
   }


   int32 num_input_frames = input.NumRows(),
       f = opts_.frame_subsampling_factor,
       num_subsampled_frames = (num_input_frames + f - 1) / f,
       num_subsampled_frames_per_chunk = opts_.frames_per_chunk / f;

   GetOutputFrameInfoForTasks(opts_, num_subsampled_frames,
                              num_subsampled_frames_per_chunk,
                              tasks);

   SplitInputToTasks(opts_, nnet_left_context_, nnet_right_context_,
                     input, tasks);


   if (ivector != NULL) {
     KALDI_ASSERT(online_ivectors == NULL);

 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       int32_t num_tasks = tasks->size();

       std::vector<const BaseFloat*> inputs(num_tasks);
       std::vector<BaseFloat*> outputs(num_tasks);
       std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
       std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);

       int b=0;  // batch counter

       for (size_t i = 0; i < tasks->size(); i++) {
         CuVector<BaseFloat> &output_vec = (*tasks)[i].ivector;
         const CuVector<BaseFloat> &input_vec =  *ivector;

         output_vec.Resize(input_vec.Dim(), kUndefined);

         // create matrix batch description arrays
         num_rows[b] = 1;
         num_cols[b] = output_vec.Dim();
         outputs[b] = output_vec.Data();
         inputs[b] = input_vec.Data();
         ldo[b] = 0;
         ldi[b] = 0;
         b++; // increase batch count
       }

       // execute batched copy
       cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
           &outputs[0], &ldo[0]);
     } else
 #endif
     {
       for (size_t i = 0; i < tasks->size(); i++)
         (*tasks)[i].ivector = *ivector;
     }

   } else if (online_ivectors != NULL) {
     AddOnlineIvectorsToTasks(opts_, *online_ivectors,
                              online_ivector_period, tasks);
   }

   for (size_t i = 0; i < tasks->size(); i++) {
     (*tasks)[i].output_to_cpu = output_to_cpu;
     // The priority will be set by the user; this just avoids undefined
     // behavior.
     (*tasks)[i].priority = 0.0;
   }
 }


 void MergeTaskOutput(
     const std::vector<NnetInferenceTask> &tasks,
     Matrix<BaseFloat> *output) {
   int32 num_tasks = tasks.size(),
       num_output_frames = 0,
       output_dim = -1;
   for (int32 i = 0; i < num_tasks; i++) {
     const NnetInferenceTask &task = tasks[i];
     num_output_frames += task.num_used_output_frames;
     if (i == 0) {
       output_dim = (task.output_to_cpu ?
                     task.output_cpu.NumCols() :
                     task.output.NumCols());
     }
   }
   KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
   int32 cur_output_frame = 0;
   output->Resize(num_output_frames, output_dim);
   for (int32 i = 0; i < num_tasks; i++) {
     const NnetInferenceTask &task = tasks[i];
     int32 skip = task.num_initial_unused_output_frames,
         num_used = task.num_used_output_frames;
     KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
     if (task.output_to_cpu) {
       output->RowRange(cur_output_frame, num_used).CopyFromMat(
           task.output_cpu.RowRange(skip, num_used));
     } else {
       output->RowRange(cur_output_frame, num_used).CopyFromMat(
           task.output.RowRange(skip, num_used));
     }
     cur_output_frame += num_used;
   }
   KALDI_ASSERT(cur_output_frame == num_output_frames);
 }
 void MergeTaskOutput(
     const std::vector<NnetInferenceTask> &tasks,
     CuMatrix<BaseFloat> *output) {
   int32 num_tasks = tasks.size(),
       num_output_frames = 0,
       output_dim = -1;
   for (int32 i = 0; i < num_tasks; i++) {
     const NnetInferenceTask &task = tasks[i];
     num_output_frames += task.num_used_output_frames;
     if (i == 0) {
       output_dim = (task.output_to_cpu ?
                     task.output_cpu.NumCols() :
                     task.output.NumCols());
     }
   }
   KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
   int32 cur_output_frame = 0;
   output->Resize(num_output_frames, output_dim, kUndefined);

 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {

     std::vector<const BaseFloat*> inputs(num_tasks);
     std::vector<BaseFloat*> outputs(num_tasks);
     std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
     std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);

     int b=0;  // batch counter
     for (int32 i = 0; i < num_tasks; i++) {
       const NnetInferenceTask &task = tasks[i];
       int32 skip = task.num_initial_unused_output_frames,
             num_used = task.num_used_output_frames;
       KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
       if (task.output_to_cpu) {
         output->RowRange(cur_output_frame, num_used).CopyFromMat(
             task.output_cpu.RowRange(skip, num_used));
       } else {
         CuSubMatrix<BaseFloat> output_mat =
           output->RowRange(cur_output_frame, num_used);
         const CuSubMatrix<BaseFloat> input_mat =
           task.output.RowRange(skip, num_used);

         // create matrix batch description arrays
         num_rows[b] = output_mat.NumRows();
         num_cols[b] = output_mat.NumCols();
         outputs[b] = output_mat.Data();
         inputs[b] = input_mat.Data();
         ldo[b] = output_mat.Stride();
         ldi[b] = input_mat.Stride();
         b++; // increase batch count
       }
       cur_output_frame += num_used;
     }

     // execute batched copy
     cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
         &outputs[0], &ldo[0]);

   } else
 #endif
  {
   for (int32 i = 0; i < num_tasks; i++) {
     const NnetInferenceTask &task = tasks[i];
     int32 skip = task.num_initial_unused_output_frames,
         num_used = task.num_used_output_frames;
     KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
     if (task.output_to_cpu) {
       output->RowRange(cur_output_frame, num_used).CopyFromMat(
           task.output_cpu.RowRange(skip, num_used));
     } else {
       output->RowRange(cur_output_frame, num_used).CopyFromMat(
           task.output.RowRange(skip, num_used));
     }
     cur_output_frame += num_used;
   }
  }

   KALDI_ASSERT(cur_output_frame == num_output_frames);
 }


 NnetBatchInference::NnetBatchInference(
     const NnetBatchComputerOptions &opts,
     const Nnet &nnet,
     const VectorBase<BaseFloat> &priors):
     computer_(opts, nnet, priors),
     is_finished_(false),
     utterance_counter_(0) {
   // 'thread_' will run the Compute() function in the background.
   compute_thread_ = std::thread(ComputeFunc, this);
 }


 void NnetBatchInference::AcceptInput(
     const std::string &utterance_id,
     const Matrix<BaseFloat> &input,
     const Vector<BaseFloat> *ivector,
     const Matrix<BaseFloat> *online_ivectors,
     int32 online_ivector_period) {

   UtteranceInfo *info = new UtteranceInfo();
   info->utterance_id = utterance_id;
   info->num_tasks_finished = 0;
   bool output_to_cpu = true;  // This wrapper is for when you need the nnet
                               // output on CPU, e.g.  because you want it
                               // written to disk.  If this needs to be
                               // configurable in the future, we can make changes
                               // then.
   computer_.SplitUtteranceIntoTasks(
       output_to_cpu, input, ivector, online_ivectors,
       online_ivector_period, &(info->tasks));

   // Setting this to a nonzero value will cause the AcceptTask() call below to
   // hang until the computation thread has made some progress, if too much
   // data is already queued.
   int32 max_full_minibatches = 2;

   // Earlier utterances have higher priority, which is important to make sure
   // that their corresponding tasks are completed and they can be output to disk.
   double priority = -1.0 * (utterance_counter_++);
   for (size_t i = 0; i < info->tasks.size(); i++) {
     info->tasks[i].priority = priority;
     computer_.AcceptTask(&(info->tasks[i]), max_full_minibatches);
   }
   utts_.push_back(info);
   tasks_ready_semaphore_.Signal();
 }

 bool NnetBatchInference::GetOutput(std::string *utterance_id,
                                    Matrix<BaseFloat> *output) {
   if (utts_.empty())
     return false;

   UtteranceInfo *info = *utts_.begin();
   std::vector<NnetInferenceTask> &tasks = info->tasks;
   int32 num_tasks = tasks.size();
   for (; info->num_tasks_finished < num_tasks; ++info->num_tasks_finished) {
     Semaphore &semaphore = tasks[info->num_tasks_finished].semaphore;
     if (is_finished_) {
       semaphore.Wait();
     } else {
       if (!semaphore.TryWait()) {
         // If not all of the tasks of this utterance are ready yet,
         // just return false.
         return false;
       }
     }
   }
   MergeTaskOutput(tasks, output);
   *utterance_id = info->utterance_id;
   delete info;
   utts_.pop_front();
   return true;
 }

 NnetBatchInference::~NnetBatchInference() {
   if (!is_finished_)
     KALDI_ERR << "Object destroyed before Finished() was called.";
   if (!utts_.empty())
     KALDI_ERR << "You should get all output before destroying this object.";
   compute_thread_.join();
 }

 void NnetBatchInference::Finished() {
   is_finished_ = true;
   tasks_ready_semaphore_.Signal();
 }

 // This is run as the thread of class NnetBatchInference.
 void NnetBatchInference::Compute() {
   bool allow_partial_minibatch = false;
   while (true) {
     // keep calling Compute() as long as it makes progress.
     while (computer_.Compute(allow_partial_minibatch));

     // ... then wait on tasks_ready_semaphore_.
     tasks_ready_semaphore_.Wait();
     if (is_finished_) {
       allow_partial_minibatch = true;
       while (computer_.Compute(allow_partial_minibatch));
       return;
     }
   }
 }


 NnetBatchDecoder::NnetBatchDecoder(
     const fst::Fst<fst::StdArc> &fst,
     const LatticeFasterDecoderConfig &decoder_opts,
     const TransitionModel &trans_model,
     const fst::SymbolTable *word_syms,
     bool allow_partial,
     int32 num_threads,
     NnetBatchComputer *computer):
   fst_(fst), decoder_opts_(decoder_opts),
   trans_model_(trans_model), word_syms_(word_syms),
   allow_partial_(allow_partial),  computer_(computer),
   is_finished_(false), tasks_finished_(false), priority_offset_(0.0),
   tot_like_(0.0), frame_count_(0), num_success_(0), num_fail_(0),
   num_partial_(0) {
   KALDI_ASSERT(num_threads > 0);
   for (int32 i = 0; i < num_threads; i++)
     decode_threads_.push_back(new std::thread(DecodeFunc, this));
   compute_thread_ = std::thread(ComputeFunc, this);
 }

 void NnetBatchDecoder::SetPriorities(std::vector<NnetInferenceTask> *tasks) {
   size_t num_tasks = tasks->size();
   double priority_offset = priority_offset_;
   for (size_t i = 0; i < num_tasks; i++)
     (*tasks)[i].priority = priority_offset - (double)i;
 }

 void NnetBatchDecoder::UpdatePriorityOffset(double priority) {
   size_t num_tasks = decode_threads_.size(),
       new_weight = 1.0 / num_tasks,
       old_weight = 1.0 - new_weight;
   // The next line is vulnerable to a race condition but if it happened it
   // wouldn't matter.
   priority_offset_ = priority_offset_ * old_weight + priority * new_weight;
 }

 void NnetBatchDecoder::AcceptInput(
     const std::string &utterance_id,
     const Matrix<BaseFloat> &input,
     const Vector<BaseFloat> *ivector,
     const Matrix<BaseFloat> *online_ivectors,
     int32 online_ivector_period){
   // This function basically does a handshake with one of the decoder threads.
   // It may have to wait till one of the decoder threads becomes ready.
   input_utterance_.utterance_id = utterance_id;
   input_utterance_.input = &input;
   input_utterance_.ivector = ivector;
   input_utterance_.online_ivectors = online_ivectors;
   input_utterance_.online_ivector_period = online_ivector_period;


   UtteranceOutput *this_output = new UtteranceOutput();
   this_output->utterance_id = utterance_id;
   pending_utts_.push_back(this_output);

   input_ready_semaphore_.Signal();
   input_consumed_semaphore_.Wait();
 }

 int32 NnetBatchDecoder::Finished() {
   is_finished_ = true;
   for (size_t i = 0; i < decode_threads_.size(); i++)
     input_ready_semaphore_.Signal();
   for (size_t i = 0; i < decode_threads_.size(); i++) {
     decode_threads_[i]->join();
     delete decode_threads_[i];
     decode_threads_[i] = NULL;
   }
   // don't clear decode_threads_, since its size is needed in the destructor to
   // compute timing.

   tasks_finished_ = true;
   tasks_ready_semaphore_.Signal();
   compute_thread_.join();
   return num_success_;
 }


 bool NnetBatchDecoder::GetOutput(
     std::string *utterance_id,
     CompactLattice *clat,
     std::string *sentence) {
   if (!decoder_opts_.determinize_lattice)
     KALDI_ERR << "Don't call this version of GetOutput if you are "
         "not determinizing.";
   while (true) {
     if (pending_utts_.empty())
       return false;
     if (!pending_utts_.front()->finished)
       return false;
     UtteranceOutput *this_output = pending_utts_.front();
     pending_utts_.pop_front();
     if (this_output->compact_lat.NumStates() == 0) {
       delete this_output;
       // ... and continue round the loop, without returning any output to the
       // user for this utterance.  Something went wrong in decoding: for
       // example, the user specified allow_partial == false and no final-states
       // were active on the last frame, or something more unexpected.  A warning
       // would have been printed in the decoder thread.
     } else {
       *clat = this_output->compact_lat;
       utterance_id->swap(this_output->utterance_id);
       sentence->swap(this_output->sentence);
       delete this_output;
       return true;
     }
   }
 }


 bool NnetBatchDecoder::GetOutput(
     std::string *utterance_id,
     Lattice *lat,
     std::string *sentence) {
   if (decoder_opts_.determinize_lattice)
     KALDI_ERR << "Don't call this version of GetOutput if you are "
         "determinizing.";
   while (true) {
     if (pending_utts_.empty())
       return false;
     if (!pending_utts_.front()->finished)
       return false;
     UtteranceOutput *this_output = pending_utts_.front();
     pending_utts_.pop_front();
     if (this_output->lat.NumStates() == 0) {
       delete this_output;
       // ... and continue round the loop, without returning any output to the
       // user for this utterance.  Something went wrong in decoding: for
       // example, the user specified allow_partial == false and no final-states
       // were active on the last frame, or something more unexpected.  A warning
       // would have been printed in the decoder thread.
     } else {
       *lat = this_output->lat;  // OpenFST has shallow copy so no need to swap.
       utterance_id->swap(this_output->utterance_id);
       sentence->swap(this_output->sentence);
       delete this_output;
       return true;
     }
   }
 }

 void NnetBatchDecoder::Compute() {
   while (!tasks_finished_) {
     tasks_ready_semaphore_.Wait();
     bool allow_partial_minibatch = true;
     while (computer_->Compute(allow_partial_minibatch));
   }
 }

 void NnetBatchDecoder::Decode() {
   while (true) {
     input_ready_semaphore_.Wait();
     if (is_finished_)
       return;

     std::vector<NnetInferenceTask> tasks;
     std::string utterance_id;
     // we can be confident that the last element of 'pending_utts_' is the one
     // for this utterance, as we know exactly at what point in the code the main
     // thread will be in AcceptInput().
     UtteranceOutput *output_utterance = pending_utts_.back();
     {
       UtteranceInput input_utterance(input_utterance_);
       utterance_id = input_utterance.utterance_id;
       bool output_to_cpu = true;
       computer_->SplitUtteranceIntoTasks(output_to_cpu,
                                          *(input_utterance.input),
                                          input_utterance.ivector,
                                          input_utterance.online_ivectors,
                                          input_utterance.online_ivector_period,
                                          &tasks);
       KALDI_ASSERT(output_utterance->utterance_id == utterance_id);
       input_consumed_semaphore_.Signal();
       // Now let input_utterance go out of scope; it's no longer valid as it may
       // be overwritten by something else.
     }

     SetPriorities(&tasks);
     for (size_t i = 0; i < tasks.size(); i++)
       computer_->AcceptTask(&(tasks[i]));
     tasks_ready_semaphore_.Signal();

     {
       int32 frame_offset = 0;
       LatticeFasterDecoder decoder(fst_, decoder_opts_);
       decoder.InitDecoding();


       for (size_t i = 0; i < tasks.size(); i++) {
         NnetInferenceTask &task = tasks[i];
         task.semaphore.Wait();
         UpdatePriorityOffset(task.priority);

         SubMatrix<BaseFloat> post(task.output_cpu,
                                   task.num_initial_unused_output_frames,
                                   task.num_used_output_frames,
                                   0, task.output_cpu.NumCols());
         DecodableMatrixMapped decodable(trans_model_, post, frame_offset);
         frame_offset += post.NumRows();
         decoder.AdvanceDecoding(&decodable);
         task.output.Resize(0, 0);  // Free some memory.
       }

       bool use_final_probs = true;
       if (!decoder.ReachedFinal()) {
         if (allow_partial_) {
           KALDI_WARN << "Outputting partial output for utterance "
                      << utterance_id << " since no final-state reached\n";
           use_final_probs = false;
           std::unique_lock<std::mutex> lock(stats_mutex_);
           num_partial_++;
         } else {
           KALDI_WARN << "Not producing output for utterance " << utterance_id
                      << " since no final-state reached and "
                      << "--allow-partial=false.\n";
           std::unique_lock<std::mutex> lock(stats_mutex_);
           num_fail_++;
           continue;
         }
       }
       // if we reached this point, we are getting a lattice.
       decoder.GetRawLattice(&output_utterance->lat, use_final_probs);
       // Let the decoder and the decodable object go out of scope, to save
       // memory.
     }
     ProcessOutputUtterance(output_utterance);
   }
 }


 void NnetBatchDecoder::UtteranceFailed() {
   std::unique_lock<std::mutex> lock(stats_mutex_);
   num_fail_++;
 }

 void NnetBatchDecoder::ProcessOutputUtterance(UtteranceOutput *output) {
   fst::Connect(&(output->lat));
   if (output->lat.NumStates() == 0) {
     KALDI_WARN << "Unexpected problem getting lattice for utterance "
                << output->utterance_id;
     std::unique_lock<std::mutex> lock(stats_mutex_);
     num_fail_++;
     return;
   }

   { // This block accumulates diagnostics, prints log messages, and sets
     // output->sentence.
     Lattice best_path;
     LatticeWeight weight;
     ShortestPath(output->lat, &best_path);
     std::vector<int32> alignment;
     std::vector<int32> words;
     GetLinearSymbolSequence(best_path, &alignment, &words, &weight);
     int32 num_frames = alignment.size();
     if (word_syms_ != NULL) {
       std::ostringstream os;
       for (size_t i = 0; i < words.size(); i++) {
         std::string s = word_syms_->Find(words[i]);
         if (s == "")
           KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
         os << s << ' ';
       }
       output->sentence = os.str();
     }
     double likelihood = -(weight.Value1() + weight.Value2());
     // Note: these logging messages will be out-of-order w.r.t. the transcripts
     // that are printed to cerr; we keep those transcripts in the same order
     // that the utterances were in, but these logging messages may be out of
     // order (due to multiple threads).
     KALDI_LOG << "Log-like per frame for utterance " << output->utterance_id
               << " is " << (likelihood / num_frames) << " over "
               << num_frames << " frames.";
     KALDI_VLOG(2) << "Cost for utterance " << output->utterance_id << " is "
                   << weight.Value1() << " + " << weight.Value2();

     std::unique_lock<std::mutex> lock(stats_mutex_);
     tot_like_ += likelihood;
     frame_count_ += num_frames;
     num_success_ += 1;
   }

   if (decoder_opts_.determinize_lattice) {
     if (!DeterminizeLatticePhonePrunedWrapper(
             trans_model_,
             &output->lat,
             decoder_opts_.lattice_beam,
             &(output->compact_lat),
             decoder_opts_.det_opts))
       KALDI_WARN << "Determinization finished earlier than the beam for "
                  << "utterance " << output->utterance_id;
     output->lat.DeleteStates();  // Save memory.
   }

   // We'll write the lattice without acoustic scaling, so we need to reverse
   // the scale that we applied when decoding.
   BaseFloat acoustic_scale = computer_->GetOptions().acoustic_scale;
   if (acoustic_scale != 0.0) {
     if (decoder_opts_.determinize_lattice)
       fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale),
                         &(output->compact_lat));
     else
       fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale),
                         &(output->lat));
   }
   output->finished = true;
 }


 NnetBatchDecoder::~NnetBatchDecoder() {
   if (!is_finished_ || !pending_utts_.empty()) {
     // At this point the application is bound to fail so raising another
     // exception is not a big problem.
     KALDI_ERR << "Destroying NnetBatchDecoder object without calling "
         "Finished() and consuming the remaining output";
   }
   // Print diagnostics.

   kaldi::int64 input_frame_count =
       frame_count_ * computer_->GetOptions().frame_subsampling_factor;
   int32 num_threads = static_cast<int32>(decode_threads_.size());

   KALDI_LOG << "Overall likelihood per frame was "
             << tot_like_ / std::max<int64>(1, frame_count_)
             << " over " << frame_count_ << " frames.";

   double elapsed = timer_.Elapsed();
   // the +1 below is just to avoid division-by-zero errors.
   KALDI_LOG << "Time taken "<< elapsed
             << "s: real-time factor assuming 100 frames/sec is "
             << (num_threads * elapsed * 100.0 /
                 std::max<int64>(input_frame_count, 1))
             << " (per thread; with " << num_threads << " threads).";
   KALDI_LOG << "Done " << num_success_ << " utterances ("
             << num_partial_ << " forced out); failed for "
             << num_fail_;
 }


 }  // namespace nnet3
 }  // namespace kaldi
kaldi::nnet3::NnetBatchDecoder::~NnetBatchDecoder
~NnetBatchDecoder()
Definition: nnet-batch-compute.cc:1497

words
int32 words[kMaxOrder]
Definition: arpa-file-parser-test.cc:43

kaldi::nnet3::NnetBatchInference::NnetBatchInference
NnetBatchInference(const NnetBatchComputerOptions &opts, const Nnet &nnet, const VectorBase< BaseFloat > &priors)
Definition: nnet-batch-compute.cc:1083

kaldi::CuMatrixBase::CopyFromMat
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::CuMatrixBase::Stride
MatrixIndexT Stride() const
Definition: cu-matrix.h:217

kaldi::nnet3::Nnet::InputDim
int32 InputDim(const std::string &input_name) const
Definition: nnet-nnet.cc:669

kaldi::kUndefined
Definition: matrix-common.h:39

kaldi::nnet3::NnetInferenceTask::num_output_frames
int32 num_output_frames
Definition: nnet-batch-compute.h:78

kaldi::nnet3::utterance_splitting::SplitInputToTasks
static void SplitInputToTasks(const NnetBatchComputerOptions &opts, int32 nnet_left_context, int32 nnet_right_context, const CuMatrix< BaseFloat > &input, std::vector< NnetInferenceTask > *tasks)
This function sets up the &#39;input&#39; and &#39;first_input_t&#39; and &#39;is_edge&#39; members of the &#39;tasks&#39; array; it ...
Definition: nnet-batch-compute.cc:779

kaldi::nnet3::ComputationRequest::store_component_stats
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
Definition: nnet-computation.h:126

kaldi::nnet3::NnetBatchComputer::num_full_minibatches_
int32 num_full_minibatches_
Definition: nnet-batch-compute.h:480

kaldi::CuVector
Definition: matrix-common.h:74

kaldi::nnet3::NnetBatchInference::ComputeFunc
static void ComputeFunc(NnetBatchInference *object)
Definition: nnet-batch-compute.h:565

kaldi::LatticeFasterDecoderConfig::lattice_beam
BaseFloat lattice_beam
Definition: lattice-faster-decoder.h:42

kaldi::CuMatrixBase::Row
const CuSubVector< Real > Row(MatrixIndexT i) const
Definition: cu-matrix.h:670

kaldi::nnet3::NnetBatchComputer::MinibatchSizeInfo
Definition: nnet-batch-compute.h:289

kaldi::LatticeFasterDecoderTpl::GetRawLattice
bool GetRawLattice(Lattice *ofst, bool use_final_probs=true) const
Outputs an FST corresponding to the raw, state-level tracebacks.
Definition: lattice-faster-decoder.cc:106

kaldi::nnet3::NnetBatchComputer::GetHighestPriorityComputation
MinibatchSizeInfo * GetHighestPriorityComputation(bool allow_partial_minibatch, int32 *minibatch_size, std::vector< NnetInferenceTask *> *tasks)
This function finds and returns the computation corresponding to the highest-priority group of tasks...
Definition: nnet-batch-compute.cc:136

kaldi::nnet3::NnetBatchComputer::mutex_
std::mutex mutex_
Definition: nnet-batch-compute.h:467

kaldi::nnet3::NnetInferenceTask::output
CuMatrix< BaseFloat > output
Definition: nnet-batch-compute.h:140

kaldi::nnet3::ComputationRequest::need_model_derivative
bool need_model_derivative
if need_model_derivative is true, then we&#39;ll be doing either model training or model-derivative compu...
Definition: nnet-computation.h:121

kaldi::nnet3::NnetBatchDecoder::UtteranceOutput
Definition: nnet-batch-compute.h:737

kaldi::nnet3::utterance_splitting::AddOnlineIvectorsToTasks
void AddOnlineIvectorsToTasks(const NnetBatchComputerOptions &opts, const CuMatrix< BaseFloat > &online_ivectors, int32 online_ivector_period, std::vector< NnetInferenceTask > *tasks)
Definition: nnet-batch-compute.cc:729

kaldi::nnet3::NnetBatchDecoder::frame_count_
int64 frame_count_
Definition: nnet-batch-compute.h:831

kaldi::MatrixBase::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67

kaldi::nnet3::NnetBatchComputer::output_dim_
int32 output_dim_
Definition: nnet-batch-compute.h:492

kaldi::nnet3::NnetBatchInference::is_finished_
bool is_finished_
Definition: nnet-batch-compute.h:576

kaldi::nnet3::NnetBatchDecoder::num_success_
int32 num_success_
Definition: nnet-batch-compute.h:832

fst
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
Definition: graph.dox:21

kaldi::nnet3::NnetBatchDecoder::UtteranceOutput::sentence
std::string sentence
Definition: nnet-batch-compute.h:742

kaldi::nnet3::NnetBatchDecoder::UtteranceOutput::compact_lat
CompactLattice compact_lat
Definition: nnet-batch-compute.h:740

kaldi::LatticeFasterDecoderTpl::ReachedFinal
bool ReachedFinal() const
says whether a final-state was active on the last frame.
Definition: lattice-faster-decoder.h:267

kaldi::nnet3::NnetBatchInference::UtteranceInfo::num_tasks_finished
size_t num_tasks_finished
Definition: nnet-batch-compute.h:591

kaldi::nnet3::NnetInferenceTask::priority
double priority
Definition: nnet-batch-compute.h:120

kaldi::nnet3::utterance_splitting::GetOutputFrameInfoForTasks
void GetOutputFrameInfoForTasks(const NnetBatchComputerOptions &opts, int32 num_subsampled_frames, int32 num_subsampled_frames_per_chunk, std::vector< NnetInferenceTask > *tasks)
This function figures out how many chunks are needed for this utterance, sets &#39;tasks&#39; to a vector wit...
Definition: nnet-batch-compute.cc:661

kaldi::GetVerboseLevel
int32 GetVerboseLevel()
Get verbosity level, usually set via command line &#39;–verbose=&#39; switch.
Definition: kaldi-error.h:60

kaldi::Semaphore::Signal
void Signal()
increase the counter
Definition: kaldi-semaphore.cc:51

kaldi::nnet3::NnetBatchDecoder::tot_like_
double tot_like_
Definition: nnet-batch-compute.h:829

kaldi::nnet3::NnetInferenceTask::output_t_stride
int32 output_t_stride
Definition: nnet-batch-compute.h:74

kaldi::nnet3::NnetInferenceTask::num_used_output_frames
int32 num_used_output_frames
Definition: nnet-batch-compute.h:90

kaldi::nnet3::NnetBatchComputer::nnet_left_context_
int32 nnet_left_context_
Definition: nnet-batch-compute.h:488

kaldi::nnet3::NnetBatchComputer::ComputationGroupInfo::tasks
std::vector< NnetInferenceTask * > tasks
Definition: nnet-batch-compute.h:309

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::nnet3::NnetBatchInference::UtteranceInfo::tasks
std::vector< NnetInferenceTask > tasks
Definition: nnet-batch-compute.h:586

kaldi::nnet3::NnetBatchComputer::nnet_right_context_
int32 nnet_right_context_
Definition: nnet-batch-compute.h:489

kaldi::nnet3::NnetSimpleComputationOptions::extra_left_context_initial
int32 extra_left_context_initial
Definition: nnet-am-decodable-simple.h:46

kaldi::Matrix< BaseFloat >

kaldi::nnet3::ComputationRequest::inputs
std::vector< IoSpecification > inputs
Definition: nnet-computation.h:115

kaldi::CuMatrix
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71

kaldi::nnet3::NnetBatchComputerOptions::ensure_exact_final_context
bool ensure_exact_final_context
Definition: nnet-batch-compute.h:147

kaldi::nnet3::NnetBatchDecoder::num_partial_
int32 num_partial_
Definition: nnet-batch-compute.h:834

kaldi::nnet3::NnetBatchDecoder::ComputeFunc
static void ComputeFunc(NnetBatchDecoder *object)
Definition: nnet-batch-compute.h:760

kaldi::nnet3::NnetBatchDecoder::priority_offset_
double priority_offset_
Definition: nnet-batch-compute.h:826

kaldi::nnet3::NnetBatchDecoder::word_syms_
const fst::SymbolTable * word_syms_
Definition: nnet-batch-compute.h:779

kaldi::LatticeFasterDecoderConfig
Definition: lattice-faster-decoder.h:38

kaldi::nnet3::NnetBatchInference::AcceptInput
void AcceptInput(const std::string &utterance_id, const Matrix< BaseFloat > &input, const Vector< BaseFloat > *ivector, const Matrix< BaseFloat > *online_ivectors, int32 online_ivector_period)
The user should call this one by one for the utterances that this class needs to compute (intersperse...
Definition: nnet-batch-compute.cc:1095

kaldi::nnet3::NnetBatchDecoder::tasks_ready_semaphore_
Semaphore tasks_ready_semaphore_
Definition: nnet-batch-compute.h:803

fst::LatticeWeightTpl< BaseFloat >

fst::GetLinearSymbolSequence
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
Definition: fstext-utils-inl.h:178

kaldi::nnet3::NnetBatchDecoder::fst_
const fst::Fst< fst::StdArc > & fst_
Definition: nnet-batch-compute.h:776

kaldi::nnet3::NnetBatchComputer::log_priors_
CuVector< BaseFloat > log_priors_
Definition: nnet-batch-compute.h:463

kaldi::CuSubVector
Definition: matrix-common.h:73

kaldi::Semaphore::TryWait
bool TryWait()
Returns true if Wait() goes through.
Definition: kaldi-semaphore.cc:35

kaldi::nnet3::Nnet::OutputDim
int32 OutputDim(const std::string &output_name) const
Definition: nnet-nnet.cc:677

kaldi::nnet3::Index
struct Index is intended to represent the various indexes by which we number the rows of the matrices...
Definition: nnet-common.h:44

kaldi::nnet3::NnetBatchComputer::PrintMinibatchStats
void PrintMinibatchStats()
Definition: nnet-batch-compute.cc:53

kaldi::TransitionModel
Definition: transition-model.h:123

nnet-utils.h
This file contains some miscellaneous functions dealing with class Nnet.

kaldi::nnet3::NnetInferenceTask::ivector
CuVector< BaseFloat > ivector
Definition: nnet-batch-compute.h:116

kaldi::nnet3::Nnet::Modulus
int32 Modulus() const
[Relevant for clockwork RNNs and similar].
Definition: nnet-nnet.cc:658

kaldi::nnet3::NnetBatchComputer::MinibatchSizeInfo::tot_num_tasks
int64 tot_num_tasks
Definition: nnet-batch-compute.h:293

fst::LatticeWeightTpl::Value1
T Value1() const
Definition: lattice-weight.h:52

kaldi::nnet3::NnetBatchComputer::ComputationGroupInfo::minibatch_info
std::map< int32, MinibatchSizeInfo > minibatch_info
Definition: nnet-batch-compute.h:314

kaldi::nnet3::NnetBatchDecoder::tasks_finished_
bool tasks_finished_
Definition: nnet-batch-compute.h:810

kaldi::nnet3::NnetBatchComputer::GetHighestPriorityTasks
void GetHighestPriorityTasks(int32 num_tasks, ComputationGroupInfo *info, std::vector< NnetInferenceTask *> *tasks)
Definition: nnet-batch-compute.cc:170

kaldi::nnet3::NnetInferenceTask::semaphore
Semaphore semaphore
Definition: nnet-batch-compute.h:125

kaldi::CuMatrixBase::Scale
void Scale(Real value)
Definition: cu-matrix.cc:644

kaldi::nnet3::NnetSimpleComputationOptions::CheckAndFixConfigs
void CheckAndFixConfigs(int32 nnet_modulus)
Definition: nnet-am-decodable-simple.h:107

fst::AcousticLatticeScale
std::vector< std::vector< double > > AcousticLatticeScale(double acwt)
Definition: lattice-utils.h:138

kaldi::nnet3::NnetComputer::AcceptInput
void AcceptInput(const std::string &node_name, CuMatrix< BaseFloat > *input)
e.g.
Definition: nnet-compute.cc:547

kaldi::nnet3::NnetBatchComputer::ComputationGroupKey::num_input_frames
int32 num_input_frames
Definition: nnet-batch-compute.h:330

kaldi::nnet3::NnetInferenceTask::first_input_t
int32 first_input_t
Definition: nnet-batch-compute.h:70

kaldi::nnet3::ComputationRequest
Definition: nnet-computation.h:114

kaldi::nnet3::NnetBatchComputer::ivector_dim_
int32 ivector_dim_
Definition: nnet-batch-compute.h:491

kaldi::CuVectorBase::CopyFromVec
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078

kaldi::nnet3::NnetBatchComputer::FormatInputs
void FormatInputs(int32 minibatch_size, const std::vector< NnetInferenceTask *> &tasks, CuMatrix< BaseFloat > *input, CuMatrix< BaseFloat > *ivector)
formats the inputs to the computation and transfers them to GPU.
Definition: nnet-batch-compute.cc:346

kaldi::nnet3::NnetInferenceTask
class NnetInferenceTask represents a chunk of an utterance that is requested to be computed...
Definition: nnet-batch-compute.h:50

kaldi::LatticeFasterDecoderTpl::InitDecoding
void InitDecoding()
InitDecoding initializes the decoding, and should only be used if you intend to call AdvanceDecoding(...
Definition: lattice-faster-decoder.cc:56

kaldi::nnet3::NnetBatchComputer::MinibatchSizeInfo::num_done
int32 num_done
Definition: nnet-batch-compute.h:292

kaldi::nnet3::NnetBatchComputer::MinibatchSizeInfo::computation
std::shared_ptr< const NnetComputation > computation
Definition: nnet-batch-compute.h:291

kaldi::nnet3::NnetBatchDecoder::compute_thread_
std::thread compute_thread_
Definition: nnet-batch-compute.h:783

kaldi::nnet3::NnetBatchComputer::SplitUtteranceIntoTasks
void SplitUtteranceIntoTasks(bool output_to_cpu, const Matrix< BaseFloat > &input, const Vector< BaseFloat > *ivector, const Matrix< BaseFloat > *online_ivectors, int32 online_ivector_period, std::vector< NnetInferenceTask > *tasks)
Split a single utterance into a list of separate tasks which can then be given to this class by Accep...
Definition: nnet-batch-compute.cc:843

kaldi::CuMatrixBase::AddVecToRows
void AddVecToRows(Real alpha, const CuVectorBase< Real > &row, Real beta=1.0)
(for each row r of *this), r = alpha * row + beta * r
Definition: cu-matrix.cc:1261

fst::LatticeWeightTpl::Value2
T Value2() const
Definition: lattice-weight.h:54

float

kaldi::nnet3::ComputeSimpleNnetContext
void ComputeSimpleNnetContext(const Nnet &nnet, int32 *left_context, int32 *right_context)
ComputeSimpleNnetContext computes the left-context and right-context of a nnet.
Definition: nnet-utils.cc:146

kaldi::nnet3::NnetSimpleComputationOptions::frames_per_chunk
int32 frames_per_chunk
Definition: nnet-am-decodable-simple.h:49

kaldi::nnet3::NnetBatchDecoder::allow_partial_
bool allow_partial_
Definition: nnet-batch-compute.h:780

kaldi::nnet3::NnetBatchDecoder::input_consumed_semaphore_
Semaphore input_consumed_semaphore_
Definition: nnet-batch-compute.h:796

kaldi::nnet3::NnetBatchComputer::GetActualMinibatchSize
int32 GetActualMinibatchSize(const ComputationGroupInfo &info) const
Definition: nnet-batch-compute.cc:242

kaldi::nnet3::NnetSimpleComputationOptions::extra_right_context_final
int32 extra_right_context_final
Definition: nnet-am-decodable-simple.h:47

kaldi::nnet3::NnetBatchComputer::AcceptTask
void AcceptTask(NnetInferenceTask *task, int32 max_minibatches_full=-1)
Accepts a task, meaning the task will be queued.
Definition: nnet-batch-compute.cc:568

kaldi::nnet3::NnetInferenceTask::input
CuMatrix< BaseFloat > input
Definition: nnet-batch-compute.h:63

fst::ScaleLattice
void ScaleLattice(const std::vector< std::vector< ScaleFloat > > &scale, MutableFst< ArcTpl< Weight > > *fst)
Scales the pairs of weights in LatticeWeight or CompactLatticeWeight by viewing the pair (a...
Definition: lattice-utils-inl.h:197

kaldi::nnet3::NnetBatchInference::Finished
void Finished()
The user should call this after the last input has been provided via AcceptInput().
Definition: nnet-batch-compute.cc:1165

kaldi::nnet3::NnetBatchComputer::FormatOutputs
void FormatOutputs(const CuMatrix< BaseFloat > &output, const std::vector< NnetInferenceTask *> &tasks)
Definition: nnet-batch-compute.cc:459

kaldi::nnet3::NnetBatchInference::tasks_ready_semaphore_
Semaphore tasks_ready_semaphore_
Definition: nnet-batch-compute.h:581

rnnlm::n
struct rnnlm::@11::@12 n

kaldi::SynchronizeGpu
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
Definition: cu-device.cc:638

kaldi::nnet3::NnetInferenceTask::output_to_cpu
bool output_to_cpu
Definition: nnet-batch-compute.h:130

kaldi::nnet3::Nnet
Definition: nnet-nnet.h:115

kaldi::nnet3::NnetBatchDecoder::input_utterance_
UtteranceInput input_utterance_
Definition: nnet-batch-compute.h:790

kaldi::nnet3::NnetBatchDecoder::DecodeFunc
static void DecodeFunc(NnetBatchDecoder *object)
Definition: nnet-batch-compute.h:755

kaldi::nnet3::NnetBatchDecoder::UtteranceFailed
void UtteranceFailed()
Definition: nnet-batch-compute.cc:1418

kaldi::nnet3::NnetBatchInference::computer_
NnetBatchComputer computer_
Definition: nnet-batch-compute.h:572

kaldi::Lattice
fst::VectorFst< LatticeArc > Lattice
Definition: kaldi-lattice.h:44

kaldi::nnet3::NnetInferenceTask::first_used_output_frame_index
int32 first_used_output_frame_index
Definition: nnet-batch-compute.h:102

kaldi::CuVector::Resize
void Resize(MatrixIndexT dim, MatrixResizeType t=kSetZero)
Allocate the memory.
Definition: cu-vector.cc:993

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

kaldi::nnet3::NnetBatchDecoder::trans_model_
const TransitionModel & trans_model_
Definition: nnet-batch-compute.h:778

kaldi::nnet3::NnetInferenceTask::is_edge
bool is_edge
Definition: nnet-batch-compute.h:108

kaldi::nnet3::NnetBatchComputer::MinibatchSizeInfo::seconds_taken
double seconds_taken
Definition: nnet-batch-compute.h:296

kaldi::nnet3::NnetBatchInference::utterance_counter_
int32 utterance_counter_
Definition: nnet-batch-compute.h:599

kaldi::nnet3::NnetBatchDecoder::stats_mutex_
std::mutex stats_mutex_
Definition: nnet-batch-compute.h:837

kaldi::nnet3::NnetBatchDecoder::NnetBatchDecoder
NnetBatchDecoder(const fst::Fst< fst::StdArc > &fst, const LatticeFasterDecoderConfig &decoder_config, const TransitionModel &trans_model, const fst::SymbolTable *word_syms, bool allow_partial, int32 num_threads, NnetBatchComputer *computer)
Constructor.
Definition: nnet-batch-compute.cc:1188

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

kaldi::nnet3::NnetBatchComputer::~NnetBatchComputer
~NnetBatchComputer()
Definition: nnet-batch-compute.cc:112

kaldi::CuSubMatrix
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70

kaldi::nnet3::NnetBatchComputer::ComputationGroupKey::num_output_frames
int32 num_output_frames
Definition: nnet-batch-compute.h:332

kaldi::nnet3::NnetBatchComputer::GetComputation
std::shared_ptr< const NnetComputation > GetComputation(const ComputationGroupInfo &info, int32 minibatch_size)
Definition: nnet-batch-compute.cc:255

kaldi::VectorBase::Dim
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64

kaldi::nnet3::NnetInferenceTask::output_cpu
Matrix< BaseFloat > output_cpu
Definition: nnet-batch-compute.h:137

kaldi::nnet3::NnetBatchDecoder::UpdatePriorityOffset
void UpdatePriorityOffset(double priority)
Definition: nnet-batch-compute.cc:1215

kaldi::CuMatrixBase::RowRange
CuSubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Definition: cu-matrix.h:660

kaldi::nnet3::NnetBatchDecoder::is_finished_
bool is_finished_
Definition: nnet-batch-compute.h:806

kaldi::nnet3::NnetBatchDecoder::input_ready_semaphore_
Semaphore input_ready_semaphore_
Definition: nnet-batch-compute.h:791

kaldi::nnet3::NnetBatchComputer::tasks_
MapType tasks_
Definition: nnet-batch-compute.h:472

kaldi::nnet3::NnetBatchComputer::ComputationGroupInfo
Definition: nnet-batch-compute.h:305

kaldi::Timer
Definition: timer.h:63

kaldi::nnet3::IoSpecification
Definition: nnet-computation.h:72

kaldi::nnet3::NnetBatchDecoder::num_fail_
int32 num_fail_
Definition: nnet-batch-compute.h:833

kaldi::nnet3::NnetBatchInference::UtteranceInfo
Definition: nnet-batch-compute.h:583

kaldi::CompactLattice
fst::VectorFst< CompactLatticeArc > CompactLattice
Definition: kaldi-lattice.h:46

kaldi::nnet3::NnetBatchDecoder::UtteranceOutput::utterance_id
std::string utterance_id
Definition: nnet-batch-compute.h:738

kaldi::nnet3::NnetBatchComputerOptions
Definition: nnet-batch-compute.h:144

kaldi::nnet3::NnetBatchDecoder::Compute
void Compute()
Definition: nnet-batch-compute.cc:1329

kaldi::nnet3::NnetBatchDecoder::SetPriorities
void SetPriorities(std::vector< NnetInferenceTask > *tasks)
Definition: nnet-batch-compute.cc:1208

operator<
bool operator<(const Int32Pair &a, const Int32Pair &b)
Definition: cu-matrixdim.h:83

kaldi::nnet3::NnetInferenceTask::num_initial_unused_output_frames
int32 num_initial_unused_output_frames
Definition: nnet-batch-compute.h:85

kaldi::CuMatrixBase::Data
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::LatticeFasterDecoderConfig::det_opts
fst::DeterminizeLatticePhonePrunedOptions det_opts
Definition: lattice-faster-decoder.h:56

kaldi::LatticeFasterDecoderTpl
This is the "normal" lattice-generating decoder.
Definition: lattice-faster-decoder.h:229

kaldi::nnet3::NnetBatchComputer::nnet_
const Nnet & nnet_
Definition: nnet-batch-compute.h:461

kaldi::nnet3::CachingOptimizingCompiler::Compile
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
Definition: nnet-optimize.cc:716

kaldi::nnet3::NnetBatchDecoder::AcceptInput
void AcceptInput(const std::string &utterance_id, const Matrix< BaseFloat > &input, const Vector< BaseFloat > *ivector, const Matrix< BaseFloat > *online_ivectors, int32 online_ivector_period)
The user should call this one by one for the utterances that it needs to compute (interspersed with c...
Definition: nnet-batch-compute.cc:1224

kaldi::nnet3::NnetBatchInference::~NnetBatchInference
~NnetBatchInference()
Definition: nnet-batch-compute.cc:1157

kaldi::nnet3::NnetBatchComputerOptions::edge_minibatch_size
int32 edge_minibatch_size
Definition: nnet-batch-compute.h:146

kaldi::nnet3::NnetBatchDecoder::pending_utts_
std::list< UtteranceOutput * > pending_utts_
Definition: nnet-batch-compute.h:819

kaldi::CuMatrixBase::NumCols
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216

kaldi::nnet3::NnetBatchComputer::NnetBatchComputer
NnetBatchComputer(const NnetBatchComputerOptions &opts, const Nnet &nnet, const VectorBase< BaseFloat > &priors)
Constructor.
Definition: nnet-batch-compute.cc:31

kaldi::nnet3::NnetBatchComputer::GetPriority
double GetPriority(bool allow_partial_minibatch, const ComputationGroupInfo &info) const
Definition: nnet-batch-compute.cc:268

kaldi::MatrixBase::RowRange
SubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Definition: kaldi-matrix.h:209

kaldi::LatticeFasterDecoderTpl::AdvanceDecoding
void AdvanceDecoding(DecodableInterface *decodable, int32 max_num_frames=-1)
This will decode until there are no more frames ready in the decodable object.
Definition: lattice-faster-decoder.cc:580

kaldi::nnet3::NnetSimpleComputationOptions::extra_right_context
int32 extra_right_context
Definition: nnet-am-decodable-simple.h:45

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

kaldi::nnet3::NnetBatchComputer::ComputationGroupKey
Definition: nnet-batch-compute.h:319

kaldi::nnet3::NnetComputer
class NnetComputer is responsible for executing the computation described in the "computation" object...
Definition: nnet-compute.h:59

kaldi::nnet3::NnetSimpleComputationOptions::compute_config
NnetComputeOptions compute_config
Definition: nnet-am-decodable-simple.h:53

kaldi::nnet3::NnetBatchComputerOptions::minibatch_size
int32 minibatch_size
Definition: nnet-batch-compute.h:145

kaldi::nnet3::NnetBatchDecoder::UtteranceInput::online_ivector_period
int32 online_ivector_period
Definition: nnet-batch-compute.h:730

kaldi::nnet3::NnetBatchDecoder::UtteranceInput
Definition: nnet-batch-compute.h:725

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::nnet3::NnetBatchComputer::compiler_
CachingOptimizingCompiler compiler_
Definition: nnet-batch-compute.h:462

kaldi::nnet3::ComputationRequest::outputs
std::vector< IoSpecification > outputs
Definition: nnet-computation.h:116

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

nnet-batch-compute.h

kaldi::nnet3::NnetBatchInference::compute_thread_
std::thread compute_thread_
Definition: nnet-batch-compute.h:602

kaldi::nnet3::NnetBatchComputerOptions::partial_minibatch_factor
BaseFloat partial_minibatch_factor
Definition: nnet-batch-compute.h:148

kaldi::nnet3::NnetBatchComputer::opts_
NnetBatchComputerOptions opts_
Definition: nnet-batch-compute.h:460

kaldi::CuVectorBase::Data
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72

kaldi::nnet3::NnetBatchComputer::no_more_than_n_minibatches_full_
std::unordered_map< int32, std::condition_variable * > no_more_than_n_minibatches_full_
Definition: nnet-batch-compute.h:485

KALDI_VLOG
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

kaldi::nnet3::NnetBatchInference::UtteranceInfo::utterance_id
std::string utterance_id
Definition: nnet-batch-compute.h:584

kaldi::nnet3::NnetBatchComputer::GetMinibatchSize
int32 GetMinibatchSize(const ComputationGroupInfo &info) const
Definition: nnet-batch-compute.cc:227

kaldi::Semaphore
Definition: kaldi-semaphore.h:30

kaldi::nnet3::NnetBatchDecoder::Finished
int32 Finished()
Definition: nnet-batch-compute.cc:1247

kaldi::nnet3::NnetBatchComputer::Compute
bool Compute(bool allow_partial_minibatch)
Does some kind of computation, choosing the highest-priority thing to compute.
Definition: nnet-batch-compute.cc:593

kaldi::nnet3::NnetSimpleComputationOptions::acoustic_scale
BaseFloat acoustic_scale
Definition: nnet-am-decodable-simple.h:50

kaldi::LatticeFasterDecoderConfig::determinize_lattice
bool determinize_lattice
Definition: lattice-faster-decoder.h:44

kaldi::Matrix::Resize
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
Definition: kaldi-matrix.cc:819

kaldi::DecodableMatrixMapped
This is like DecodableMatrixScaledMapped, but it doesn&#39;t support an acoustic scale, and it does support a frame offset, whereby you can state that the first row of &#39;likes&#39; is actually the n&#39;th row of the matrix of available log-likelihoods.
Definition: decodable-matrix.h:98

kaldi::nnet3::NnetBatchComputer::GetOptions
const NnetBatchComputerOptions & GetOptions()
Definition: nnet-batch-compute.h:280

kaldi::nnet3::NnetBatchComputer::input_dim_
int32 input_dim_
Definition: nnet-batch-compute.h:490

kaldi::nnet3::NnetBatchDecoder::Decode
void Decode()
Definition: nnet-batch-compute.cc:1337

kaldi::nnet3::NnetBatchDecoder::computer_
NnetBatchComputer * computer_
Definition: nnet-batch-compute.h:781

kaldi::nnet3::NnetBatchInference::utts_
std::list< UtteranceInfo * > utts_
Definition: nnet-batch-compute.h:597

decodable-matrix.h

kaldi::nnet3::NnetBatchDecoder::ProcessOutputUtterance
void ProcessOutputUtterance(UtteranceOutput *output)
Definition: nnet-batch-compute.cc:1423

kaldi::nnet3::NnetBatchComputer
This class does neural net inference in a way that is optimized for GPU use: it combines chunks of mu...
Definition: nnet-batch-compute.h:207

kaldi::nnet3::NnetComputer::GetOutputDestructive
void GetOutputDestructive(const std::string &output_name, CuMatrix< BaseFloat > *output)
Definition: nnet-compute.cc:587

kaldi::CuMatrixBase::NumRows
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215

kaldi::VectorBase
Provides a vector abstraction class.
Definition: kaldi-vector.h:41

kaldi::nnet3::NnetBatchDecoder::UtteranceInput::ivector
const Vector< BaseFloat > * ivector
Definition: nnet-batch-compute.h:728

kaldi::nnet3::NnetBatchDecoder::decoder_opts_
const LatticeFasterDecoderConfig & decoder_opts_
Definition: nnet-batch-compute.h:777

kaldi::nnet3::NnetInferenceTask::is_irregular
bool is_irregular
Definition: nnet-batch-compute.h:113

KALDI_LOG
#define KALDI_LOG
Definition: kaldi-error.h:153

kaldi::Timer::Elapsed
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74

kaldi::nnet3::MergeTaskOutput
void MergeTaskOutput(const std::vector< NnetInferenceTask > &tasks, Matrix< BaseFloat > *output)
Merges together the &#39;output_cpu&#39; (if the &#39;output_to_cpu&#39; members are true) or the &#39;output&#39; members of...
Definition: nnet-batch-compute.cc:968

kaldi::nnet3::NnetBatchDecoder::decode_threads_
std::vector< std::thread * > decode_threads_
Definition: nnet-batch-compute.h:782

kaldi::SubMatrix
Sub-matrix representation.
Definition: kaldi-matrix.h:988

kaldi::nnet3::NnetBatchInference::GetOutput
bool GetOutput(std::string *utterance_id, Matrix< BaseFloat > *output)
The user should call this to obtain output.
Definition: nnet-batch-compute.cc:1130

kaldi::nnet3::NnetBatchDecoder::UtteranceOutput::finished
bool finished
Definition: nnet-batch-compute.h:739

kaldi::nnet3::NnetBatchDecoder::GetOutput
bool GetOutput(std::string *utterance_id, CompactLattice *clat, std::string *sentence)
The user should call this to obtain output (This version should only be called if config...
Definition: nnet-batch-compute.cc:1266

fst::DeterminizeLatticePhonePrunedWrapper
bool DeterminizeLatticePhonePrunedWrapper(const kaldi::TransitionModel &trans_model, MutableFst< kaldi::LatticeArc > *ifst, double beam, MutableFst< kaldi::CompactLatticeArc > *ofst, DeterminizeLatticePhonePrunedOptions opts)
This function is a wrapper of DeterminizeLatticePhonePruned() that works for Lattice type FSTs...
Definition: determinize-lattice-pruned.cc:1488

kaldi::nnet3::NnetBatchDecoder::UtteranceOutput::lat
Lattice lat
Definition: nnet-batch-compute.h:741

kaldi::nnet3::NnetSimpleComputationOptions::frame_subsampling_factor
int32 frame_subsampling_factor
Definition: nnet-am-decodable-simple.h:48

kaldi::Semaphore::Wait
void Wait()
decrease the counter
Definition: kaldi-semaphore.cc:44

kaldi::CuMatrix::Resize
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
Definition: cu-matrix.cc:50

kaldi::nnet3::NnetBatchComputer::GetComputationRequest
static void GetComputationRequest(const NnetInferenceTask &task, int32 minibatch_size, ComputationRequest *request)
Definition: nnet-batch-compute.cc:312

kaldi::nnet3::NnetBatchInference::Compute
void Compute()
Definition: nnet-batch-compute.cc:1171

kaldi::nnet3::NnetBatchDecoder::UtteranceInput::input
const Matrix< BaseFloat > * input
Definition: nnet-batch-compute.h:727

kaldi::CuVectorBase::Dim
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69

kaldi::nnet3::NnetBatchDecoder::timer_
Timer timer_
Definition: nnet-batch-compute.h:839

kaldi::nnet3::NnetBatchDecoder::UtteranceInput::utterance_id
std::string utterance_id
Definition: nnet-batch-compute.h:726

kaldi::nnet3::NnetSimpleComputationOptions::extra_left_context
int32 extra_left_context
Definition: nnet-am-decodable-simple.h:44

kaldi::nnet3::NnetBatchDecoder::UtteranceInput::online_ivectors
const Matrix< BaseFloat > * online_ivectors
Definition: nnet-batch-compute.h:729

kaldi::nnet3::NnetComputer::Run
void Run()
This does either the forward or backward computation, depending when it is called (in a typical compu...
Definition: nnet-compute.cc:512