37 compiler_(nnet_, opts.optimize_config),
39 num_full_minibatches_(0) {
54 int32 max_stats_to_print = 10;
55 int64 tot_tasks = 0, tot_minibatches = 0;
56 double tot_time = 0.0;
57 std::ostringstream os;
58 struct MinibatchStats {
66 bool operator < (
const MinibatchStats &other)
const {
67 return seconds_taken > other.seconds_taken;
70 std::vector<MinibatchStats> all_stats;
71 os <<
"Minibatch stats: seconds-taken,frames-in:frames-out*minibatch-size=num-done(percent-full%) ";
73 for (MapType::const_iterator iter =
tasks_.begin();
74 iter !=
tasks_.end(); ++iter) {
75 for (std::map<int32, MinibatchSizeInfo>::const_iterator
76 miter = iter->second.minibatch_info.begin();
77 miter != iter->second.minibatch_info.end(); ++miter) {
83 stats.minibatch_size = miter->first;
91 (stats.minibatch_size * stats.num_done));
92 all_stats.push_back(stats);
96 std::sort(all_stats.begin(), all_stats.end());
97 os << std::fixed << std::setprecision(2);
98 int32 num_stats = all_stats.size();
99 for (
int32 i = 0; i < std::min<int32>(num_stats, max_stats_to_print);
i++) {
100 MinibatchStats &stats = all_stats[
i];
101 os << stats.seconds_taken <<
',' << stats.num_frames_in <<
':' 102 << stats.num_frames_out <<
'*' << stats.minibatch_size
103 <<
'=' << stats.num_done <<
'(' << stats.percent_full <<
"%) ";
105 if (num_stats > max_stats_to_print)
108 KALDI_LOG <<
"Did " << tot_tasks <<
" tasks in " << tot_minibatches
109 <<
" minibatches, taking " << tot_time <<
" seconds.";
117 KALDI_ERR <<
"Destructor called while object locked.";
118 int32 num_pending_tasks = 0;
119 for (
auto iter =
tasks_.begin(); iter !=
tasks_.end(); ++iter)
120 num_pending_tasks += iter->second.tasks.size();
121 if (num_pending_tasks > 0)
122 KALDI_ERR <<
"Tasks are pending but object is being destroyed";
125 std::condition_variable *cond = iter->second;
137 bool allow_partial_minibatch,
138 int32 *minibatch_size_out,
139 std::vector<NnetInferenceTask*> *tasks) {
141 std::unique_lock<std::mutex> lock(
mutex_);
142 MapType::iterator iter =
tasks_.begin(), end =
tasks_.end(),
144 double highest_priority = -std::numeric_limits<double>::infinity();
146 for (; iter != end; ++iter) {
148 double this_priority =
GetPriority(allow_partial_minibatch, info);
149 if (this_priority > highest_priority) {
150 highest_priority = this_priority;
154 if (best_iter ==
tasks_.end()) {
161 *minibatch_size_out = actual_minibatch_size;
171 int32 num_tasks_needed,
173 std::vector<NnetInferenceTask*> *tasks) {
177 if (num_tasks_needed >= num_tasks_present) {
178 tasks->swap(info->
tasks);
180 int32 num_tasks_not_needed = num_tasks_present - num_tasks_needed;
185 std::vector<std::pair<double, NnetInferenceTask*> > pairs(num_tasks_present);
186 for (
int32 i = 0;
i < num_tasks_present;
i++) {
187 pairs[
i].first = info->
tasks[
i]->priority;
188 pairs[
i].second = info->
tasks[
i];
190 std::nth_element(pairs.begin(), pairs.begin() + num_tasks_not_needed,
195 for (
int32 i = 0;
i < num_tasks_not_needed;
i++)
196 info->
tasks.push_back(pairs[
i].second);
199 for (
int32 i = num_tasks_not_needed; i < num_tasks_present; i++)
200 tasks->push_back(pairs[i].second);
209 int32 new_num_tasks_present = info->
tasks.size(),
210 full_minibatch_reduction =
211 (num_tasks_present / minibatch_size) -
212 (new_num_tasks_present / minibatch_size);
213 for (
int32 i = 0;
i < full_minibatch_reduction;
i++) {
216 std::unordered_map<int32, std::condition_variable*>::const_iterator
219 std::condition_variable *cond = iter->second;
229 if (info.
tasks.empty()) {
251 return int32(this_minibatch_size);
257 int32 minibatch_size) {
270 if (info.
tasks.empty())
271 return -std::numeric_limits<double>::infinity();
275 if (!allow_partial_minibatch && num_tasks < this_minibatch_size)
276 return -std::numeric_limits<double>::infinity();
282 double proportion_full = std::min<int32>(num_tasks, this_minibatch_size) /
283 double(this_minibatch_size),
284 penalty_for_not_full = 10.0 * (proportion_full - 1.0),
285 task_priority_sum = 0.0;
288 if (num_tasks > this_minibatch_size) {
291 std::vector<double> priorities;
292 priorities.resize(num_tasks);
293 for (
int32 i = 0;
i < num_tasks;
i++)
294 priorities[
i] = info.
tasks[
i]->priority;
296 std::nth_element(priorities.begin(),
297 priorities.begin() + this_minibatch_size,
299 std::greater<double>());
300 for (
int32 i = 0;
i < this_minibatch_size;
i++)
301 task_priority_sum += priorities[
i];
302 return penalty_for_not_full + task_priority_sum / this_minibatch_size;
304 for (
int32 i = 0;
i < num_tasks;
i++)
305 task_priority_sum += info.
tasks[
i]->priority;
306 return penalty_for_not_full + task_priority_sum / num_tasks;
314 int32 minibatch_size,
318 request->
inputs.reserve(2);
320 int32 num_input_frames = task.
input.NumRows(),
324 bool has_ivector = (task.
ivector.Dim() != 0);
326 std::vector<Index> input_indexes, ivector_indexes, output_indexes;
327 input_indexes.reserve(minibatch_size * num_input_frames);
328 output_indexes.reserve(minibatch_size * num_output_frames);
330 ivector_indexes.reserve(minibatch_size);
332 for (
int32 n = 0;
n < minibatch_size;
n++) {
333 for (
int32 t = first_input_t; t < first_input_t + num_input_frames; t++)
334 input_indexes.push_back(
Index(
n, t, 0));
336 ivector_indexes.push_back(
Index(
n, 0, 0));
337 for (
int32 t = 0; t < num_output_frames; t++)
338 output_indexes.push_back(
Index(
n, t * output_t_stride, 0));
347 int32 minibatch_size,
348 const std::vector<NnetInferenceTask*> &tasks,
351 int32 num_input_frames = tasks[0]->input.NumRows(),
352 input_dim = tasks[0]->input.NumCols(),
353 ivector_dim = tasks[0]->ivector.Dim(),
354 num_tasks = tasks.size();
355 KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
358 input->
Resize(minibatch_size * num_input_frames, input_dim,
362 if (CuDevice::Instantiate().Enabled()) {
364 std::vector<const BaseFloat*> inputs(num_tasks);
365 std::vector<BaseFloat*> outputs(num_tasks);
366 std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
367 std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
370 for (
int32 n = 0;
n < num_tasks;
n++) {
373 n * num_input_frames, num_input_frames);
376 num_rows[
n] = num_input_frames;
377 num_cols[
n] = input_dim;
378 outputs[
n] = output_mat.
Data();
379 inputs[
n] = input_mat.
Data();
385 cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0],
386 &ldi[0], &outputs[0], &ldo[0]);
391 for (
int32 n = 0;
n < num_tasks;
n++) {
393 n * num_input_frames, num_input_frames,
400 if (num_tasks < minibatch_size) {
404 input->
RowRange(num_tasks * num_input_frames,
405 (minibatch_size - num_tasks) * num_input_frames).SetZero();
409 if (ivector_dim != 0) {
413 if (CuDevice::Instantiate().Enabled()) {
418 std::vector<const BaseFloat*> inputs(num_tasks);
419 std::vector<BaseFloat*> outputs(num_tasks);
420 std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
421 std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
424 for (
int32 n = 0;
n < num_tasks;
n++) {
429 num_cols[
n] = ivector_dim;
430 outputs[
n] = output_vec.
Data();
431 inputs[
n] = input_vec.
Data();
437 cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
438 &outputs[0], &ldo[0]);
443 for (
int32 n = 0;
n < num_tasks;
n++) {
444 ivector->
Row(
n).CopyFromVec(tasks[
n]->ivector);
449 if (num_tasks < minibatch_size) {
453 ivector->
RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
461 const std::vector<NnetInferenceTask*> &tasks) {
463 int32 num_output_frames = tasks[0]->num_output_frames,
465 num_tasks = tasks.size();
466 bool did_output_to_gpu =
false;
473 if (CuDevice::Instantiate().Enabled()) {
475 std::vector<const BaseFloat*> inputs(num_tasks);
476 std::vector<BaseFloat*> outputs(num_tasks);
477 std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
478 std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
481 for (
int32 n = 0;
n < num_tasks;
n++) {
497 output.
RowRange(
n * num_output_frames + left_unused, used));
503 did_output_to_gpu =
true;
504 task->
output.Resize(num_output_frames, output_dim,
510 n * num_output_frames + left_unused, used);
513 num_rows[b] = output_mat.
NumRows();
514 num_cols[b] = output_mat.
NumCols();
515 outputs[b] = output_mat.
Data();
516 inputs[b] = input_mat.
Data();
517 ldo[b] = output_mat.
Stride();
518 ldi[b] = input_mat.
Stride();
524 cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
525 &outputs[0], &ldo[0]);
532 for (
int32 n = 0;
n < num_tasks;
n++) {
545 output.
RowRange(
n * num_output_frames + left_unused, used));
549 did_output_to_gpu =
true;
550 task->
output.Resize(num_output_frames, output_dim,
554 task->
output.RowRange(left_unused, used).CopyFromMat(
555 output.
RowRange(
n * num_output_frames + left_unused, used));
564 if (did_output_to_gpu)
569 int32 max_minibatches_full) {
570 std::unique_lock<std::mutex> lock(
mutex_);
573 std::unordered_map<int32, std::condition_variable*>::iterator
575 std::condition_variable *cond;
579 cond =
new std::condition_variable();
587 info.
tasks.push_back(task);
589 if (static_cast<int32>(info.
tasks.size()) % minibatch_size == 0)
594 int32 minibatch_size;
595 std::vector<NnetInferenceTask*> tasks;
604 Nnet *nnet_to_update = NULL;
606 nnet_, nnet_to_update);
631 for (
size_t i = 0;
i < tasks.size();
i++)
632 tasks[
i]->semaphore.Signal();
642 namespace utterance_splitting {
663 int32 num_subsampled_frames,
664 int32 num_subsampled_frames_per_chunk,
665 std::vector<NnetInferenceTask> *tasks) {
667 int32 fpc = num_subsampled_frames_per_chunk;
668 int32 num_tasks = (num_subsampled_frames + fpc - 1) / fpc;
669 tasks->resize(num_tasks);
670 for (
int32 i = 0;
i < num_tasks;
i++) {
673 if (num_subsampled_frames <= fpc) {
689 for (
int32 i = 0;
i + 1 < num_tasks;
i++) {
702 (num_subsampled_frames - fpc);
704 num_subsampled_frames - ((num_tasks - 1) * fpc);
711 KALDI_ASSERT((*tasks)[0].first_used_output_frame_index == 0);
712 for (
int32 i = 1;
i < num_tasks;
i++) {
714 (*tasks)[
i-1].first_used_output_frame_index +
715 (*tasks)[
i-1].num_used_output_frames);
717 KALDI_ASSERT((*tasks)[num_tasks-1].first_used_output_frame_index +
718 (*tasks)[num_tasks-1].num_used_output_frames ==
719 num_subsampled_frames);
720 for (
int32 i = 0;
i < num_tasks;
i++) {
732 int32 online_ivector_period,
733 std::vector<NnetInferenceTask> *tasks) {
735 num_tasks = tasks->size();
736 for (
int32 i = 0;
i < num_tasks;
i++) {
743 mid_input_t = mid_output_t * f,
744 ivector_frame = mid_input_t / online_ivector_period,
745 num_ivector_frames = online_ivectors.
NumRows(),
746 margin_in_frames = 20,
747 margin_in_ivector_frames =
748 (margin_in_frames + online_ivector_period - 1) / online_ivector_period;
752 if (ivector_frame >= num_ivector_frames) {
753 if (num_ivector_frames > 0 && ivector_frame > num_ivector_frames -
754 margin_in_ivector_frames) {
755 ivector_frame = num_ivector_frames - 1;
757 KALDI_ERR <<
"Could not get iVector for frame " << ivector_frame
758 <<
", online-ivectors matrix has " 760 <<
" rows. Mismatched --online-ivector-period?";
763 task.
ivector = online_ivectors.
Row(ivector_frame);
780 int32 nnet_left_context,
781 int32 nnet_right_context,
783 std::vector<NnetInferenceTask> *tasks) {
786 num_subsampled_frames = (num_input_frames + f - 1) / f,
793 num_tasks = tasks->size();
795 for (
int32 i = 0;
i < num_tasks;
i++) {
806 int32 begin_input_t = begin_output_t * f,
807 end_input_t = end_output_t * f;
811 bool left_edge = (begin_output_t <= 0),
812 right_edge = (end_output_t >= num_subsampled_frames);
813 int32 tot_left_context = nnet_left_context +
815 tot_right_context = nnet_right_context +
824 int32 begin_input_t_padded = begin_input_t - tot_left_context,
825 end_input_t_padded = end_input_t + tot_right_context;
830 task.
first_input_t = begin_input_t_padded - (begin_output_t * f);
832 task.
input.Resize(end_input_t_padded - begin_input_t_padded,
836 task.
input.CopyRangeFromMatClamped(input, begin_input_t_padded,
837 end_input_t_padded, 0, num_input_frames-1);
848 int32 online_ivector_period,
849 std::vector<NnetInferenceTask> *tasks) {
858 if (h_ivector!=NULL) {
861 ivector = &cu_ivector;
863 if (h_online_ivectors!=NULL) {
865 cu_online_ivectors.
CopyFromMat(*h_online_ivectors);
866 online_ivectors = &cu_online_ivectors;
870 online_ivectors, online_ivector_period, tasks);
878 int32 online_ivector_period,
879 std::vector<NnetInferenceTask> *tasks) {
880 using namespace utterance_splitting;
885 KALDI_ERR <<
"Input features did not have expected dimension: expected " 888 int32 ivector_dim = (ivector != NULL ? ivector->
Dim() :
889 (online_ivectors != NULL ?
890 online_ivectors->
NumCols() : 0));
892 KALDI_ERR <<
"Model expects i-vectors but none were supplied";
894 KALDI_ERR <<
"You supplied i-vectors but model does not expect them.";
896 KALDI_ERR <<
"I-vector dimensions mismatch: model expects " 903 num_subsampled_frames = (num_input_frames + f - 1) / f,
907 num_subsampled_frames_per_chunk,
914 if (ivector != NULL) {
918 if (CuDevice::Instantiate().Enabled()) {
919 int32_t num_tasks = tasks->size();
921 std::vector<const BaseFloat*> inputs(num_tasks);
922 std::vector<BaseFloat*> outputs(num_tasks);
923 std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
924 std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
928 for (
size_t i = 0;
i < tasks->size();
i++) {
936 num_cols[b] = output_vec.
Dim();
937 outputs[b] = output_vec.
Data();
938 inputs[b] = input_vec.
Data();
945 cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
946 &outputs[0], &ldo[0]);
950 for (
size_t i = 0;
i < tasks->size();
i++)
951 (*tasks)[
i].ivector = *ivector;
954 }
else if (online_ivectors != NULL) {
956 online_ivector_period, tasks);
959 for (
size_t i = 0;
i < tasks->size();
i++) {
960 (*tasks)[
i].output_to_cpu = output_to_cpu;
963 (*tasks)[
i].priority = 0.0;
969 const std::vector<NnetInferenceTask> &tasks,
971 int32 num_tasks = tasks.size(),
972 num_output_frames = 0,
974 for (
int32 i = 0;
i < num_tasks;
i++) {
983 KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
984 int32 cur_output_frame = 0;
985 output->
Resize(num_output_frames, output_dim);
986 for (
int32 i = 0;
i < num_tasks;
i++) {
992 output->
RowRange(cur_output_frame, num_used).CopyFromMat(
995 output->
RowRange(cur_output_frame, num_used).CopyFromMat(
996 task.
output.RowRange(skip, num_used));
998 cur_output_frame += num_used;
1003 const std::vector<NnetInferenceTask> &tasks,
1005 int32 num_tasks = tasks.size(),
1006 num_output_frames = 0,
1008 for (
int32 i = 0;
i < num_tasks;
i++) {
1017 KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
1018 int32 cur_output_frame = 0;
1022 if (CuDevice::Instantiate().Enabled()) {
1024 std::vector<const BaseFloat*> inputs(num_tasks);
1025 std::vector<BaseFloat*> outputs(num_tasks);
1026 std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
1027 std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
1030 for (
int32 i = 0;
i < num_tasks;
i++) {
1036 output->
RowRange(cur_output_frame, num_used).CopyFromMat(
1040 output->
RowRange(cur_output_frame, num_used);
1042 task.
output.RowRange(skip, num_used);
1045 num_rows[b] = output_mat.
NumRows();
1046 num_cols[b] = output_mat.
NumCols();
1047 outputs[b] = output_mat.
Data();
1048 inputs[b] = input_mat.
Data();
1049 ldo[b] = output_mat.
Stride();
1050 ldi[b] = input_mat.
Stride();
1053 cur_output_frame += num_used;
1057 cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0],
1058 &outputs[0], &ldo[0]);
1063 for (
int32 i = 0;
i < num_tasks;
i++) {
1069 output->
RowRange(cur_output_frame, num_used).CopyFromMat(
1072 output->
RowRange(cur_output_frame, num_used).CopyFromMat(
1073 task.
output.RowRange(skip, num_used));
1075 cur_output_frame += num_used;
1087 computer_(opts, nnet, priors),
1088 is_finished_(false),
1089 utterance_counter_(0) {
1096 const std::string &utterance_id,
1100 int32 online_ivector_period) {
1105 bool output_to_cpu =
true;
1111 output_to_cpu, input, ivector, online_ivectors,
1112 online_ivector_period, &(info->
tasks));
1117 int32 max_full_minibatches = 2;
1122 for (
size_t i = 0;
i < info->
tasks.size();
i++) {
1123 info->
tasks[
i].priority = priority;
1126 utts_.push_back(info);
1136 std::vector<NnetInferenceTask> &tasks = info->
tasks;
1137 int32 num_tasks = tasks.size();
1159 KALDI_ERR <<
"Object destroyed before Finished() was called.";
1161 KALDI_ERR <<
"You should get all output before destroying this object.";
1172 bool allow_partial_minibatch =
false;
1180 allow_partial_minibatch =
true;
1189 const fst::Fst<fst::StdArc> &
fst,
1192 const fst::SymbolTable *word_syms,
1196 fst_(fst), decoder_opts_(decoder_opts),
1197 trans_model_(trans_model), word_syms_(word_syms),
1198 allow_partial_(allow_partial),
computer_(computer),
1199 is_finished_(false), tasks_finished_(false), priority_offset_(0.0),
1200 tot_like_(0.0), frame_count_(0), num_success_(0), num_fail_(0),
1203 for (
int32 i = 0;
i < num_threads;
i++)
1209 size_t num_tasks = tasks->size();
1211 for (
size_t i = 0;
i < num_tasks;
i++)
1212 (*tasks)[
i].priority = priority_offset - (double)
i;
1217 new_weight = 1.0 / num_tasks,
1218 old_weight = 1.0 - new_weight;
1225 const std::string &utterance_id,
1229 int32 online_ivector_period){
1267 std::string *utterance_id,
1269 std::string *sentence) {
1271 KALDI_ERR <<
"Don't call this version of GetOutput if you are " 1272 "not determinizing.";
1290 sentence->swap(this_output->
sentence);
1299 std::string *utterance_id,
1301 std::string *sentence) {
1303 KALDI_ERR <<
"Don't call this version of GetOutput if you are " 1312 if (this_output->
lat.NumStates() == 0) {
1320 *lat = this_output->
lat;
1322 sentence->swap(this_output->
sentence);
1332 bool allow_partial_minibatch =
true;
1343 std::vector<NnetInferenceTask> tasks;
1344 std::string utterance_id;
1352 bool output_to_cpu =
true;
1354 *(input_utterance.
input),
1366 for (
size_t i = 0;
i < tasks.size();
i++)
1371 int32 frame_offset = 0;
1376 for (
size_t i = 0; i < tasks.size(); i++) {
1386 frame_offset += post.NumRows();
1388 task.
output.Resize(0, 0);
1391 bool use_final_probs =
true;
1394 KALDI_WARN <<
"Outputting partial output for utterance " 1395 << utterance_id <<
" since no final-state reached\n";
1396 use_final_probs =
false;
1400 KALDI_WARN <<
"Not producing output for utterance " << utterance_id
1401 <<
" since no final-state reached and " 1402 <<
"--allow-partial=false.\n";
1424 fst::Connect(&(output->
lat));
1425 if (output->
lat.NumStates() == 0) {
1426 KALDI_WARN <<
"Unexpected problem getting lattice for utterance " 1437 ShortestPath(output->
lat, &best_path);
1438 std::vector<int32> alignment;
1439 std::vector<int32>
words;
1441 int32 num_frames = alignment.size();
1443 std::ostringstream os;
1444 for (
size_t i = 0;
i < words.size();
i++) {
1447 KALDI_ERR <<
"Word-id " << words[
i] <<
" not in symbol table.";
1452 double likelihood = -(weight.
Value1() + weight.
Value2());
1458 <<
" is " << (likelihood / num_frames) <<
" over " 1459 << num_frames <<
" frames.";
1476 KALDI_WARN <<
"Determinization finished earlier than the beam for " 1478 output->
lat.DeleteStates();
1484 if (acoustic_scale != 0.0) {
1501 KALDI_ERR <<
"Destroying NnetBatchDecoder object without calling " 1502 "Finished() and consuming the remaining output";
1506 kaldi::int64 input_frame_count =
1510 KALDI_LOG <<
"Overall likelihood per frame was " 1517 <<
"s: real-time factor assuming 100 frames/sec is " 1518 << (num_threads * elapsed * 100.0 /
1519 std::max<int64>(input_frame_count, 1))
1520 <<
" (per thread; with " << num_threads <<
" threads).";
NnetBatchInference(const NnetBatchComputerOptions &opts, const Nnet &nnet, const VectorBase< BaseFloat > &priors)
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
MatrixIndexT Stride() const
int32 InputDim(const std::string &input_name) const
static void SplitInputToTasks(const NnetBatchComputerOptions &opts, int32 nnet_left_context, int32 nnet_right_context, const CuMatrix< BaseFloat > &input, std::vector< NnetInferenceTask > *tasks)
This function sets up the 'input' and 'first_input_t' and 'is_edge' members of the 'tasks' array; it ...
bool store_component_stats
you should set need_component_stats to true if you need the average-activation and average-derivative...
int32 num_full_minibatches_
static void ComputeFunc(NnetBatchInference *object)
const CuSubVector< Real > Row(MatrixIndexT i) const
bool GetRawLattice(Lattice *ofst, bool use_final_probs=true) const
Outputs an FST corresponding to the raw, state-level tracebacks.
MinibatchSizeInfo * GetHighestPriorityComputation(bool allow_partial_minibatch, int32 *minibatch_size, std::vector< NnetInferenceTask *> *tasks)
This function finds and returns the computation corresponding to the highest-priority group of tasks...
CuMatrix< BaseFloat > output
bool need_model_derivative
if need_model_derivative is true, then we'll be doing either model training or model-derivative compu...
void AddOnlineIvectorsToTasks(const NnetBatchComputerOptions &opts, const CuMatrix< BaseFloat > &online_ivectors, int32 online_ivector_period, std::vector< NnetInferenceTask > *tasks)
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
For an extended explanation of the framework of which grammar-fsts are a part, please see Support for...
CompactLattice compact_lat
bool ReachedFinal() const
says whether a final-state was active on the last frame.
size_t num_tasks_finished
void GetOutputFrameInfoForTasks(const NnetBatchComputerOptions &opts, int32 num_subsampled_frames, int32 num_subsampled_frames_per_chunk, std::vector< NnetInferenceTask > *tasks)
This function figures out how many chunks are needed for this utterance, sets 'tasks' to a vector wit...
int32 GetVerboseLevel()
Get verbosity level, usually set via command line '–verbose=' switch.
void Signal()
increase the counter
int32 num_used_output_frames
std::vector< NnetInferenceTask * > tasks
std::vector< NnetInferenceTask > tasks
int32 nnet_right_context_
int32 extra_left_context_initial
std::vector< IoSpecification > inputs
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
bool ensure_exact_final_context
static void ComputeFunc(NnetBatchDecoder *object)
const fst::SymbolTable * word_syms_
void AcceptInput(const std::string &utterance_id, const Matrix< BaseFloat > &input, const Vector< BaseFloat > *ivector, const Matrix< BaseFloat > *online_ivectors, int32 online_ivector_period)
The user should call this one by one for the utterances that this class needs to compute (intersperse...
Semaphore tasks_ready_semaphore_
bool GetLinearSymbolSequence(const Fst< Arc > &fst, std::vector< I > *isymbols_out, std::vector< I > *osymbols_out, typename Arc::Weight *tot_weight_out)
GetLinearSymbolSequence gets the symbol sequence from a linear FST.
const fst::Fst< fst::StdArc > & fst_
CuVector< BaseFloat > log_priors_
bool TryWait()
Returns true if Wait() goes through.
int32 OutputDim(const std::string &output_name) const
struct Index is intended to represent the various indexes by which we number the rows of the matrices...
void PrintMinibatchStats()
This file contains some miscellaneous functions dealing with class Nnet.
CuVector< BaseFloat > ivector
int32 Modulus() const
[Relevant for clockwork RNNs and similar].
std::map< int32, MinibatchSizeInfo > minibatch_info
void GetHighestPriorityTasks(int32 num_tasks, ComputationGroupInfo *info, std::vector< NnetInferenceTask *> *tasks)
void CheckAndFixConfigs(int32 nnet_modulus)
std::vector< std::vector< double > > AcousticLatticeScale(double acwt)
void AcceptInput(const std::string &node_name, CuMatrix< BaseFloat > *input)
e.g.
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
void FormatInputs(int32 minibatch_size, const std::vector< NnetInferenceTask *> &tasks, CuMatrix< BaseFloat > *input, CuMatrix< BaseFloat > *ivector)
formats the inputs to the computation and transfers them to GPU.
class NnetInferenceTask represents a chunk of an utterance that is requested to be computed...
void InitDecoding()
InitDecoding initializes the decoding, and should only be used if you intend to call AdvanceDecoding(...
std::shared_ptr< const NnetComputation > computation
std::thread compute_thread_
void SplitUtteranceIntoTasks(bool output_to_cpu, const Matrix< BaseFloat > &input, const Vector< BaseFloat > *ivector, const Matrix< BaseFloat > *online_ivectors, int32 online_ivector_period, std::vector< NnetInferenceTask > *tasks)
Split a single utterance into a list of separate tasks which can then be given to this class by Accep...
void AddVecToRows(Real alpha, const CuVectorBase< Real > &row, Real beta=1.0)
(for each row r of *this), r = alpha * row + beta * r
void ComputeSimpleNnetContext(const Nnet &nnet, int32 *left_context, int32 *right_context)
ComputeSimpleNnetContext computes the left-context and right-context of a nnet.
Semaphore input_consumed_semaphore_
int32 GetActualMinibatchSize(const ComputationGroupInfo &info) const
int32 extra_right_context_final
void AcceptTask(NnetInferenceTask *task, int32 max_minibatches_full=-1)
Accepts a task, meaning the task will be queued.
CuMatrix< BaseFloat > input
void ScaleLattice(const std::vector< std::vector< ScaleFloat > > &scale, MutableFst< ArcTpl< Weight > > *fst)
Scales the pairs of weights in LatticeWeight or CompactLatticeWeight by viewing the pair (a...
void Finished()
The user should call this after the last input has been provided via AcceptInput().
void FormatOutputs(const CuMatrix< BaseFloat > &output, const std::vector< NnetInferenceTask *> &tasks)
Semaphore tasks_ready_semaphore_
void SynchronizeGpu()
The function SynchronizeGpu(), which for convenience is defined whether or not we have compiled for C...
UtteranceInput input_utterance_
static void DecodeFunc(NnetBatchDecoder *object)
NnetBatchComputer computer_
fst::VectorFst< LatticeArc > Lattice
int32 first_used_output_frame_index
void Resize(MatrixIndexT dim, MatrixResizeType t=kSetZero)
Allocate the memory.
const TransitionModel & trans_model_
NnetBatchDecoder(const fst::Fst< fst::StdArc > &fst, const LatticeFasterDecoderConfig &decoder_config, const TransitionModel &trans_model, const fst::SymbolTable *word_syms, bool allow_partial, int32 num_threads, NnetBatchComputer *computer)
Constructor.
This class is used for a piece of a CuMatrix.
std::shared_ptr< const NnetComputation > GetComputation(const ComputationGroupInfo &info, int32 minibatch_size)
MatrixIndexT Dim() const
Returns the dimension of the vector.
Matrix< BaseFloat > output_cpu
void UpdatePriorityOffset(double priority)
CuSubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
Semaphore input_ready_semaphore_
fst::VectorFst< CompactLatticeArc > CompactLattice
void SetPriorities(std::vector< NnetInferenceTask > *tasks)
bool operator<(const Int32Pair &a, const Int32Pair &b)
int32 num_initial_unused_output_frames
const Real * Data() const
Return data pointer (const).
fst::DeterminizeLatticePhonePrunedOptions det_opts
This is the "normal" lattice-generating decoder.
std::shared_ptr< const NnetComputation > Compile(const ComputationRequest &request)
Does the compilation and returns a const pointer to the result, which is owned by this class...
void AcceptInput(const std::string &utterance_id, const Matrix< BaseFloat > &input, const Vector< BaseFloat > *ivector, const Matrix< BaseFloat > *online_ivectors, int32 online_ivector_period)
The user should call this one by one for the utterances that it needs to compute (interspersed with c...
int32 edge_minibatch_size
std::list< UtteranceOutput * > pending_utts_
MatrixIndexT NumCols() const
NnetBatchComputer(const NnetBatchComputerOptions &opts, const Nnet &nnet, const VectorBase< BaseFloat > &priors)
Constructor.
double GetPriority(bool allow_partial_minibatch, const ComputationGroupInfo &info) const
SubMatrix< Real > RowRange(const MatrixIndexT row_offset, const MatrixIndexT num_rows) const
void AdvanceDecoding(DecodableInterface *decodable, int32 max_num_frames=-1)
This will decode until there are no more frames ready in the decodable object.
int32 extra_right_context
A class representing a vector.
class NnetComputer is responsible for executing the computation described in the "computation" object...
NnetComputeOptions compute_config
#define KALDI_ASSERT(cond)
CachingOptimizingCompiler compiler_
std::vector< IoSpecification > outputs
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
std::thread compute_thread_
BaseFloat partial_minibatch_factor
NnetBatchComputerOptions opts_
Real * Data()
Returns a pointer to the start of the vector's data.
std::unordered_map< int32, std::condition_variable * > no_more_than_n_minibatches_full_
int32 GetMinibatchSize(const ComputationGroupInfo &info) const
bool Compute(bool allow_partial_minibatch)
Does some kind of computation, choosing the highest-priority thing to compute.
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
This is like DecodableMatrixScaledMapped, but it doesn't support an acoustic scale, and it does support a frame offset, whereby you can state that the first row of 'likes' is actually the n'th row of the matrix of available log-likelihoods.
const NnetBatchComputerOptions & GetOptions()
NnetBatchComputer * computer_
std::list< UtteranceInfo * > utts_
void ProcessOutputUtterance(UtteranceOutput *output)
This class does neural net inference in a way that is optimized for GPU use: it combines chunks of mu...
void GetOutputDestructive(const std::string &output_name, CuMatrix< BaseFloat > *output)
MatrixIndexT NumRows() const
Dimensions.
Provides a vector abstraction class.
const LatticeFasterDecoderConfig & decoder_opts_
double Elapsed() const
Returns time in seconds.
void MergeTaskOutput(const std::vector< NnetInferenceTask > &tasks, Matrix< BaseFloat > *output)
Merges together the 'output_cpu' (if the 'output_to_cpu' members are true) or the 'output' members of...
std::vector< std::thread * > decode_threads_
Sub-matrix representation.
bool GetOutput(std::string *utterance_id, Matrix< BaseFloat > *output)
The user should call this to obtain output.
bool GetOutput(std::string *utterance_id, CompactLattice *clat, std::string *sentence)
The user should call this to obtain output (This version should only be called if config...
bool DeterminizeLatticePhonePrunedWrapper(const kaldi::TransitionModel &trans_model, MutableFst< kaldi::LatticeArc > *ifst, double beam, MutableFst< kaldi::CompactLatticeArc > *ofst, DeterminizeLatticePhonePrunedOptions opts)
This function is a wrapper of DeterminizeLatticePhonePruned() that works for Lattice type FSTs...
int32 frame_subsampling_factor
void Wait()
decrease the counter
void Resize(MatrixIndexT rows, MatrixIndexT cols, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Allocate the memory.
static void GetComputationRequest(const NnetInferenceTask &task, int32 minibatch_size, ComputationRequest *request)
MatrixIndexT Dim() const
Dimensions.
void Run()
This does either the forward or backward computation, depending when it is called (in a typical compu...