#include <nnet-example-utils.h>

Collaboration diagram for UtteranceSplitter:

Public Member Functions
	UtteranceSplitter (const ExampleGenerationConfig &config)

const ExampleGenerationConfig &	Config () const

void	GetChunksForUtterance (int32 utterance_length, std::vector< ChunkTimeInfo > *chunk_info)

bool	LengthsMatch (const std::string &utt, int32 utterance_length, int32 supervision_length, int32 length_tolerance=0) const

	~UtteranceSplitter ()

int32	ExitStatus ()

Private Member Functions
void	InitSplitForLength ()

float	DefaultDurationOfSplit (const std::vector< int32 > &split) const

int32	MaxUtteranceLength () const

void	InitSplits (std::vector< std::vector< int32 > > *splits) const

void	GetChunkSizesForUtterance (int32 utterance_length, std::vector< int32 > *chunk_sizes) const

void	GetGapSizes (int32 utterance_length, bool enforce_subsampling_factor, const std::vector< int32 > &chunk_sizes, std::vector< int32 > *gap_sizes) const

void	SetOutputWeights (int32 utterance_length, std::vector< ChunkTimeInfo > *chunk_info) const

void	AccStatsForUtterance (int32 utterance_length, const std::vector< ChunkTimeInfo > &chunk_info)

Static Private Member Functions
static void	DistributeRandomlyUniform (int32 n, std::vector< int32 > *vec)

static void	DistributeRandomly (int32 n, const std::vector< int32 > &magnitudes, std::vector< int32 > *vec)

Private Attributes
const ExampleGenerationConfig &	config_

std::vector< std::vector< std::vector< int32 > > >	splits_for_length_

int32	total_num_utterances_

int64	total_input_frames_

int64	total_frames_overlap_

int64	total_num_chunks_

int64	total_frames_in_chunks_

std::map< int32, int32 >	chunk_size_to_count_

Detailed Description

Definition at line 169 of file nnet-example-utils.h.

Constructor & Destructor Documentation

◆ UtteranceSplitter()

UtteranceSplitter ( const ExampleGenerationConfig & config )

Definition at line 342 of file nnet-example-utils.cc.

References UtteranceSplitter::InitSplitForLength(), KALDI_ERR, ExampleGenerationConfig::num_frames, and ExampleGenerationConfig::num_frames_str.

                                                                          :
     config_(config),
     total_num_utterances_(0), total_input_frames_(0),
     total_frames_overlap_(0), total_num_chunks_(0),
     total_frames_in_chunks_(0) {
   if (config.num_frames_str != "-1") {
     if (config.num_frames.empty()) {
       KALDI_ERR << "You need to call ComputeDerived() on the "
                  "ExampleGenerationConfig().";
     }
    InitSplitForLength();
   }
 }

◆ ~UtteranceSplitter()

~UtteranceSplitter ( )

Definition at line 356 of file nnet-example-utils.cc.

References UtteranceSplitter::chunk_size_to_count_, KALDI_LOG, UtteranceSplitter::total_frames_in_chunks_, UtteranceSplitter::total_frames_overlap_, UtteranceSplitter::total_input_frames_, UtteranceSplitter::total_num_chunks_, and UtteranceSplitter::total_num_utterances_.

                                       {
   KALDI_LOG << "Split " << total_num_utterances_ << " utts, with "
             << "total length " << total_input_frames_ << " frames ("
             << (total_input_frames_ / 360000.0) << " hours assuming "
             << "100 frames per second)";
   float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_,
       overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_,
       output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_,
       output_percent_no_overlap = output_percent - overlap_percent;
 
   KALDI_LOG << "Average chunk length was " << average_chunk_length
             << " frames; overlap between adjacent chunks was "
             << overlap_percent << "% of input length; length of output was "
             << output_percent << "% of input length (minus overlap = "
             << output_percent_no_overlap << "%).";
   if (chunk_size_to_count_.size() > 1) {
     std::ostringstream os;
     os << std::setprecision(4);
     for (std::map<int32, int32>::iterator iter = chunk_size_to_count_.begin();
          iter != chunk_size_to_count_.end(); ++iter) {
       int32 chunk_size = iter->first,
           num_frames = chunk_size * iter->second;
       float percent_of_total = num_frames * 100.0 / total_frames_in_chunks_;
       if (iter != chunk_size_to_count_.begin()) os << ", ";
       os << chunk_size << " = " << percent_of_total << "%";
     }
     KALDI_LOG << "Output frames are distributed among chunk-sizes as follows: "
               << os.str();
   }
 
 }

Member Function Documentation

◆ AccStatsForUtterance()

void AccStatsForUtterance	(	int32	utterance_length,
		const std::vector< ChunkTimeInfo > &	chunk_info
	)

private

Definition at line 863 of file nnet-example-utils.cc.

References UtteranceSplitter::chunk_size_to_count_, UtteranceSplitter::total_frames_in_chunks_, UtteranceSplitter::total_frames_overlap_, UtteranceSplitter::total_input_frames_, UtteranceSplitter::total_num_chunks_, and UtteranceSplitter::total_num_utterances_.

Referenced by UtteranceSplitter::GetChunksForUtterance().

                                                 {
   total_num_utterances_ += 1;
   total_input_frames_ += utterance_length;
 
   for (size_t c = 0; c < chunk_info.size(); c++) {
     int32 chunk_size = chunk_info[c].num_frames;
     if (c > 0) {
       int32 last_chunk_end = chunk_info[c-1].first_frame +
           chunk_info[c-1].num_frames;
       if (last_chunk_end > chunk_info[c].first_frame)
         total_frames_overlap_ += last_chunk_end - chunk_info[c].first_frame;
     }
     std::map<int32, int32>::iterator iter = chunk_size_to_count_.find(
         chunk_size);
     if (iter == chunk_size_to_count_.end())
       chunk_size_to_count_[chunk_size] = 1;
     else
       iter->second++;
     total_num_chunks_ += 1;
     total_frames_in_chunks_ += chunk_size;
   }
 }

◆ Config()

const ExampleGenerationConfig& Config ( ) const

inline

Definition at line 175 of file nnet-example-utils.h.

Referenced by kaldi::nnet3::ProcessFile().

175 { return config_; }

kaldi::nnet3::UtteranceSplitter::config_

const ExampleGenerationConfig & config_

Definition: nnet-example-utils.h:282

◆ DefaultDurationOfSplit()

float DefaultDurationOfSplit ( const std::vector< int32 > & split ) const

private

Definition at line 388 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, rnnlm::i, KALDI_ASSERT, ExampleGenerationConfig::num_frames, and ExampleGenerationConfig::num_frames_overlap.

Referenced by UtteranceSplitter::InitSplitForLength(), and UtteranceSplitter::InitSplits().

                                          {
   if (split.empty())  // not a valid split, but useful to handle this case.
     return 0.0;
   float principal_num_frames = config_.num_frames[0],
       num_frames_overlap = config_.num_frames_overlap;
   KALDI_ASSERT(num_frames_overlap < principal_num_frames &&
                "--num-frames-overlap value is too high");
   float overlap_proportion = num_frames_overlap / principal_num_frames;
   float ans = std::accumulate(split.begin(), split.end(), int32(0));
   for (size_t i = 0; i + 1 < split.size(); i++) {
     float min_adjacent_chunk_length = std::min(split[i], split[i + 1]),
         overlap = overlap_proportion * min_adjacent_chunk_length;
     ans -= overlap;
   }
   KALDI_ASSERT(ans > 0.0);
   return ans;
 }

◆ DistributeRandomly()

void DistributeRandomly	(	int32	n,
		const std::vector< int32 > &	magnitudes,
		std::vector< int32 > *	vec
	)

staticprivate

Definition at line 703 of file nnet-example-utils.cc.

References rnnlm::i, KALDI_ASSERT, and rnnlm::n.

Referenced by UtteranceSplitter::GetGapSizes().

                                                                   {
   KALDI_ASSERT(!vec->empty() && vec->size() == magnitudes.size());
   int32 size = vec->size();
   if (n < 0) {
     DistributeRandomly(-n, magnitudes, vec);
     for (int32 i = 0; i < size; i++)
       (*vec)[i] *= -1;
     return;
   }
   float total_magnitude = std::accumulate(magnitudes.begin(), magnitudes.end(),
                                           int32(0));
   KALDI_ASSERT(total_magnitude > 0);
   // note: 'partial_counts' contains the negative of the partial counts, so
   // when we sort the larger partial counts come first.
   std::vector<std::pair<float, int32> > partial_counts;
   int32 total_count = 0;
   for (int32 i = 0; i < size; i++) {
     float this_count = n * float(magnitudes[i]) / total_magnitude;
     // note: cast of float to int32 rounds towards zero (down, in this
     // case, since this_count >= 0).
     int32 this_whole_count = static_cast<int32>(this_count),
         this_partial_count = this_count - this_whole_count;
     (*vec)[i] = this_whole_count;
     total_count += this_whole_count;
     partial_counts.push_back(std::pair<float, int32>(-this_partial_count, i));
   }
   KALDI_ASSERT(total_count <= n && total_count + size >= n);
   std::sort(partial_counts.begin(), partial_counts.end());
   int32 i = 0;
   // Increment by one the elements of the vector that has the largest partial
   // count, then the next largest partial count, and so on... until we reach the
   // desired total-count 'n'.
   for(; total_count < n; i++,total_count++) {
     (*vec)[partial_counts[i].second]++;
   }
   KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
 }

◆ DistributeRandomlyUniform()

void DistributeRandomlyUniform	(	int32	n,
		std::vector< int32 > *	vec
	)

staticprivate

Definition at line 679 of file nnet-example-utils.cc.

References rnnlm::i, and KALDI_ASSERT.

Referenced by UtteranceSplitter::GetGapSizes().

                                                                                 {
   KALDI_ASSERT(!vec->empty());
   int32 size = vec->size();
   if (n < 0) {
     DistributeRandomlyUniform(-n, vec);
     for (int32 i = 0; i < size; i++)
       (*vec)[i] *= -1;
     return;
   }
   // from this point we know n >= 0.
   int32 common_part = n / size,
       remainder = n % size, i;
   for (i = 0; i < remainder; i++) {
     (*vec)[i] = common_part + 1;
   }
   for (; i < size; i++) {
     (*vec)[i] = common_part;
   }
   std::random_shuffle(vec->begin(), vec->end());
   KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
 }

◆ ExitStatus()

int32 ExitStatus ( )

inline

Definition at line 198 of file nnet-example-utils.h.

References kaldi::AccStatsForUtterance(), and rnnlm::n.

Referenced by main().

198 { return (total_frames_in_chunks_ > 0 ? 0 : 1); }

kaldi::nnet3::UtteranceSplitter::total_frames_in_chunks_

int64 total_frames_in_chunks_

Definition: nnet-example-utils.h:311

◆ GetChunksForUtterance()

void GetChunksForUtterance	(	int32	utterance_length,
		std::vector< ChunkTimeInfo > *	chunk_info
	)

Definition at line 822 of file nnet-example-utils.cc.

References UtteranceSplitter::AccStatsForUtterance(), UtteranceSplitter::config_, ChunkTimeInfo::first_frame, ExampleGenerationConfig::frame_subsampling_factor, UtteranceSplitter::GetChunkSizesForUtterance(), UtteranceSplitter::GetGapSizes(), rnnlm::i, KALDI_ASSERT, ExampleGenerationConfig::left_context, ChunkTimeInfo::left_context, ExampleGenerationConfig::left_context_initial, ChunkTimeInfo::num_frames, ExampleGenerationConfig::num_frames_str, ExampleGenerationConfig::right_context, ChunkTimeInfo::right_context, ExampleGenerationConfig::right_context_final, and UtteranceSplitter::SetOutputWeights().

Referenced by kaldi::nnet3::ProcessFile().

                                           {
   int32 t = 0;
   if (config_.num_frames_str == "-1" ) {
     ChunkTimeInfo *info;
     info = new ChunkTimeInfo;
     info->first_frame = 0;
     info->num_frames = utterance_length;
     info->left_context = (config_.left_context_initial >= 0 ?
                           config_.left_context_initial : config_.left_context);
     info->right_context = (config_.right_context_final >= 0 ?
                            config_.right_context_final : config_.right_context);
     (*chunk_info).push_back(*info);
   } else {
     std::vector<int32> chunk_sizes;
     GetChunkSizesForUtterance(utterance_length, &chunk_sizes);
     std::vector<int32> gaps(chunk_sizes.size());
     GetGapSizes(utterance_length, true, chunk_sizes, &gaps);
     int32 num_chunks = chunk_sizes.size();
     chunk_info->resize(num_chunks);
     for (int32 i = 0; i < num_chunks; i++) {
       t += gaps[i];
       ChunkTimeInfo &info = (*chunk_info)[i];
       info.first_frame = t;
       info.num_frames = chunk_sizes[i];
       info.left_context = (i == 0 && config_.left_context_initial >= 0 ?
                            config_.left_context_initial : config_.left_context);
       info.right_context = (i == num_chunks - 1 && config_.right_context_final >= 0 ?
                             config_.right_context_final : config_.right_context);
       t += chunk_sizes[i];
     }
   }
   SetOutputWeights(utterance_length, chunk_info);
   AccStatsForUtterance(utterance_length, *chunk_info);
   // check that the end of the last chunk doesn't go more than
   // 'config_.frame_subsampling_factor - 1' frames past the end
   // of the utterance.  That amount, we treat as rounding error.
   KALDI_ASSERT(t - utterance_length < config_.frame_subsampling_factor);
 }

◆ GetChunkSizesForUtterance()

void GetChunkSizesForUtterance	(	int32	utterance_length,
		std::vector< int32 > *	chunk_sizes
	)		const

private

Definition at line 580 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, rnnlm::i, KALDI_ASSERT, ExampleGenerationConfig::num_frames, ExampleGenerationConfig::num_frames_overlap, kaldi::RandInt(), and UtteranceSplitter::splits_for_length_.

Referenced by UtteranceSplitter::GetChunksForUtterance().

                                                                  {
   KALDI_ASSERT(!splits_for_length_.empty());
   // 'primary_length' is the first-specified num-frames.
   // It's the only chunk that may be repeated an arbitrary number
   // of times.
   int32 primary_length = config_.num_frames[0],
       num_frames_overlap = config_.num_frames_overlap,
       max_tabulated_length = splits_for_length_.size() - 1,
       num_primary_length_repeats = 0;
   KALDI_ASSERT(primary_length - num_frames_overlap > 0);
   KALDI_ASSERT(utterance_length >= 0);
   while (utterance_length > max_tabulated_length) {
     utterance_length -= (primary_length - num_frames_overlap);
     num_primary_length_repeats++;
   }
   KALDI_ASSERT(utterance_length >= 0);
   const std::vector<std::vector<int32> > &possible_splits =
       splits_for_length_[utterance_length];
   if (possible_splits.empty()) {
     chunk_sizes->clear();
     return;
   }
   int32 num_possible_splits = possible_splits.size(),
       randomly_chosen_split = RandInt(0, num_possible_splits - 1);
   *chunk_sizes = possible_splits[randomly_chosen_split];
   for (int32 i = 0; i < num_primary_length_repeats; i++)
     chunk_sizes->push_back(primary_length);
 
   std::sort(chunk_sizes->begin(), chunk_sizes->end());
   if (RandInt(0, 1) == 0) {
     std::reverse(chunk_sizes->begin(), chunk_sizes->end());
   }
 }

◆ GetGapSizes()

void GetGapSizes	(	int32	utterance_length,
		bool	enforce_subsampling_factor,
		const std::vector< int32 > &	chunk_sizes,
		std::vector< int32 > *	gap_sizes
	)		const

private

Definition at line 744 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, UtteranceSplitter::DistributeRandomly(), UtteranceSplitter::DistributeRandomlyUniform(), ExampleGenerationConfig::frame_subsampling_factor, rnnlm::i, KALDI_ASSERT, and KALDI_ERR.

Referenced by UtteranceSplitter::GetChunksForUtterance().

                                                                        {
   if (chunk_sizes.empty()) {
     gap_sizes->clear();
     return;
   }
   if (enforce_subsampling_factor && config_.frame_subsampling_factor > 1) {
     int32 sf = config_.frame_subsampling_factor, size = chunk_sizes.size();
     int32 utterance_length_reduced = (utterance_length + (sf - 1)) / sf;
     std::vector<int32> chunk_sizes_reduced(chunk_sizes);
     for (int32 i = 0; i < size; i++) {
       KALDI_ASSERT(chunk_sizes[i] % config_.frame_subsampling_factor == 0);
       chunk_sizes_reduced[i] /= config_.frame_subsampling_factor;
     }
     GetGapSizes(utterance_length_reduced, false,
                 chunk_sizes_reduced, gap_sizes);
     KALDI_ASSERT(gap_sizes->size() == static_cast<size_t>(size));
     for (int32 i = 0; i < size; i++)
       (*gap_sizes)[i] *= config_.frame_subsampling_factor;
     return;
   }
   int32 num_chunks = chunk_sizes.size(),
       total_of_chunk_sizes = std::accumulate(chunk_sizes.begin(),
                                              chunk_sizes.end(),
                                              int32(0)),
       total_gap = utterance_length - total_of_chunk_sizes;
   gap_sizes->resize(num_chunks);
 
   if (total_gap < 0) {
     // there is an overlap.  Overlaps can only go between chunks, not at the
     // beginning or end of the utterance.  Also, we try to make the length of
     // overlap proportional to the size of the smaller of the two chunks
     // that the overlap is between.
     if (num_chunks == 1) {
       // there needs to be an overlap, but there is only one chunk... this means
       // the chunk-size exceeds the utterance length, which is not allowed.
       KALDI_ERR << "Chunk size is " << chunk_sizes[0]
                 << " but utterance length is only "
                 << utterance_length;
     }
 
     // note the elements of 'overlaps' will be <= 0.
     std::vector<int32> magnitudes(num_chunks - 1),
         overlaps(num_chunks - 1);
     // the 'magnitudes' vector will contain the minimum of the lengths of the
     // two adjacent chunks between which are are going to consider having an
     // overlap.  These will be used to assign the overlap proportional to that
     // size.
     for (int32 i = 0; i + 1 < num_chunks; i++) {
       magnitudes[i] = std::min<int32>(chunk_sizes[i], chunk_sizes[i + 1]);
     }
     DistributeRandomly(total_gap, magnitudes, &overlaps);
     for (int32 i = 0; i + 1 < num_chunks; i++) {
       // If the following condition does not hold, it's possible we
       // could get chunk start-times less than zero.  I don't believe
       // it's possible for this condition to fail, but we're checking
       // for it at this level to make debugging easier, just in case.
       KALDI_ASSERT(overlaps[i] <= magnitudes[i]);
     }
 
     (*gap_sizes)[0] = 0;  // no gap before 1st chunk.
     for (int32 i = 1; i < num_chunks; i++)
       (*gap_sizes)[i] = overlaps[i-1];
   } else {
     // There may be a gap.  Gaps can go at the start or end of the utterance, or
     // between segments.  We try to distribute the gaps evenly.
     std::vector<int32> gaps(num_chunks + 1);
     DistributeRandomlyUniform(total_gap, &gaps);
     // the last element of 'gaps', the one at the end of the utterance, is
     // implicit and doesn't have to be written to the output.
     for (int32 i = 0; i < num_chunks; i++)
       (*gap_sizes)[i] = gaps[i];
   }
 }

◆ InitSplitForLength()

void InitSplitForLength ( )

private

Definition at line 447 of file nnet-example-utils.cc.

References UtteranceSplitter::DefaultDurationOfSplit(), kaldi::GetVerboseLevel(), UtteranceSplitter::InitSplits(), KALDI_ASSERT, KALDI_VLOG, UtteranceSplitter::MaxUtteranceLength(), and UtteranceSplitter::splits_for_length_.

Referenced by UtteranceSplitter::UtteranceSplitter().

                                            {
   int32 max_utterance_length = MaxUtteranceLength();
 
   // The 'splits' vector is a list of possible splits (a split being
   // a sorted vector of chunk-sizes).
   // The vector 'splits' is itself sorted.
   std::vector<std::vector<int32> > splits;
   InitSplits(&splits);
 
 
   // Define a split-index 0 <= s < splits.size() as index into the 'splits'
   // vector, and let a cost c >= 0 represent the mismatch between an
   // utterance length and the total length of the chunk sizes in a split:
 
   //  c(default_duration, utt_length) = (default_duration > utt_length ?
   //                                    default_duration - utt_length :
   //                                    2.0 * (utt_length - default_duration))
   // [but as a special case, set c to infinity if the largest chunk size in the
   //  split is longer than the utterance length; we couldn't, in that case, use
   //  this split for this utterance].
 
   // 'costs_for_length[u][s]', indexed by utterance-length u and then split,
   // contains the cost for utterance-length u and split s.
 
   std::vector<std::vector<float> > costs_for_length(
       max_utterance_length + 1);
   int32 num_splits = splits.size();
 
   for (int32 u = 0; u <= max_utterance_length; u++)
     costs_for_length[u].reserve(num_splits);
 
   for (int32 s = 0; s < num_splits; s++) {
     const std::vector<int32> &split = splits[s];
     float default_duration = DefaultDurationOfSplit(split);
     int32 max_chunk_size = *std::max_element(split.begin(), split.end());
     for (int32 u = 0; u <= max_utterance_length; u++) {
       // c is the cost for this utterance length and this split.  We penalize
       // gaps twice as strongly as overlaps, based on the intuition that
       // completely throwing out frames of data is worse than counting them
       // twice.
       float c = (default_duration > float(u) ? default_duration - float(u) :
                  2.0 * (u - default_duration));
       if (u < max_chunk_size)  // can't fit the largest of the chunks in this
                                // utterance
         c = std::numeric_limits<float>::max();
       KALDI_ASSERT(c >= 0);
       costs_for_length[u].push_back(c);
     }
   }
 
 
   splits_for_length_.resize(max_utterance_length + 1);
 
   for (int32 u = 0; u <= max_utterance_length; u++) {
     const std::vector<float> &costs = costs_for_length[u];
     float min_cost = *std::min_element(costs.begin(), costs.end());
     if (min_cost == std::numeric_limits<float>::max()) {
       // All costs were infinity, becaues this utterance-length u is shorter
       // than the smallest chunk-size.  Leave splits_for_length_[u] as empty
       // for this utterance-length, meaning we will not be able to choose any
       // split, and such utterances will be discarded.
       continue;
     }
     float cost_threshold = 1.9999; // We will choose pseudo-randomly from splits
                                    // that are within this distance from the
                                    // best cost.  Make the threshold just
                                    // slightly less than 2...  this will
                                    // hopefully make the behavior more
                                    // deterministic for ties.
     std::vector<int32> possible_splits;
     std::vector<float>::const_iterator iter = costs.begin(), end = costs.end();
     int32 s = 0;
     for (; iter != end; ++iter,++s)
       if (*iter < min_cost + cost_threshold)
         splits_for_length_[u].push_back(splits[s]);
   }
 
   if (GetVerboseLevel() >= 3) {
     std::ostringstream os;
     for (int32 u = 0; u <= max_utterance_length; u++) {
       if (!splits_for_length_[u].empty()) {
         os << u << "=(";
         std::vector<std::vector<int32 > >::const_iterator
             iter1 = splits_for_length_[u].begin(),
             end1 = splits_for_length_[u].end();
 
         while (iter1 != end1) {
           std::vector<int32>::const_iterator iter2 = iter1->begin(),
               end2 = iter1->end();
           while (iter2 != end2) {
             os << *iter2;
             ++iter2;
             if (iter2 != end2) os << ",";
           }
           ++iter1;
           if (iter1 != end1) os << "/";
         }
         os << ")";
         if (u < max_utterance_length) os << ", ";
       }
     }
     KALDI_VLOG(3) << "Utterance-length-to-splits map is: " << os.str();
   }
 }

◆ InitSplits()

void InitSplits ( std::vector< std::vector< int32 > > * splits ) const

private

Definition at line 631 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, UtteranceSplitter::DefaultDurationOfSplit(), rnnlm::i, rnnlm::j, UtteranceSplitter::MaxUtteranceLength(), rnnlm::n, and ExampleGenerationConfig::num_frames.

Referenced by UtteranceSplitter::InitSplitForLength().

                                                                              {
   // we consider splits whose default duration (as returned by
   // DefaultDurationOfSplit()) is up to MaxUtteranceLength() + primary_length.
   // We can be confident without doing a lot of math, that splits above this
   // length will never be chosen for any utterance-length up to
   // MaxUtteranceLength() (which is the maximum we use).
   int32 primary_length = config_.num_frames[0],
       default_duration_ceiling = MaxUtteranceLength() + primary_length;
 
   typedef unordered_set<std::vector<int32>, VectorHasher<int32> > SetType;
 
   SetType splits_set;
 
   int32 num_lengths = config_.num_frames.size();
 
   // The splits we are allow are: zero to two 'alternate' lengths, plus
   // an arbitrary number of repeats of the 'primary' length.  The repeats
   // of the 'primary' length are handled by the inner loop over n.
   // The zero to two 'alternate' lengths are handled by the loops over
   // i and j.  i == 0 and j == 0 are special cases; they mean, no
   // alternate is chosen.
   for (int32 i = 0; i < num_lengths; i++) {
     for (int32 j = 0; j < num_lengths; j++) {
       std::vector<int32> vec;
       if (i > 0)
         vec.push_back(config_.num_frames[i]);
       if (j > 0)
         vec.push_back(config_.num_frames[j]);
       int32 n = 0;
       while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
         if (!vec.empty()) // Don't allow the empty vector as a split.
           splits_set.insert(vec);
         n++;
         vec.push_back(primary_length);
         std::sort(vec.begin(), vec.end());
       }
     }
   }
   for (SetType::const_iterator iter = splits_set.begin();
        iter != splits_set.end(); ++iter)
     splits->push_back(*iter);
   std::sort(splits->begin(), splits->end());  // make the order deterministic,
                                               // for consistency of output
                                               // between runs and C libraries.
 }

◆ LengthsMatch()

bool LengthsMatch	(	const std::string &	utt,
		int32	utterance_length,
		int32	supervision_length,
		int32	length_tolerance = `0`
	)		const

Definition at line 553 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, ExampleGenerationConfig::frame_subsampling_factor, and KALDI_WARN.

Referenced by kaldi::nnet3::ProcessFile().

                                                                    {
   int32 sf = config_.frame_subsampling_factor,
       expected_supervision_length = (utterance_length + sf - 1) / sf;
   if (std::abs(supervision_length - expected_supervision_length) 
       <= length_tolerance) {
     return true;
   } else {
     if (sf == 1) {
       KALDI_WARN << "Supervision does not have expected length for utterance "
                  << utt << ": expected length = " << utterance_length
                  << ", got " << supervision_length;
     } else {
       KALDI_WARN << "Supervision does not have expected length for utterance "
                  << utt << ": expected length = (" << utterance_length
                  << " + " << sf << " - 1) / " << sf << " = "
                  << expected_supervision_length
                  << ", got: " << supervision_length
                  << " (note: --frame-subsampling-factor=" << sf << ")";
     }
     return false;
   }
 }

◆ MaxUtteranceLength()

int32 MaxUtteranceLength ( ) const

private

Definition at line 616 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, rnnlm::i, KALDI_ASSERT, and ExampleGenerationConfig::num_frames.

Referenced by UtteranceSplitter::InitSplitForLength(), and UtteranceSplitter::InitSplits().

                                                   {
   int32 num_lengths = config_.num_frames.size();
   KALDI_ASSERT(num_lengths > 0);
   // 'primary_length' is the first-specified num-frames.
   // It's the only chunk that may be repeated an arbitrary number
   // of times.
   int32 primary_length = config_.num_frames[0],
       max_length = primary_length;
   for (int32 i = 0; i < num_lengths; i++) {
     KALDI_ASSERT(config_.num_frames[i] > 0);
     max_length = std::max(config_.num_frames[i], max_length);
   }
   return 2 * max_length + primary_length;
 }

◆ SetOutputWeights()

void SetOutputWeights	(	int32	utterance_length,
		std::vector< ChunkTimeInfo > *	chunk_info
	)		const

private

Definition at line 889 of file nnet-example-utils.cc.

References UtteranceSplitter::config_, count, ChunkTimeInfo::first_frame, ExampleGenerationConfig::frame_subsampling_factor, rnnlm::i, ChunkTimeInfo::num_frames, and ChunkTimeInfo::output_weights.

Referenced by UtteranceSplitter::GetChunksForUtterance().

                                                 {
   int32 sf = config_.frame_subsampling_factor;
   int32 num_output_frames = (utterance_length + sf - 1) / sf;
   // num_output_frames is the number of frames of supervision.  'count[t]' will
   // be the number of chunks that this output-frame t appears in.  Note: the
   // 'first_frame' and 'num_frames' members of ChunkTimeInfo will always be
   // multiples of frame_subsampling_factor.
   std::vector<int32> count(num_output_frames, 0);
   int32 num_chunks = chunk_info->size();
   for (int32 i = 0; i < num_chunks; i++) {
     ChunkTimeInfo &chunk = (*chunk_info)[i];
     for (int32 t = chunk.first_frame / sf;
          t < (chunk.first_frame + chunk.num_frames) / sf;
          t++)
       count[t]++;
   }
   for (int32 i = 0; i < num_chunks; i++) {
     ChunkTimeInfo &chunk = (*chunk_info)[i];
     chunk.output_weights.resize(chunk.num_frames / sf);
     int32 t_start = chunk.first_frame / sf;
     for (int32 t = t_start;
          t < (chunk.first_frame + chunk.num_frames) / sf;
          t++)
       chunk.output_weights[t - t_start] = 1.0 / count[t];
   }
 }

Member Data Documentation

◆ chunk_size_to_count_

std::map<int32, int32> chunk_size_to_count_

private

Definition at line 314 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::AccStatsForUtterance(), and UtteranceSplitter::~UtteranceSplitter().

◆ config_

const ExampleGenerationConfig& config_

private

Definition at line 282 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::DefaultDurationOfSplit(), UtteranceSplitter::GetChunksForUtterance(), UtteranceSplitter::GetChunkSizesForUtterance(), UtteranceSplitter::GetGapSizes(), UtteranceSplitter::InitSplits(), UtteranceSplitter::LengthsMatch(), UtteranceSplitter::MaxUtteranceLength(), and UtteranceSplitter::SetOutputWeights().

◆ splits_for_length_

std::vector<std::vector<std::vector<int32> > > splits_for_length_

private

Definition at line 302 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::GetChunkSizesForUtterance(), and UtteranceSplitter::InitSplitForLength().

◆ total_frames_in_chunks_

int64 total_frames_in_chunks_

private

Definition at line 311 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::AccStatsForUtterance(), and UtteranceSplitter::~UtteranceSplitter().

◆ total_frames_overlap_

int64 total_frames_overlap_

private

Definition at line 308 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::AccStatsForUtterance(), and UtteranceSplitter::~UtteranceSplitter().

◆ total_input_frames_

int64 total_input_frames_

private

Definition at line 306 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::AccStatsForUtterance(), and UtteranceSplitter::~UtteranceSplitter().

◆ total_num_chunks_

int64 total_num_chunks_

private

Definition at line 310 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::AccStatsForUtterance(), and UtteranceSplitter::~UtteranceSplitter().

◆ total_num_utterances_

int32 total_num_utterances_

private

Definition at line 305 of file nnet-example-utils.h.

Referenced by UtteranceSplitter::AccStatsForUtterance(), and UtteranceSplitter::~UtteranceSplitter().

The documentation for this class was generated from the following files:

nnet3/nnet-example-utils.h
nnet3/nnet-example-utils.cc

Public Member Functions

Private Member Functions

Static Private Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

◆ UtteranceSplitter()

◆ ~UtteranceSplitter()

Member Function Documentation

◆ AccStatsForUtterance()

◆ Config()

◆ DefaultDurationOfSplit()

◆ DistributeRandomly()

◆ DistributeRandomlyUniform()

◆ ExitStatus()

◆ GetChunksForUtterance()

◆ GetChunkSizesForUtterance()

◆ GetGapSizes()

◆ InitSplitForLength()

◆ InitSplits()

◆ LengthsMatch()

◆ MaxUtteranceLength()

◆ SetOutputWeights()

Member Data Documentation

◆ chunk_size_to_count_

◆ config_

◆ splits_for_length_

◆ total_frames_in_chunks_

◆ total_frames_overlap_

◆ total_input_frames_

◆ total_num_chunks_

◆ total_num_utterances_