DecisionTreeSplitter Class Reference
Collaboration diagram for DecisionTreeSplitter:

Public Member Functions

EventMapGetMap ()
 
BaseFloat BestSplit ()
 
void DoSplit (int32 *next_leaf)
 
 DecisionTreeSplitter (EventAnswerType leaf, const BuildTreeStatsType &stats, const Questions &q_opts)
 
 ~DecisionTreeSplitter ()
 

Private Member Functions

void DoSplitInternal (int32 *next_leaf)
 
void FindBestSplit ()
 

Private Attributes

const Questionsq_opts_
 
BaseFloat best_split_impr_
 
DecisionTreeSplitteryes_
 
DecisionTreeSplitterno_
 
EventAnswerType leaf_
 
BuildTreeStatsType stats_
 
EventKeyType key_
 
std::vector< EventValueTypeyes_set_
 

Detailed Description

Definition at line 429 of file build-tree-utils.cc.

Constructor & Destructor Documentation

◆ DecisionTreeSplitter()

DecisionTreeSplitter ( EventAnswerType  leaf,
const BuildTreeStatsType stats,
const Questions q_opts 
)
inline

Definition at line 447 of file build-tree-utils.cc.

References DecisionTreeSplitter::FindBestSplit().

Referenced by DecisionTreeSplitter::DoSplitInternal(), and kaldi::SplitDecisionTree().

448  : q_opts_(q_opts), yes_(NULL), no_(NULL), leaf_(leaf), stats_(stats) {
449  // not, this must work when stats is empty too. [just gives zero improvement, non-splittable].
450  FindBestSplit();
451  }
DecisionTreeSplitter * yes_
DecisionTreeSplitter * no_

◆ ~DecisionTreeSplitter()

~DecisionTreeSplitter ( )
inline

Definition at line 452 of file build-tree-utils.cc.

References DecisionTreeSplitter::no_, and DecisionTreeSplitter::yes_.

452  {
453  delete yes_;
454  delete no_;
455  }
DecisionTreeSplitter * yes_
DecisionTreeSplitter * no_

Member Function Documentation

◆ BestSplit()

BaseFloat BestSplit ( )
inline

Definition at line 438 of file build-tree-utils.cc.

References DecisionTreeSplitter::best_split_impr_.

Referenced by DecisionTreeSplitter::DoSplit(), DecisionTreeSplitter::DoSplitInternal(), and kaldi::SplitDecisionTree().

438 { return best_split_impr_; } // returns objf improvement (>=0) of best possible split.

◆ DoSplit()

void DoSplit ( int32 next_leaf)
inline

Definition at line 439 of file build-tree-utils.cc.

References DecisionTreeSplitter::best_split_impr_, DecisionTreeSplitter::BestSplit(), DecisionTreeSplitter::DoSplitInternal(), DecisionTreeSplitter::no_, and DecisionTreeSplitter::yes_.

439  {
440  if (!yes_) { // not already split; we are a leaf, so split.
441  DoSplitInternal(next_leaf);
442  } else { // find which of our children is best to split, and split that.
443  (yes_->BestSplit() >= no_->BestSplit() ? yes_ : no_)->DoSplit(next_leaf);
444  best_split_impr_ = std::max(yes_->BestSplit(), no_->BestSplit()); // may have changed.
445  }
446  }
DecisionTreeSplitter * yes_
void DoSplitInternal(int32 *next_leaf)
DecisionTreeSplitter * no_
void DoSplit(int32 *next_leaf)

◆ DoSplitInternal()

void DoSplitInternal ( int32 next_leaf)
inlineprivate

Definition at line 457 of file build-tree-utils.cc.

References kaldi::ApproxEqual(), DecisionTreeSplitter::best_split_impr_, DecisionTreeSplitter::BestSplit(), DecisionTreeSplitter::DecisionTreeSplitter(), Clusterable::Distance(), KALDI_ASSERT, KALDI_ERR, KALDI_WARN, DecisionTreeSplitter::key_, DecisionTreeSplitter::leaf_, EventMap::Lookup(), DecisionTreeSplitter::no_, DecisionTreeSplitter::q_opts_, DecisionTreeSplitter::stats_, kaldi::SumStats(), DecisionTreeSplitter::yes_, and DecisionTreeSplitter::yes_set_.

Referenced by DecisionTreeSplitter::DoSplit().

457  {
458  // Does the split; applicable only to leaf nodes.
459  KALDI_ASSERT(!yes_); // make sure children not already set up.
461  EventAnswerType yes_leaf = leaf_, no_leaf = (*next_leaf)++;
462  leaf_ = -1; // we now have no leaf.
463  // Now split the stats.
464  BuildTreeStatsType yes_stats, no_stats;
465  yes_stats.reserve(stats_.size()); no_stats.reserve(stats_.size()); // probably better than multiple resizings.
466  for (BuildTreeStatsType::const_iterator iter = stats_.begin(); iter != stats_.end(); ++iter) {
467  const EventType &vec = iter->first;
468  EventValueType val;
469  if (!EventMap::Lookup(vec, key_, &val)) KALDI_ERR << "DoSplitInternal: key has no value.";
470  if (std::binary_search(yes_set_.begin(), yes_set_.end(), val)) yes_stats.push_back(*iter);
471  else no_stats.push_back(*iter);
472  }
473 #ifdef KALDI_PARANOID
474  { // Check objf improvement.
475  Clusterable *yes_clust = SumStats(yes_stats), *no_clust = SumStats(no_stats);
476  BaseFloat impr_check = yes_clust->Distance(*no_clust);
477  // this is a negated objf improvement from merging (== objf improvement from splitting).
478  if (!ApproxEqual(impr_check, best_split_impr_, 0.01)) {
479  KALDI_WARN << "DoSplitInternal: possible problem: "<< impr_check << " != " << best_split_impr_;
480  }
481  delete yes_clust; delete no_clust;
482  }
483 #endif
484  yes_ = new DecisionTreeSplitter(yes_leaf, yes_stats, q_opts_);
485  no_ = new DecisionTreeSplitter(no_leaf, no_stats, q_opts_);
486  best_split_impr_ = std::max(yes_->BestSplit(), no_->BestSplit());
487  stats_.clear(); // note: pointers in stats_ were not owned here.
488  }
DecisionTreeSplitter * yes_
Clusterable * SumStats(const BuildTreeStatsType &stats_in)
Sums stats, or returns NULL stats_in has no non-NULL stats.
std::vector< EventValueType > yes_set_
std::vector< std::pair< EventKeyType, EventValueType > > EventType
Definition: event-map.h:58
float BaseFloat
Definition: kaldi-types.h:29
#define KALDI_ERR
Definition: kaldi-error.h:147
#define KALDI_WARN
Definition: kaldi-error.h:150
DecisionTreeSplitter * no_
DecisionTreeSplitter(EventAnswerType leaf, const BuildTreeStatsType &stats, const Questions &q_opts)
static bool Lookup(const EventType &event, EventKeyType key, EventValueType *ans)
Definition: event-map.cc:290
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
int32 EventAnswerType
As far as the event-map code itself is concerned, things of type EventAnswerType may take any value e...
Definition: event-map.h:56
int32 EventValueType
Given current code, things of type EventValueType should generally be nonnegative and in a reasonably...
Definition: event-map.h:51
std::vector< std::pair< EventType, Clusterable * > > BuildTreeStatsType
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265

◆ FindBestSplit()

void FindBestSplit ( )
inlineprivate

Definition at line 489 of file build-tree-utils.cc.

References DecisionTreeSplitter::best_split_impr_, kaldi::FindBestSplitForKey(), Questions::GetKeysWithQuestions(), Questions::HasQuestionsForKey(), rnnlm::i, KALDI_WARN, DecisionTreeSplitter::key_, DecisionTreeSplitter::q_opts_, DecisionTreeSplitter::stats_, and DecisionTreeSplitter::yes_set_.

Referenced by DecisionTreeSplitter::DecisionTreeSplitter().

489  {
490  // This sets best_split_impr_, key_ and yes_set_.
491  // May just pick best question, or may iterate a bit (depends on
492  // q_opts; see FindBestSplitForKey for details)
493  std::vector<EventKeyType> all_keys;
494  q_opts_.GetKeysWithQuestions(&all_keys);
495  if (all_keys.size() == 0) {
496  KALDI_WARN << "DecisionTreeSplitter::FindBestSplit(), no keys available to split on (maybe no key covered all of your events, or there was a problem with your questions configuration?)";
497  }
498  best_split_impr_ = 0;
499  for (size_t i = 0; i < all_keys.size(); i++) {
500  if (q_opts_.HasQuestionsForKey(all_keys[i])) {
501  std::vector<EventValueType> temp_yes_set;
502  BaseFloat split_improvement = FindBestSplitForKey(stats_, q_opts_, all_keys[i], &temp_yes_set);
503  if (split_improvement > best_split_impr_) {
504  best_split_impr_ = split_improvement;
505  yes_set_ = temp_yes_set;
506  key_ = all_keys[i];
507  }
508  }
509  }
510  }
void GetKeysWithQuestions(std::vector< EventKeyType > *keys_out) const
BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats, const Questions &q_opts, EventKeyType key, std::vector< EventValueType > *yes_set_out)
FindBestSplitForKey is a function used in DoDecisionTreeSplit.
std::vector< EventValueType > yes_set_
float BaseFloat
Definition: kaldi-types.h:29
#define KALDI_WARN
Definition: kaldi-error.h:150
bool HasQuestionsForKey(EventKeyType key) const

◆ GetMap()

EventMap* GetMap ( )
inline

Definition at line 431 of file build-tree-utils.cc.

References DecisionTreeSplitter::GetMap(), DecisionTreeSplitter::key_, DecisionTreeSplitter::leaf_, DecisionTreeSplitter::no_, DecisionTreeSplitter::yes_, and DecisionTreeSplitter::yes_set_.

Referenced by DecisionTreeSplitter::GetMap(), and kaldi::SplitDecisionTree().

431  {
432  if (!yes_) { // leaf.
433  return new ConstantEventMap(leaf_);
434  } else {
435  return new SplitEventMap(key_, yes_set_, yes_->GetMap(), no_->GetMap());
436  }
437  }
DecisionTreeSplitter * yes_
std::vector< EventValueType > yes_set_
DecisionTreeSplitter * no_

Member Data Documentation

◆ best_split_impr_

◆ key_

◆ leaf_

◆ no_

◆ q_opts_

const Questions& q_opts_
private

◆ stats_

◆ yes_

◆ yes_set_


The documentation for this class was generated from the following file: