build-tree-utils.h
Go to the documentation of this file.
1 // tree/build-tree-utils.h
2 
3 // Copyright 2009-2011 Microsoft Corporation
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #ifndef KALDI_TREE_BUILD_TREE_UTILS_H_
21 #define KALDI_TREE_BUILD_TREE_UTILS_H_
22 
24 
25 // build-tree-questions.h needed for this typedef:
26 // typedef std::vector<std::pair<EventType, Clusterable*> > BuildTreeStatsType;
27 // and for other #includes.
28 
29 namespace kaldi {
30 
31 
37 
38 
39 
43 
45 void WriteBuildTreeStats(std::ostream &os, bool binary,
46  const BuildTreeStatsType &stats);
47 
52 void ReadBuildTreeStats(std::istream &is, bool binary,
53  const Clusterable &example, BuildTreeStatsType *stats);
54 
58 bool PossibleValues(EventKeyType key, const BuildTreeStatsType &stats,
59  std::vector<EventValueType> *ans);
60 
61 
68 void SplitStatsByMap(const BuildTreeStatsType &stats_in, const EventMap &e,
69  std::vector<BuildTreeStatsType> *stats_out);
70 
76 void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key,
77  std::vector<BuildTreeStatsType> *stats_out);
78 
79 
89 bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP,
90  BuildTreeStatsType *stats);
91 
92 
99 void FilterStatsByKey(const BuildTreeStatsType &stats_in,
100  EventKeyType key,
101  std::vector<EventValueType> &values,
102  bool include_if_present, // true-> retain only if in "values",
103  // false-> retain only if not in "values".
104  BuildTreeStatsType *stats_out);
105 
106 
109 Clusterable *SumStats(const BuildTreeStatsType &stats_in);
110 
113 
115 BaseFloat SumObjf(const BuildTreeStatsType &stats_in);
116 
117 
121 void SumStatsVec(const std::vector<BuildTreeStatsType> &stats_in, std::vector<Clusterable*> *stats_out);
122 
124 BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e);
125 
126 
134 void FindAllKeys(const BuildTreeStatsType &stats, AllKeysType keys_type,
135  std::vector<EventKeyType> *keys);
136 
137 
139 
140 
150 inline EventMap *TrivialTree(int32 *num_leaves) {
153  KALDI_ASSERT(*num_leaves == 0); // in envisaged usage.
154  return new ConstantEventMap( (*num_leaves)++ );
155 }
156 
161 EventMap *DoTableSplit(const EventMap &orig, EventKeyType key,
162  const BuildTreeStatsType &stats, int32 *num_leaves);
163 
164 
171 EventMap *DoTableSplitMultiple(const EventMap &orig,
172  const std::vector<EventKeyType> &keys,
173  const BuildTreeStatsType &stats,
174  int32 *num_leaves);
175 
176 
180 // The function returns the #leaves we combined. The same leaf-ids of the leaves being clustered
181 // will be used for the clustered leaves (but other than that there is no special rule which
182 // leaf-ids should be used at output).
183 // It outputs the mapping for leaves, in "mapping", which may be empty at the start
184 // but may also contain mappings for other parts of the tree, which must contain
185 // disjoint leaves from this part. This is so that Cluster can
186 // be called multiple times for sub-parts of the tree (with disjoint sets of leaves),
187 // e.g. if we want to avoid sharing across phones. Afterwards you can use Copy function
188 // of EventMap to apply the mapping, i.e. call e_in.Copy(mapping) to get the new map.
189 // Note that the application of Cluster creates gaps in the leaves. You should then
190 // call RenumberEventMap(e_in.Copy(mapping), num_leaves).
191 // *If you only want to cluster a subset of the leaves (e.g. just non-silence, or just
192 // a particular phone, do this by providing a set of "stats" that correspond to just
193 // this subset of leaves*. Leaves with no stats will not be clustered.
194 // See build-tree.cc for an example of usage.
195 int ClusterEventMapGetMapping(const EventMap &e_in, const BuildTreeStatsType &stats,
196  BaseFloat thresh, std::vector<EventMap*> *mapping);
197 
202 EventMap *ClusterEventMap(const EventMap &e_in, const BuildTreeStatsType &stats,
203  BaseFloat thresh, int32 *num_removed);
204 
209 EventMap *ClusterEventMapRestrictedByKeys(const EventMap &e_in,
210  const BuildTreeStatsType &stats,
211  BaseFloat thresh,
212  const std::vector<EventKeyType> &keys,
213  int32 *num_removed);
214 
215 
219 EventMap *ClusterEventMapRestrictedByMap(const EventMap &e_in,
220  const BuildTreeStatsType &stats,
221  BaseFloat thresh,
222  const EventMap &e_restrict,
223  int32 *num_removed);
224 
225 
229  const EventMap &e_in,
230  const BuildTreeStatsType &stats,
231  int32 num_clusters,
232  const EventMap &e_restrict,
233  int32 *num_removed);
234 
239 EventMap *RenumberEventMap(const EventMap &e_in, int32 *num_leaves);
240 
243 EventMap *MapEventMapLeaves(const EventMap &e_in,
244  const std::vector<int32> &mapping);
245 
246 
247 
257 EventMap *ShareEventMapLeaves(const EventMap &e_in, EventKeyType key,
258  std::vector<std::vector<EventValueType> > &values,
259  int32 *num_leaves);
260 
261 
262 
282 EventMap *SplitDecisionTree(const EventMap &orig,
283  const BuildTreeStatsType &stats,
284  Questions &qcfg,
285  BaseFloat thresh,
286  int32 max_leaves, // max_leaves<=0 -> no maximum.
287  int32 *num_leaves,
288  BaseFloat *objf_impr_out,
289  BaseFloat *smallest_split_change_out);
290 
294 void CreateRandomQuestions(const BuildTreeStatsType &stats, int32 num_quest, Questions *cfg_out);
295 
296 
301  const Questions &qcfg,
302  EventKeyType key,
303  std::vector<EventValueType> *yes_set);
304 
305 
320 
321 EventMap *GetStubMap(int32 P,
322  const std::vector<std::vector<int32> > &phone_sets,
323  const std::vector<int32> &phone2num_pdf_classes,
324  const std::vector<bool> &share_roots, // indexed by index into phone_sets.
325  int32 *num_leaves);
327 
329 
330 
331 }// end namespace kaldi
332 
333 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
BaseFloat SumNormalizer(const BuildTreeStatsType &stats_in)
Sums the normalizer [typically, data-count] over the stats.
EventMap * DoTableSplitMultiple(const EventMap &orig, const std::vector< EventKeyType > &keys, const BuildTreeStatsType &stats, int32 *num_leaves)
DoTableSplitMultiple does a complete split on all the keys, in order from keys[0], keys[1] and so on.
bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP, BuildTreeStatsType *stats)
Converts stats from a given context-window (N) and central-position (P) to a different N and P...
EventMap * SplitDecisionTree(const EventMap &input_map, const BuildTreeStatsType &stats, Questions &q_opts, BaseFloat thresh, int32 max_leaves, int32 *num_leaves, BaseFloat *obj_impr_out, BaseFloat *smallest_split_change_out)
Does a decision-tree split at the leaves of an EventMap.
Clusterable * SumStats(const BuildTreeStatsType &stats_in)
Sums stats, or returns NULL stats_in has no non-NULL stats.
void SplitStatsByMap(const BuildTreeStatsType &stats, const EventMap &e, std::vector< BuildTreeStatsType > *stats_out)
Splits stats according to the EventMap, indexing them at output by the leaf type. ...
EventMap * ShareEventMapLeaves(const EventMap &e_in, EventKeyType key, std::vector< std::vector< EventValueType > > &values, int32 *num_leaves)
ShareEventMapLeaves performs a quite specific function that allows us to generate trees where...
void FindAllKeys(const BuildTreeStatsType &stats, AllKeysType keys_type, std::vector< EventKeyType > *keys_out)
FindAllKeys puts in *keys the (sorted, unique) list of all key identities in the stats.
kaldi::int32 int32
BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats, const Questions &q_opts, EventKeyType key, std::vector< EventValueType > *yes_set_out)
FindBestSplitForKey is a function used in DoDecisionTreeSplit.
void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< BuildTreeStatsType > *stats_out)
SplitStatsByKey splits stats up according to the value of a particular key, which must be always defi...
bool PossibleValues(EventKeyType key, const BuildTreeStatsType &stats, std::vector< EventValueType > *ans)
Convenience function e.g.
void DeleteBuildTreeStats(BuildTreeStatsType *stats)
This frees the Clusterable* pointers in "stats", where non-NULL, and sets them to NULL...
void ReadBuildTreeStats(std::istream &is, bool binary, const Clusterable &example, BuildTreeStatsType *stats)
Reads BuildTreeStats object.
AllKeysType
Typedef used when we get "all keys" from a set of stats– used in specifying which kinds of questions...
float BaseFloat
Definition: kaldi-types.h:29
int32 EventKeyType
Things of type EventKeyType can take any value.
Definition: event-map.h:45
void SumStatsVec(const std::vector< BuildTreeStatsType > &stats_in, std::vector< Clusterable *> *stats_out)
Sum a vector of stats.
int ClusterEventMapGetMapping(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, std::vector< EventMap *> *mapping)
"ClusterEventMapGetMapping" clusters the leaves of the EventMap, with "thresh" a delta-likelihood thr...
EventMap * ClusterEventMapRestrictedByKeys(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const std::vector< EventKeyType > &keys, int32 *num_removed)
This is as ClusterEventMap, but first splits the stats on the keys specified in "keys" (e...
EventMap * RenumberEventMap(const EventMap &e_in, int32 *num_leaves)
RenumberEventMap [intended to be used after calling ClusterEventMap] renumbers an EventMap so its lea...
EventMap * MapEventMapLeaves(const EventMap &e_in, const std::vector< int32 > &mapping_in)
This function remaps the event-map leaves using this mapping, indexed by the number at leaf...
BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e)
Cluster the stats given the event map return the total objf given those clusters. ...
EventMap * GetStubMap(int32 P, const std::vector< std::vector< int32 > > &phone_sets, const std::vector< int32 > &phone2num_pdf_classes, const std::vector< bool > &share_roots, int32 *num_leaves_out)
GetStubMap is used in tree-building functions to get the initial to-states map, before the decision-t...
EventMap * ClusterEventMapToNClustersRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, int32 num_clusters_required, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestrictedByMap clusters to get a specific number of clusters as speci...
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
BaseFloat SumObjf(const BuildTreeStatsType &stats_in)
Sums the objective function over the stats.
EventMap * ClusterEventMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, int32 *num_removed_ptr)
This is as ClusterEventMapGetMapping but a more convenient interface that exposes less of the interna...
void FilterStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key, std::vector< EventValueType > &values, bool include_if_present, BuildTreeStatsType *stats_out)
FilterStatsByKey filters the stats according the value of a specified key.
void CreateRandomQuestions(const BuildTreeStatsType &stats, int32 num_quest, Questions *cfg_out)
CreateRandomQuestions will initialize a Questions randomly, in a reasonable way [for testing purposes...
std::vector< std::pair< EventType, Clusterable * > > BuildTreeStatsType
EventMap * ClusterEventMapRestrictedByMap(const EventMap &e_in, const BuildTreeStatsType &stats, BaseFloat thresh, const EventMap &e_restrict, int32 *num_removed_ptr)
This version of ClusterEventMapRestricted restricts the clustering to only allow things that "e_restr...
EventMap * TrivialTree(int32 *num_leaves)
Returns a tree with just one node.
EventMap * DoTableSplit(const EventMap &orig, EventKeyType key, const BuildTreeStatsType &stats, int32 *num_leaves)
DoTableSplit does a complete split on this key (e.g.
void WriteBuildTreeStats(std::ostream &os, bool binary, const BuildTreeStatsType &stats)
Writes BuildTreeStats object. This works even if pointers are NULL.