mikolov-rnnlm-lib.h
Go to the documentation of this file.
1 // lm/mikolov-rnnlm-lib.h
2 
3 // Copyright 2015 Guoguo Chen Hainan Xu
4 // 2010-2012 Tomas Mikolov
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // This file is based on version 0.3e of the RNNLM language modeling
9 // toolkit by Tomas Mikolov. Changes made by authors other than
10 // Tomas Mikolov are licensed under the Apache License, the short form
11 // os which is below. The original code by Tomas Mikolov is licensed
12 // under the BSD 3-clause license, whose text is further below.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
22 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
23 // MERCHANTABLITY OR NON-INFRINGEMENT.
24 // See the Apache 2 License for the specific language governing permissions and
25 // limitations under the License.
26 //
27 //
28 // Original BSD 3-clause license text:
29 // Copyright (c) 2010-2012 Tomas Mikolov
30 //
31 // All rights reserved. Redistribution and use in source and binary forms, with
32 // or without modification, are permitted provided that the following conditions
33 // are met: 1. Redistributions of source code must retain the above copyright
34 // notice, this list of conditions and the following
35 // disclaimer. 2. Redistributions in binary form must reproduce the above
36 // copyright notice, this list of conditions and the following disclaimer in the
37 // documentation and/or other materials provided with the
38 // distribution. 3. Neither name of copyright holders nor the names of its
39 // contributors may be used to endorse or promote products derived from this
40 // software without specific prior written permission. THIS SOFTWARE IS PROVIDED
41 // BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
42 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
43 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
44 // EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
45 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
47 // OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
48 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
49 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
50 // EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 
52 #ifndef KALDI_LM_MIKOLOV_RNNLM_LIB_H_
53 #define KALDI_LM_MIKOLOV_RNNLM_LIB_H_
54 
55 #include <string>
56 #include <vector>
57 #include "util/stl-utils.h"
58 
59 namespace rnnlm {
60 
61 #define MAX_STRING 100
62 #define MAX_FILENAME_STRING 300
63 
64 typedef double real; // doubles for NN weights
65 typedef double direct_t; // doubles for ME weights;
66 
67 struct neuron {
68  real ac; // actual value stored in neuron
69  real er; // error value in neuron, used by learning algorithm
70 };
71 
72 struct synapse {
73  real weight; // weight of synapse
74 };
75 
76 struct vocab_word {
77  int cn;
78  char word[MAX_STRING];
79 
80  real prob;
82 };
83 
84 const unsigned int PRIMES[] = {108641969, 116049371, 125925907, 133333309,
85  145678979, 175308587, 197530793, 234567803, 251851741, 264197411,
86  330864029, 399999781,
87  407407183, 459258997, 479012069, 545678687, 560493491, 607407037, 629629243,
88  656789717, 716048933, 718518067, 725925469, 733332871, 753085943, 755555077,
89  782715551, 790122953, 812345159, 814814293, 893826581, 923456189, 940740127,
90  953085797, 985184539, 990122807};
91 const unsigned int PRIMES_SIZE = sizeof(PRIMES) / sizeof(PRIMES[0]);
92 
93 const int MAX_NGRAM_ORDER = 20;
94 
95 enum FileTypeEnum {TEXT, BINARY, COMPRESSED}; // COMPRESSED not yet implemented
96 
97 class CRnnLM {
98  protected:
99  char train_file[MAX_FILENAME_STRING];
100  char valid_file[MAX_FILENAME_STRING];
101  char test_file[MAX_FILENAME_STRING];
102  char rnnlm_file[MAX_FILENAME_STRING];
103  char lmprob_file[MAX_FILENAME_STRING];
104 
106  int version;
107  int filetype;
108 
111 
112  real dynamic;
113 
114  real alpha;
117  double logp, llogp;
119  int iter;
124  int counter;
125 
126  int anti_k;
127 
128  real beta;
129 
131  int **class_words;
132  int *class_cn;
135 
136  struct vocab_word *vocab;
137  void sortVocab();
140 
145 
146  long long direct_size;
148  int history[MAX_NGRAM_ORDER];
149 
150  int bptt;
155 
156  int gen;
157 
159 
160  struct neuron *neu0; // neurons in input layer
161  struct neuron *neu1; // neurons in hidden layer
162  struct neuron *neuc; // neurons in hidden layer
163  struct neuron *neu2; // neurons in output layer
164 
165  struct synapse *syn0; // weights between input and hidden layer
166  struct synapse *syn1; // weights between hidden and output layer
167  // (or hidden and compression if compression>0)
168  struct synapse *sync; // weights between hidden and compression layer
169  direct_t *syn_d; // direct parameters between input and output layer
170  // (similar to Maximum Entropy model parameters)
171 
172  // backup used in training:
173  struct neuron *neu0b;
174  struct neuron *neu1b;
175  struct neuron *neucb;
176  struct neuron *neu2b;
177 
178  struct synapse *syn0b;
179  struct synapse *syn1b;
180  struct synapse *syncb;
181  direct_t *syn_db;
182 
183  // backup used in n-bset rescoring:
184  struct neuron *neu1b2;
185 
186  unordered_map<std::string, float> unk_penalty;
187  std::string unk_sym;
188 
189  public:
190 
191  int alpha_set, train_file_set;
192 
193  CRnnLM();
194 
195  ~CRnnLM();
196 
197  real random(real min, real max);
198 
199  void setRnnLMFile(const std::string &str);
200  int getHiddenLayerSize() const { return layer1_size; }
201  void setRandSeed(int newSeed);
202 
203  int getWordHash(const char *word);
204  void readWord(char *word, FILE *fin);
205  int searchVocab(const char *word);
206 
207  void saveWeights(); // saves current weights and unit activations
208  void initNet();
209  void goToDelimiter(int delim, FILE *fi);
210  void restoreNet();
211  void netReset(); // will erase just hidden layer state + bptt history
212  // + maxent history (called at end of sentences in
213  // the independent mode)
214 
215  void computeNet(int last_word, int word);
216  void copyHiddenLayerToInput();
217 
218  void matrixXvector(struct neuron *dest, struct neuron *srcvec,
219  struct synapse *srcmatrix, int matrix_width,
220  int from, int to, int from2, int to2, int type);
221 
222  void restoreContextFromVector(const std::vector<float> &context_in);
223  void saveContextToVector(std::vector<float> *context_out);
224 
225  float computeConditionalLogprob(
226  std::string current_word,
227  const std::vector<std::string> &history_words,
228  const std::vector<float> &context_in,
229  std::vector<float> *context_out);
230 
231  void setUnkSym(const std::string &unk);
232  void setUnkPenalty(const std::string &filename);
233  float getUnkPenalty(const std::string &word);
234  bool isUnk(const std::string &word);
235 };
236 
237 } // namespace rnnlm
238 
239 #endif // KALDI_LM_MIKOLOV_RNNLM_LIB_H_
double real
struct synapse * syn0b
struct synapse * syn0
struct synapse * syn1
struct neuron * neu2b
unordered_map< std::string, float > unk_penalty
const int MAX_NGRAM_ORDER
double direct_t
long long direct_size
struct synapse * sync
struct synapse * bptt_syn0
struct neuron * neu1b2
struct vocab_word * vocab
struct neuron * neu1
int getHiddenLayerSize() const
struct synapse * syncb
struct neuron * neu1b
struct synapse * syn1b
struct neuron * neu0
#define MAX_STRING
struct neuron * neu0b
const unsigned int PRIMES_SIZE
const unsigned int PRIMES[]
struct neuron * neuc
std::string unk_sym
struct neuron * neu2
struct neuron * neucb
#define MAX_FILENAME_STRING