cu-math.cc
Go to the documentation of this file.
1 // cudamatrix/cu-math.cc
2 
3 // Copyright 2009-2012 Karel Vesely
4 // Johns Hopkins University (author: Daniel Povey)
5 
6 // See ../../COPYING for clarification regarding multiple authors
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 //
12 // http://www.apache.org/licenses/LICENSE-2.0
13 //
14 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 // MERCHANTABLITY OR NON-INFRINGEMENT.
18 // See the Apache 2 License for the specific language governing permissions and
19 // limitations under the License.
20 
21 #include "base/timer.h"
22 #include "cudamatrix/cu-common.h"
23 #include "cudamatrix/cu-matrix.h"
24 #include "cudamatrix/cu-device.h"
25 #include "cudamatrix/cu-kernels.h"
26 
27 namespace kaldi {
28 
29 namespace cu {
30 
31 /*
32  * templated functions wrapping the ANSI-C CUDA kernel functions
33  */
34 
35 
36 template<typename Real>
37 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
38  KALDI_ASSERT(SameDim(*weight, *grad));
39 #if HAVE_CUDA == 1
40  if (CuDevice::Instantiate().Enabled()) {
41  CuTimer tim;
42 
43  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
44  dim3 dimGrid(n_blocks(weight->NumCols(), CU2DBLOCK), n_blocks(weight->NumRows(), CU2DBLOCK));
45 
46  cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
47  weight->Dim(), grad->Stride());
48  CU_SAFE_CALL(cudaGetLastError());
49 
50  CuDevice::Instantiate().AccuProfile(__func__, tim);
51  } else
52  #endif
53  {
54  MatrixBase<Real> &weight2 = weight->Mat();
55  MatrixBase<Real> &grad2 = grad->Mat();
56  for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
57  for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
58 
59  if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
60 
61  Real l1_signed = l1;
62  if (weight2(r, c) < 0.0)
63  l1_signed = -l1;
64 
65  Real before = weight2(r, c);
66  Real after = weight2(r, c) - lr*grad2(r, c) - l1_signed;
67  if ((after > 0.0) ^ (before > 0.0)) {
68  weight2(r, c) = 0.0;
69  grad2(r, c) = 0.0;
70  } else {
71  weight2(r, c) -= l1_signed;
72  }
73  }
74  }
75  }
76 }
77 
78 
79 template<typename Real>
80 void Randomize(const CuMatrixBase<Real> &src,
81  const CuArray<int32> &copy_from_idx,
82  CuMatrixBase<Real> *tgt) {
83 
84  KALDI_ASSERT(src.NumCols() == tgt->NumCols());
85  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
86  KALDI_ASSERT(copy_from_idx.Dim() <= tgt->NumRows());
87 
88  #if HAVE_CUDA == 1
89  if (CuDevice::Instantiate().Enabled()) {
90  CuTimer tim;
91 
92  /*
93  Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
94  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
95  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
96  */
97 
98  /*
99  * Let's use blocksize 4 x 128 (512 threads/block)
100  * and extend the randomizable matrices to: col 4*65535, row 128*65535
101  * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
102  */
103  dim3 dimBlock(4, 128);
104  dim3 dimGrid(n_blocks(tgt->NumCols(), 4), n_blocks(copy_from_idx.Dim(), 128));
105  /*
106  */
107 
108  MatrixDim dimsrc = src.Dim(); dimsrc.rows=copy_from_idx.Dim();
109  MatrixDim dimtgt = tgt->Dim(); dimtgt.rows=copy_from_idx.Dim();
110 
111  cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
112  copy_from_idx.Data(), dimtgt, dimsrc);
113  CU_SAFE_CALL(cudaGetLastError());
114 
115  CuDevice::Instantiate().AccuProfile(__func__, tim);
116  } else
117  #endif
118  {
119  // randomize in CPU
120  const MatrixBase<Real> &srcmat = src.Mat();
121  const int32 *copy_from_idxvec = copy_from_idx.Data();
122  MatrixBase<Real> &tgtmat = tgt->Mat();
123  for(int32 i=0; i<copy_from_idx.Dim(); i++) {
124  tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
125  }
126  }
127 }
128 
129 
130 
131 template<typename Real>
132 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
133  CuMatrixBase<Real> *tgt) {
134 
135  KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
136  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
137 
138  #if HAVE_CUDA == 1
139  if (CuDevice::Instantiate().Enabled()) {
140  CuTimer tim;
141 
142  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
143  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
144 
145  cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
146  frame_offsets.Data(), tgt->Dim(), src.Dim());
147  CU_SAFE_CALL(cudaGetLastError());
148 
149  CuDevice::Instantiate().AccuProfile(__func__, tim);
150  } else
151  #endif
152  {
153  // expand in CPU
154  const MatrixBase<Real> &srcmat = src.Mat();
155  const int32 *frame_offsetvec = frame_offsets.Data();
156  int32 dim = frame_offsets.Dim();
157  MatrixBase<Real> &tgtmat = tgt->Mat();
158  //
159  for(int32 r=0; r < tgtmat.NumRows(); r++) {
160  for(int32 off=0; off < dim; off++) {
161  int32 r_off = r + frame_offsetvec[off];
162  if(r_off < 0) r_off = 0;
163  if(r_off >= srcmat.NumRows()) r_off = srcmat.NumRows()-1;
164  memcpy(tgtmat.RowData(r)+off*srcmat.NumCols(),srcmat.RowData(r_off),sizeof(Real)*srcmat.NumCols());
165  }
166  }
167  }
168 }
169 
170 
171 
172 template<typename Real>
173 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
174  CuMatrixBase<Real> *tgt) {
175 
176  KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
177  KALDI_ASSERT(src.NumRows() == tgt->NumRows());
178 
179  #if HAVE_CUDA == 1
180  if (CuDevice::Instantiate().Enabled()) {
181  CuTimer tim;
182 
183  dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
184  dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
185 
186  cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
187  copy_from_indices.Data(), tgt->Dim(), src.Dim());
188  CU_SAFE_CALL(cudaGetLastError());
189 
190  CuDevice::Instantiate().AccuProfile(__func__, tim);
191  } else
192  #endif
193  {
194  // expand in CPU
195  const MatrixBase<Real> &srcmat = src.Mat();
196  const int32 *copy_from_indicesvec = copy_from_indices.Data();
197  int32 dim = copy_from_indices.Dim();
198  MatrixBase<Real> &tgtmat = tgt->Mat();
199  //
200  for(int32 r = 0; r < tgtmat.NumRows(); r++) {
201  for(int32 c = 0; c < dim; c++) {
202  tgtmat(r,c) = srcmat(r,copy_from_indicesvec[c]);
203  }
204  }
205  }
206 }
207 
208 template <typename Real>
210  Real epsilon,
211  CuMatrixBase<Real> *dest) {
212  KALDI_ASSERT(SameDim(*dest, src) && epsilon > 0.0);
213 #if HAVE_CUDA == 1
214  if (CuDevice::Instantiate().Enabled()) {
215  CuTimer tim;
216  dim3 dimGrid, dimBlock;
217  GetBlockSizesForSimpleMatrixOperation(src.NumRows(), src.NumCols(),
218  &dimGrid, &dimBlock);
219  cuda_ensure_nonzero(dimGrid, dimBlock, src.Data(), src.Dim(),
220  epsilon, dest->Stride(), dest->Data());
221  CU_SAFE_CALL(cudaGetLastError());
222  CuDevice::Instantiate().AccuProfile(__func__, tim);
223  } else
224 #endif
225  {
226  int32 num_rows = src.NumRows(), num_cols = src.NumCols();
227  for (int32 r = 0; r < num_rows; r++) {
228  const Real *src_data = src.RowData(r);
229  Real *dest_data = dest->RowData(r);
230  for (int32 c = 0; c < num_cols; c++) {
231  Real x = src_data[c], y;
232  if (x <= -epsilon || x >= epsilon) y = x;
233  else if (x >= 0.0) y = epsilon;
234  else y = -epsilon;
235  dest_data[c] = y;
236  }
237  }
238  }
239 }
240 
241 
242 // instantiate the templates.
243 template
244 void RegularizeL1(CuMatrixBase<float> *weight, CuMatrixBase<float> *grad, float l1, float lr);
245 template
246 void RegularizeL1(CuMatrixBase<double> *weight, CuMatrixBase<double> *grad, double l1, double lr);
247 
248 template
249 void Splice(const CuMatrixBase<float> &src, const CuArray<int32> &frame_offsets,
250  CuMatrixBase<float> *tgt);
251 template
252 void Splice(const CuMatrixBase<double> &src, const CuArray<int32> &frame_offsets,
253  CuMatrixBase<double> *tgt);
254 template
255 void Copy(const CuMatrixBase<float> &src, const CuArray<int32> &copy_from_indices,
256  CuMatrixBase<float> *tgt);
257 template
258 void Copy(const CuMatrixBase<double> &src, const CuArray<int32> &copy_from_indices,
259  CuMatrixBase<double> *tgt);
260 
261 template
262 void Randomize(const CuMatrixBase<float> &src,
263  const CuArray<int32> &copy_from_idx,
264  CuMatrixBase<float> *tgt);
265 template
266 void Randomize(const CuMatrixBase<double> &src,
267  const CuArray<int32> &copy_from_idx,
268  CuMatrixBase<double> *tgt);
269 
270 // The output y_i = scale * x_i,
271 // and we want to RMS value of the y_i to equal target_rms,
272 // so y^t y = D * target_rms^2 (if y is one row of the input).
273 // we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
274 // there is also flooring involved, to avoid division-by-zero
275 // problems. It's important for the backprop, that the floor's
276 // square root is exactly representable as float.
277 // If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
278 // is an extra dimension of the output.
279 template<typename Real>
280 void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
281  const bool add_log_stddev, CuMatrixBase<Real>* out) {
282  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
283  if (add_log_stddev) {
284  KALDI_ASSERT(in.NumRows() == out->NumRows());
285  KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
286  } else {
287  KALDI_ASSERT(SameDim(in, *out));
288  }
289 
290 #if HAVE_CUDA == 1
291  if (CuDevice::Instantiate().Enabled()) {
292  CuTimer tim;
293  size_t dimBlock = CU1DBLOCK;
294  size_t dimGrid = out->NumRows();
295  cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
296  in.Data(), in.Dim(), target_rms, add_log_stddev);
297  CU_SAFE_CALL(cudaGetLastError());
298  CuDevice::Instantiate().AccuProfile(__func__, tim);
299  } else
300 #endif
301  {
302  CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
303  if (in.Data() != out_no_log.Data())
304  out_no_log.CopyFromMat(in);
305  CuVector<Real> in_norm(in.NumRows());
306  Real d_scaled = in.NumCols() * target_rms * target_rms;
307  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
308  in_norm.ApplyFloor(kSquaredNormFloor);
309  in_norm.ApplyPow(-0.5);
310  out_no_log.MulRowsVec(in_norm);
311  if (add_log_stddev) {
312  in_norm.ApplyLog();
313  in_norm.Scale(-1.0);
314  in_norm.Add(log(target_rms));
315  out->CopyColFromVec(in_norm, in.NumCols());
316  }
317  }
318 }
319 
320 template
321 void NormalizePerRow(const CuMatrixBase<float>& in, const float target_rms,
322  const bool add_log_stddev, CuMatrixBase<float>* out);
323 template
324 void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
325  const bool add_log_stddev, CuMatrixBase<double>* out);
326 
327 
328 // A note on the derivative of NormalizeComponent...
329 // let both row_in and row_out be vectors of dimension D.
330 // Let p = row_in^T row_in / (D * target_rms^2), and let
331 // f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
332 // row_out = f row_in.
333 // Suppose we have a quantity deriv_out which is the derivative
334 // of the objective function w.r.t. row_out. We want to compute
335 // deriv_in which is the derivative of the objective function w.r.t.
336 // row_in. Let the objective function be F. One term is obvious: we have
337 // deriv_in = f deriv_out + ....
338 // next we have to take into account the derivative that gets back-propagated
339 // through f. Obviously, dF/df = deriv_out^T row_in.
340 // And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
341 // and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
342 // So this term in dF/d(row_in) equals:
343 // dF/df df/dp dp/d(row_in) = 2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
344 // So
345 // deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / (D * target_rms^2) ) (deriv_out^T row_in) row_in
346 // if add_log_stddev_ true, the deriv_in has another term as
347 // dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
348 template<typename Real>
350  const CuMatrixBase<Real> &out_deriv,
351  const Real target_rms, const bool add_log_stddev,
352  CuMatrixBase<Real>* in_deriv) {
353  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
354 #if HAVE_CUDA == 1
355  if (CuDevice::Instantiate().Enabled()) {
356  CuTimer tim;
357  size_t dimBlock = CU1DBLOCK;
358  size_t dimGrid = in_deriv->NumRows();
359  cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
360  in_deriv->Stride(), in_value.Data(),
361  in_value.Dim(), out_deriv.Data(),
362  out_deriv.Stride(), target_rms, add_log_stddev);
363  CU_SAFE_CALL(cudaGetLastError());
364  CuDevice::Instantiate().AccuProfile(__func__, tim);
365  } else
366 #endif
367  {
368  const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
369  0, in_value.NumCols());
370  CuVector<Real> dot_products(out_deriv.NumRows());
371  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
372  kTrans, 0.0);
373  CuVector<Real> in_norm(in_value.NumRows());
374  Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
375  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
376 
377  if (add_log_stddev) {
378  CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
379  out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
380  // f = log(sqrt(max(epsi, x^T x / D)))
381  // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x : 0.
382  // we don't compute this exactly below for the case when x^2 x is very
383  // small, but we do make sure that the deriv isn't infinity when the input
384  // is zero.
385  log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
386  log_stddev_deriv.ApplyPow(-1.0);
387  out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
388  log_stddev_deriv.MulElements(out_deriv_for_stddev);
389  if (in_deriv)
390  in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
391  }
392  in_norm.Scale(1.0 / d_scaled);
393  in_norm.ApplyFloor(kSquaredNormFloor);
394  in_norm.ApplyPow(-0.5);
395  if (in_deriv) {
396  if (in_deriv->Data() != out_deriv_no_log.Data())
397  in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
398  else
399  in_deriv->MulRowsVec(in_norm);
400  in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
401  in_norm.ApplyPow(3.0);
402  dot_products.MulElements(in_norm);
403 
404  in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
405  1.0);
406  }
407  }
408 }
409 
410 template
411 void DiffNormalizePerRow(const CuMatrixBase<float> &in_value,
412  const CuMatrixBase<float> &out_deriv,
413  const float target_rms, const bool add_log_stddev,
414  CuMatrixBase<float>* in_deriv);
415 template
416 void DiffNormalizePerRow(const CuMatrixBase<double> &in_value,
417  const CuMatrixBase<double> &out_deriv,
418  const double target_rms, const bool add_log_stddev,
419  CuMatrixBase<double>* in_deriv);
420 
421 
422 // not calling this Sigmoid to reduce the chance of future collisions.
423 template<typename Real>
424 static inline Real ScalarSigmoid(Real a) {
425  if (a > Real(0)) {
426  return Real(1) / (Real(1) + Exp(-a));
427  } else {
428  Real x = Exp(a);
429  return x / (x + Real(1));
430  }
431 }
432 
433 template<typename Real>
434 static inline Real ScalarTanh(Real a) {
435  if (a > Real(0)) {
436  Real inv_expa = Exp(-a);
437  return -Real(1) + Real(2) / (Real(1) + inv_expa * inv_expa);
438  } else {
439  Real expa = Exp(a);
440  return Real(1) - Real(2) / (Real(1) + expa * expa);
441  }
442 }
443 
444 template<typename Real>
446  const MatrixBase<Real> &params_mat,
447  MatrixBase<Real> *output) {
448  int32 num_rows = input_mat.NumRows(),
449  input_cols = input_mat.NumCols(),
450  cell_dim = input_cols / 5;
451  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
452  KALDI_ASSERT(output->NumRows() == num_rows);
453  KALDI_ASSERT(params_mat.NumRows() == 3);
454  KALDI_ASSERT(params_mat.NumCols() == cell_dim);
455  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
456 
457  MatrixBase<Real> &output_mat = *output;
458  const Real *params_data = params_mat.Data();
459  int32 params_stride = params_mat.Stride();
460  for (int32 r = 0; r < num_rows; r++) {
461  const Real *input_row = input_mat.RowData(r);
462  // i_scale and f_scale relate to dropout, they will normally be 1.0.
463  Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
464  f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
465  o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
466 
467  Real *output_row = output_mat.RowData(r);
468  for (int32 c = 0; c < cell_dim; c++) {
469  Real i_part = input_row[c];
470  Real f_part = input_row[c + cell_dim];
471  Real c_part = input_row[c + 2 * cell_dim];
472  Real o_part = input_row[c + 3 * cell_dim];
473  Real c_prev = input_row[c + 4 * cell_dim];
474  Real w_ic = params_data[c];
475  Real w_fc = params_data[c + params_stride];
476  Real w_oc = params_data[c + params_stride * 2];
477  Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
478  Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
479  Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
480  Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
481  Real m_t = o_t * o_scale * ScalarTanh(c_t);
482  output_row[c] = c_t;
483  output_row[c + cell_dim] = m_t;
484  }
485  }
486 }
487 
488 template<typename Real>
490  const CuMatrixBase<Real> &params,
491  CuMatrixBase<Real> *output) {
492  int32 num_rows = input.NumRows(),
493  input_cols = input.NumCols(),
494  cell_dim = input_cols / 5;
495  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
496  KALDI_ASSERT(output->NumRows() == num_rows);
497  KALDI_ASSERT(params.NumRows() == 3);
498  KALDI_ASSERT(params.NumCols() == cell_dim);
499  KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
500 
501 #if HAVE_CUDA == 1
502  if (CuDevice::Instantiate().Enabled()) {
503  CuTimer tim;
504 
505  int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
506 
507  // Each thread block is working on 1 row of the data.
508  // It's best that cell dim is a multiple fo CU1DBLOCK
509  dim3 dimBlock(CU1DBLOCK);
510  dim3 dimGrid(num_rows);
511 
512  cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
513  params.Data(), params.Stride(), output->Stride(),
514  cell_dim, have_dropout_mask, num_rows, output->Data());
515  CU_SAFE_CALL(cudaGetLastError());
516 
517  CuDevice::Instantiate().AccuProfile(__func__, tim);
518  } else
519 #endif
520  {
521  CpuComputeLstmNonlinearity(input.Mat(), params.Mat(), &output->Mat());
522  }
523 }
524 
525 template
526 void CpuComputeLstmNonlinearity(const MatrixBase<float> &input_mat,
527  const MatrixBase<float> &params_mat,
528  MatrixBase<float> *output);
529 template
530 void CpuComputeLstmNonlinearity(const MatrixBase<double> &input_mat,
531  const MatrixBase<double> &params_mat,
532  MatrixBase<double> *output);
533 template
535  const CuMatrixBase<float> &params,
536  CuMatrixBase<float> *output);
537 template
539  const CuMatrixBase<double> &params,
540  CuMatrixBase<double> *output);
541 
542 template<typename Real>
544  const MatrixBase<Real> &params,
545  const MatrixBase<Real> &output_deriv,
546  const MatrixBase<double> &deriv_sum_in,
547  const VectorBase<Real> &self_repair_config,
548  double count_in,
549  MatrixBase<Real> *input_deriv,
550  MatrixBase<Real> *params_deriv,
551  MatrixBase<double> *value_sum_out,
552  MatrixBase<double> *deriv_sum_out,
553  MatrixBase<Real> *self_repair_sum_out) {
554  int32 num_rows = input.NumRows(),
555  input_cols = input
556  .NumCols(),
557  cell_dim = input.NumCols() / 5;
558  // Check dimensions.
559  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
560  KALDI_ASSERT(params.NumRows() == 3);
561  KALDI_ASSERT(params.NumCols() == cell_dim);
562  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
563  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
564  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
565  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
566  KALDI_ASSERT(self_repair_config.Dim() == 10);
567  if (input_deriv != NULL) {
568  KALDI_ASSERT(SameDim(input, *input_deriv));
569  }
570  if (params_deriv == NULL) {
571  KALDI_ASSERT(value_sum_out == NULL);
572  KALDI_ASSERT(deriv_sum_out == NULL);
573  KALDI_ASSERT(self_repair_sum_out == NULL);
574  } else {
575  KALDI_ASSERT(value_sum_out != NULL);
576  KALDI_ASSERT(deriv_sum_out != NULL);
577  KALDI_ASSERT(self_repair_sum_out != NULL);
578  KALDI_ASSERT(SameDim(params, *params_deriv));
579  KALDI_ASSERT(value_sum_out->NumRows() == 5);
580  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
581  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
582  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
583  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
584  }
585 
586  const MatrixBase<Real> &input_mat = input;
587  const MatrixBase<Real> &params_mat = params;
588  const MatrixBase<Real> &output_deriv_mat = output_deriv;
589  const MatrixBase<double> &deriv_sum_in_mat = deriv_sum_in;
590  const VectorBase<Real> &sr_config = self_repair_config;
591  MatrixBase<Real> *input_deriv_mat = (
592  input_deriv == NULL ? NULL : input_deriv);
593  MatrixBase<Real> *params_deriv_mat = NULL;
594  MatrixBase<Real> *self_repair_sum_out_mat = NULL;
595  MatrixBase<double> *value_sum_out_mat = NULL;
596  MatrixBase<double> *deriv_sum_out_mat = NULL;
597  if (params_deriv != NULL) {
598  params_deriv_mat = params_deriv;
599  value_sum_out_mat = value_sum_out;
600  deriv_sum_out_mat = deriv_sum_out;
601  self_repair_sum_out_mat = self_repair_sum_out;
602  }
603 
604 
605  // We add 1.0 (i.e. a small value) to the count to avoid division by zero.
606  Real count = 1.0 + count_in;
607  for (int32 c = 0; c < cell_dim; c++) {
608  // parameters
609  Real w_ic = params_mat(0, c);
610  Real w_fc = params_mat(1, c);
611  Real w_oc = params_mat(2, c);
612  // derivative sums w.r.t. parameters.
613  Real w_ic_deriv_sum = 0.0;
614  Real w_fc_deriv_sum = 0.0;
615  Real w_oc_deriv_sum = 0.0;
616 
617  // average derivatives, for self-repair.
618  // The 5 nonlinearities that are subject to self-repair are written as:
619  // Sigmoid(i_t_input), Sigmoid(f_t_input),
620  // Tanh(c_part), Sigmoid(o_t_input), Tanh(c_t)
621  Real i_t_self_repair = (
622  deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
623  Real f_t_self_repair = (
624  deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
625  Real c_part_self_repair = (
626  deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
627  Real o_t_self_repair = (
628  deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
629  Real c_t_self_repair = (
630  deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
631  // Note on how we add self-repair for sigmoids/tanh's. If self-repair
632  // is activated for this unit, then...
633  // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
634  // ... to the input-deriv;
635  // For tanh's we'd add -self_repair_scale * tanh(x)
636  // If self-repair is not activated, the 'self_repair' scales are set to zero.
637 
638  // The following variables are for the accumulation of stats on the
639  // sigmoid and tanh units.
640  Real i_t_value_sum = 0.0, i_t_deriv_sum = 0.0;
641  Real f_t_value_sum = 0.0, f_t_deriv_sum = 0.0;
642  Real c_part_value_sum = 0.0, c_part_deriv_sum = 0.0;
643  Real o_t_value_sum = 0.0, o_t_deriv_sum = 0.0;
644  Real c_t_value_sum = 0.0, c_t_deriv_sum = 0.0;
645 
646 
647  for (int32 r = 0; r < num_rows; r++) {
648  Real i_part = input_mat(r, c),
649  f_part = input_mat(r, c + cell_dim),
650  c_part = input_mat(r, c + 2 * cell_dim),
651  o_part = input_mat(r, c + 3 * cell_dim),
652  c_prev = input_mat(r, c + 4 * cell_dim);
653 
654  Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
655  input_mat(r, cell_dim * 5)),
656  f_scale = (input_cols == cell_dim * 5 ? 1.0 :
657  input_mat(r, cell_dim * 5 + 1)),
658  o_scale = (input_cols == cell_dim * 5 ? 1.0 :
659  input_mat(r, cell_dim * 5 + 2));
660 
661  // For greater clarity, we give some of the quantities in the
662  // forward equations their own names.
663  Real i_t_input = i_part + w_ic * c_prev,
664  i_t = ScalarSigmoid(i_t_input),
665  f_t_input = f_part + w_fc * c_prev,
666  f_t = ScalarSigmoid(f_t_input),
667  tanh_c_part = ScalarTanh(c_part),
668  c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
669  o_t_input = o_part + w_oc * c_t,
670  o_t = ScalarSigmoid(o_t_input),
671  tanh_c_t = ScalarTanh(c_t);
672  // we'd also compute, in the forward pass,
673  // m_t = o_t * tanh_c_t;
674  // but this variable is not needed.
675 
676  // Accumulate nonlinearity value and derivative stats.
677  // Note:
678  // tanh'(x) = sech^2(x) = -(tanh(x)+1) (tanh(x)-1) = 1 - tanh^2(x)
679  // sigmoid'(x) = sigmoid(x) * (1 - sigmoid(x)).
680  i_t_value_sum += i_t;
681  i_t_deriv_sum += i_t * (1.0F - i_t);
682  f_t_value_sum += f_t;
683  f_t_deriv_sum += f_t * (1.0F - f_t);
684  c_part_value_sum += tanh_c_part;
685  c_part_deriv_sum += 1.0F - tanh_c_part * tanh_c_part;
686  o_t_value_sum += o_t;
687  o_t_deriv_sum += o_t * (1.0F - o_t);
688  c_t_value_sum += tanh_c_t;
689  c_t_deriv_sum += 1.0F - tanh_c_t * tanh_c_t;
690 
691 
692  // the derivative of the objective function w.r.t. a particular quantity
693  // will be written by prepending "d" to the name.
694  // We compute these derivatives in the reverse of the order in which
695  // we computed the original quantities.
696  // dc_t_out is the part of the derivative w.r.t. c_t that
697  // comes directly from the output of this function.
698  Real dc_t_out = output_deriv_mat(r, c);
699  Real dm_t = output_deriv_mat(r, c + cell_dim);
700  Real dtanh_c_t = o_t * o_scale * dm_t;
701  Real do_t = o_scale * tanh_c_t * dm_t;
702  Real do_t_input = (o_t * (1.0F - o_t) * do_t
703  - (2.0F * o_t - 1.0F) * o_t_self_repair);
704  Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
705  + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
706  Real dtanh_c_part = i_t * i_scale * dc_t;
707  Real df_t = dc_t * f_scale * c_prev;
708  Real df_t_input = ((df_t * f_t * (1.0F - f_t)
709  - (2.0F * f_t - 1.0F) * f_t_self_repair));
710  Real di_t = dc_t * i_scale * tanh_c_part;
711  Real di_t_input = ((di_t * i_t * (1.0F - i_t)
712  - (2.0F * i_t - 1.0F) * i_t_self_repair));
713 
714  w_ic_deriv_sum += c_prev * di_t_input;
715  w_fc_deriv_sum += c_prev * df_t_input;
716  w_oc_deriv_sum += c_t * do_t_input;
717 
718  Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
719  Real do_part = do_t_input;
720  Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
721  - tanh_c_part * c_part_self_repair);
722  Real df_part = df_t_input;
723  Real di_part = di_t_input;
724 
725  if (input_deriv_mat != NULL) {
726  (*input_deriv_mat)(r, c) = di_part;
727  (*input_deriv_mat)(r, c + cell_dim) = df_part;
728  (*input_deriv_mat)(r, c + 2 * cell_dim) = dc_part;
729  (*input_deriv_mat)(r, c + 3 * cell_dim) = do_part;
730  (*input_deriv_mat)(r, c + 4 * cell_dim) = dc_prev;
731  }
732  }
733 
734  if (params_deriv != NULL) {
735  // note: for optimizing things you can assume that params_deriv and
736  // input_deriv_mat are non-NULL (i.e. all the output matrices are
737  // non-NULL). The situations when some of the output matrices are NULL
738  // does not happen often (mainly only in testing code).
739 
740  (*params_deriv_mat)(0, c) = w_ic_deriv_sum;
741  (*params_deriv_mat)(1, c) = w_fc_deriv_sum;
742  (*params_deriv_mat)(2, c) = w_oc_deriv_sum;
743 
744  (*value_sum_out_mat)(0, c) += i_t_value_sum;
745  (*value_sum_out_mat)(1, c) += f_t_value_sum;
746  (*value_sum_out_mat)(2, c) += c_part_value_sum;
747  (*value_sum_out_mat)(3, c) += o_t_value_sum;
748  (*value_sum_out_mat)(4, c) += c_t_value_sum;
749 
750  // need to update self_repair_sum_out before deriv_sum_out, because
751  // deriv_sum_out and deriv_sum_in might point to the same memory.
752  for (int32 i = 0; i < 5; i++)
753  (*self_repair_sum_out_mat)(i, c) =
754  (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
755 
756  (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
757  (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
758  (*deriv_sum_out_mat)(2, c) += c_part_deriv_sum;
759  (*deriv_sum_out_mat)(3, c) += o_t_deriv_sum;
760  (*deriv_sum_out_mat)(4, c) += c_t_deriv_sum;
761  }
762  }
763 }
764 
765 
766 
767 template<typename Real>
769  const CuMatrixBase<Real> &params,
770  const CuMatrixBase<Real> &output_deriv,
771  const CuMatrixBase<double> &deriv_sum_in,
772  const CuVectorBase<Real> &self_repair_config,
773  double count_in,
774  CuMatrixBase<Real> *input_deriv,
775  CuMatrixBase<Real> *params_deriv,
776  CuMatrixBase<double> *value_sum_out,
777  CuMatrixBase<double> *deriv_sum_out,
778  CuMatrixBase<Real> *self_repair_sum_out) {
779  int32 num_rows = input.NumRows(),
780  cell_dim = input.NumCols() / 5,
781  input_cols = input.NumCols();
782  // Check dimensions.
783  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
784  KALDI_ASSERT(params.NumRows() == 3);
785  KALDI_ASSERT(params.NumCols() == cell_dim);
786  KALDI_ASSERT(output_deriv.NumRows() == num_rows);
787  KALDI_ASSERT(output_deriv.NumCols() == 2 * cell_dim);
788  KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
789  KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
790  KALDI_ASSERT(self_repair_config.Dim() == 10);
791  if (input_deriv != NULL) {
792  KALDI_ASSERT(SameDim(input, *input_deriv));
793  }
794  if (params_deriv == NULL) {
795  KALDI_ASSERT(value_sum_out == NULL);
796  KALDI_ASSERT(deriv_sum_out == NULL);
797  KALDI_ASSERT(self_repair_sum_out == NULL);
798  } else {
799  KALDI_ASSERT(value_sum_out != NULL);
800  KALDI_ASSERT(deriv_sum_out != NULL);
801  KALDI_ASSERT(self_repair_sum_out != NULL);
802  KALDI_ASSERT(SameDim(params, *params_deriv));
803  KALDI_ASSERT(value_sum_out->NumRows() == 5);
804  KALDI_ASSERT(value_sum_out->NumCols() == cell_dim);
805  KALDI_ASSERT(SameDim(*value_sum_out, *deriv_sum_out));
806  KALDI_ASSERT(self_repair_sum_out->NumRows() == 5);
807  KALDI_ASSERT(self_repair_sum_out->NumCols() == cell_dim);
808  }
809 
810 
811 #if HAVE_CUDA == 1
812  if (CuDevice::Instantiate().Enabled()) {
813  CuTimer tim;
814  // Each thread block is working on 1 row of the data.
815  // It's best that cell dim is a multiple fo CU1DBLOCK
816 
817  int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
818 
819  // Use 2D block (8x32 threads) as we need to compute column sum.
820  // Use 1D grid to cover the data matrix width `cell_dim`.
821  const int kWarpSize = 32;
822  dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
823 // dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
824 // n_blocks(num_rows, dimBlock.y));
825 // if (dimGrid.x * dimGrid.y > 1024) {
826 // dimGrid.y = std::max(1024 / dimGrid.x, 1);
827 // }
828  dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
829  if (input_deriv == NULL) {
830  if (params_deriv == NULL) {
831  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
832  have_dropout_mask, num_rows,
833  input.Data(), input.Stride(), params.Data(),
834  params.Stride(), output_deriv.Data(),
835  output_deriv.Stride(), deriv_sum_in.Data(),
836  deriv_sum_in.Stride(),
837  self_repair_config.Data(), count_in + 1,
838  NULL,
839  0,
840  NULL,
841  0,
842  NULL,
843  0,
844  NULL,
845  0,
846  NULL,
847  0);
848 
849  } else {
850  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
851  have_dropout_mask, num_rows,
852  input.Data(), input.Stride(), params.Data(),
853  params.Stride(), output_deriv.Data(),
854  output_deriv.Stride(), deriv_sum_in.Data(),
855  deriv_sum_in.Stride(),
856  self_repair_config.Data(), count_in + 1,
857  NULL,
858  0, params_deriv->Data(),
859  params_deriv->Stride(),
860  value_sum_out->Data(),
861  value_sum_out->Stride(),
862  deriv_sum_out->Data(),
863  deriv_sum_out->Stride(),
864  self_repair_sum_out->Data(),
865  self_repair_sum_out->Stride());
866  }
867  } else {
868  if (params_deriv == NULL) {
869  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
870  have_dropout_mask, num_rows,
871  input.Data(), input.Stride(), params.Data(),
872  params.Stride(), output_deriv.Data(),
873  output_deriv.Stride(), deriv_sum_in.Data(),
874  deriv_sum_in.Stride(),
875  self_repair_config.Data(), count_in + 1,
876  input_deriv->Data(), input_deriv->Stride(),
877  NULL,
878  0, NULL, 0, NULL, 0, NULL, 0);
879  } else {
880  cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
881  have_dropout_mask, num_rows,
882  input.Data(), input.Stride(), params.Data(),
883  params.Stride(), output_deriv.Data(),
884  output_deriv.Stride(), deriv_sum_in.Data(),
885  deriv_sum_in.Stride(),
886  self_repair_config.Data(), count_in + 1,
887  input_deriv->Data(), input_deriv->Stride(),
888  params_deriv->Data(),
889  params_deriv->Stride(),
890  value_sum_out->Data(),
891  value_sum_out->Stride(),
892  deriv_sum_out->Data(),
893  deriv_sum_out->Stride(),
894  self_repair_sum_out->Data(),
895  self_repair_sum_out->Stride());
896  }
897  }
898 
899  CU_SAFE_CALL(cudaGetLastError());
900 
901  CuDevice::Instantiate().AccuProfile(__func__, tim);
902  } else
903 #endif
904  {
905  CpuBackpropLstmNonlinearity(input.Mat(), params.Mat(), output_deriv.Mat(),
906  deriv_sum_in.Mat(), self_repair_config.Vec(),
907  count_in, &(input_deriv->Mat()),
908  &(params_deriv->Mat()), &(value_sum_out->Mat()),
909  &(deriv_sum_out->Mat()),
910  &(self_repair_sum_out->Mat()));
911  }
912 }
913 
914 template <typename Real>
916  Real epsilon,
917  CuVectorBase<Real> *dest) {
918  KALDI_ASSERT(src.Dim() == dest->Dim());
919  int32 dim = src.Dim();
920  // fake it with a 1-row matrix.
921  CuSubMatrix<Real> src_mat(src.Data(), 1, dim, dim),
922  dest_mat(dest->Data(), 1, dim, dim);
923  EnsureNonzero(src_mat, epsilon, &dest_mat);
924 }
925 
926 // Instantiate the templates we defined above.
927 
928 template
929 void EnsureNonzero(const CuMatrixBase<float> &src,
930  float epsilon,
931  CuMatrixBase<float> *dest);
932 template
933 void EnsureNonzero(const CuMatrixBase<double> &src,
934  double epsilon,
935  CuMatrixBase<double> *dest);
936 
937 template
938 void EnsureNonzero(const CuVectorBase<float> &src,
939  float epsilon,
940  CuVectorBase<float> *dest);
941 template
942 void EnsureNonzero(const CuVectorBase<double> &src,
943  double epsilon,
944  CuVectorBase<double> *dest);
945 
946 template
948  const MatrixBase<float> &params,
949  const MatrixBase<float> &output_deriv,
950  const MatrixBase<double> &deriv_sum_in,
951  const VectorBase<float> &self_repair_config,
952  double count_in,
953  MatrixBase<float> *input_deriv,
954  MatrixBase<float> *params_deriv,
955  MatrixBase<double> *value_sum_out,
956  MatrixBase<double> *deriv_sum_out,
957  MatrixBase<float> *self_repair_sum_out);
958 template
960  const MatrixBase<double> &params,
961  const MatrixBase<double> &output_deriv,
962  const MatrixBase<double> &deriv_sum_in,
963  const VectorBase<double> &self_repair_config,
964  double count_in,
965  MatrixBase<double> *input_deriv,
966  MatrixBase<double> *params_deriv,
967  MatrixBase<double> *value_sum_out,
968  MatrixBase<double> *deriv_sum_out,
969  MatrixBase<double> *self_repair_sum_out);
970 template
972  const CuMatrixBase<float> &params,
973  const CuMatrixBase<float> &output_deriv,
974  const CuMatrixBase<double> &deriv_sum_in,
975  const CuVectorBase<float> &self_repair_config,
976  double count_in,
977  CuMatrixBase<float> *input_deriv,
978  CuMatrixBase<float> *params_deriv,
979  CuMatrixBase<double> *value_sum_out,
980  CuMatrixBase<double> *deriv_sum_out,
981  CuMatrixBase<float> *self_repair_sum_out);
982 template
984  const CuMatrixBase<double> &params,
985  const CuMatrixBase<double> &output_deriv,
986  const CuMatrixBase<double> &deriv_sum_in,
987  const CuVectorBase<double> &self_repair_config,
988  double count_in,
989  CuMatrixBase<double> *input_deriv,
990  CuMatrixBase<double> *params_deriv,
991  CuMatrixBase<double> *value_sum_out,
992  CuMatrixBase<double> *deriv_sum_out,
993  CuMatrixBase<double> *self_repair_sum_out);
994 
995 
996 
997 } //namespace cu
998 
999 } //namespace kaldi
const MatrixBase< Real > & Mat() const
Definition: cu-matrix.h:755
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
double Exp(double x)
Definition: kaldi-math.h:83
MatrixIndexT Stride() const
Definition: cu-matrix.h:217
int32_cuda rows
Definition: cu-matrixdim.h:47
void CpuComputeLstmNonlinearity(const MatrixBase< Real > &input_mat, const MatrixBase< Real > &params_mat, MatrixBase< Real > *output)
Definition: cu-math.cc:445
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79
Structure containing size of the matrix plus stride.
Definition: cu-matrixdim.h:46
void Randomize(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_idx, CuMatrixBase< Real > *tgt)
Copies a permutation of src into tgt.
Definition: cu-math.cc:80
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
Definition: kaldi-matrix.h:87
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
Definition: cu-vector.cc:595
kaldi::int32 int32
const T * Data() const
Get raw pointer.
Definition: cu-array.h:52
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=NULL)
Definition: cu-vector.h:139
void CopyColFromVec(const CuVectorBase< Real > &v, const MatrixIndexT col)
Copy vector into specific column of matrix.
Definition: cu-matrix.cc:2414
void BackpropLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, const CuMatrixBase< Real > &output_deriv, const CuMatrixBase< double > &deriv_sum_in, const CuVectorBase< Real > &self_repair_config, double count_in, CuMatrixBase< Real > *input_deriv, CuMatrixBase< Real > *params_deriv, CuMatrixBase< double > *value_sum_out, CuMatrixBase< double > *deriv_sum_out, CuMatrixBase< Real > *self_repair_sum_out)
This function does the &#39;backward&#39; pass corresponding to the function ComputeLstmNonlinearity.
Definition: cu-math.cc:768
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
Definition: cu-vector.cc:611
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
const size_t count
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
Definition: kaldi-matrix.h:70
int32 MatrixIndexT
Definition: matrix-common.h:98
void DiffNormalizePerRow(const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_deriv, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *in_deriv)
Definition: cu-math.cc:349
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
#define CU2DBLOCK
Definition: cu-matrixdim.h:61
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64
const Real * Data() const
Return data pointer (const).
Definition: cu-matrix.h:746
static Real ScalarTanh(Real a)
Definition: cu-math.cc:434
void EnsureNonzero(const CuMatrixBase< Real > &src, Real epsilon, CuMatrixBase< Real > *dest)
This function requires that src and dest have the same dimension and epsilon > 0. ...
Definition: cu-math.cc:209
void Splice(const CuMatrixBase< Real > &src, const CuArray< int32 > &frame_offsets, CuMatrixBase< Real > *tgt)
Splice concatenates frames of src as specified in frame_offsets into tgt.
Definition: cu-math.cc:132
Matrix for CUDA computing.
Definition: matrix-common.h:69
MatrixIndexT NumCols() const
Definition: cu-matrix.h:216
const VectorBase< Real > & Vec() const
Definition: cu-vector.h:235
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72
::MatrixDim Dim() const
Definition: cu-matrix.h:221
MatrixIndexT NumRows() const
Dimensions.
Definition: cu-matrix.h:215
Provides a vector abstraction class.
Definition: kaldi-vector.h:41
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49
void NormalizePerRow(const CuMatrixBase< Real > &in, const Real target_rms, const bool add_log_stddev, CuMatrixBase< Real > *out)
Normalize nonlinearity modifies the vector of activations by scaling it so that the root-mean-square ...
Definition: cu-math.cc:280
static Real ScalarSigmoid(Real a)
Definition: cu-math.cc:424
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i&#39;th row by scale[i]
Definition: cu-matrix.cc:792
void RegularizeL1(CuMatrixBase< Real > *weight, CuMatrixBase< Real > *grad, Real l1, Real lr)
RegularizeL1 is a gradient step with l1 regularization added to the gradient.
Definition: cu-math.cc:37
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
Vector for CUDA computing.
Definition: matrix-common.h:72
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
Definition: cu-matrix.cc:1382
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740
void Copy(const CuMatrixBase< Real > &src, const CuArray< int32 > &copy_from_indices, CuMatrixBase< Real > *tgt)
Copies elements from src into tgt as given by copy_from_indices.
Definition: cu-math.cc:173
void CpuBackpropLstmNonlinearity(const MatrixBase< Real > &input, const MatrixBase< Real > &params, const MatrixBase< Real > &output_deriv, const MatrixBase< double > &deriv_sum_in, const VectorBase< Real > &self_repair_config, double count_in, MatrixBase< Real > *input_deriv, MatrixBase< Real > *params_deriv, MatrixBase< double > *value_sum_out, MatrixBase< double > *deriv_sum_out, MatrixBase< Real > *self_repair_sum_out)
Definition: cu-math.cc:543
void ComputeLstmNonlinearity(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &params, CuMatrixBase< Real > *output)
this is a special-purpose function used by class LstmNonlinearityComponent, to do its forward propaga...
Definition: cu-math.cc:489