cu-matrix-speed-test.cc
Go to the documentation of this file.
1 // cudamatrix/cu-matrix-speed-test.cc
2 
3 // Copyright 2013 Johns Hopkins University (author: Daniel Povey)
4 // 2015 Guoguo Chen
5 // 2017 Shiyin Kang
6 
7 // See ../../COPYING for clarification regarding multiple authors
8 //
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 //
13 // http://www.apache.org/licenses/LICENSE-2.0
14 //
15 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
17 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
18 // MERCHANTABLITY OR NON-INFRINGEMENT.
19 // See the Apache 2 License for the specific language governing permissions and
20 // limitations under the License.
21 
22 
23 #include <iostream>
24 #include <vector>
25 #include <cstdlib>
26 
27 #include "base/kaldi-common.h"
28 #include "util/common-utils.h"
29 #include "cudamatrix/cu-matrix.h"
30 #include "cudamatrix/cu-vector.h"
31 #include "cudamatrix/cu-math.h"
35 
36 using namespace kaldi;
37 
38 
39 namespace kaldi {
40 
41 template<typename Real>
42 std::string NameOf() {
43  return (sizeof(Real) == 8 ? "<double>" : "<float>");
44 }
45 
46 template<typename Real> void TestCuMatrixSum(int32 dim) {
47  BaseFloat time_in_secs = 0.025;
48  CuMatrix<Real> M(dim, dim);
49  M.SetRandn();
50 
51  Timer tim;
52  int32 iter = 0;
53  Real result = 0;
54  for (; tim.Elapsed() < time_in_secs; iter++) {
55  result = M.Sum();
56  }
57  BaseFloat fdim = dim;
58  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
59  KALDI_LOG<< "For CuMatrix::TestCuMatrixSum" << NameOf<Real>() << ", for dim = "
60  << dim << ", speed was " << gflops << " gigaflops, result = " << result;
61 }
62 
63 template<typename Real> void TestCuMatrixMax(int32 dim) {
64  BaseFloat time_in_secs = 0.025;
65  CuMatrix<Real> M(dim, dim);
66  M.SetRandn();
67 
68  Timer tim;
69  int32 iter = 0;
70  Real result = 0;
71  for (; tim.Elapsed() < time_in_secs; iter++) {
72  result = M.Max();
73  }
74  BaseFloat fdim = dim;
75  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
76  KALDI_LOG<< "For CuMatrix::TestCuMatrixMax" << NameOf<Real>() << ", for dim = "
77  << dim << ", speed was " << gflops << " gigaflops, result = " << result;
78 }
79 
80 template<typename Real> void TestCuMatrixMin(int32 dim) {
81  BaseFloat time_in_secs = 0.025;
82  CuMatrix<Real> M(dim, dim);
83  M.SetRandn();
84 
85  Timer tim;
86  int32 iter = 0;
87  Real result = 0;
88  for (; tim.Elapsed() < time_in_secs; iter++) {
89  result = M.Min();
90  }
91  BaseFloat fdim = dim;
92  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
93  KALDI_LOG<< "For CuMatrix::TestCuMatrixMin" << NameOf<Real>() << ", for dim = "
94  << dim << ", speed was " << gflops << " gigaflops, result = " << result;
95 }
96 
97 template<typename Real> void TestCuMatrixDivRowsVec(int32 dim) {
98  BaseFloat time_in_secs = 0.025;
99  CuMatrix<Real> M(dim, dim);
100  CuVector<Real> V(dim);
101  M.SetRandn();
102  V.SetRandn();
103 
104  Timer tim;
105  int32 iter = 0;
106  for (; tim.Elapsed() < time_in_secs; iter++) {
107  M.DivRowsVec(V);
108  }
109 
110  BaseFloat fdim = dim;
111  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
112  KALDI_LOG<< "For CuMatrix::DivRowsVec" << NameOf<Real>() << ", for dim = "
113  << dim << ", speed was " << gflops << " gigaflops.";
114 }
115 
116 template<typename Real> void TestCuMatrixTransposeNS(int32 dim) {
117  BaseFloat time_in_secs = 0.025;
118  CuMatrix<Real> M(dim, dim / 2);
119  M.SetRandn();
120 
121  Timer tim;
122  int32 iter = 0;
123  for (; tim.Elapsed() < time_in_secs; iter++) {
124  M.Transpose();
125  }
126  BaseFloat fdim = dim;
127  BaseFloat gflops = (fdim * fdim * iter / 2) / (tim.Elapsed() * 1.0e+09);
128  KALDI_LOG<< "For CuMatrix::TransposeNS" << NameOf<Real>() << ", for dim = "
129  << dim << ", speed was " << gflops << " gigaflops.";
130 }
131 
132 template<typename Real> void TestCuMatrixTransposeS(int32 dim) {
133  BaseFloat time_in_secs = 0.025;
134  CuMatrix<Real> M(dim, dim);
135  M.SetRandn();
136 
137  Timer tim;
138  int32 iter = 0;
139  for (; tim.Elapsed() < time_in_secs; iter++) {
140  M.Transpose();
141  }
142  BaseFloat fdim = dim;
143  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
144  KALDI_LOG<< "For CuMatrix::TransposeS" << NameOf<Real>() << ", for dim = "
145  << dim << ", speed was " << gflops << " gigaflops.";
146 }
147 
148 template<typename Real> void TestCuMatrixTransposeCross(int32 dim) {
149  BaseFloat time_in_secs = 0.025;
150  CuMatrix<float> Mf(dim / 2, dim), ref(dim, dim / 2);
151  CuMatrix<Real> Md(dim, dim / 2);
152  Mf.SetRandn();
153  ref = Mf;
154 
155  Timer tim;
156  int32 iter = 0;
157  for (; tim.Elapsed() < time_in_secs; iter++) {
158  Md.CopyFromMat(Mf, kTrans);
159  Mf.CopyFromMat(Md, kTrans);
160  }
161  BaseFloat fdim = dim;
162  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
163  KALDI_LOG<< "For CuMatrix::TransposeCross" << NameOf<Real>() << ", for dim = "
164  << dim << ", speed was " << gflops << " gigaflops.";
165 
166  AssertEqual(ref, Mf);
167 }
168 
169 template<typename Real> void TestCuMatrixAddMat(int32 dim, int32 num_row_blocks,
170  int32 num_col_blocks) {
171  BaseFloat time_in_secs = 0.025;
172  CuMatrix<Real> A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks);
173  A.SetRandn();
174  B.SetRandn();
175  Timer tim;
176  int32 iter = 0;
177  for (;tim.Elapsed() < time_in_secs; iter++) {
178  for (int32 i = 0; i < num_row_blocks; i++) {
179  for (int32 j = 0; j < num_col_blocks; j++) {
180  A.AddMat(0.0, CuSubMatrix<Real>(B, i * dim, dim, j * dim, dim));
181  }
182  }
183  }
184  BaseFloat fdim = dim;
185  BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter)
186  / (tim.Elapsed() * 1.0e+09);
187  KALDI_LOG << "For CuMatrix::AddMat" << NameOf<Real>() << ", for dim = "
188  << dim << "numRowBlocks = "<< num_row_blocks << "numColBlocks = "
189  << num_col_blocks << ", speed was " << gflops << " gigaflops.";
190 }
191 
192 template<typename Real> void TestCuMatrixAddMatBlocks(int32 dim,
193  int32 num_row_blocks,
194  int32 num_col_blocks) {
195  BaseFloat time_in_secs = 0.025;
196  CuMatrix<Real> A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks);
197  A.SetRandn();
198  B.SetRandn();
199  Timer tim;
200  int32 iter = 0;
201  for (;tim.Elapsed() < time_in_secs; iter++) {
202  A.AddMatBlocks(0.0, B);
203  }
204  BaseFloat fdim = dim;
205  BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter)
206  / (tim.Elapsed() * 1.0e+09);
207  KALDI_LOG << "For CuMatrix::AddMatBlocks" << NameOf<Real>() << ", for dim = "
208  << dim << ", numRowBlocks = "<< num_row_blocks << ", numColBlocks = "
209  << num_col_blocks << ", speed was " << gflops << " gigaflops.";
210 }
211 
212 template<typename Real> void TestCuMatrixMatMat(int32 dim) {
213  BaseFloat time_in_secs = 0.025;
214  CuMatrix<Real> M(dim, dim), N(dim, dim), O(dim, dim);
215  M.SetRandn();
216  N.SetRandn();
217  Timer tim;
218  int32 iter = 0;
219  for (;tim.Elapsed() < time_in_secs; iter++) {
220  O.AddMatMat(1.0, M, kNoTrans, N, kNoTrans, 0.0);
221  }
222 
223  BaseFloat fdim = dim;
224  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
225  KALDI_LOG << "For CuMatrix::AddMatMat" << NameOf<Real>() << ", for dim = "
226  << dim << ", speed was " << gflops << " gigaflops.";
227 }
228 
229 template<typename Real> void TestCuMatrixMatMatBatched(int32 dim, int32 batchCount) {
230  std::vector<CuMatrix<Real>* > a(batchCount), b(batchCount), c(batchCount);
231  std::vector<CuSubMatrix<Real>* > A, B, C;
232 
233  for (int32 i = 0; i < batchCount; i++) {
234  // first create a Matrix intance and then creat a SubMatrix instance from that
235  a[i] = new CuMatrix<Real>(dim, dim);
236  b[i] = new CuMatrix<Real>(dim, dim);
237  c[i] = new CuMatrix<Real>(dim, dim);
238  a[i]->SetRandn();
239  b[i]->SetRandn();
240  A.push_back(new CuSubMatrix<Real>(*(a[i]), 0, a[i]->NumRows(), 0,
241  a[i]->NumCols()));
242  B.push_back(new CuSubMatrix<Real>(*(b[i]), 0, b[i]->NumRows(), 0,
243  b[i]->NumCols()));
244  C.push_back(new CuSubMatrix<Real>(*(c[i]), 0, c[i]->NumRows(), 0,
245  c[i]->NumCols()));
246  }
247  BaseFloat time_in_secs = 0.025;
248  Timer tim;
249  int32 iter = 0;
250  for (;tim.Elapsed() < time_in_secs; iter++) {
251  AddMatMatBatched(static_cast<Real>(1.0), C, A, kNoTrans, B, kNoTrans,
252  static_cast<Real>(0.0));
253  }
254  for (int32 i = 0; i< batchCount; i++) {
255  delete a[i]; delete b[i]; delete c[i];
256  delete A[i]; delete B[i]; delete C[i];
257  }
258 
259  BaseFloat fdim = dim;
260  BaseFloat gflops = (fdim * fdim * fdim * iter * batchCount) / (tim.Elapsed() * 1.0e+09);
261  KALDI_LOG << "For CuMatrix::AddMatMatBatched" << NameOf<Real>() << ", for dim = " << dim
262  << ", batchSize = " << batchCount << ", speed was " << gflops << " gigaflops.";
263 }
264 
265 template<typename Real> void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) {
266  BaseFloat time_in_secs = 0.015;
267  CuMatrix<Real> M(dim, dim), N(dim, dim);
268  CuVector<Real> v(dim);
269  M.SetRandn();
270  v.SetRandn();
271  Timer tim;
272  int32 iter = 0;
273  for (;tim.Elapsed() < time_in_secs; iter++)
274  N.AddDiagVecMat(1.0, v, M, trans, 0.0);
275 
276  BaseFloat fdim = dim;
277  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
278  KALDI_LOG << "For CuMatrix::AddDiagVecMat" << NameOf<Real>()
279  << (trans == kTrans ? "[trans]" : "[no-trans]")
280  << ", for dim = " << dim << ", speed was "
281  << gflops << " gigaflops.";
282 }
283 
284 
285 
286 template<typename Real> void TestSymInvertPosDef(int32 dim) {
287  BaseFloat time_in_secs = 0.025;
288  CuMatrix<Real> M(dim, dim * 2), N(dim, dim);
289  M.SetRandn();
290  N.SymAddMat2(1.0, M, kNoTrans, 0.0);
291  CuMatrix<Real> Ncopy(N);
292 
293  int iter = 0;
294  Timer tim;
295  for (;tim.Elapsed() < time_in_secs; iter++) {
296  Ncopy.CopyFromMat(N);
297  Ncopy.SymInvertPosDef();
298  }
299 
300  BaseFloat fdim = dim;
301  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
302  KALDI_LOG << "For CuMatrix::TestCuInvertPosDef" << NameOf<Real>() << ", for dim = "
303  << dim << ", speed was " << gflops << " gigaflops.";
304 }
305 
306 
307 template<typename Real>
309  BaseFloat time_in_secs = 0.025;
310  // Previously tested for larger dims, but test was slow.
311 
312  int32 n_r = dim, n_c = dim + Rand() % 5;
313 
314  CuMatrix<Real> A(n_r, n_c), B(n_r, n_c);
315  B.SetRandn();
316  B.Add(1.0);
317  B.ApplyFloor(1.0e-10);
318 
319  std::vector<MatrixElement<Real> > labels;
320  for(int i = 0; i < n_r; i++) {
321  for(int j = 0; j < n_c; j++) {
322  // have approximately one weight per row of the matrix.
323  if (Rand() % n_c == 0) {
324  A(i, j) = RandUniform();
325  MatrixElement<Real> t = {i, j, A(i, j)};
326  labels.push_back(t);
327  }
328  }
329  }
330  CuMatrix<Real> C(n_r, n_c);
331 
332  int iter = 0;
333  Timer tim;
334  Real a = 0.0, b = 0.0;
335  for (;tim.Elapsed() < time_in_secs; iter++)
336  C.CompObjfAndDeriv(labels, B, &a, &b);
337 
338  BaseFloat gflops = (n_r * n_c * iter) / (tim.Elapsed() * 1.0e+09);
339  KALDI_LOG << "For CuMatrix::CompObjfAndDeriv" << NameOf<Real>() << ", for dim = "
340  << dim << ", speed was " << gflops << " gigaflops.";
341 
342 
343  // do it one more time for correctness test.
344  C.SetZero();
345  C.CompObjfAndDeriv(labels, B, &a, &b);
346 
347  KALDI_ASSERT(ApproxEqual(b, A.Sum()));
348 
349  // repeat the real test.
350  Real sum2; // sum(i, j) A(i, j) log(B(i, j));
351  {
352  CuMatrix<Real> Bcopy(B);
353  Bcopy.ApplyLog();
354  sum2 = TraceMatMat(Bcopy, A, kTrans);
355  }
356 
357  KALDI_ASSERT(ApproxEqual(a, sum2));
358 
359  B.InvertElements();
360  A.MulElements(B); // each element of A is now A(i, j) / B(i, j);
361  KALDI_ASSERT(ApproxEqual(A, C));
362 
363 
364 }
365 
366 
367 template<typename Real>
368 static void TestCuFindRowMaxId(int32 dim) {
369 
370  int32 dimM = dim, dimN = dimM + Rand() % 5;
371 
372  Matrix<Real> Hi(dimM, dimN);
373  Hi.SetRandn();
374 
375  CuMatrix<Real> Di(dimM, dimN);
376  Di.CopyFromMat(Hi);
377 
378  std::vector<int32> Hmax(dimM);
379  CuArray<int32> Dmax(dimN);
380 
381  BaseFloat time_in_secs = 0.025;
382  int iter = 0;
383  Timer tim;
384  for (;tim.Elapsed() < time_in_secs; iter++)
385  Di.FindRowMaxId(&Dmax);
386 
387 
388  BaseFloat fdim = dim;
389  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
390  KALDI_LOG << "For CuMatrix::FindRowMaxId" << NameOf<Real>() << ", for dim = "
391  << dim << ", speed was " << gflops << " gigaflops.";
392 
393 
394  // on cpu
395  for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
396  Real max=-1.0e+20; int32 idx=-1;
397  for(MatrixIndexT c=0; c<Hi.NumCols(); c++) {
398  if(Hi(r,c) > max) { idx=c; max=Hi(r,c); }
399  }
400  Hmax[r] = idx;
401  }
402  std::vector<int32> Hmax2(dimM);
403  Dmax.CopyToVec(&Hmax2);
404 
405  KALDI_ASSERT(Hmax == Hmax2);
406 }
407 
408 
409 
410 template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
411  BaseFloat time_in_secs = 0.025;
412  CuMatrix<Real> M(dim, dim), N(dim, dim);
413  M.SetRandn();
414  N.SetRandn();
415  Timer tim;
416  int32 iter = 0;
417  for (;tim.Elapsed() < time_in_secs; iter++) {
418  N.Sigmoid(M);
419  }
420 
421  BaseFloat fdim = dim;
422  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
423  KALDI_LOG << "For CuMatrix::Sigmoid" << NameOf<Real>() << ", for dim = "
424  << dim << ", speed was " << gflops << " gigaflops.";
425 }
426 
427 template<typename Real> void TestCuMatrixHeaviside(int32 dim) {
428  BaseFloat time_in_secs = 0.025;
429  CuMatrix<Real> M(dim, dim), N(dim, dim);
430  M.SetRandn();
431  N.SetRandn();
432  Timer tim;
433  int32 iter = 0;
434  for (;tim.Elapsed() < time_in_secs; iter++) {
435  N.ApplyHeaviside();
436  }
437 
438  BaseFloat fdim = dim;
439  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
440  KALDI_LOG << "For CuMatrix::Heaviside" << NameOf<Real>() << ", for dim = "
441  << dim << ", speed was " << gflops << " gigaflops.";
442 }
443 
444 
445 template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
446  BaseFloat time_in_secs = 0.025;
447 
448  int32 group_size = 5;
449  CuMatrix<Real> M(dim, dim * group_size), N(dim, dim);
450  M.SetRandn();
451  N.SetRandn();
452  Timer tim;
453  int32 iter = 0;
454  for (;tim.Elapsed() < time_in_secs; iter++) {
455  M.MulRowsGroupMat(N);
456  }
457 
458  BaseFloat fdim = dim;
459  BaseFloat gflops = (fdim * fdim * group_size * iter) / (tim.Elapsed() * 1.0e+09);
460  KALDI_LOG << "For CuMatrix::MulRowsGroupMat" << NameOf<Real>() << ", for dim = "
461  << dim << ", speed was " << gflops << " gigaflops.";
462 }
463 
464 template<typename Real> void TestCuMatrixDiffSoftmax(int32 dim) {
465  BaseFloat time_in_secs = 0.025;
466  CuMatrix<Real> M(dim, dim), N(dim, dim), L(dim, dim);
467  M.SetRandn();
468  N.SetRandn();
469  L.SetRandn();
470  Timer tim;
471  int32 iter = 0;
472  for (; tim.Elapsed() < time_in_secs; iter++) {
473  N.DiffSoftmaxPerRow(M, L);
474  }
475 
476  BaseFloat fdim = dim;
477  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
478  KALDI_LOG << "For CuMatrix::DiffSoftmaxPerRow" << NameOf<Real>() << ", for dim = "
479  << dim << ", speed was " << gflops << " gigaflops.";
480 }
481 
482 template<typename Real> void TestCuMatrixDiffLogSoftmax(int32 dim) {
483  BaseFloat time_in_secs = 0.025;
484  CuMatrix<Real> M(dim, dim), N(dim, dim), L(dim, dim);
485  M.SetRandn();
486  N.SetRandn();
487  L.SetRandn();
488  Timer tim;
489  int32 iter = 0;
490  for (; tim.Elapsed() < time_in_secs; iter++) {
491  N.DiffLogSoftmaxPerRow(M, L);
492  }
493 
494  BaseFloat fdim = dim;
495  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
496  KALDI_LOG << "For CuMatrix::DiffLogSoftmaxPerRow" << NameOf<Real>() << ", for dim = "
497  << dim << ", speed was " << gflops << " gigaflops.";
498 }
499 
500 template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
501  BaseFloat time_in_secs = 0.025;
502  CuMatrix<Real> M(dim, dim), N(dim, dim);
503  M.SetRandn();
504  N.SetRandn();
505  Timer tim;
506  int32 iter = 0;
507  for (;tim.Elapsed() < time_in_secs; iter++) {
508  N.SoftMaxPerRow(M);
509  }
510 
511  BaseFloat fdim = dim;
512  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
513  KALDI_LOG << "For CuMatrix::Softmax" << NameOf<Real>() << ", for dim = "
514  << dim << ", speed was " << gflops << " gigaflops.";
515 }
516 
517 
518 template<typename Real> void TestCuMatrixLogSoftmax(int32 dim) {
519  BaseFloat time_in_secs = 0.025;
520  CuMatrix<Real> M(dim, dim), N(dim, dim);
521  M.SetRandn();
522  N.SetRandn();
523  Timer tim;
524  int32 iter = 0;
525  for (;tim.Elapsed() < time_in_secs; iter++) {
526  N.LogSoftMaxPerRow(M);
527  }
528 
529  BaseFloat fdim = dim;
530  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
531  KALDI_LOG << "For CuMatrix::LogSoftmax" << NameOf<Real>() << ", for dim = "
532  << dim << ", speed was " << gflops << " gigaflops.";
533 }
534 
535 
536 template<typename Real> void TestCuMatrixGroupPnorm(int32 dim) {
537  BaseFloat time_in_secs = 0.025;
538  int32 group_size = 4;
539  CuMatrix<Real> M(dim, dim), N(dim, dim / group_size);
540  M.SetRandn();
541  Timer tim;
542  int32 iter = 0;
543  for (;tim.Elapsed() < time_in_secs; iter++)
544  N.GroupPnorm(M, 2.0);
545 
546  BaseFloat fdim = dim;
547  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
548  KALDI_LOG << "For CuMatrix::GroupPnorm" << NameOf<Real>() << ", for dim = "
549  << dim << ", speed was " << gflops << " gigaflops.";
550 }
551 
552 
553 template<typename Real> void TestCuMatrixDiffGroupPnorm(int32 dim) {
554  BaseFloat time_in_secs = 0.025;
555  int32 group_size = 8;
556  CuMatrix<Real> iv(dim, dim), ov(dim, dim / group_size);
557  CuMatrix<Real> id(dim, dim), od(dim, dim / group_size);
558  iv.SetRandn();
559  od.SetRandn();
560  ov.GroupPnorm(iv, 2.0);
561  Timer tim;
562  int32 iter = 0;
563 
564  for (; tim.Elapsed() < time_in_secs; iter++)
565  id.DiffGroupPnorm(iv, ov, od, 2.0);
566 
567  BaseFloat fdim = dim;
568  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
569  KALDI_LOG << "For CuMatrix::DiffGroupPnorm" << NameOf<Real>() << ", for dim = "
570  << dim << ", speed was " << gflops << " gigaflops.";
571 }
572 
573 template<typename Real> void TestCuMatrixGroupMax(int32 dim) {
574  BaseFloat time_in_secs = 0.025;
575  int32 group_size = 4;
576  CuMatrix<Real> M(dim, dim), N(dim, dim / group_size);
577  M.SetRandn();
578  Timer tim;
579  int32 iter = 0;
580  for (;tim.Elapsed() < time_in_secs; iter++)
581  N.GroupMax(M);
582 
583  BaseFloat fdim = dim;
584  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
585  KALDI_LOG << "For CuMatrix::GroupMax" << NameOf<Real>() << ", for dim = "
586  << dim << ", speed was " << gflops << " gigaflops.";
587 }
588 
589 template<typename Real> void TestCuMatrixGroupMaxAllGroupSizes(int32 dim) {
590  BaseFloat time_in_secs = 0.025;
591  CuMatrix<Real> M(dim, dim);
592  M.SetRandn();
593  Timer tim;
594  int32 iter = 0;
595  for (; tim.Elapsed() < time_in_secs;) {
596  for (int group_size = 1; group_size <= dim; group_size++) {
597  if (dim % group_size == 0) {
598  CuMatrix<Real> N(dim, dim / group_size, kUndefined);
599  N.GroupMax(M);
600  iter++;
601  }
602  }
603  }
604 
605  BaseFloat fdim = dim;
606  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
607  KALDI_LOG << "For CuMatrix::GroupMax (all group sizes)" << NameOf<Real>()
608  << ", for dim = " << dim << ", speed was " << gflops
609  << " gigaflops.";
610 }
611 
612 template<typename Real> void TestCuMatrixGroupMaxDeriv(int32 dim) {
613  BaseFloat time_in_secs = 0.025;
614  int32 group_size = 4;
615  CuMatrix<Real> M(dim, dim), N(dim, dim / group_size), O(dim, dim);
616  M.SetRandn();
617  N.GroupMax(M);
618  Timer tim;
619  int32 iter = 0;
620 
621  for (;tim.Elapsed() < time_in_secs; iter++)
622  O.GroupMaxDeriv(M, N);
623 
624  BaseFloat fdim = dim;
625  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
626  KALDI_LOG << "For CuMatrix::GroupMaxDeriv" << NameOf<Real>() << ", for dim = "
627  << dim << ", speed was " << gflops << " gigaflops.";
628 }
629 
630 template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
631  for (int32 n = 0; n < 2; n++) {
632  MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
633  BaseFloat time_in_secs = 0.02;
634 
635  CuMatrix<Real> M(dim, dim), N(dim, dim);
636  M.SetRandn();
637  N.SetRandn();
638  Timer tim;
639  int32 iter = 0;
640  for (;tim.Elapsed() < time_in_secs; iter++) {
641  TraceMatMat(M, N, trans);
642  }
643  BaseFloat fdim = dim;
644  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
645  KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>()
646  << (trans == kTrans ? " [transposed]" : "") << ", for dim = "
647  << dim << ", speed was " << gflops << " gigaflops.";
648  }
649 }
650 
651 
652 template<typename Real> void TestCuMatrixCholesky(int32 dim) {
653  BaseFloat time_in_secs = 0.025;
654 
655  CuMatrix<Real> M(dim, dim);
656  M.AddToDiag(100.0);
657  Timer tim;
658  int32 iter = 0;
659  for (;tim.Elapsed() < time_in_secs; iter++)
660  M.Cholesky();
661 
662  BaseFloat fdim = dim;
663  BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
664  KALDI_LOG << "For CuMatrix::Cholesky" << NameOf<Real>()
665  << ", for dim = " << dim << ", speed was " << gflops << " gigaflops.";
666 }
667 
668 
669 
670 template<typename Real> void TestCuMatrixCopyLowerToUpper(int32 dim) {
671  BaseFloat time_in_secs = 0.025;
672  CuMatrix<Real> M(dim, dim);
673  M.SetRandn();
674  Timer tim;
675  int32 iter = 0;
676  for (; tim.Elapsed() < time_in_secs; iter++) {
677  M.CopyLowerToUpper();
678  }
679  CuMatrix<Real> M2(M, kTrans);
680  AssertEqual(M, M2);
681  BaseFloat fdim = dim;
682  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
683  KALDI_LOG << "For CuMatrix::CopyLowerToUpper" << NameOf<Real>() << ", for dim = "
684  << dim << ", speed was " << gflops << " gigaflops.";
685 }
686 
687 
688 template<typename Real> void TestCuMatrixCopyFromTp(int32 dim, MatrixTransposeType trans) {
689  BaseFloat time_in_secs = 0.025;
690  CuTpMatrix<Real> T(dim);
691  T.SetRandn();
692  CuMatrix<Real> M(dim, dim);
693 
694  Timer tim;
695  int32 iter = 0;
696  for (; tim.Elapsed() < time_in_secs; iter++) {
697  M.CopyFromTp(T, trans);
698  }
699  TpMatrix<Real> T_cpu(T);
700  Matrix<Real> M_cpu(T_cpu, trans);
701  Matrix<Real> M2_cpu(M);
702  AssertEqual(M_cpu, M2_cpu);
703 
704  BaseFloat fdim = dim;
705  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
706  KALDI_LOG << "For CuMatrix::CopyFromTp" << (trans == kNoTrans ? "[NoTrans]":"[Trans]")
707  << NameOf<Real>() << ", for dim = "
708  << dim << ", speed was " << gflops << " gigaflops.";
709 }
710 
711 
712 template<typename Real> void TestCuMatrixCopyFromSp(int32 dim) {
713  BaseFloat time_in_secs = 0.025;
714  CuSpMatrix<Real> S(dim);
715  S.SetRandn();
716  CuMatrix<Real> M(dim, dim);
717 
718  Timer tim;
719  int32 iter = 0;
720  for (; tim.Elapsed() < time_in_secs; iter++) {
721  M.CopyFromSp(S);
722  }
723  SpMatrix<Real> S_cpu(S);
724  Matrix<Real> M_cpu(S_cpu);
725  Matrix<Real> M2_cpu(M);
726  AssertEqual(M_cpu, M2_cpu);
727 
728  BaseFloat fdim = dim;
729  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
730  KALDI_LOG << "For CuMatrix::CopyFromSp" << NameOf<Real>() << ", for dim = "
731  << dim << ", speed was " << gflops << " gigaflops.";
732 }
733 
734 
735 
736 template<typename Real> void TestCuMatrixCopyUpperToLower(int32 dim) {
737  BaseFloat time_in_secs = 0.025;
738  CuMatrix<Real> M(dim, dim);
739  M.SetRandn();
740  Timer tim;
741  int32 iter = 0;
742  for (; tim.Elapsed() < time_in_secs; iter++) {
743  M.CopyUpperToLower();
744  }
745  CuMatrix<Real> M2(M, kTrans);
746  AssertEqual(M, M2);
747  BaseFloat fdim = dim;
748  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
749  KALDI_LOG << "For CuMatrix::CopyUpperToLower" << NameOf<Real>() << ", for dim = "
750  << dim << ", speed was " << gflops << " gigaflops.";
751 }
752 
753 
754 template<typename Real> void TestCuMatrixResize(int32 dim) {
755  BaseFloat time_in_secs = 0.025;
756  Timer tim;
757  int32 iter = 0;
758  for (; tim.Elapsed() < time_in_secs; iter++) {
759  CuMatrix<Real>M(dim, dim, kUndefined); // we are testing the allocation and deallocation time.
760  }
761  BaseFloat fdim = dim;
762  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
763  KALDI_LOG << "For CuMatrix::TestCuMatrixResize" << NameOf<Real>() << ", for dim = "
764  << dim << ", speed was " << gflops << " gigaflops.";
765 }
766 
767 template<typename Real> void TestCuMatrixSetZeroAboveDiag(int32 dim) {
768  BaseFloat time_in_secs = 0.025;
769  CuMatrix<Real> M(dim, dim);
770  M.SetRandn();
771  Timer tim;
772  int32 iter = 0;
773  for (; tim.Elapsed() < time_in_secs; iter++)
774  M.SetZeroAboveDiag();
775  BaseFloat fdim = dim;
776  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
777  KALDI_LOG << "For CuMatrix::SetZeroAboveDiag" << NameOf<Real>() << ", for dim = "
778  << dim << ", speed was " << gflops << " gigaflops.";
779 }
780 
781 template<typename Real>
783  BaseFloat time_in_secs = 0.025;
784  int32 dimM = dim, dimN = dim;
785  CuMatrix<Real> H(dimM, dimN);
786  H.SetRandn();
787  std::vector<Int32Pair> indices;
788  std::vector<Real> reference;
789  std::vector<Real> output;
790  // Generates the indices and the reference.
791  int32 num_index = dim * dim;
792  output.resize(num_index);
793  for (int32 j = 0; j < num_index; j++) {
794  MatrixIndexT r = Rand() % dimM;
795  MatrixIndexT c = Rand() % dimN;
796 
797  Int32Pair tmp_pair;
798  tmp_pair.first = r;
799  tmp_pair.second = c;
800  indices.push_back(tmp_pair);
801  reference.push_back(H(r, c));
802  }
803  Timer tim;
804  int32 iter = 0;
805  for (; tim.Elapsed()< time_in_secs; iter++)
806  H.Lookup(indices, &(output[0]));
807 
808  BaseFloat fdim = dim;
809  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
810  KALDI_LOG << "For CuMatrix::Lookup" << NameOf<Real>() << ", for dim = "
811  << dim << ", speed was " << gflops << " gigaflops.";
812 }
813 
814 template<typename Real> void TestCuMatrixCopyRows1(int32 dim) {
815  BaseFloat time_in_secs = 0.025;
816  CuMatrix<Real> M(dim, dim), N(dim, dim);
817  M.SetRandn();
818  N.SetRandn();
819 
820  std::vector<int32> reorder(dim);
821  for (int32 i = 0; i < dim; i++) {
822  reorder[i] = i;
823  }
824  CuArray<int32> reorder_cuda(reorder);
825 
826  Timer tim;
827  int32 iter = 0;
828  for (; tim.Elapsed() < time_in_secs; iter++) {
829  M.CopyRows(N, reorder_cuda);
830  }
831 
832  BaseFloat fdim = dim;
833  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
834  KALDI_LOG << "For CuMatrix::CopyRows" << NameOf<Real>() << ", for dim = "
835  << dim << ", speed was " << gflops << " gigaflops.";
836 }
837 
838 template<typename Real> void TestCuMatrixCopyRows2(int32 dim) {
839  BaseFloat time_in_secs = 0.025;
840  CuMatrix<Real> M(dim, dim), N(dim, dim);
841  M.SetRandn();
842  N.SetRandn();
843 
844  std::vector<const Real*> reorder_src(dim, NULL);
845  for (int32 i = 0; i < dim; i++) {
846  reorder_src[i] = N.RowData(i);
847  }
848  CuArray<const Real*> reorder_src_cuda(reorder_src);
849 
850  Timer tim;
851  int32 iter = 0;
852  for (; tim.Elapsed() < time_in_secs; iter++) {
853  M.CopyRows(reorder_src_cuda);
854  }
855 
856  BaseFloat fdim = dim;
857  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
858  KALDI_LOG << "For CuMatrix::CopyRows" << NameOf<Real>() << ", for dim = "
859  << dim << ", speed was " << gflops << " gigaflops.";
860 }
861 
862 template<typename Real> void TestCuMatrixCopyToRows(int32 dim) {
863  BaseFloat time_in_secs = 0.025;
864  CuMatrix<Real> M(dim, dim), N(dim, dim);
865  M.SetRandn();
866  N.SetRandn();
867 
868  std::vector<Real*> reorder_dst(dim, NULL);
869  for (int32 i = 0; i < dim; i++) {
870  reorder_dst[i] = N.RowData(i);
871  }
872  CuArray<Real*> reorder_dst_cuda(reorder_dst);
873 
874  Timer tim;
875  int32 iter = 0;
876  for (; tim.Elapsed() < time_in_secs; iter++) {
877  M.CopyToRows(reorder_dst_cuda);
878  }
879 
880  BaseFloat fdim = dim;
881  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
882  KALDI_LOG << "For CuMatrix::CopyToRows" << NameOf<Real>() << ", for dim = "
883  << dim << ", speed was " << gflops << " gigaflops.";
884 }
885 
886 template<typename Real> void TestCuMatrixAddRows1(int32 dim) {
887  BaseFloat time_in_secs = 0.025;
888  CuMatrix<Real> M(dim, dim), N(dim, dim);
889  M.SetRandn();
890  N.SetRandn();
891 
892  std::vector<int32> reorder(dim);
893  for (int32 i = 0; i < dim; i++) {
894  reorder[i] = i;
895  }
896  CuArray<int32> reorder_cuda(reorder);
897 
898  Timer tim;
899  int32 iter = 0;
900  for (; tim.Elapsed() < time_in_secs; iter++) {
901  M.AddRows(0.5, N, reorder_cuda);
902  }
903 
904  BaseFloat fdim = dim;
905  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
906  KALDI_LOG << "For CuMatrix::AddRows" << NameOf<Real>() << ", for dim = "
907  << dim << ", speed was " << gflops << " gigaflops.";
908 }
909 
910 template<typename Real> void TestCuMatrixAddRows2(int32 dim) {
911  BaseFloat time_in_secs = 0.025;
912  CuMatrix<Real> M(dim, dim), N(dim, dim);
913  M.SetRandn();
914  N.SetRandn();
915 
916  std::vector<const Real*> reorder_src(dim, NULL);
917  for (int32 i = 0; i < dim; i++) {
918  reorder_src[i] = N.RowData(i);
919  }
920  CuArray<const Real*> reorder_src_cuda(reorder_src);
921 
922  Timer tim;
923  int32 iter = 0;
924  for (; tim.Elapsed() < time_in_secs; iter++) {
925  M.AddRows(0.5, reorder_src_cuda);
926  }
927 
928  BaseFloat fdim = dim;
929  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
930  KALDI_LOG << "For CuMatrix::AddRows" << NameOf<Real>() << ", for dim = "
931  << dim << ", speed was " << gflops << " gigaflops.";
932 }
933 
934 template<typename Real> void TestCuMatrixAddToRows(int32 dim) {
935  BaseFloat time_in_secs = 0.025;
936  CuMatrix<Real> M(dim, dim), N(dim, dim);
937  M.SetRandn();
938  N.SetRandn();
939 
940  std::vector<Real*> reorder_dst(dim, NULL);
941  for (int32 i = 0; i < dim; i++) {
942  reorder_dst[i] = N.RowData(i);
943  }
944  CuArray<Real*> reorder_dst_cuda(reorder_dst);
945 
946  Timer tim;
947  int32 iter = 0;
948  for (; tim.Elapsed() < time_in_secs; iter++) {
949  M.AddToRows(0.5, reorder_dst_cuda);
950  }
951 
952  BaseFloat fdim = dim;
953  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
954  KALDI_LOG << "For CuMatrix::AddToRows" << NameOf<Real>() << ", for dim = "
955  << dim << ", speed was " << gflops << " gigaflops.";
956 }
957 
958 template<typename Real> void TestCuMatrixAddRowRanges(int32 dim) {
959  BaseFloat time_in_secs = 0.025;
960  CuMatrix<Real> M(dim, dim), N(dim, dim);
961  M.SetRandn();
962  N.SetRandn();
963 
964  std::vector<Int32Pair> indexes(dim);
965  for (int32 i = 0; i < dim; i++) {
966  indexes[i].first = i;
967  indexes[i].second = i + 1;
968  }
969  CuArray<Int32Pair> indexes_cuda(indexes);
970 
971  Timer tim;
972  int32 iter = 0;
973  for (; tim.Elapsed() < time_in_secs; iter++) {
974  M.AddRowRanges(N, indexes_cuda);
975  }
976 
977  BaseFloat fdim = dim;
978  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
979  KALDI_LOG << "For CuMatrix::AddRowRanges" << NameOf<Real>() << ", for dim = "
980  << dim << ", speed was " << gflops << " gigaflops.";
981 }
982 
983 template<typename Real> void TestCuSparseMatrixTraceMatSmat(int32 dim) {
984  for (int32 n = 0; n < 2; n++) {
985  MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
986  BaseFloat time_in_secs = 0.02;
987 
988  CuMatrix<Real> M(dim, dim);
989  M.SetRandn();
990 
991  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pairs(dim);
992  for (auto && row : pairs) {
993  row.push_back( { MatrixIndexT(Rand() % dim), Real(Rand() % dim) });
994  }
995  SparseMatrix<Real> Ncpu(dim, pairs);
996  CuSparseMatrix<Real> N(Ncpu);
997 
998  Timer tim;
999  int32 iter = 0;
1000  for (;tim.Elapsed() < time_in_secs; iter++) {
1001  TraceMatSmat(M, N, trans);
1002  }
1003  BaseFloat fdim = dim;
1004  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
1005  KALDI_LOG << "For CuSparseMatrix::TraceMatSmat" << NameOf<Real>()
1006  << (trans == kTrans ? " [transposed]" : "") << ", for dim = "
1007  << dim << ", speed was " << gflops << " gigaflops.";
1008  }
1009 }
1010 
1011 
1012 template<typename Real> void CudaMatrixSpeedTest() {
1013  std::vector<int32> sizes;
1014  sizes.push_back(16);
1015  sizes.push_back(32);
1016  sizes.push_back(64);
1017  sizes.push_back(128);
1018  sizes.push_back(256);
1019  sizes.push_back(512);
1020  sizes.push_back(1024);
1021  int32 ns = sizes.size();
1022  for (int32 s = 0; s < ns; s++)
1023  TestCuMatrixDivRowsVec<Real>(sizes[s]);
1024  for (int32 s = 0; s < ns; s++)
1025  TestCuMatrixResize<Real>(sizes[s]);
1026  for (int32 s = 0; s < ns; s++)
1027  TestCuMatrixAddMat<Real>(sizes[s], 3, 3);
1028  for (int32 s = 0; s < ns; s++)
1029  TestCuMatrixAddMatBlocks<Real>(sizes[s], 3, 3);
1030  for (int32 s = 0; s < ns; s++)
1031  TestCuMatrixMatMat<Real>(sizes[s]);
1032  for (int32 s = 0; s + 1 < ns; s++)
1033  TestCuMatrixMatMatBatched<Real>(sizes[s], 10);
1034  for (int32 s = 0; s < ns; s++) {
1035  TestCuMatrixAddDiagVecMat<Real>(sizes[s], kNoTrans);
1036  TestCuMatrixAddDiagVecMat<Real>(sizes[s], kTrans);
1037  }
1038  for (int32 s = 0; s < ns; s++)
1039  TestSymInvertPosDef<Real>(sizes[s]);
1040  for (int32 s = 0; s < ns; s++)
1041  TestCuMatrixCholesky<Real>(sizes[s]);
1042  for (int32 s = 0; s < ns; s++)
1043  TestCuMatrixSigmoid<Real>(sizes[s]);
1044  for (int32 s = 0; s < ns; s++)
1045  TestCuMatrixHeaviside<Real>(sizes[s]);
1046  for (int32 s = 0; s < ns; s++)
1047  TestCuFindRowMaxId<Real>(sizes[s]);
1048  for (int32 s = 0; s < ns; s++)
1049  TestCuMatrixCompObjfAndDeriv<Real>(sizes[s]);
1050  for (int32 s = 0; s < ns; s++)
1051  TestCuMatrixMulRowsGroupMat<Real>(sizes[s]);
1052  for (int32 s = 0; s < ns; s++)
1053  TestCuMatrixSoftmax<Real>(sizes[s]);
1054  for (int32 s = 0; s < ns; s++)
1055  TestCuMatrixDiffSoftmax<Real>(sizes[s]);
1056  for (int32 s = 0; s < ns; s++)
1057  TestCuMatrixDiffLogSoftmax<Real>(sizes[s]);
1058  for (int32 s = 0; s < ns; s++)
1059  TestCuMatrixLogSoftmax<Real>(sizes[s]);
1060  for (int32 s = 0; s < ns; s++)
1061  TestCuMatrixGroupPnorm<Real>(sizes[s]);
1062  for (int32 s = 0; s < ns; s++)
1063  TestCuMatrixDiffGroupPnorm<Real>(sizes[s]);
1064  for (int32 s = 0; s < ns; s++)
1065  TestCuMatrixGroupMax<Real>(sizes[s]);
1066  for (int32 s = 0; s < ns; s++)
1067  TestCuMatrixGroupMaxAllGroupSizes<Real>(sizes[s]);
1068  for (int32 s = 0; s < ns; s++)
1069  TestCuMatrixGroupMaxDeriv<Real>(sizes[s]);
1070  for (int32 s = 0; s < ns; s++)
1071  TestCuMatrixTraceMatMat<Real>(sizes[s]);
1072  for (int32 s = 0; s < ns; s++)
1073  TestCuSparseMatrixTraceMatSmat<Real>(sizes[s]);
1074  for (int32 s = 0; s < ns; s++)
1075  TestCuMatrixCopyLowerToUpper<Real>(sizes[s]);
1076  for (int32 s = 0; s < ns; s++)
1077  TestCuMatrixCopyFromTp<Real>(sizes[s], kNoTrans);
1078  for (int32 s = 0; s < ns; s++)
1079  TestCuMatrixCopyFromTp<Real>(sizes[s], kTrans);
1080  for (int32 s = 0; s < ns; s++)
1081  TestCuMatrixCopyFromSp<Real>(sizes[s]);
1082  for (int32 s = 0; s < ns; s++)
1083  TestCuMatrixCopyUpperToLower<Real>(sizes[s]);
1084  for (int32 s = 0; s < ns; s++)
1085  TestCuMatrixSetZeroAboveDiag<Real>(sizes[s]);
1086  for (int32 s = 0; s + 2 < ns; s++)
1087  TestCuMatrixLookup<Real>(sizes[s]);
1088  for (int32 s = 0; s < ns; s++)
1089  TestCuMatrixCopyRows1<Real>(sizes[s]);
1090  for (int32 s = 0; s < ns; s++)
1091  TestCuMatrixCopyRows2<Real>(sizes[s]);
1092  for (int32 s = 0; s < ns; s++)
1093  TestCuMatrixCopyToRows<Real>(sizes[s]);
1094  for (int32 s = 0; s < ns; s++)
1095  TestCuMatrixAddRows1<Real>(sizes[s]);
1096  for (int32 s = 0; s < ns; s++)
1097  TestCuMatrixAddRows2<Real>(sizes[s]);
1098  for (int32 s = 0; s < ns; s++)
1099  TestCuMatrixAddToRows<Real>(sizes[s]);
1100  for (int32 s = 0; s < ns; s++)
1101  TestCuMatrixAddRowRanges<Real>(sizes[s]);
1102  for (int32 s = 0; s < ns; s++)
1103  TestCuMatrixTransposeCross<Real>(sizes[s]);
1104  for (int32 s = 0; s < ns; s++)
1105  TestCuMatrixTransposeS<Real>(sizes[s]);
1106  for (int32 s = 0; s < ns; s++)
1107  TestCuMatrixTransposeNS<Real>(sizes[s]);
1108  for (int32 s = 0; s < ns; s++)
1109  TestCuMatrixSum<Real>(sizes[s]);
1110  for (int32 s = 0; s < ns; s++)
1111  TestCuMatrixMax<Real>(sizes[s]);
1112  for (int32 s = 0; s < ns; s++)
1113  TestCuMatrixMin<Real>(sizes[s]);
1114 }
1115 
1116 
1117 } // namespace kaldi
1118 
1119 
1120 int main() {
1121  SetVerboseLevel(1);
1122 #if HAVE_CUDA == 1
1123  int32 loop = 0;
1124  for (loop = 0; loop < 2; loop++) {
1125  if (loop == 0)
1126  CuDevice::Instantiate().SelectGpuId("no");
1127  else
1128  CuDevice::Instantiate().SelectGpuId("yes");
1129 #endif
1130 
1131  kaldi::CudaMatrixSpeedTest<float>();
1132 #if HAVE_CUDA == 1
1133  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
1134  kaldi::CudaMatrixSpeedTest<double>();
1135  } else {
1136  KALDI_WARN << "Double precision not supported";
1137  }
1138 #else
1139  kaldi::CudaMatrixSpeedTest<double>();
1140 #endif
1141 #if HAVE_CUDA == 1
1142  } // No for loop if 'HAVE_CUDA != 1',
1143  CuDevice::Instantiate().PrintProfile();
1144 #endif
1145  KALDI_LOG << "Tests succeeded.";
1146 }
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:344
void TestCuMatrixCholesky(int32 dim)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void TestCuMatrixLookup(int32 dim)
Packed symetric matrix class.
Definition: matrix-common.h:62
void TestCuMatrixCopyToRows(int32 dim)
void TestCuMatrixSetZeroAboveDiag(int32 dim)
void TestCuMatrixGroupMaxAllGroupSizes(int32 dim)
void GroupMax(const CuMatrixBase< Real > &src)
Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j where G = x.NumCols() / y.NumCols() must be an integer.
Definition: cu-matrix.cc:1617
void TestCuMatrixTransposeCross(int32 dim)
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151
void CopyToVec(std::vector< T > *dst) const
This function resizes *dst if needed.
Definition: cu-array-inl.h:177
void TestCuMatrixResize(int32 size_multiple)
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67
void TestCuMatrixSum(int32 dim)
void TestSymInvertPosDef(int32 dim)
void TestCuMatrixDiffSoftmax(int32 dim)
Real Sum() const
Definition: cu-matrix.cc:3012
void ApplyFloor(Real floor_val)
Definition: cu-matrix.h:451
std::string NameOf()
kaldi::int32 int32
void TestCuMatrixMulRowsGroupMat(int32 dim)
void TestCuMatrixAddRowRanges(int32 dim)
void AddToDiag(Real value)
Adds "value" to the diagonal elements of the matrix.
Definition: cu-matrix.cc:604
void DivRowsVec(const CuVectorBase< Real > &div)
divide i&#39;th row by scale[i]
Definition: cu-matrix.cc:899
A class for storing matrices.
Definition: kaldi-matrix.h:823
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void TestCuMatrixMatMatBatched(int32 dim, int32 batchCount)
void TestCuMatrixMin(int32 dim)
void Min(const CuMatrixBase< Real > &A)
Do, elementwise, *this = min(*this, A).
Definition: cu-matrix.cc:740
void TestCuMatrixCopyRows2(int32 dim)
void SetRandn()
< Set to unit matrix.
void InvertElements()
invert the matrix by elements.
Definition: cu-matrix.cc:932
int main()
void TestCuMatrixCopyUpperToLower(int32 dim)
void Lookup(const std::vector< Int32Pair > &indexes, Real *output) const
Definition: cu-matrix.cc:3370
void SetVerboseLevel(int32 i)
This should be rarely used, except by programs using Kaldi as library; command-line programs set the ...
Definition: kaldi-error.h:64
void SymInvertPosDef()
Inversion for positive definite symmetric matrices.
Definition: cu-matrix.cc:2111
void CompObjfAndDeriv(const std::vector< MatrixElement< Real > > &elements, const CuMatrix< Real > &A, Real *tot_objf, Real *tot_weight)
Here, A is interpreted as a matrix of probabilities, and "elements" as a list of posteriors (possibly...
Definition: cu-matrix.cc:1661
void AddMatMatBatched(const Real alpha, std::vector< CuSubMatrix< Real > * > &C, const std::vector< CuSubMatrix< Real > * > &A, MatrixTransposeType transA, const std::vector< CuSubMatrix< Real > * > &B, MatrixTransposeType transB, const Real beta)
Does multiple matrix multiplications, executing them in parallel using cuBLAS&#39;s gemmBatched if we are...
Definition: cu-matrix.cc:2207
void TestCuMatrixGroupPnorm(int32 dim)
void TestCuMatrixAddMat(int32 dim, int32 num_row_blocks, int32 num_col_blocks)
int32 MatrixIndexT
Definition: matrix-common.h:98
void Max(const CuMatrixBase< Real > &A)
Do, elementwise, *this = max(*this, A).
Definition: cu-matrix.cc:715
void TestCuMatrixDiffLogSoftmax(int32 dim)
void TestCuMatrixTransposeNS(int32 dim)
void CopyFromSp(const CuSpMatrix< Real > &M)
Definition: cu-matrix.cc:360
void Sigmoid(const CuMatrixBase< Real > &src)
Set each element to the sigmoid of the corresponding element of "src": element by element...
Definition: cu-matrix.cc:1534
void Add(Real value)
Definition: cu-matrix.cc:582
void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans)
void SetZero()
Math operations, some calling kernels.
Definition: cu-matrix.cc:509
void SoftMaxPerRow(const CuMatrixBase< Real > &src)
Softmax nonlinearity Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row, with attention to avoiding overflow or underflow.
Definition: cu-matrix.cc:1717
void TestCuMatrixAddRows2(int32 dim)
struct rnnlm::@11::@12 n
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
Definition: cu-matrix.cc:1353
void TestCuMatrixCopyLowerToUpper(int32 dim)
void TestCuMatrixMax(int32 dim)
void SetRandn()
Sets to random values of a normal distribution.
void TestCuMatrixGroupMax(int32 dim)
void GroupPnorm(const CuMatrixBase< Real > &src, Real pow)
Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p) where G = x...
Definition: cu-matrix.cc:1576
Packed symetric matrix class.
Definition: matrix-common.h:63
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Definition: cu-matrix.cc:1291
void Cholesky(CuMatrixBase< Real > *inv_cholesky=NULL)
This function does sets *this to the Cholesky factor of *this (i.e.
Definition: cu-matrix.cc:1987
#define KALDI_WARN
Definition: kaldi-error.h:150
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
This class is used for a piece of a CuMatrix.
Definition: matrix-common.h:70
void CudaMatrixSpeedTest()
void GroupMaxDeriv(const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &output)
Calculate derivatives for the GroupMax function above, where "input" is the input to the GroupMax fun...
Definition: cu-matrix.cc:874
int Rand(struct RandomState *state)
Definition: kaldi-math.cc:45
static void TestCuMatrixCompObjfAndDeriv(int32 dim)
void FindRowMaxId(CuArray< int32 > *id) const
Find the id of the maximal element for each row (resizes the &#39;id&#39; array to the appropriate size)...
Definition: cu-matrix.cc:1829
void TestCuMatrixGroupMaxDeriv(int32 dim)
void TestCuSparseMatrixTraceMatSmat(int32 dim)
void TestCuMatrixCopyFromTp(int32 dim, MatrixTransposeType trans)
void TestCuMatrixDivRowsVec(int32 dim)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void TestCuMatrixSoftmax(int32 dim)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64
void TestCuMatrixAddRows1(int32 dim)
void TestCuMatrixCopyFromSp(int32 dim)
void CopyFromTp(const CuTpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Definition: cu-matrix.cc:280
MatrixTransposeType
Definition: matrix-common.h:32
static void AssertEqual(float a, float b, float relative_tolerance=0.001)
assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
Definition: kaldi-math.h:276
Real TraceMatSmat(const MatrixBase< Real > &A, const SparseMatrix< Real > &B, MatrixTransposeType trans)
int32_cuda second
Definition: cu-matrixdim.h:80
void TestCuMatrixTransposeS(int32 dim)
void LogSoftMaxPerRow(const CuMatrixBase< Real > &src)
LogSoftmax nonlinearity Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row...
Definition: cu-matrix.cc:1740
void TestCuMatrixDiffGroupPnorm(int32 dim)
void TestCuMatrixAddToRows(int32 dim)
void TestCuMatrixAddMatBlocks(int32 dim, int32 num_row_blocks, int32 num_col_blocks)
static void TestCuFindRowMaxId(int32 dim)
#define KALDI_LOG
Definition: kaldi-error.h:153
void TestCuMatrixSigmoid(int32 dim)
void TestCuMatrixTraceMatMat(int32 dim)
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265
void TestCuMatrixCopyRows1(int32 dim)
void TestCuMatrixLogSoftmax(int32 dim)
void SetZeroAboveDiag()
Zeroes all elements for which col > row.
Definition: cu-matrix.cc:554
void TestCuMatrixMatMat(int32 dim)
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].
Definition: cu-matrix.cc:1382
void TestCuMatrixHeaviside(int32 dim)
const Real * RowData(MatrixIndexT r) const
Get raw row pointer (const).
Definition: cu-matrix.h:740
int32_cuda first
Definition: cu-matrixdim.h:79