cu-vector-speed-test.cc
Go to the documentation of this file.
1 // cudamatrix/cu-vector-speed-test.cc
2 
3 // Copyright 2013 Johns Hopkins University (author: Daniel Povey)
4 // 2017 Daniel Galvez
5 // 2016-2018 Shiyin Kang
6 
7 
8 // See ../../COPYING for clarification regarding multiple authors
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
18 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
19 // MERCHANTABLITY OR NON-INFRINGEMENT.
20 // See the Apache 2 License for the specific language governing permissions and
21 // limitations under the License.
22 
23 
24 #include <iostream>
25 #include <vector>
26 #include <cstdlib>
27 
28 #include "base/kaldi-common.h"
29 #include "util/common-utils.h"
30 #include "cudamatrix/cu-matrix.h"
31 #include "cudamatrix/cu-vector.h"
32 #include "cudamatrix/cu-math.h"
33 
34 using namespace kaldi;
35 
36 
37 namespace kaldi {
38 
39 template<typename Real>
40 std::string NameOf() {
41  return (sizeof(Real) == 8 ? "<double>" : "<float>");
42 }
43 
44 template<typename Real> void TestCuVectorSoftmax(int32 dim) {
45  BaseFloat time_in_secs = 0.02;
46  CuVector<Real> M(dim);
47  M.SetRandn();
48 
49  Timer tim;
50  int32 iter = 0;
51  for (;tim.Elapsed() < time_in_secs; iter++) {
52  M.ApplySoftMax();
53  }
54 
55  BaseFloat fdim = dim;
56  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
57  KALDI_LOG << "For CuVector::Softmax" << NameOf<Real>() << ", for dim = "
58  << dim << ", speed was " << gflops << " gigaflops.";
59 }
60 
61 
62 template<typename Real> void TestCuVectorSum(int32 dim) {
63  BaseFloat time_in_secs = 0.02;
64  CuVector<Real> M(dim);
65  M.SetRandn();
66 
67  Timer tim;
68  int32 iter = 0;
69  for (;tim.Elapsed() < time_in_secs; iter++) {
70  M.Sum();
71  }
72 
73  BaseFloat fdim = dim;
74  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
75  KALDI_LOG << "For CuVector::Sum" << NameOf<Real>() << ", for dim = "
76  << dim << ", speed was " << gflops << " gigaflops.";
77 }
78 
79 template<typename Real, typename OtherReal> void TestCuVectorCopyFromVec(int32 dim) {
80  BaseFloat time_in_secs = 0.02;
81  CuVector<Real> M(dim);
82  M.SetRandn();
83 
84  Timer tim;
85  int32 iter = 0;
86  for (;tim.Elapsed() < time_in_secs; iter++) {
87  CuVector<OtherReal> v(dim);
88  v.CopyFromVec(M);
89  }
90 
91  BaseFloat fdim = dim;
92  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
93  KALDI_LOG << "For CuVector::CopyFromVec" << NameOf<Real>() << " to "
94  << NameOf<OtherReal>() << ", for dim = "
95  << dim << ", speed was " << gflops << " gigaflops.";
96 }
97 
98 
99 #if HAVE_CUDA == 1
100 // This test choose the min length of vectors to be reduced on GPU.
101 // Smaller vector will be copied to RAM and reduced on CPU.
102 template<typename Real> void TestCuVectorSumChooseMinLength() {
103  BaseFloat time_in_secs = 0.02;
104  for (int dim = 100; dim < 1000000; dim = dim * 1.5 + 1 ) {
105  CuVector<Real> M(dim);
106  BaseFloat gflops, gflops_cpu;
107  Real result = 0, result_cpu = 0;
108  M.SetRandn();
109  {
110  Timer tim;
111  int32 iter = 0;
112  for (; tim.Elapsed() < time_in_secs; iter++) {
113  // Force GPU reduction
114  int dimBlock = CU1DBLOCK;
115  int dimGrid = n_blocks(M.Dim(), dimBlock);
116  if (dimGrid > 256) {
117  dimGrid = 256;
118  }
119  CuVector<Real> ans(dimGrid, kUndefined);
120  cuda_vec_sum(dimGrid, dimBlock, M.Data(), ans.Data(), M.Dim(), 1);
121  CU_SAFE_CALL(cudaGetLastError());
122  Vector<Real> ans_cpu(ans);
123  result = ans_cpu.Sum();
124  }
125 
126  BaseFloat fdim = dim;
127  gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
128  }
129  {
130  Timer tim;
131  int32 iter = 0;
132  for (; tim.Elapsed() < time_in_secs; iter++) {
133  Vector<Real> M_cpu(M);
134  result_cpu = M_cpu.Sum();
135  }
136 
137  BaseFloat fdim = dim;
138  gflops_cpu = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
139  }
140  KALDI_LOG << "CuVector::Sum" << NameOf<Real>() << ", dim: " << dim
141  << ", speed: GPU " << (gflops > gflops_cpu ? ">" : "<")
142  << " CPU, GPU speed: " << gflops << " Gflops. CPU speed: "
143  << gflops_cpu << " Gflops. Result diff: " << (result - result_cpu);
144  }
145 }
146 #endif
147 
148 template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
149  BaseFloat time_in_secs = 0.02;
150  CuVector<Real> M(dim);
151  M.SetRandn();
152 
153  Timer tim;
154  int32 iter = 0;
155  for (;tim.Elapsed() < time_in_secs; iter++) {
156  CuVector<Real> ones(dim);
157  ones.Set(1.0);
158  VecVec(M, ones);
159  }
160 
161  BaseFloat fdim = dim;
162  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
163  KALDI_LOG << "For CuVector::VecVecOne" << NameOf<Real>() << ", for dim = "
164  << dim << ", speed was " << gflops << " gigaflops.";
165 }
166 
167 
168 
169 
170 template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
171  MatrixTransposeType transN,
172  MatrixTransposeType transO) {
173  BaseFloat time_in_secs = 0.02;
174  CuVector<Real> v(dim);
175  v.SetRandn();
176  CuMatrix<Real> N(dim, dim), O(dim, dim);
177  N.SetRandn();
178  O.SetRandn();
179 
180  Timer tim;
181  int32 iter = 0;
182 
183  for (;tim.Elapsed() < time_in_secs; iter++) {
184  v.AddDiagMatMat(1.0, N, transN, O, transO, 1.0);
185  }
186 
187  BaseFloat fdim = dim;
188  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
189  KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>()
190  << (transN == kNoTrans ? "[no-trans],":"[trans],")
191  << (transO == kNoTrans ? "[no-trans],":"[trans],")
192  << " for dim = "<< dim << ", speed was " << gflops << " gigaflops.";
193 }
194 
195 
196 template<typename Real> void TestCuVectorAddDiagMat2OnVariousShapes(
197  int32 dim, MatrixTransposeType trans) {
198  BaseFloat time_in_secs = 0.02;
199  int32 size = 1024 * 32;
200  CuVector<Real> v(trans == kNoTrans ? size / dim : dim);
201  v.SetRandn();
202  CuMatrix<Real> N(size / dim, dim);
203  N.SetRandn();
204 
205  Timer tim;
206  int32 iter = 0;
207 
208  for (; tim.Elapsed() < time_in_secs; iter++) {
209  v.AddDiagMat2(1.0, N, trans, 0.0);
210  }
211 
212  BaseFloat fdim = size;
213  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
214  KALDI_LOG << "For CuVector::AddDiagMat2Shapes" << NameOf<Real>()
215  << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = ("
216  << size / dim << ", " << dim << "), speed was " << gflops
217  << " gigaflops.";
218 }
219 
220 
221 
222 template<typename Real> void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) {
223  BaseFloat time_in_secs = 0.02;
224  CuVector<Real> v(dim);
225  v.SetRandn();
226  CuMatrix<Real> N(dim, dim);
227  N.SetRandn();
228 
229  Timer tim;
230  int32 iter = 0;
231 
232  for (;tim.Elapsed() < time_in_secs; iter++) {
233  v.AddDiagMat2(1.0, N, trans, 0.0);
234  }
235 
236  BaseFloat fdim = dim;
237  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
238  KALDI_LOG << "For CuVector::AddDiagMat2" << NameOf<Real>()
239  << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = "
240  << dim << ", speed was " << gflops << " gigaflops.";
241 }
242 
243 
244 template<typename Real> void TestCuVectorAddRowSumMat(int32 dim, MatrixTransposeType trans) {
245  BaseFloat time_in_secs = 0.02;
246  CuVector<Real> v(dim);
247  v.SetRandn();
248  CuMatrix<Real> N(dim, dim);
249  N.SetRandn();
250 
251  Timer tim;
252  int32 iter = 0;
253 
254  for (;tim.Elapsed() < time_in_secs; iter++) {
255  v.AddRowSumMat(1.0, N, 0.5);
256  }
257 
258  BaseFloat fdim = dim;
259  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
260  KALDI_LOG << "For CuVector::AddRowSumMat" << NameOf<Real>()
261  << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = "
262  << dim << ", speed was " << gflops << " gigaflops.";
263 }
264 
265 
266 template<typename Real> void TestCuVectorAddColSumMat(int32 dim, MatrixTransposeType trans) {
267  BaseFloat time_in_secs = 0.02;
268  CuVector<Real> v(dim);
269  v.SetRandn();
270  CuMatrix<Real> N(dim, dim);
271  N.SetRandn();
272 
273  Timer tim;
274  int32 iter = 0;
275 
276  for (;tim.Elapsed() < time_in_secs; iter++) {
277  v.AddColSumMat(1.0, N, 0.5);
278  }
279 
280  BaseFloat fdim = dim;
281  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
282  KALDI_LOG << "For CuVector::AddColSumMat" << NameOf<Real>()
283  << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = "
284  << dim << ", speed was " << gflops << " gigaflops.";
285 }
286 
287 
288 template<typename Real> void TestCuVectorApplyFloor(int32 dim) {
289  BaseFloat time_in_secs = 0.02;
290  CuVector<Real> v(dim);
291  v.SetRandn();
292  Real threshold = RandInt(-35000, 35000) / Real(100);
293 
294  Timer tim;
295  int32 iter = 0;
296  for (;tim.Elapsed() < time_in_secs; iter++) {
297  MatrixIndexT dummy_count;
298  v.ApplyFloor(threshold, &dummy_count);
299  }
300 
301  BaseFloat fdim = dim;
302  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
303  KALDI_LOG << "For CuVector::ApplyFloor" << NameOf<Real>() << ", for dim = "
304  << dim << ", speed was " << gflops << " gigaflops.";
305 
306 }
307 
308 
309 template<typename Real> void TestCuVectorApplyFloorNoCount(int32 dim) {
310  BaseFloat time_in_secs = 0.02;
311  CuVector<Real> v(dim);
312  v.SetRandn();
313  Real threshold = RandInt(-35000, 35000) / Real(100);
314 
315  Timer tim;
316  int32 iter = 0;
317  for (;tim.Elapsed() < time_in_secs; iter++) {
318  v.ApplyFloor(threshold, nullptr);
319  }
320 
321  BaseFloat fdim = dim;
322  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
323  KALDI_LOG << "For CuVector::ApplyFloor (no count variety)" << NameOf<Real>()
324  << ", for dim = " << dim << ", speed was " << gflops
325  << " gigaflops.";
326 
327 }
328 
329 
330 template<typename Real> void TestCuVectorApplyCeiling(int32 dim) {
331  BaseFloat time_in_secs = 0.02;
332  CuVector<Real> v(dim);
333  v.SetRandn();
334  Real threshold = RandInt(-35000, 35000) / Real(100);
335 
336  Timer tim;
337  int32 iter = 0;
338  for (;tim.Elapsed() < time_in_secs; iter++) {
339  MatrixIndexT dummy_count;
340  v.ApplyCeiling(threshold, &dummy_count);
341  }
342 
343  BaseFloat fdim = dim;
344  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
345  KALDI_LOG << "For CuVector::ApplyCeiling" << NameOf<Real>() << ", for dim = "
346  << dim << ", speed was " << gflops << " gigaflops.";
347 
348 }
349 
350 
351 template<typename Real> void TestCuVectorApplyCeilingNoCount(int32 dim) {
352  BaseFloat time_in_secs = 0.02;
353  CuVector<Real> v(dim);
354  v.SetRandn();
355  Real threshold = RandInt(-35000, 35000) / Real(100);
356 
357  Timer tim;
358  int32 iter = 0;
359  for (;tim.Elapsed() < time_in_secs; iter++) {
360  v.ApplyCeiling(threshold, nullptr);
361  }
362 
363  BaseFloat fdim = dim;
364  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
365  KALDI_LOG << "For CuVector::ApplyCeiling (no count variety)" << NameOf<Real>()
366  << ", for dim = " << dim << ", speed was " << gflops
367  << " gigaflops.";
368 
369 }
370 
371 
372 template<typename Real> void TestCuVectorAddDiagMatMatShape(
373  int32 num_rows, int32 num_cols, MatrixTransposeType transM,
374  MatrixTransposeType transN) {
375  BaseFloat time_in_secs = 0.02;
376  CuVector<Real> v(transM == kTrans ? num_cols : num_rows);
377  v.SetRandn();
378  CuMatrix<Real> M(num_rows, num_cols);
379  CuMatrix<Real> N(transM != transN ? num_rows : num_cols,
380  transM != transN ? num_cols : num_rows);
381  M.SetRandn();
382  N.SetRandn();
383 
384  Timer tim;
385  int32 iter = 0;
386 
387  for (;tim.Elapsed() < time_in_secs; iter++) {
388  v.AddDiagMatMat(1.0, M, transM, N, transN, 1.0);
389  }
390 
391  BaseFloat fnr = num_rows;
392  BaseFloat fnc = num_cols;
393  BaseFloat gflops = (fnr * fnc * iter) / (tim.Elapsed() * 1.0e+09);
394  KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>()
395  << (transM == kNoTrans ? "[no-trans],":"[trans],")
396  << (transN == kNoTrans ? "[no-trans],":"[trans],")
397  << " for dim = "<< num_rows << ", " << num_cols
398  << ", speed was " << gflops << " gigaflops.";
399 }
400 
401 
402 template<typename Real> void CudaVectorSpeedTest() {
403  const size_t a = 1 << 5;
404  const size_t b = 1 << 8;
405  for (size_t i = a; i <= b; i *= 2) {
406  for (size_t j = a; j <= b; j *= 2) {
407  if (i * j <= a * b) {
408  TestCuVectorAddDiagMatMatShape<Real>(i, j, kNoTrans, kNoTrans);
409  TestCuVectorAddDiagMatMatShape<Real>(i, j, kNoTrans, kTrans);
410  TestCuVectorAddDiagMatMatShape<Real>(i, j, kTrans, kNoTrans);
411  TestCuVectorAddDiagMatMatShape<Real>(i, j, kTrans, kTrans);
412  }
413  }
414  }
415 
416  std::vector<int32> sizes;
417  for (int i = 32; i <= 1024; i *= 2) {
418  sizes.push_back(i);
419  }
420  int32 ns = sizes.size();
421  for (int32 s = 0; s < ns; s++)
422  TestCuVectorSoftmax<Real>(sizes[s]);
423 #if HAVE_CUDA == 1
424  TestCuVectorSumChooseMinLength<Real>();
425 #endif
426  for (int32 s = 0; s < ns; s++)
427  TestCuVectorSum<Real>(sizes[s]);
428  for (int32 s = 0; s < ns; s++)
429  TestCuVectorVecVecOne<Real>(sizes[s]);
430  for (int32 s = 0; s < ns; s++)
431  TestCuVectorCopyFromVec<Real, float>(sizes[s]);
432  for (int32 s = 0; s < ns; s++)
433  TestCuVectorCopyFromVec<Real, double>(sizes[s]);
434  for (int32 s = 0; s < ns; s++) {
435  TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kNoTrans);
436  TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kTrans);
437  TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kNoTrans);
438  TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kTrans);
439  }
440  for (int32 s = 0; s < ns; s++) {
441  TestCuVectorAddDiagMat2OnVariousShapes<Real>(sizes[s], kNoTrans);
442  TestCuVectorAddDiagMat2OnVariousShapes<Real>(sizes[s], kTrans);
443  }
444  for (int32 s = 0; s < ns; s++) {
445  TestCuVectorAddDiagMat2<Real>(sizes[s], kNoTrans);
446  TestCuVectorAddDiagMat2<Real>(sizes[s], kTrans);
447  }
448  for (int32 s = 0; s < ns; s++) {
449  TestCuVectorAddRowSumMat<Real>(sizes[s], kNoTrans);
450  TestCuVectorAddRowSumMat<Real>(sizes[s], kTrans);
451  }
452  for (int32 s = 0; s < ns; s++) {
453  TestCuVectorAddColSumMat<Real>(sizes[s], kNoTrans);
454  TestCuVectorAddColSumMat<Real>(sizes[s], kTrans);
455  }
456  for (int32 s = 0; s < ns; s++) {
457  TestCuVectorApplyFloor<Real>(sizes[s]);
458  TestCuVectorApplyFloorNoCount<Real>(sizes[s]);
459  }
460  for (int32 s = 0; s < ns; s++) {
461  TestCuVectorApplyCeiling<Real>(sizes[s]);
462  TestCuVectorApplyCeilingNoCount<Real>(sizes[s]);
463  }
464 
465 }
466 
467 
468 } // namespace kaldi
469 
470 
471 int main() {
473  //Select the GPU
474 #if HAVE_CUDA == 1
475  CuDevice::Instantiate().SelectGpuId("yes"); //-2 .. automatic selection
476 #endif
477 
478  kaldi::CudaVectorSpeedTest<float>();
479 #if HAVE_CUDA == 1
480  if (CuDevice::Instantiate().DoublePrecisionSupported()) {
481  kaldi::CudaVectorSpeedTest<double>();
482  } else {
483  KALDI_WARN << "Double precision not supported";
484  }
485 #else
486  kaldi::CudaVectorSpeedTest<double>();
487 #endif
488  KALDI_LOG << "Tests succeeded.";
489 }
490 
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void TestCuVectorApplyFloor(int32 dim)
void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count=NULL)
Definition: cu-vector.h:143
void TestCuVectorSoftmax(int32 dim)
Real Sum() const
Definition: cu-vector.cc:297
void Set(Real value)
Definition: cu-vector.cc:1135
void AddDiagMat2(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType trans, Real beta)
Add the diagonal of a matrix times itself: *this = diag(M M^T) + beta * *this (if trans == kNoTrans)...
Definition: cu-vector.cc:595
std::string NameOf()
kaldi::int32 int32
void TestCuVectorSum(int32 dim)
This class represents a matrix that&#39;s stored on the GPU if we have one, and in memory if not...
Definition: matrix-common.h:71
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=NULL)
Definition: cu-vector.h:139
void TestCuVectorAddDiagMat2OnVariousShapes(int32 dim, MatrixTransposeType trans)
void TestCuVectorVecVecOne(int32 dim)
void AddDiagMatMat(Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const CuMatrixBase< Real > &N, MatrixTransposeType transN, Real beta=1.0)
Add the diagonal of a matrix product: *this = diag(M N), assuming the "trans" arguments are both kNoT...
Definition: cu-vector.cc:611
void SetVerboseLevel(int32 i)
This should be rarely used, except by programs using Kaldi as library; command-line programs set the ...
Definition: kaldi-error.h:64
int32 MatrixIndexT
Definition: matrix-common.h:98
void CopyFromVec(const CuVectorBase< Real > &src)
Copy functions; these will crash if the dimension do not match.
Definition: cu-vector.cc:1078
void AddColSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the columns of the matrix, add to vector.
Definition: cu-vector.cc:1298
void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans)
void TestCuVectorApplyCeilingNoCount(int32 dim)
#define CU1DBLOCK
Definition: cu-matrixdim.h:57
#define KALDI_WARN
Definition: kaldi-error.h:150
void TestCuVectorAddDiagMatMat(int32 dim, MatrixTransposeType transN, MatrixTransposeType transO)
void TestCuVectorCopyFromVec(int32 dim)
Real Sum() const
Returns sum of the elements.
A class representing a vector.
Definition: kaldi-vector.h:406
void TestCuVectorAddDiagMatMatShape(int32 num_rows, int32 num_cols, MatrixTransposeType transM, MatrixTransposeType transN)
MatrixTransposeType
Definition: matrix-common.h:32
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: cu-vector.h:72
void CudaVectorSpeedTest()
void TestCuVectorApplyCeiling(int32 dim)
void TestCuVectorAddColSumMat(int32 dim, MatrixTransposeType trans)
void TestCuVectorApplyFloorNoCount(int32 dim)
#define KALDI_LOG
Definition: kaldi-error.h:153
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37
double Elapsed() const
Returns time in seconds.
Definition: timer.h:74
void AddRowSumMat(Real alpha, const CuMatrixBase< Real > &mat, Real beta=1.0)
Sum the rows of the matrix, add to vector.
Definition: cu-vector.cc:1277
MatrixIndexT Dim() const
Dimensions.
Definition: cu-vector.h:69
int32 RandInt(int32 min_val, int32 max_val, struct RandomState *state)
Definition: kaldi-math.cc:95
int main()
void TestCuVectorAddRowSumMat(int32 dim, MatrixTransposeType trans)