cu-array-inl.h
Go to the documentation of this file.
1 // cudamatrix/cu-array-inl.h
2 
3 // Copyright 2009-2016 Karel Vesely
4 // 2013 Johns Hopkins University (author: Daniel Povey)
5 // 2017 Shiyin Kang
6 
7 
8 // See ../../COPYING for clarification regarding multiple authors
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
18 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
19 // MERCHANTABLITY OR NON-INFRINGEMENT.
20 // See the Apache 2 License for the specific language governing permissions and
21 // limitations under the License.
22 
23 
24 
25 #ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
26 #define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
27 
28 #include <algorithm>
29 
30 #if HAVE_CUDA == 1
31 #include <cuda_runtime_api.h>
32 #include "cudamatrix/cu-common.h"
33 #include "cudamatrix/cu-device.h"
34 #include "cudamatrix/cu-kernels.h"
35 #endif
36 
37 #include "base/timer.h"
38 
39 namespace kaldi {
40 
41 
42 template<typename T>
44  KALDI_ASSERT((resize_type == kSetZero || resize_type == kUndefined) && dim >= 0);
45  if (this->dim_ == dim) {
46  if (resize_type == kSetZero)
47  this->SetZero();
48  return;
49  }
50 
51  Destroy();
52 
53  if (dim == 0) return;
54 
55 #if HAVE_CUDA == 1
56  if (CuDevice::Instantiate().Enabled()) {
57  CuTimer tim;
58  this->data_ = static_cast<T*>(CuDevice::Instantiate().Malloc(dim * sizeof(T)));
59  this->dim_ = dim;
60  if (resize_type == kSetZero) this->SetZero();
61  CuDevice::Instantiate().AccuProfile("CuArray::Resize", tim);
62  } else
63 #endif
64  {
65  this->data_ = static_cast<T*>(malloc(dim * sizeof(T)));
66  // We allocate with malloc because we don't want constructors being called.
67  // We basically ignore memory alignment issues here-- we assume the malloc
68  // implementation is forgiving enough that it will automatically align on
69  // sensible boundaries.
70  if (this->data_ == 0)
71  KALDI_ERR << "Memory allocation failed when initializing CuVector "
72  << "with dimension " << dim << " object size in bytes: "
73  << sizeof(T);
74  }
75 
76  this->dim_ = dim;
77  if (resize_type == kSetZero)
78  this->SetZero();
79 }
80 
81 template<typename T>
83 #if HAVE_CUDA == 1
84  if (CuDevice::Instantiate().Enabled()) {
85  if (this->data_ != NULL) {
86  CuDevice::Instantiate().Free(this->data_);
87  }
88  } else
89 #endif
90  {
91  if (this->data_ != NULL)
92  free(this->data_);
93  }
94  this->dim_ = 0;
95  this->data_ = NULL;
96 }
97 
98 
99 template<typename T>
100 void CuArrayBase<T>::CopyFromVec(const std::vector<T> &src) {
101  KALDI_ASSERT(dim_ == src.size());
102  if (src.empty())
103  return;
104 #if HAVE_CUDA == 1
105  if (CuDevice::Instantiate().Enabled()) {
106  CuTimer tim;
107  CU_SAFE_CALL(
108  cudaMemcpyAsync(data_, &src.front(), src.size() * sizeof(T),
109  cudaMemcpyHostToDevice, cudaStreamPerThread));
110  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
111  CuDevice::Instantiate().AccuProfile(__func__, tim);
112  } else
113 #endif
114  {
115  memcpy(data_, &src.front(), src.size() * sizeof(T));
116  }
117 }
118 
119 template<typename T>
120 void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
121  Resize(src.size(), kUndefined);
122  if (src.empty()) return;
123 #if HAVE_CUDA == 1
124  if (CuDevice::Instantiate().Enabled()) {
125  CuTimer tim;
126  CU_SAFE_CALL(cudaMemcpyAsync(this->data_, &src.front(),
127  src.size()*sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread));
128  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
129  CuDevice::Instantiate().AccuProfile(__func__, tim);
130  } else
131 #endif
132  {
133  memcpy(this->data_, &src.front(), src.size()*sizeof(T));
134  }
135 }
136 
137 
138 template<typename T>
140  this->Resize(src.Dim(), kUndefined);
141  if (this->dim_ == 0) return;
142 #if HAVE_CUDA == 1
143  if (CuDevice::Instantiate().Enabled()) {
144  CuTimer tim;
145  CU_SAFE_CALL(cudaMemcpyAsync(this->data_, src.data_, this->dim_ * sizeof(T),
146  cudaMemcpyDeviceToDevice,
147  cudaStreamPerThread));
148  CuDevice::Instantiate().AccuProfile(__func__, tim);
149  } else
150 #endif
151  {
152  memcpy(this->data_, src.data_, this->dim_ * sizeof(T));
153  }
154 }
155 
156 template<typename T>
158  KALDI_ASSERT(src.Dim() == Dim());
159  if (dim_ == 0)
160  return;
161 #if HAVE_CUDA == 1
162  if (CuDevice::Instantiate().Enabled()) {
163  CuTimer tim;
164  CU_SAFE_CALL(
165  cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T),
166  cudaMemcpyDeviceToDevice, cudaStreamPerThread));
167  CuDevice::Instantiate().AccuProfile(__func__, tim);
168  } else
169 #endif
170  {
171  memcpy(this->data_, src.data_, dim_ * sizeof(T));
172  }
173 }
174 
175 
176 template<typename T>
177 void CuArrayBase<T>::CopyToVec(std::vector<T> *dst) const {
178  if (static_cast<MatrixIndexT>(dst->size()) != this->dim_) {
179  dst->resize(this->dim_);
180  }
181  if (this->dim_ == 0) return;
182 #if HAVE_CUDA == 1
183  if (CuDevice::Instantiate().Enabled()) {
184  CuTimer tim;
185  CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * sizeof(T),
186  cudaMemcpyDeviceToHost, cudaStreamPerThread));
187  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
188  CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
189  } else
190 #endif
191  {
192  memcpy(&dst->front(), this->data_, this->dim_ * sizeof(T));
193  }
194 }
195 
196 
197 template<typename T>
198 void CuArrayBase<T>::CopyToHost(T *dst) const {
199  if (this->dim_ == 0) return;
200  KALDI_ASSERT(dst != NULL);
201 #if HAVE_CUDA == 1
202  if (CuDevice::Instantiate().Enabled()) {
203  CuTimer tim;
204  CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * sizeof(T),
205  cudaMemcpyDeviceToHost, cudaStreamPerThread));
206  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
207  CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
208  } else
209 #endif
210  {
211  memcpy(dst, this->data_, this->dim_ * sizeof(T));
212  }
213 }
214 
215 
216 template<typename T>
218  if (this->dim_ == 0) return;
219 #if HAVE_CUDA == 1
220  if (CuDevice::Instantiate().Enabled()) {
221  CuTimer tim;
222  CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T),
223  cudaStreamPerThread));
224  CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim);
225  } else
226 #endif
227  {
228  memset(static_cast<void*>(this->data_), 0, this->dim_ * sizeof(T));
229  }
230 }
231 
232 
233 template<class T>
234 void CuArrayBase<T>::Set(const T &value) {
235  // This is not implemented yet, we'll do so if it's needed.
236  KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
237 }
238 // int32 specialization implemented in 'cudamatrix/cu-array.cc',
239 template<>
240 void CuArrayBase<int32>::Set(const int32 &value);
241 
242 
243 template<class T>
244 void CuArrayBase<T>::Sequence(const T base) {
245  // This is not implemented yet, we'll do so if it's needed.
246  KALDI_ERR << "CuArray<T>::Sequence not implemented yet for this type.";
247 }
248 // int32 specialization implemented in 'cudamatrix/cu-array.cc',
249 template<>
250 void CuArrayBase<int32>::Sequence(const int32 base);
251 
252 
253 template<class T>
254 void CuArrayBase<T>::Add(const T &value) {
255  // This is not implemented yet, we'll do so if it's needed.
256  KALDI_ERR << "CuArray<T>::Add not implemented yet for this type.";
257 }
258 // int32 specialization implemented in 'cudamatrix/cu-array.cc',
259 template<>
260 void CuArrayBase<int32>::Add(const int32 &value);
261 
262 
263 template<class T>
264 inline T CuArrayBase<T>::Min() const {
265  KALDI_ASSERT(this->Dim() > 0);
266 #if HAVE_CUDA == 1
267  CuTimer tim;
268 #endif
269  std::vector<T> tmp(Dim());
270  CopyToVec(&tmp);
271  T ans = *std::min_element(tmp.begin(), tmp.end());
272 #if HAVE_CUDA == 1
273  if (CuDevice::Instantiate().Enabled()) {
274  CuDevice::Instantiate().AccuProfile(__func__, tim);
275  }
276 #endif
277  return ans;
278 }
279 
280 
281 template<class T>
282 inline T CuArrayBase<T>::Max() const {
283  KALDI_ASSERT(this->Dim() > 0);
284 #if HAVE_CUDA == 1
285  CuTimer tim;
286 #endif
287  std::vector<T> tmp(Dim());
288  CopyToVec(&tmp);
289  T ans = *std::max_element(tmp.begin(), tmp.end());
290 #if HAVE_CUDA == 1
291  if (CuDevice::Instantiate().Enabled()) {
292  CuDevice::Instantiate().AccuProfile(__func__, tim);
293  }
294 #endif
295  return ans;
296 }
297 
298 
299 template<typename T>
300 void CuArray<T>::Read(std::istream& in, bool binary) {
301  std::vector<T> tmp;
302  ReadIntegerVector(in, binary, &tmp);
303  (*this) = tmp;
304 }
305 
306 template<typename T>
307 void CuArray<T>::Write(std::ostream& out, bool binary) const {
308  std::vector<T> tmp(this->Dim());
309  this->CopyToVec(&tmp);
310  WriteIntegerVector(out, binary, tmp);
311 }
312 
313 
314 template<typename T>
316  MatrixIndexT offset,
317  MatrixIndexT dim) {
318  KALDI_ASSERT(offset >= 0 && dim >= 0 &&
319  offset + dim <= src.Dim());
320  this->data_ = src.data_ + offset;
321  this->dim_ = dim;
322 }
323 
324 
328 template<typename T>
329 std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
330  std::vector<T> tmp;
331  vec.CopyToVec(&tmp);
332  out << "[";
333  for(int32 i=0; i<tmp.size(); i++) {
334  out << " " << tmp[i];
335  }
336  out << " ]\n";
337  return out;
338 }
339 
340 template <typename T>
342  std::swap(this->dim_, other->dim_);
343  std::swap(this->data_, other->data_);
344 }
345 
346 
347 } // namespace kaldi
348 
349 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
MatrixResizeType
Definition: matrix-common.h:37
void CopyFromVec(const std::vector< T > &src)
This function resizes if needed.
Definition: cu-array-inl.h:120
T Min() const
Get minimum value (for now implemented on CPU, reimplement if slow).
Definition: cu-array-inl.h:264
void CopyToVec(std::vector< T > *dst) const
This function resizes *dst if needed.
Definition: cu-array-inl.h:177
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
kaldi::int32 int32
void CopyFromArray(const CuArrayBase< T > &src)
The caller is responsible to ensure dim is equal between *this and src.
Definition: cu-array-inl.h:157
uint64 data_
CuSubArray(const CuArrayBase< T > &src, MatrixIndexT offset, MatrixIndexT dim)
Constructor as a range of an existing CuArray or CuSubArray.
Definition: cu-array-inl.h:315
void Add(const T &value)
Add a constant value.
Definition: cu-array-inl.h:254
void CopyFromArray(const CuArrayBase< T > &src)
This function resizes if needed.
Definition: cu-array-inl.h:139
int32 MatrixIndexT
Definition: matrix-common.h:98
void ReadIntegerVector(std::istream &is, bool binary, std::vector< T > *v)
Function for reading STL vector of integer types.
Definition: io-funcs-inl.h:232
void Swap(CuArray< T > *other)
Shallow swap with another CuArray<T>.
Definition: cu-array-inl.h:341
T * data_
GPU data pointer (if GPU not available, will point to CPU memory).
Definition: cu-array.h:111
void Read(std::istream &is, bool binary)
I/O.
Definition: cu-array-inl.h:300
void Set(const T &value)
Set to a constant value.
Definition: cu-array-inl.h:234
#define KALDI_ERR
Definition: kaldi-error.h:147
void CopyToHost(T *dst) const
Version of the above function that copies contents to a host array (i.e.
Definition: cu-array-inl.h:198
MatrixIndexT dim_
dimension of the vector
Definition: cu-array.h:113
void Destroy()
Deallocate the memory and set dim_ and data_ to zero.
Definition: cu-array-inl.h:82
void Write(std::ostream &is, bool binary) const
Definition: cu-array-inl.h:307
Class CuArrayBase, CuSubArray and CuArray are analogues of classes CuVectorBase, CuSubVector and CuVe...
Definition: cu-array.h:44
Class CuArray represents a vector of an integer or struct of type T.
Definition: cu-array.h:32
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
void CopyFromVec(const std::vector< T > &src)
The caller is responsible to ensure dim is equal between *this and src.
Definition: cu-array-inl.h:100
void Resize(MatrixIndexT dim, MatrixResizeType resize_type=kSetZero)
Allocate the memory.
Definition: cu-array-inl.h:43
void WriteIntegerVector(std::ostream &os, bool binary, const std::vector< T > &v)
Function for writing STL vectors of integer types.
Definition: io-funcs-inl.h:198
void SetZero()
Sets the memory for the object to zero, via memset.
Definition: cu-array-inl.h:217
void Sequence(const T base)
Fill with the sequence [base ...
Definition: cu-array-inl.h:244
T Max() const
Get minimum value (for now implemented on CPU, reimplement if slow).
Definition: cu-array-inl.h:282
MatrixIndexT Dim() const
Return the vector dimension.
Definition: cu-array.h:49