25 #ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_ 26 #define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_ 31 #include <cuda_runtime_api.h> 45 if (this->dim_ == dim) {
56 if (CuDevice::Instantiate().Enabled()) {
58 this->
data_ =
static_cast<T*
>(CuDevice::Instantiate().Malloc(dim *
sizeof(T)));
60 if (resize_type ==
kSetZero) this->SetZero();
61 CuDevice::Instantiate().AccuProfile(
"CuArray::Resize", tim);
65 this->
data_ =
static_cast<T*
>(malloc(dim *
sizeof(T)));
71 KALDI_ERR <<
"Memory allocation failed when initializing CuVector " 72 <<
"with dimension " << dim <<
" object size in bytes: " 84 if (CuDevice::Instantiate().Enabled()) {
85 if (this->
data_ != NULL) {
86 CuDevice::Instantiate().Free(this->
data_);
91 if (this->
data_ != NULL)
105 if (CuDevice::Instantiate().Enabled()) {
108 cudaMemcpyAsync(
data_, &src.front(), src.size() *
sizeof(T),
109 cudaMemcpyHostToDevice, cudaStreamPerThread));
110 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
111 CuDevice::Instantiate().AccuProfile(__func__, tim);
115 memcpy(
data_, &src.front(), src.size() *
sizeof(T));
122 if (src.empty())
return;
124 if (CuDevice::Instantiate().Enabled()) {
126 CU_SAFE_CALL(cudaMemcpyAsync(this->
data_, &src.front(),
127 src.size()*
sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread));
128 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
129 CuDevice::Instantiate().AccuProfile(__func__, tim);
133 memcpy(this->
data_, &src.front(), src.size()*
sizeof(T));
141 if (this->dim_ == 0)
return;
143 if (CuDevice::Instantiate().Enabled()) {
145 CU_SAFE_CALL(cudaMemcpyAsync(this->
data_, src.
data_, this->dim_ *
sizeof(T),
146 cudaMemcpyDeviceToDevice,
147 cudaStreamPerThread));
148 CuDevice::Instantiate().AccuProfile(__func__, tim);
152 memcpy(this->
data_, src.
data_, this->dim_ *
sizeof(T));
162 if (CuDevice::Instantiate().Enabled()) {
165 cudaMemcpyAsync(this->
data_, src.
data_, dim_ *
sizeof(T),
166 cudaMemcpyDeviceToDevice, cudaStreamPerThread));
167 CuDevice::Instantiate().AccuProfile(__func__, tim);
171 memcpy(this->
data_, src.
data_, dim_ *
sizeof(T));
178 if (static_cast<MatrixIndexT>(dst->size()) != this->dim_) {
179 dst->resize(this->dim_);
181 if (this->dim_ == 0)
return;
183 if (CuDevice::Instantiate().Enabled()) {
185 CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ *
sizeof(T),
186 cudaMemcpyDeviceToHost, cudaStreamPerThread));
187 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
188 CuDevice::Instantiate().AccuProfile(
"CuArray::CopyToVecD2H", tim);
192 memcpy(&dst->front(), this->
data_, this->dim_ *
sizeof(T));
199 if (this->dim_ == 0)
return;
202 if (CuDevice::Instantiate().Enabled()) {
204 CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ *
sizeof(T),
205 cudaMemcpyDeviceToHost, cudaStreamPerThread));
206 CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
207 CuDevice::Instantiate().AccuProfile(
"CuArray::CopyToVecD2H", tim);
211 memcpy(dst, this->
data_, this->dim_ *
sizeof(T));
218 if (this->dim_ == 0)
return;
220 if (CuDevice::Instantiate().Enabled()) {
222 CU_SAFE_CALL(cudaMemsetAsync(this->
data_, 0, this->dim_ *
sizeof(T),
223 cudaStreamPerThread));
224 CuDevice::Instantiate().AccuProfile(
"CuArray::SetZero", tim);
228 memset(static_cast<void*>(this->
data_), 0, this->dim_ *
sizeof(T));
236 KALDI_ERR <<
"CuArray<T>::Set not implemented yet for this type.";
246 KALDI_ERR <<
"CuArray<T>::Sequence not implemented yet for this type.";
256 KALDI_ERR <<
"CuArray<T>::Add not implemented yet for this type.";
269 std::vector<T> tmp(Dim());
271 T ans = *std::min_element(tmp.begin(), tmp.end());
273 if (CuDevice::Instantiate().Enabled()) {
274 CuDevice::Instantiate().AccuProfile(__func__, tim);
287 std::vector<T> tmp(Dim());
289 T ans = *std::max_element(tmp.begin(), tmp.end());
291 if (CuDevice::Instantiate().Enabled()) {
292 CuDevice::Instantiate().AccuProfile(__func__, tim);
308 std::vector<T> tmp(this->Dim());
309 this->CopyToVec(&tmp);
319 offset + dim <= src.
Dim());
329 std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
334 out <<
" " << tmp[
i];
340 template <
typename T>
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
void CopyFromVec(const std::vector< T > &src)
This function resizes if needed.
T Min() const
Get minimum value (for now implemented on CPU, reimplement if slow).
void CopyToVec(std::vector< T > *dst) const
This function resizes *dst if needed.
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
void CopyFromArray(const CuArrayBase< T > &src)
The caller is responsible to ensure dim is equal between *this and src.
CuSubArray(const CuArrayBase< T > &src, MatrixIndexT offset, MatrixIndexT dim)
Constructor as a range of an existing CuArray or CuSubArray.
void Add(const T &value)
Add a constant value.
void CopyFromArray(const CuArrayBase< T > &src)
This function resizes if needed.
void ReadIntegerVector(std::istream &is, bool binary, std::vector< T > *v)
Function for reading STL vector of integer types.
void Swap(CuArray< T > *other)
Shallow swap with another CuArray<T>.
T * data_
GPU data pointer (if GPU not available, will point to CPU memory).
void Read(std::istream &is, bool binary)
I/O.
void Set(const T &value)
Set to a constant value.
void CopyToHost(T *dst) const
Version of the above function that copies contents to a host array (i.e.
MatrixIndexT dim_
dimension of the vector
void Destroy()
Deallocate the memory and set dim_ and data_ to zero.
void Write(std::ostream &is, bool binary) const
Class CuArrayBase, CuSubArray and CuArray are analogues of classes CuVectorBase, CuSubVector and CuVe...
Class CuArray represents a vector of an integer or struct of type T.
#define KALDI_ASSERT(cond)
void CopyFromVec(const std::vector< T > &src)
The caller is responsible to ensure dim is equal between *this and src.
void Resize(MatrixIndexT dim, MatrixResizeType resize_type=kSetZero)
Allocate the memory.
void WriteIntegerVector(std::ostream &os, bool binary, const std::vector< T > &v)
Function for writing STL vectors of integer types.
void SetZero()
Sets the memory for the object to zero, via memset.
void Sequence(const T base)
Fill with the sequence [base ...
T Max() const
Get minimum value (for now implemented on CPU, reimplement if slow).
MatrixIndexT Dim() const
Return the vector dimension.