25 #ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_    26 #define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_    31 #include <cuda_runtime_api.h>    45   if (this->dim_ == dim) {
    56   if (CuDevice::Instantiate().Enabled()) {
    58     this->
data_ = 
static_cast<T*
>(CuDevice::Instantiate().Malloc(dim * 
sizeof(T)));
    60     if (resize_type == 
kSetZero) this->SetZero();
    61     CuDevice::Instantiate().AccuProfile(
"CuArray::Resize", tim);
    65     this->
data_ = 
static_cast<T*
>(malloc(dim * 
sizeof(T)));
    71       KALDI_ERR << 
"Memory allocation failed when initializing CuVector "    72                 << 
"with dimension " << dim << 
" object size in bytes: "    84   if (CuDevice::Instantiate().Enabled()) {
    85     if (this->
data_ != NULL) {
    86       CuDevice::Instantiate().Free(this->
data_);
    91     if (this->
data_ != NULL)
   105   if (CuDevice::Instantiate().Enabled()) {
   108         cudaMemcpyAsync(
data_, &src.front(), src.size() * 
sizeof(T),
   109                    cudaMemcpyHostToDevice, cudaStreamPerThread));
   110     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
   111     CuDevice::Instantiate().AccuProfile(__func__, tim);
   115     memcpy(
data_, &src.front(), src.size() * 
sizeof(T));
   122   if (src.empty()) 
return;
   124   if (CuDevice::Instantiate().Enabled()) {
   126     CU_SAFE_CALL(cudaMemcpyAsync(this->
data_, &src.front(), 
   127           src.size()*
sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread));
   128     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
   129     CuDevice::Instantiate().AccuProfile(__func__, tim);
   133     memcpy(this->
data_, &src.front(), src.size()*
sizeof(T));
   141   if (this->dim_ == 0) 
return;
   143   if (CuDevice::Instantiate().Enabled()) {
   145     CU_SAFE_CALL(cudaMemcpyAsync(this->
data_, src.
data_, this->dim_ * 
sizeof(T),
   146                                  cudaMemcpyDeviceToDevice,
   147                                  cudaStreamPerThread));
   148     CuDevice::Instantiate().AccuProfile(__func__, tim);
   152     memcpy(this->
data_, src.
data_, this->dim_ * 
sizeof(T));
   162   if (CuDevice::Instantiate().Enabled()) {
   165       cudaMemcpyAsync(this->
data_, src.
data_, dim_ * 
sizeof(T),
   166                       cudaMemcpyDeviceToDevice, cudaStreamPerThread));
   167     CuDevice::Instantiate().AccuProfile(__func__, tim);
   171     memcpy(this->
data_, src.
data_, dim_ * 
sizeof(T));
   178   if (static_cast<MatrixIndexT>(dst->size()) != this->dim_) {
   179     dst->resize(this->dim_);
   181   if (this->dim_ == 0) 
return;
   183   if (CuDevice::Instantiate().Enabled()) {
   185     CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * 
sizeof(T),
   186           cudaMemcpyDeviceToHost, cudaStreamPerThread));
   187     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
   188     CuDevice::Instantiate().AccuProfile(
"CuArray::CopyToVecD2H", tim);
   192     memcpy(&dst->front(), this->
data_, this->dim_ * 
sizeof(T));
   199   if (this->dim_ == 0) 
return;
   202   if (CuDevice::Instantiate().Enabled()) {
   204     CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * 
sizeof(T),
   205           cudaMemcpyDeviceToHost, cudaStreamPerThread));
   206     CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
   207     CuDevice::Instantiate().AccuProfile(
"CuArray::CopyToVecD2H", tim);
   211     memcpy(dst, this->
data_, this->dim_ * 
sizeof(T));
   218   if (this->dim_ == 0) 
return;
   220   if (CuDevice::Instantiate().Enabled()) {
   222     CU_SAFE_CALL(cudaMemsetAsync(this->
data_, 0, this->dim_ * 
sizeof(T),
   223           cudaStreamPerThread));
   224     CuDevice::Instantiate().AccuProfile(
"CuArray::SetZero", tim);
   228     memset(static_cast<void*>(this->
data_), 0, this->dim_ * 
sizeof(T));
   236   KALDI_ERR << 
"CuArray<T>::Set not implemented yet for this type.";
   246   KALDI_ERR << 
"CuArray<T>::Sequence not implemented yet for this type.";
   256   KALDI_ERR << 
"CuArray<T>::Add not implemented yet for this type.";
   269   std::vector<T> tmp(Dim());
   271   T ans = *std::min_element(tmp.begin(), tmp.end());
   273   if (CuDevice::Instantiate().Enabled()) {
   274     CuDevice::Instantiate().AccuProfile(__func__, tim);
   287   std::vector<T> tmp(Dim());
   289   T ans = *std::max_element(tmp.begin(), tmp.end());
   291   if (CuDevice::Instantiate().Enabled()) {
   292     CuDevice::Instantiate().AccuProfile(__func__, tim);
   308   std::vector<T> tmp(this->Dim());
   309   this->CopyToVec(&tmp);
   319                offset + dim <= src.
Dim());
   329 std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
   334     out << 
" " << tmp[
i];
   340 template <
typename T>
 This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
 
void CopyFromVec(const std::vector< T > &src)
This function resizes if needed. 
 
T Min() const
Get minimum value (for now implemented on CPU, reimplement if slow). 
 
void CopyToVec(std::vector< T > *dst) const
This function resizes *dst if needed. 
 
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
 
void CopyFromArray(const CuArrayBase< T > &src)
The caller is responsible to ensure dim is equal between *this and src. 
 
CuSubArray(const CuArrayBase< T > &src, MatrixIndexT offset, MatrixIndexT dim)
Constructor as a range of an existing CuArray or CuSubArray. 
 
void Add(const T &value)
Add a constant value. 
 
void CopyFromArray(const CuArrayBase< T > &src)
This function resizes if needed. 
 
void ReadIntegerVector(std::istream &is, bool binary, std::vector< T > *v)
Function for reading STL vector of integer types. 
 
void Swap(CuArray< T > *other)
Shallow swap with another CuArray<T>. 
 
T * data_
GPU data pointer (if GPU not available, will point to CPU memory). 
 
void Read(std::istream &is, bool binary)
I/O. 
 
void Set(const T &value)
Set to a constant value. 
 
void CopyToHost(T *dst) const
Version of the above function that copies contents to a host array (i.e. 
 
MatrixIndexT dim_
dimension of the vector 
 
void Destroy()
Deallocate the memory and set dim_ and data_ to zero. 
 
void Write(std::ostream &is, bool binary) const
 
Class CuArrayBase, CuSubArray and CuArray are analogues of classes CuVectorBase, CuSubVector and CuVe...
 
Class CuArray represents a vector of an integer or struct of type T. 
 
#define KALDI_ASSERT(cond)
 
void CopyFromVec(const std::vector< T > &src)
The caller is responsible to ensure dim is equal between *this and src. 
 
void Resize(MatrixIndexT dim, MatrixResizeType resize_type=kSetZero)
Allocate the memory. 
 
void WriteIntegerVector(std::ostream &os, bool binary, const std::vector< T > &v)
Function for writing STL vectors of integer types. 
 
void SetZero()
Sets the memory for the object to zero, via memset. 
 
void Sequence(const T base)
Fill with the sequence [base ... 
 
T Max() const
Get minimum value (for now implemented on CPU, reimplement if slow). 
 
MatrixIndexT Dim() const
Return the vector dimension.