doc/kaldi-matrix_8cc_source.html

 // matrix/kaldi-matrix.cc

 // Copyright 2009-2011   Lukas Burget;  Ondrej Glembek;  Go Vivace Inc.;
 //                       Microsoft Corporation;  Saarland University;
 //                       Yanmin Qian;  Petr Schwarz;  Jan Silovsky;
 //                       Haihua Xu
 //           2017        Shiyin Kang
 //           2019        Yiwen Shao

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include "matrix/kaldi-matrix.h"
 #include "matrix/sp-matrix.h"
 #include "matrix/jama-svd.h"
 #include "matrix/jama-eig.h"
 #include "matrix/compressed-matrix.h"
 #include "matrix/sparse-matrix.h"

 static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
     "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");

 namespace kaldi {

 template<typename Real>
 void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
                               bool inverse_needed) {
   KALDI_ASSERT(num_rows_ == num_cols_);
   if (num_rows_ == 0) {
     if (det_sign) *det_sign = 1;
     if (log_det) *log_det = 0.0;
     return;
   }
 #ifndef HAVE_ATLAS
   KaldiBlasInt *pivot = new KaldiBlasInt[num_rows_];
   KaldiBlasInt M = num_rows_;
   KaldiBlasInt N = num_cols_;
   KaldiBlasInt LDA = stride_;
   KaldiBlasInt result = -1;
   KaldiBlasInt l_work = std::max<KaldiBlasInt>(1, N);
   Real *p_work;
   void *temp;
   if ((p_work = static_cast<Real*>(
           KALDI_MEMALIGN(16, sizeof(Real)*l_work, &temp))) == NULL) {
     delete[] pivot;
     throw std::bad_alloc();
   }

   clapack_Xgetrf2(&M, &N, data_, &LDA, pivot, &result);
   const int pivot_offset = 1;
 #else
   int *pivot = new int[num_rows_];
   int result;
   clapack_Xgetrf(num_rows_, num_cols_, data_, stride_, pivot, &result);
   const int pivot_offset = 0;
 #endif
   KALDI_ASSERT(result >= 0 && "Call to CLAPACK sgetrf_ or ATLAS clapack_sgetrf "
                "called with wrong arguments");
   if (result > 0) {
     if (inverse_needed) {
       KALDI_ERR << "Cannot invert: matrix is singular";
     } else {
       if (log_det) *log_det = -std::numeric_limits<Real>::infinity();
       if (det_sign) *det_sign = 0;
       delete[] pivot;
 #ifndef HAVE_ATLAS
       KALDI_MEMALIGN_FREE(p_work);
 #endif
       return;
     }
   }
   if (det_sign != NULL) {
     int sign = 1;
     for (MatrixIndexT i = 0; i < num_rows_; i++)
       if (pivot[i] != static_cast<int>(i) + pivot_offset) sign *= -1;
     *det_sign = sign;
   }
   if (log_det != NULL || det_sign != NULL) {  // Compute log determinant.
     if (log_det != NULL) *log_det = 0.0;
     Real prod = 1.0;
     for (MatrixIndexT i = 0; i < num_rows_; i++) {
       prod *= (*this)(i, i);
       if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 ||
           std::fabs(prod) > 1.0e+10) {
         if (log_det != NULL) *log_det += kaldi::Log(std::fabs(prod));
         if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0);
         prod = 1.0;
       }
     }
   }
 #ifndef HAVE_ATLAS
   if (inverse_needed) clapack_Xgetri2(&M, data_, &LDA, pivot, p_work, &l_work,
                               &result);
   delete[] pivot;
   KALDI_MEMALIGN_FREE(p_work);
 #else
   if (inverse_needed)
     clapack_Xgetri(num_rows_, data_, stride_, pivot, &result);
   delete [] pivot;
 #endif
   KALDI_ASSERT(result == 0 && "Call to CLAPACK sgetri_ or ATLAS clapack_sgetri "
                "called with wrong arguments");
 }

 template<>
 template<>
 void MatrixBase<float>::AddVecVec(const float alpha,
                                   const VectorBase<float> &a,
                                   const VectorBase<float> &rb) {
   KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
   cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
              1, data_, stride_);
 }

 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::AddVecVec(const Real alpha,
                                  const VectorBase<OtherReal> &a,
                                  const VectorBase<OtherReal> &b) {
   KALDI_ASSERT(a.Dim() == num_rows_ && b.Dim() == num_cols_);
   if (num_rows_ * num_cols_ > 100) { // It's probably worth it to allocate
     // temporary vectors of the right type and use BLAS.
     Vector<Real> temp_a(a), temp_b(b);
     cblas_Xger(num_rows_, num_cols_, alpha, temp_a.Data(), 1,
                temp_b.Data(), 1, data_, stride_);
   } else {
     const OtherReal *a_data = a.Data(), *b_data = b.Data();
     Real *row_data = data_;
     for (MatrixIndexT i = 0; i < num_rows_; i++, row_data += stride_) {
       BaseFloat alpha_ai = alpha * a_data[i];
       for (MatrixIndexT j = 0; j < num_cols_; j++)
         row_data[j] += alpha_ai * b_data[j];
     }
   }
 }

 // instantiate the template above.
 template
 void MatrixBase<float>::AddVecVec(const float alpha,
                                   const VectorBase<double> &a,
                                   const VectorBase<double> &b);
 template
 void MatrixBase<double>::AddVecVec(const double alpha,
                                    const VectorBase<float> &a,
                                    const VectorBase<float> &b);

 template<>
 template<>
 void MatrixBase<double>::AddVecVec(const double alpha,
                                    const VectorBase<double> &a,
                                    const VectorBase<double> &rb) {
   KALDI_ASSERT(a.Dim() == num_rows_ && rb.Dim() == num_cols_);
   if (num_rows_ == 0) return;
   cblas_Xger(a.Dim(), rb.Dim(), alpha, a.Data(), 1, rb.Data(),
              1, data_, stride_);
 }

 template<typename Real>
 void MatrixBase<Real>::AddMatMat(const Real alpha,
                                   const MatrixBase<Real>& A,
                                   MatrixTransposeType transA,
                                   const MatrixBase<Real>& B,
                                   MatrixTransposeType transB,
                                   const Real beta) {
   KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
                || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
                || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
                || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);
   if (num_rows_ == 0) return;
   cblas_Xgemm(alpha, transA, A.data_, A.num_rows_, A.num_cols_, A.stride_,
               transB, B.data_, B.stride_, beta, data_, num_rows_, num_cols_, stride_);

 }

 template<typename Real>
 void MatrixBase<Real>::SetMatMatDivMat(const MatrixBase<Real>& A,
                                        const MatrixBase<Real>& B,
                                        const MatrixBase<Real>& C) {
   KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
   KALDI_ASSERT(A.NumRows() == C.NumRows() && A.NumCols() == C.NumCols());
   for (int32 r = 0; r < A.NumRows(); r++) { // each frame...
     for (int32 c = 0; c < A.NumCols(); c++) {
       BaseFloat i = C(r, c), o = B(r, c), od = A(r, c),
           id;
       if (i != 0.0) {
         id = od * (o / i);
       } else {
         id = od;
       }
       (*this)(r, c) = id;
     }
   }
 }


 template<typename Real>
 void MatrixBase<Real>::CopyLowerToUpper() {
   KALDI_ASSERT(num_rows_ == num_cols_);
   Real *data = data_;
   MatrixIndexT num_rows = num_rows_, stride = stride_;
   for (int32 i = 0; i < num_rows; i++)
     for (int32 j = 0; j < i; j++)
       data[j * stride + i ] = data[i * stride + j];
 }


 template<typename Real>
 void MatrixBase<Real>::CopyUpperToLower() {
   KALDI_ASSERT(num_rows_ == num_cols_);
   Real *data = data_;
   MatrixIndexT num_rows = num_rows_, stride = stride_;
   for (int32 i = 0; i < num_rows; i++)
     for (int32 j = 0; j < i; j++)
       data[i * stride + j] = data[j * stride + i];
 }

 template<typename Real>
 void MatrixBase<Real>::SymAddMat2(const Real alpha,
                                   const MatrixBase<Real> &A,
                                   MatrixTransposeType transA,
                                   Real beta) {
   KALDI_ASSERT(num_rows_ == num_cols_ &&
                ((transA == kNoTrans && A.num_rows_ == num_rows_) ||
                 (transA == kTrans && A.num_cols_ == num_cols_)));
   KALDI_ASSERT(A.data_ != data_);
   if (num_rows_ == 0) return;

 #ifdef HAVE_ATLAS
   if (transA == kTrans && num_rows_ >= 56) {
     this->AddMatMat(alpha, A, kTrans, A, kNoTrans, beta);
     return;
   }
 #endif // HAVE_ATLAS

   MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);

   // This function call is hard-coded to update the lower triangle.
   cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
               A.Stride(), beta, this->data_, this->stride_);
 }


 template<typename Real>
 void MatrixBase<Real>::AddMatSmat(const Real alpha,
                                   const MatrixBase<Real> &A,
                                   MatrixTransposeType transA,
                                   const MatrixBase<Real> &B,
                                   MatrixTransposeType transB,
                                   const Real beta) {
   KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
                || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
                || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
                || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);

   // We iterate over the columns of B.

   MatrixIndexT Astride = A.stride_, Bstride = B.stride_, stride = this->stride_,
       Arows = A.num_rows_, Acols = A.num_cols_;
   Real *data = this->data_, *Adata = A.data_, *Bdata = B.data_;
   MatrixIndexT num_cols = this->num_cols_;
   if (transB == kNoTrans) {
     // Iterate over the columns of *this and of B.
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       // for each column of *this, do
       // [this column] = [alpha * A * this column of B] + [beta * this column]
       Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
                       Bdata + c, Bstride, beta, data + c, stride);
     }
   } else {
     // Iterate over the columns of *this and the rows of B.
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       // for each column of *this, do
       // [this column] = [alpha * A * this row of B] + [beta * this column]
       Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
                       Bdata + (c * Bstride), 1, beta, data + c, stride);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddSmatMat(const Real alpha,
                                   const MatrixBase<Real> &A,
                                   MatrixTransposeType transA,
                                   const MatrixBase<Real> &B,
                                   MatrixTransposeType transB,
                                   const Real beta) {
   KALDI_ASSERT((transA == kNoTrans && transB == kNoTrans && A.num_cols_ == B.num_rows_ && A.num_rows_ == num_rows_ && B.num_cols_ == num_cols_)
                || (transA == kTrans && transB == kNoTrans && A.num_rows_ == B.num_rows_ && A.num_cols_ == num_rows_ && B.num_cols_ == num_cols_)
                || (transA == kNoTrans && transB == kTrans && A.num_cols_ == B.num_cols_ && A.num_rows_ == num_rows_ && B.num_rows_ == num_cols_)
                || (transA == kTrans && transB == kTrans && A.num_rows_ == B.num_cols_ && A.num_cols_ == num_rows_ && B.num_rows_ == num_cols_));
   KALDI_ASSERT(&A !=  this && &B != this);

   MatrixIndexT Astride = A.stride_, Bstride = B.stride_, stride = this->stride_,
       Brows = B.num_rows_, Bcols = B.num_cols_;
   MatrixTransposeType invTransB = (transB == kTrans ? kNoTrans : kTrans);
   Real *data = this->data_, *Adata = A.data_, *Bdata = B.data_;
   MatrixIndexT num_rows = this->num_rows_;
   if (transA == kNoTrans) {
     // Iterate over the rows of *this and of A.
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       // for each row of *this, do
       // [this row] = [alpha * (this row of A) * B^T] + [beta * this row]
       Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
                       Adata + (r * Astride), 1, beta, data + (r * stride), 1);
     }
   } else {
     // Iterate over the rows of *this and the columns of A.
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       // for each row of *this, do
       // [this row] = [alpha * (this column of A) * B^T] + [beta * this row]
       Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
                       Adata + r, Astride, beta, data + (r * stride), 1);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddSpSp(const Real alpha, const SpMatrix<Real> &A_in,
                                 const SpMatrix<Real> &B_in, const Real beta) {
   MatrixIndexT sz = num_rows_;
   KALDI_ASSERT(sz == num_cols_ && sz == A_in.NumRows() && sz == B_in.NumRows());

   Matrix<Real> A(A_in), B(B_in);
   // CblasLower or CblasUpper would work below as symmetric matrix is copied
   // fully (to save work, we used the matrix constructor from SpMatrix).
   // CblasLeft means A is on the left: C <-- alpha A B + beta C
   if (sz == 0) return;
   cblas_Xsymm(alpha, sz, A.data_, A.stride_, B.data_, B.stride_, beta, data_, stride_);
 }

 template<typename Real>
 void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
                                MatrixTransposeType transA) {
   if (&A == this) {
     if (transA == kNoTrans) {
       Scale(alpha + 1.0);
     } else {
       KALDI_ASSERT(num_rows_ == num_cols_ && "AddMat: adding to self (transposed): not symmetric.");
       Real *data = data_;
       if (alpha == 1.0) {  // common case-- handle separately.
         for (MatrixIndexT row = 0; row < num_rows_; row++) {
           for (MatrixIndexT col = 0; col < row; col++) {
             Real *lower = data + (row * stride_) + col, *upper = data + (col
                                                                           * stride_) + row;
             Real sum = *lower + *upper;
             *lower = *upper = sum;
           }
           *(data + (row * stride_) + row) *= 2.0;  // diagonal.
         }
       } else {
         for (MatrixIndexT row = 0; row < num_rows_; row++) {
           for (MatrixIndexT col = 0; col < row; col++) {
             Real *lower = data + (row * stride_) + col, *upper = data + (col
                                                                           * stride_) + row;
             Real lower_tmp = *lower;
             *lower += alpha * *upper;
             *upper += alpha * lower_tmp;
           }
           *(data + (row * stride_) + row) *= (1.0 + alpha);  // diagonal.
         }
       }
     }
   } else {
     int aStride = (int) A.stride_, stride = stride_;
     Real *adata = A.data_, *data = data_;
     if (transA == kNoTrans) {
       KALDI_ASSERT(A.num_rows_ == num_rows_ && A.num_cols_ == num_cols_);
       if (num_rows_ == 0) return;
       for (MatrixIndexT row = 0; row < num_rows_; row++, adata += aStride,
                data += stride) {
         cblas_Xaxpy(num_cols_, alpha, adata, 1, data, 1);
       }
     } else {
       KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_);
       if (num_rows_ == 0) return;
       for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride)
         cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddSmat(Real alpha, const SparseMatrix<Real> &A,
                                MatrixTransposeType trans) {
   if (trans == kNoTrans) {
     KALDI_ASSERT(NumRows() == A.NumRows());
     KALDI_ASSERT(NumCols() == A.NumCols());
     MatrixIndexT a_num_rows = A.NumRows();
     for (MatrixIndexT i = 0; i < a_num_rows; ++i) {
       const SparseVector<Real> &row = A.Row(i);
       MatrixIndexT num_elems = row.NumElements();
       for (MatrixIndexT id = 0; id < num_elems; ++id) {
         (*this)(i, row.GetElement(id).first) += alpha
             * row.GetElement(id).second;
       }
     }
   } else {
     KALDI_ASSERT(NumRows() == A.NumCols());
     KALDI_ASSERT(NumCols() == A.NumRows());
     MatrixIndexT a_num_rows = A.NumRows();
     for (MatrixIndexT i = 0; i < a_num_rows; ++i) {
       const SparseVector<Real> &row = A.Row(i);
       MatrixIndexT num_elems = row.NumElements();
       for (MatrixIndexT id = 0; id < num_elems; ++id) {
         (*this)(row.GetElement(id).first, i) += alpha
             * row.GetElement(id).second;
       }
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddSmatMat(Real alpha, const SparseMatrix<Real> &A,
                                   MatrixTransposeType transA,
                                   const MatrixBase<Real> &B, Real beta) {
   if (transA == kNoTrans) {
     KALDI_ASSERT(NumRows() == A.NumRows());
     KALDI_ASSERT(NumCols() == B.NumCols());
     KALDI_ASSERT(A.NumCols() == B.NumRows());

     this->Scale(beta);
     MatrixIndexT a_num_rows = A.NumRows(),
         this_num_cols = this->NumCols();
     for (MatrixIndexT i = 0; i < a_num_rows; ++i) {
       Real *this_row_i = this->RowData(i);
       const SparseVector<Real> &A_row_i = A.Row(i);
       MatrixIndexT num_elems = A_row_i.NumElements();
       for (MatrixIndexT e = 0; e < num_elems; ++e) {
         const std::pair<MatrixIndexT, Real> &p = A_row_i.GetElement(e);
         MatrixIndexT k = p.first;
         Real alpha_A_ik = alpha * p.second;
         const Real *b_row_k = B.RowData(k);
         cblas_Xaxpy(this_num_cols, alpha_A_ik, b_row_k, 1,
                     this_row_i, 1);
         //for (MatrixIndexT j = 0; j < this_num_cols; ++j)
         //  this_row_i[j] += alpha_A_ik * b_row_k[j];
       }
     }
   } else {
     KALDI_ASSERT(NumRows() == A.NumCols());
     KALDI_ASSERT(NumCols() == B.NumCols());
     KALDI_ASSERT(A.NumRows() == B.NumRows());

     this->Scale(beta);
     Matrix<Real> buf(NumRows(), NumCols(), kSetZero);
     MatrixIndexT a_num_rows = A.NumRows(),
         this_num_cols = this->NumCols();
     for (int k = 0; k < a_num_rows; ++k) {
       const Real *b_row_k = B.RowData(k);
       const SparseVector<Real> &A_row_k = A.Row(k);
       MatrixIndexT num_elems = A_row_k.NumElements();
       for (MatrixIndexT e = 0; e < num_elems; ++e) {
         const std::pair<MatrixIndexT, Real> &p = A_row_k.GetElement(e);
         MatrixIndexT i = p.first;
         Real alpha_A_ki = alpha * p.second;
         Real *this_row_i = this->RowData(i);
         cblas_Xaxpy(this_num_cols, alpha_A_ki, b_row_k, 1,
                     this_row_i, 1);
         //for (MatrixIndexT j = 0; j < this_num_cols; ++j)
         // this_row_i[j] += alpha_A_ki * b_row_k[j];
       }
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddMatSmat(Real alpha, const MatrixBase<Real> &A,
                                   const SparseMatrix<Real> &B,
                                   MatrixTransposeType transB, Real beta) {
   if (transB == kNoTrans) {
     KALDI_ASSERT(NumRows() == A.NumRows());
     KALDI_ASSERT(NumCols() == B.NumCols());
     KALDI_ASSERT(A.NumCols() == B.NumRows());

     this->Scale(beta);
     MatrixIndexT b_num_rows = B.NumRows(),
         this_num_rows = this->NumRows();
     // Iterate over the rows of sparse matrix B and columns of A.
     for (MatrixIndexT k = 0; k < b_num_rows; ++k) {
       const SparseVector<Real> &B_row_k = B.Row(k);
       MatrixIndexT num_elems = B_row_k.NumElements();
       const Real *a_col_k = A.Data() + k;
       for (MatrixIndexT e = 0; e < num_elems; ++e) {
         const std::pair<MatrixIndexT, Real> &p = B_row_k.GetElement(e);
         MatrixIndexT j = p.first;
         Real alpha_B_kj = alpha * p.second;
         Real *this_col_j = this->Data() + j;
         // Add to entire 'j'th column of *this at once using cblas_Xaxpy.
         // pass stride to write a colmun as matrices are stored in row major order.
         cblas_Xaxpy(this_num_rows, alpha_B_kj, a_col_k, A.stride_,
                     this_col_j, this->stride_);
         //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_kj * a_col_k[i*A.stride_];
       }
     }
   } else {
     KALDI_ASSERT(NumRows() == A.NumRows());
     KALDI_ASSERT(NumCols() == B.NumRows());
     KALDI_ASSERT(A.NumCols() == B.NumCols());

     this->Scale(beta);
     MatrixIndexT b_num_rows = B.NumRows(),
         this_num_rows = this->NumRows();
     // Iterate over the rows of sparse matrix B and columns of *this.
     for (MatrixIndexT j = 0; j < b_num_rows; ++j) {
       const SparseVector<Real> &B_row_j = B.Row(j);
       MatrixIndexT num_elems = B_row_j.NumElements();
       Real *this_col_j = this->Data() + j;
       for (MatrixIndexT e = 0; e < num_elems; ++e) {
         const std::pair<MatrixIndexT, Real> &p = B_row_j.GetElement(e);
         MatrixIndexT k = p.first;
         Real alpha_B_jk = alpha * p.second;
         const Real *a_col_k = A.Data() + k;
         // Add to entire 'j'th column of *this at once using cblas_Xaxpy.
         // pass stride to write a column as matrices are stored in row major order.
         cblas_Xaxpy(this_num_rows, alpha_B_jk, a_col_k, A.stride_,
                     this_col_j, this->stride_);
         //for (MatrixIndexT i = 0; i < this_num_rows; ++i)
         // this_col_j[i*this->stride_] +=  alpha_B_jk * a_col_k[i*A.stride_];
       }
     }
   }
 }

 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::AddSp(const Real alpha, const SpMatrix<OtherReal> &S) {
   KALDI_ASSERT(S.NumRows() == NumRows() && S.NumRows() == NumCols());
   Real *data = data_; const OtherReal *sdata = S.Data();
   MatrixIndexT num_rows = NumRows(), stride = Stride();
   for (MatrixIndexT i = 0; i < num_rows; i++) {
     for (MatrixIndexT j = 0; j < i; j++, sdata++) {
       data[i*stride + j] += alpha * *sdata;
       data[j*stride + i] += alpha * *sdata;
     }
     data[i*stride + i] += alpha * *sdata++;
   }
 }

 // instantiate the template above.
 template
 void MatrixBase<float>::AddSp(const float alpha, const SpMatrix<float> &S);
 template
 void MatrixBase<double>::AddSp(const double alpha, const SpMatrix<double> &S);
 template
 void MatrixBase<float>::AddSp(const float alpha, const SpMatrix<double> &S);
 template
 void MatrixBase<double>::AddSp(const double alpha, const SpMatrix<float> &S);


 template<typename Real>
 void MatrixBase<Real>::AddDiagVecMat(
     const Real alpha, const VectorBase<Real> &v,
     const MatrixBase<Real> &M,
     MatrixTransposeType transM,
     Real beta) {
   if (beta != 1.0) this->Scale(beta);

   if (transM == kNoTrans) {
     KALDI_ASSERT(SameDim(*this, M));
   } else {
     KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
   }
   KALDI_ASSERT(v.Dim() == this->NumRows());

   MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1, stride = stride_,
       num_rows = num_rows_, num_cols = num_cols_;
   if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++, data += stride, Mdata += M_row_stride, vdata++)
     cblas_Xaxpy(num_cols, alpha * *vdata, Mdata, M_col_stride, data, 1);
 }

 template<typename Real>
 void MatrixBase<Real>::AddMatDiagVec(
     const Real alpha,
     const MatrixBase<Real> &M, MatrixTransposeType transM,
     VectorBase<Real> &v,
     Real beta) {

   if (beta != 1.0) this->Scale(beta);

   if (transM == kNoTrans) {
     KALDI_ASSERT(SameDim(*this, M));
   } else {
     KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
   }
   KALDI_ASSERT(v.Dim() == this->NumCols());

   MatrixIndexT M_row_stride = M.Stride(),
                M_col_stride = 1,
                stride = stride_,
                num_rows = num_rows_,
                num_cols = num_cols_;

   if (transM == kTrans)
     std::swap(M_row_stride, M_col_stride);

   Real *data = data_;
   const Real *Mdata = M.Data(), *vdata = v.Data();
   if (num_rows_ == 0) return;
   for (MatrixIndexT i = 0; i < num_rows; i++){
       for(MatrixIndexT j = 0; j < num_cols; j ++ ){
           data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
       }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddMatMatElements(const Real alpha,
                                          const MatrixBase<Real>& A,
                                          const MatrixBase<Real>& B,
                                          const Real beta) {
     KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
     KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols());
     Real *data = data_;
     const Real *dataA = A.Data();
     const Real *dataB = B.Data();

     for (MatrixIndexT i = 0; i < num_rows_; i++) {
         for (MatrixIndexT j = 0; j < num_cols_; j++) {
             data[j] = beta*data[j] + alpha*dataA[j]*dataB[j];
         }
         data += Stride();
         dataA += A.Stride();
         dataB += B.Stride();
     }
 }

 #if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
 // ****************************************************************************
 // ****************************************************************************
 template<typename Real>
 void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                                    MatrixBase<Real> *V_in) {
   KALDI_ASSERT(s != NULL && U_in != this && V_in != this);

   Matrix<Real> tmpU, tmpV;
   if (U_in == NULL) tmpU.Resize(this->num_rows_, 1);  // work-space if U_in empty.
   if (V_in == NULL) tmpV.Resize(1, this->num_cols_);  // work-space if V_in empty.


   KaldiBlasInt M   = num_cols_;
   KaldiBlasInt N   = num_rows_;
   KaldiBlasInt LDA = Stride();

   KALDI_ASSERT(N>=M);  // NumRows >= columns.

   if (U_in) {
     KALDI_ASSERT((int)U_in->num_rows_ == N && (int)U_in->num_cols_ == M);
   }
   if (V_in) {
     KALDI_ASSERT((int)V_in->num_rows_ == M && (int)V_in->num_cols_ == M);
   }
   KALDI_ASSERT((int)s->Dim() == std::min(M, N));

   MatrixBase<Real> *U = (U_in ? U_in : &tmpU);
   MatrixBase<Real> *V = (V_in ? V_in : &tmpV);

   KaldiBlasInt V_stride      = V->Stride();
   KaldiBlasInt U_stride      = U->Stride();

   // Original LAPACK recipe
   // KaldiBlasInt l_work = std::max(std::max<long int>
   //   (1, 3*std::min(M, N)+std::max(M, N)), 5*std::min(M, N))*2;
   KaldiBlasInt l_work = -1;
   Real   work_query;
   KaldiBlasInt result;

   // query for work space
   char *u_job = const_cast<char*>(U_in ? "s" : "N");  // "s" == skinny, "N" == "none."
   char *v_job = const_cast<char*>(V_in ? "s" : "N");  // "s" == skinny, "N" == "none."
   clapack_Xgesvd(v_job, u_job,
                  &M, &N, data_, &LDA,
                  s->Data(),
                  V->Data(), &V_stride,
                  U->Data(), &U_stride,
                  &work_query, &l_work,
                  &result);

   KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments");

   l_work = static_cast<KaldiBlasInt>(work_query);
   Real *p_work;
   void *temp;
   if ((p_work = static_cast<Real*>(
           KALDI_MEMALIGN(16, sizeof(Real)*l_work, &temp))) == NULL)
     throw std::bad_alloc();

   // perform svd
   clapack_Xgesvd(v_job, u_job,
                  &M, &N, data_, &LDA,
                  s->Data(),
                  V->Data(), &V_stride,
                  U->Data(), &U_stride,
                  p_work, &l_work,
                  &result);

   KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments");

   if (result != 0) {
     KALDI_WARN << "CLAPACK sgesvd_ : some weird convergence not satisfied";
   }
   KALDI_MEMALIGN_FREE(p_work);
 }

 #endif

 // Copy constructor.  Copies data to newly allocated memory.
 template<typename Real>
 Matrix<Real>::Matrix (const MatrixBase<Real> & M,
                       MatrixTransposeType trans/*=kNoTrans*/)
     : MatrixBase<Real>() {
   if (trans == kNoTrans) {
     Resize(M.num_rows_, M.num_cols_);
     this->CopyFromMat(M);
   } else {
     Resize(M.num_cols_, M.num_rows_);
     this->CopyFromMat(M, kTrans);
   }
 }

 // Copy constructor.  Copies data to newly allocated memory.
 template<typename Real>
 Matrix<Real>::Matrix (const Matrix<Real> & M):
     MatrixBase<Real>() {
   Resize(M.num_rows_, M.num_cols_);
   this->CopyFromMat(M);
 }

 template<typename Real>
 template<typename OtherReal>
 Matrix<Real>::Matrix(const MatrixBase<OtherReal> & M,
                      MatrixTransposeType trans) : MatrixBase<Real>() {
   if (trans == kNoTrans) {
     Resize(M.NumRows(), M.NumCols());
     this->CopyFromMat(M);
   } else {
     Resize(M.NumCols(), M.NumRows());
     this->CopyFromMat(M, kTrans);
   }
 }

 // Instantiate this constructor for float->double and double->float.
 template
 Matrix<float>::Matrix(const MatrixBase<double> & M,
                       MatrixTransposeType trans);
 template
 Matrix<double>::Matrix(const MatrixBase<float> & M,
                        MatrixTransposeType trans);

 template<typename Real>
 inline void Matrix<Real>::Init(const MatrixIndexT rows,
                                const MatrixIndexT cols,
                                const MatrixStrideType stride_type) {
   if (rows * cols == 0) {
     KALDI_ASSERT(rows == 0 && cols == 0);
     this->num_rows_ = 0;
     this->num_cols_ = 0;
     this->stride_ = 0;
     this->data_ = NULL;
     return;
   }
   KALDI_ASSERT(rows > 0 && cols > 0);
   MatrixIndexT skip, stride;
   size_t size;
   void *data;  // aligned memory block
   void *temp;  // memory block to be really freed

   // compute the size of skip and real cols
   skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real)))
       % (16 / sizeof(Real));
   stride = cols + skip;
   size = static_cast<size_t>(rows) * static_cast<size_t>(stride)
       * sizeof(Real);

   // allocate the memory and set the right dimensions and parameters
   if (NULL != (data = KALDI_MEMALIGN(16, size, &temp))) {
     MatrixBase<Real>::data_        = static_cast<Real *> (data);
     MatrixBase<Real>::num_rows_      = rows;
     MatrixBase<Real>::num_cols_      = cols;
     MatrixBase<Real>::stride_  = (stride_type == kDefaultStride ? stride : cols);
   } else {
     throw std::bad_alloc();
   }
 }

 template<typename Real>
 void Matrix<Real>::Resize(const MatrixIndexT rows,
                           const MatrixIndexT cols,
                           MatrixResizeType resize_type,
                           MatrixStrideType stride_type) {
   // the next block uses recursion to handle what we have to do if
   // resize_type == kCopyData.
   if (resize_type == kCopyData) {
     if (this->data_ == NULL || rows == 0) resize_type = kSetZero;  // nothing to copy.
     else if (rows == this->num_rows_ && cols == this->num_cols_ &&
              (stride_type == kDefaultStride || this->stride_ == this->num_cols_)) { return; } // nothing to do.
     else {
       // set tmp to a matrix of the desired size; if new matrix
       // is bigger in some dimension, zero it.
       MatrixResizeType new_resize_type =
           (rows > this->num_rows_ || cols > this->num_cols_) ? kSetZero : kUndefined;
       Matrix<Real> tmp(rows, cols, new_resize_type, stride_type);
       MatrixIndexT rows_min = std::min(rows, this->num_rows_),
           cols_min = std::min(cols, this->num_cols_);
       tmp.Range(0, rows_min, 0, cols_min).
           CopyFromMat(this->Range(0, rows_min, 0, cols_min));
       tmp.Swap(this);
       // and now let tmp go out of scope, deleting what was in *this.
       return;
     }
   }
   // At this point, resize_type == kSetZero or kUndefined.

   if (MatrixBase<Real>::data_ != NULL) {
     if (rows == MatrixBase<Real>::num_rows_
         && cols == MatrixBase<Real>::num_cols_) {
       if (resize_type == kSetZero)
         this->SetZero();
       return;
     }
     else
       Destroy();
   }
   Init(rows, cols, stride_type);
   if (resize_type == kSetZero) MatrixBase<Real>::SetZero();
 }

 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromMat(const MatrixBase<OtherReal> &M,
                                    MatrixTransposeType Trans) {
   if (sizeof(Real) == sizeof(OtherReal) &&
       static_cast<const void*>(M.Data()) ==
       static_cast<const void*>(this->Data())) {
     // CopyFromMat called on same data.  Nothing to do (except sanity checks).
     KALDI_ASSERT(Trans == kNoTrans && M.NumRows() == NumRows() &&
                  M.NumCols() == NumCols() && M.Stride() == Stride());
     return;
   }
   if (Trans == kNoTrans) {
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == M.NumCols());
     for (MatrixIndexT i = 0; i < num_rows_; i++)
       (*this).Row(i).CopyFromVec(M.Row(i));
   } else {
     KALDI_ASSERT(num_cols_ == M.NumRows() && num_rows_ == M.NumCols());
     int32 this_stride = stride_, other_stride = M.Stride();
     Real *this_data = data_;
     const OtherReal *other_data = M.Data();
     for (MatrixIndexT i = 0; i < num_rows_; i++)
       for (MatrixIndexT j = 0; j < num_cols_; j++)
         this_data[i * this_stride + j] = other_data[j * other_stride + i];
   }
 }

 // template instantiations.
 template
 void MatrixBase<float>::CopyFromMat(const MatrixBase<double> & M,
                                     MatrixTransposeType Trans);
 template
 void MatrixBase<double>::CopyFromMat(const MatrixBase<float> & M,
                                      MatrixTransposeType Trans);
 template
 void MatrixBase<float>::CopyFromMat(const MatrixBase<float> & M,
                                     MatrixTransposeType Trans);
 template
 void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
                                      MatrixTransposeType Trans);

 // Specialize the template for CopyFromSp for float, float.
 template<>
 template<>
 void MatrixBase<float>::CopyFromSp(const SpMatrix<float> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
   MatrixIndexT num_rows = num_rows_, stride = stride_;
   const float *Mdata = M.Data();
   float *row_data = data_, *col_data = data_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
     cblas_scopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
     cblas_scopy(i, Mdata, 1, col_data, stride); // copy to the column.
     Mdata += i+1;
     row_data += stride;
     col_data += 1;
   }
 }

 // Specialize the template for CopyFromSp for double, double.
 template<>
 template<>
 void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
   MatrixIndexT num_rows = num_rows_, stride = stride_;
   const double *Mdata = M.Data();
   double *row_data = data_, *col_data = data_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
     cblas_dcopy(i+1, Mdata, 1, row_data, 1); // copy to the row.
     cblas_dcopy(i, Mdata, 1, col_data, stride); // copy to the column.
     Mdata += i+1;
     row_data += stride;
     col_data += 1;
   }
 }


 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromSp(const SpMatrix<OtherReal> & M) {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
   // MORE EFFICIENT IF LOWER TRIANGULAR!  Reverse code otherwise.
   for (MatrixIndexT i = 0; i < num_rows_; i++) {
     for (MatrixIndexT j = 0; j < i; j++) {
       (*this)(j, i)  = (*this)(i, j) = M(i, j);
     }
     (*this)(i, i) = M(i, i);
   }
 }

 // Instantiate this function
 template
 void MatrixBase<float>::CopyFromSp(const SpMatrix<double> & M);
 template
 void MatrixBase<double>::CopyFromSp(const SpMatrix<float> & M);


 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromTp(const TpMatrix<OtherReal> & M,
                                   MatrixTransposeType Trans) {
   if (Trans == kNoTrans) {
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
     SetZero();
     Real *out_i = data_;
     const OtherReal *in_i = M.Data();
     for (MatrixIndexT i = 0; i < num_rows_; i++, out_i += stride_, in_i += i) {
       for (MatrixIndexT j = 0; j <= i; j++)
         out_i[j] = in_i[j];
     }
   } else {
     SetZero();
     KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
     MatrixIndexT stride = stride_;
     Real *out_i = data_;
     const OtherReal *in_i = M.Data();
     for (MatrixIndexT i = 0; i < num_rows_; i++, out_i ++, in_i += i) {
       for (MatrixIndexT j = 0; j <= i; j++)
         out_i[j*stride] = in_i[j];
     }
   }
 }

 template
 void MatrixBase<float>::CopyFromTp(const TpMatrix<float> & M,
                                    MatrixTransposeType trans);
 template
 void MatrixBase<float>::CopyFromTp(const TpMatrix<double> & M,
                                    MatrixTransposeType trans);
 template
 void MatrixBase<double>::CopyFromTp(const TpMatrix<float> & M,
                                     MatrixTransposeType trans);
 template
 void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
                                     MatrixTransposeType trans);


 template<typename Real>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
   if (rv.Dim() == num_rows_*num_cols_) {
     if (stride_ == num_cols_) {
       // one big copy operation.
       const Real *rv_data = rv.Data();
       std::memcpy(data_, rv_data, sizeof(Real)*num_rows_*num_cols_);
     } else {
       const Real *rv_data = rv.Data();
       for (MatrixIndexT r = 0; r < num_rows_; r++) {
         Real *row_data = RowData(r);
         for (MatrixIndexT c = 0; c < num_cols_; c++) {
           row_data[c] = rv_data[c];
         }
         rv_data += num_cols_;
       }
     }
   } else if (rv.Dim() == num_cols_) {
     const Real *rv_data = rv.Data();
     for (MatrixIndexT r = 0; r < num_rows_; r++)
       std::memcpy(RowData(r), rv_data, sizeof(Real)*num_cols_);
   } else {
     KALDI_ERR << "Wrong sized arguments";
   }
 }

 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<OtherReal> &rv) {
   if (rv.Dim() == num_rows_*num_cols_) {
     const OtherReal *rv_data = rv.Data();
     for (MatrixIndexT r = 0; r < num_rows_; r++) {
       Real *row_data = RowData(r);
       for (MatrixIndexT c = 0; c < num_cols_; c++) {
         row_data[c] = static_cast<Real>(rv_data[c]);
       }
       rv_data += num_cols_;
     }
   } else if (rv.Dim() == num_cols_) {
     const OtherReal *rv_data = rv.Data();
     Real *first_row_data = RowData(0);
     for (MatrixIndexT c = 0; c < num_cols_; c++)
       first_row_data[c] = rv_data[c];
     for (MatrixIndexT r = 1; r < num_rows_; r++)
       std::memcpy(RowData(r), first_row_data, sizeof(Real)*num_cols_);
   } else {
     KALDI_ERR << "Wrong sized arguments.";
   }
 }


 template
 void MatrixBase<float>::CopyRowsFromVec(const VectorBase<double> &rv);
 template
 void MatrixBase<double>::CopyRowsFromVec(const VectorBase<float> &rv);

 template<typename Real>
 void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
   if (rv.Dim() == num_rows_*num_cols_) {
     const Real *v_inc_data = rv.Data();
     Real *m_inc_data = data_;

     for (MatrixIndexT c = 0; c < num_cols_; c++) {
       for (MatrixIndexT r = 0; r < num_rows_; r++) {
         m_inc_data[r * stride_] = v_inc_data[r];
       }
       v_inc_data += num_rows_;
       m_inc_data ++;
     }
   } else if (rv.Dim() == num_rows_) {
     const Real *v_inc_data = rv.Data();
     Real *m_inc_data = data_;
     for (MatrixIndexT r = 0; r < num_rows_; r++) {
       Real value = *(v_inc_data++);
       for (MatrixIndexT c = 0; c < num_cols_; c++)
         m_inc_data[c] = value;
       m_inc_data += stride_;
     }
   } else {
     KALDI_ERR << "Wrong size of arguments.";
   }
 }


 template<typename Real>
 void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIndexT row) {
   KALDI_ASSERT(rv.Dim() == num_cols_ &&
                static_cast<UnsignedMatrixIndexT>(row) <
                static_cast<UnsignedMatrixIndexT>(num_rows_));

   const Real *rv_data = rv.Data();
   Real *row_data = RowData(row);

   std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real));
 }

 template<typename Real>
 void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
   KALDI_ASSERT(rv.Dim() == std::min(num_cols_, num_rows_));
   const Real *rv_data = rv.Data(), *rv_end = rv_data + rv.Dim();
   Real *my_data = this->Data();
   for (; rv_data != rv_end; rv_data++, my_data += (this->stride_+1))
     *my_data = *rv_data;
 }

 template<typename Real>
 void MatrixBase<Real>::CopyColFromVec(const VectorBase<Real> &rv,
                                       const MatrixIndexT col) {
   KALDI_ASSERT(rv.Dim() == num_rows_ &&
                static_cast<UnsignedMatrixIndexT>(col) <
                static_cast<UnsignedMatrixIndexT>(num_cols_));

   const Real *rv_data = rv.Data();
   Real *col_data = data_ + col;

   for (MatrixIndexT r = 0; r < num_rows_; r++)
     col_data[r * stride_] = rv_data[r];
 }


 template<typename Real>
 void Matrix<Real>::RemoveRow(MatrixIndexT i) {
   KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                static_cast<UnsignedMatrixIndexT>(MatrixBase<Real>::num_rows_)
                && "Access out of matrix");
   for (MatrixIndexT j = i + 1; j <  MatrixBase<Real>::num_rows_; j++)
     MatrixBase<Real>::Row(j-1).CopyFromVec( MatrixBase<Real>::Row(j));
   MatrixBase<Real>::num_rows_--;
 }

 template<typename Real>
 void Matrix<Real>::Destroy() {
   // we need to free the data block if it was defined
   if (NULL != MatrixBase<Real>::data_)
     KALDI_MEMALIGN_FREE( MatrixBase<Real>::data_);
   MatrixBase<Real>::data_ = NULL;
   MatrixBase<Real>::num_rows_ = MatrixBase<Real>::num_cols_
       = MatrixBase<Real>::stride_ = 0;
 }


 template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
   KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);

   if (num_cols_ == stride_ && num_cols_ == a.stride_) {
     mul_elements(num_rows_ * num_cols_, a.data_, data_);
   } else {
     MatrixIndexT a_stride = a.stride_, stride = stride_;
     Real *data = data_, *a_data = a.data_;
     for (MatrixIndexT i = 0; i < num_rows_; i++) {
       mul_elements(num_cols_, a_data, data);
       a_data += a_stride;
       data += stride;
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::DivElements(const MatrixBase<Real> &a) {
   KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
   MatrixIndexT i;
   MatrixIndexT j;

   for (i = 0; i < num_rows_; i++) {
     for (j = 0; j < num_cols_; j++) {
       (*this)(i, j) /= a(i, j);
     }
   }
 }

 template<typename Real>
 Real MatrixBase<Real>::Sum() const {
   double sum = 0.0;

   for (MatrixIndexT i = 0; i < num_rows_; i++) {
     for (MatrixIndexT j = 0; j < num_cols_; j++) {
       sum += (*this)(i, j);
     }
   }

   return (Real)sum;
 }

 template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {
   KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols());
   for (MatrixIndexT row = 0; row < num_rows_; row++) {
     Real *row_data = RowData(row);
     const Real *other_row_data = A.RowData(row);
     MatrixIndexT num_cols = num_cols_;
     for (MatrixIndexT col = 0; col < num_cols; col++) {
       row_data[col] = std::max(row_data[col],
                                other_row_data[col]);
     }
   }
 }

 template<typename Real> void MatrixBase<Real>::Min(const MatrixBase<Real> &A) {
   KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols());
   for (MatrixIndexT row = 0; row < num_rows_; row++) {
     Real *row_data = RowData(row);
     const Real *other_row_data = A.RowData(row);
     MatrixIndexT num_cols = num_cols_;
     for (MatrixIndexT col = 0; col < num_cols; col++) {
       row_data[col] = std::min(row_data[col],
                                other_row_data[col]);
     }
   }
 }


 template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
   if (alpha == 1.0) return;
   if (num_rows_ == 0) return;
   if (num_cols_ == stride_) {
     cblas_Xscal(static_cast<size_t>(num_rows_) * static_cast<size_t>(num_cols_),
                 alpha, data_,1);
   } else {
     Real *data = data_;
     for (MatrixIndexT i = 0; i < num_rows_; ++i, data += stride_) {
       cblas_Xscal(num_cols_, alpha, data,1);
     }
   }
 }

 template<typename Real>  // scales each row by scale[i].
 void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
   KALDI_ASSERT(scale.Dim() == num_rows_);
   MatrixIndexT M = num_rows_, N = num_cols_;

   for (MatrixIndexT i = 0; i < M; i++) {
     Real this_scale = scale(i);
     for (MatrixIndexT j = 0; j < N; j++) {
       (*this)(i, j) *= this_scale;
     }
   }
 }


 template<typename Real>
 void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
   KALDI_ASSERT(src.NumRows() == this->NumRows() &&
                this->NumCols() % src.NumCols() == 0);
   int32 group_size = this->NumCols() / src.NumCols(),
       num_groups = this->NumCols() / group_size,
       num_rows = this->NumRows();

   for (MatrixIndexT i = 0; i < num_rows; i++) {
     Real *data = this->RowData(i);
     for (MatrixIndexT j = 0; j < num_groups; j++, data += group_size) {
       Real scale = src(i, j);
       cblas_Xscal(group_size, scale, data, 1);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
                                        const MatrixBase<Real> &output,
                                        Real power) {
   KALDI_ASSERT(input.NumCols() == this->NumCols() && input.NumRows() == this->NumRows());
   KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
                this->NumRows() == output.NumRows());

   int group_size = this->NumCols() / output.NumCols(),
     num_rows = this->NumRows(), num_cols = this->NumCols();

   if (power == 1.0) {
     for (MatrixIndexT i = 0; i < num_rows; i++) {
       for (MatrixIndexT j = 0; j < num_cols; j++) {
         Real input_val = input(i, j);
         (*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1));
       }
     }
   } else if (power == std::numeric_limits<Real>::infinity()) {
     for (MatrixIndexT i = 0; i < num_rows; i++) {
       for (MatrixIndexT j = 0; j < num_cols; j++) {
         Real output_val = output(i, j / group_size), input_val = input(i, j);
         if (output_val == 0)
           (*this)(i, j) = 0;
         else
           (*this)(i, j) = (std::abs(input_val) == output_val ? 1.0 : 0.0)
               * (input_val >= 0 ? 1 : -1);
       }
     }
   } else {
     for (MatrixIndexT i = 0; i < num_rows; i++) {
       for (MatrixIndexT j = 0; j < num_cols; j++) {
         Real output_val = output(i, j / group_size),
           input_val = input(i, j);
         if (output_val == 0)
           (*this)(i, j) = 0;
          else
             (*this)(i, j) = pow(std::abs(input_val), power - 1) *
               pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ;
       }
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::GroupMaxDeriv(const MatrixBase<Real> &input,
                                      const MatrixBase<Real> &output) {
   KALDI_ASSERT(input.NumCols() == this->NumCols() &&
               input.NumRows() == this->NumRows());
   KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
                this->NumRows() == output.NumRows());

   int group_size = this->NumCols() / output.NumCols(),
       num_rows = this->NumRows(), num_cols = this->NumCols();

   for (MatrixIndexT i = 0; i < num_rows; i++) {
     for (MatrixIndexT j = 0; j < num_cols; j++) {
       Real input_val = input(i, j);
       Real output_val = output(i, j / group_size);
       (*this)(i, j) = (input_val == output_val ? 1 : 0);
     }
   }
 }

 template<typename Real>  // scales each column by scale[i].
 void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
   KALDI_ASSERT(scale.Dim() == num_cols_);
   for (MatrixIndexT i = 0; i < num_rows_; i++) {
     for (MatrixIndexT j = 0; j < num_cols_; j++) {
       Real this_scale = scale(j);
       (*this)(i, j) *= this_scale;
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::SetZero() {
   if (num_cols_ == stride_)
     memset(data_, 0, sizeof(Real)*num_rows_*num_cols_);
   else
     for (MatrixIndexT row = 0; row < num_rows_; row++)
       memset(data_ + row*stride_, 0, sizeof(Real)*num_cols_);
 }

 template<typename Real>
 void MatrixBase<Real>::Set(Real value) {
   for (MatrixIndexT row = 0; row < num_rows_; row++) {
     for (MatrixIndexT col = 0; col < num_cols_; col++) {
       (*this)(row, col) = value;
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::SetUnit() {
   SetZero();
   for (MatrixIndexT row = 0; row < std::min(num_rows_, num_cols_); row++)
     (*this)(row, row) = 1.0;
 }

 template<typename Real>
 void MatrixBase<Real>::SetRandn() {
   kaldi::RandomState rstate;
   for (MatrixIndexT row = 0; row < num_rows_; row++) {
     Real *row_data = this->RowData(row);
     MatrixIndexT nc = (num_cols_ % 2 == 1) ? num_cols_ - 1 : num_cols_;
     for (MatrixIndexT col = 0; col < nc; col += 2) {
       kaldi::RandGauss2(row_data + col, row_data + col + 1, &rstate);
     }
     if (nc != num_cols_) row_data[nc] = static_cast<Real>(kaldi::RandGauss(&rstate));
   }
 }

 template<typename Real>
 void MatrixBase<Real>::SetRandUniform() {
   kaldi::RandomState rstate;
   for (MatrixIndexT row = 0; row < num_rows_; row++) {
     Real *row_data = this->RowData(row);
     for (MatrixIndexT col = 0; col < num_cols_; col++, row_data++) {
       *row_data = static_cast<Real>(kaldi::RandUniform(&rstate));
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
   if (!os.good()) {
     KALDI_ERR << "Failed to write matrix to stream: stream not good";
   }
   if (binary) {  // Use separate binary and text formats,
     // since in binary mode we need to know if it's float or double.
     std::string my_token = (sizeof(Real) == 4 ? "FM" : "DM");

     WriteToken(os, binary, my_token);
     {
       int32 rows = this->num_rows_;  // make the size 32-bit on disk.
       int32 cols = this->num_cols_;
       KALDI_ASSERT(this->num_rows_ == (MatrixIndexT) rows);
       KALDI_ASSERT(this->num_cols_ == (MatrixIndexT) cols);
       WriteBasicType(os, binary, rows);
       WriteBasicType(os, binary, cols);
     }
     if (Stride() == NumCols())
       os.write(reinterpret_cast<const char*> (Data()), sizeof(Real)
                * static_cast<size_t>(num_rows_) * static_cast<size_t>(num_cols_));
     else
       for (MatrixIndexT i = 0; i < num_rows_; i++)
         os.write(reinterpret_cast<const char*> (RowData(i)), sizeof(Real)
                  * num_cols_);
     if (!os.good()) {
       KALDI_ERR << "Failed to write matrix to stream";
     }
   } else {  // text mode.
     if (num_cols_ == 0) {
       os << " [ ]\n";
     } else {
       os << " [";
       for (MatrixIndexT i = 0; i < num_rows_; i++) {
         os << "\n  ";
         for (MatrixIndexT j = 0; j < num_cols_; j++)
           os << (*this)(i, j) << " ";
       }
       os << "]\n";
     }
   }
 }


 template<typename Real>
 void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
   if (add) {
     Matrix<Real> tmp(num_rows_, num_cols_);
     tmp.Read(is, binary, false);  // read without adding.
     if (tmp.num_rows_ != this->num_rows_ || tmp.num_cols_ != this->num_cols_)
       KALDI_ERR << "MatrixBase::Read, size mismatch "
                 << this->num_rows_ << ", " << this->num_cols_
                 << " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
     this->AddMat(1.0, tmp);
     return;
   }
   // now assume add == false.

   //  In order to avoid rewriting this, we just declare a Matrix and
   // use it to read the data, then copy.
   Matrix<Real> tmp;
   tmp.Read(is, binary, false);
   if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) {
     KALDI_ERR << "MatrixBase<Real>::Read, size mismatch "
               << NumRows() << " x " << NumCols() << " versus "
               << tmp.NumRows() << " x " << tmp.NumCols();
   }
   CopyFromMat(tmp);
 }


 template<typename Real>
 void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
   if (add) {
     Matrix<Real> tmp;
     tmp.Read(is, binary, false);  // read without adding.
     if (this->num_rows_ == 0) this->Resize(tmp.num_rows_, tmp.num_cols_);
     else {
       if (this->num_rows_ != tmp.num_rows_ || this->num_cols_ != tmp.num_cols_) {
         if (tmp.num_rows_ == 0) return;  // do nothing in this case.
         else KALDI_ERR << "Matrix::Read, size mismatch "
                        << this->num_rows_ <<  ", " << this->num_cols_
                        << " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
       }
     }
     this->AddMat(1.0, tmp);
     return;
   }

   // now assume add == false.
   MatrixIndexT pos_at_start = is.tellg();
   std::ostringstream specific_error;

   if (binary) {  // Read in binary mode.
     int peekval = Peek(is, binary);
     if (peekval == 'C') {
       // This code enables us to read CompressedMatrix as a regular matrix.
       CompressedMatrix compressed_mat;
       compressed_mat.Read(is, binary); // at this point, add == false.
       this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
       compressed_mat.CopyToMat(this);
       return;
     }
     const char *my_token =  (sizeof(Real) == 4 ? "FM" : "DM");
     char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
     if (peekval == other_token_start) {  // need to instantiate the other type to read it.
       typedef typename OtherReal<Real>::Real OtherType;  // if Real == float, OtherType == double, and vice versa.
       Matrix<OtherType> other(this->num_rows_, this->num_cols_);
       other.Read(is, binary, false);  // add is false at this point anyway.
       this->Resize(other.NumRows(), other.NumCols());
       this->CopyFromMat(other);
       return;
     }
     std::string token;
     ReadToken(is, binary, &token);
     if (token != my_token) {
       if (token.length() > 20) token = token.substr(0, 17) + "...";
       specific_error << ": Expected token " << my_token << ", got " << token;
       goto bad;
     }
     int32 rows, cols;
     ReadBasicType(is, binary, &rows);  // throws on error.
     ReadBasicType(is, binary, &cols);  // throws on error.
     if ((MatrixIndexT)rows != this->num_rows_ || (MatrixIndexT)cols != this->num_cols_) {
       this->Resize(rows, cols);
     }
     if (this->Stride() == this->NumCols() && rows*cols!=0) {
       is.read(reinterpret_cast<char*>(this->Data()),
               sizeof(Real)*rows*cols);
       if (is.fail()) goto bad;
     } else {
       for (MatrixIndexT i = 0; i < (MatrixIndexT)rows; i++) {
         is.read(reinterpret_cast<char*>(this->RowData(i)), sizeof(Real)*cols);
         if (is.fail()) goto bad;
       }
     }
     if (is.eof()) return;
     if (is.fail()) goto bad;
     return;
   } else {  // Text mode.
     std::string str;
     is >> str; // get a token
     if (is.fail()) { specific_error << ": Expected \"[\", got EOF"; goto bad; }
     // if ((str.compare("DM") == 0) || (str.compare("FM") == 0)) {  // Back compatibility.
     // is >> str;  // get #rows
     //  is >> str;  // get #cols
     //  is >> str;  // get "["
     // }
     if (str == "[]") { Resize(0, 0); return; } // Be tolerant of variants.
     else if (str != "[") {
       if (str.length() > 20) str = str.substr(0, 17) + "...";
       specific_error << ": Expected \"[\", got \"" << str << '"';
       goto bad;
     }
     // At this point, we have read "[".
     std::vector<std::vector<Real>* > data;
     std::vector<Real> *cur_row = new std::vector<Real>;
     while (1) {
       int i = is.peek();
       if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto cleanup; }
       else if (static_cast<char>(i) == ']') {  // Finished reading matrix.
         is.get();  // eat the "]".
         i = is.peek();
         if (static_cast<char>(i) == '\r') {
           is.get();
           is.get();  // get \r\n (must eat what we wrote)
         } else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
         if (is.fail()) {
           KALDI_WARN << "After end of matrix data, read error.";
           // we got the data we needed, so just warn for this error.
         }
         // Now process the data.
         if (!cur_row->empty()) data.push_back(cur_row);
         else delete(cur_row);
         cur_row = NULL;
         if (data.empty()) { this->Resize(0, 0); return; }
         else {
           int32 num_rows = data.size(), num_cols = data[0]->size();
           this->Resize(num_rows, num_cols);
           for (int32 i = 0; i < num_rows; i++) {
             if (static_cast<int32>(data[i]->size()) != num_cols) {
               specific_error << "Matrix has inconsistent #cols: " << num_cols
                              << " vs." << data[i]->size() << " (processing row"
                              << i << ")";
               goto cleanup;
             }
             for (int32 j = 0; j < num_cols; j++)
               (*this)(i, j) = (*(data[i]))[j];
             delete data[i];
             data[i] = NULL;
           }
         }
         return;
       } else if (static_cast<char>(i) == '\n' || static_cast<char>(i) == ';') {
         // End of matrix row.
         is.get();
         if (cur_row->size() != 0) {
           data.push_back(cur_row);
           cur_row = new std::vector<Real>;
           cur_row->reserve(data.back()->size());
         }
       } else if ( (i >= '0' && i <= '9') || i == '-' ) {  // A number...
         Real r;
         is >> r;
         if (is.fail()) {
           specific_error << "Stream failure/EOF while reading matrix data.";
           goto cleanup;
         }
         cur_row->push_back(r);
       } else if (isspace(i)) {
         is.get();  // eat the space and do nothing.
       } else {  // NaN or inf or error.
         std::string str;
         is >> str;
         if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
             !KALDI_STRCASECMP(str.c_str(), "infinity")) {
           cur_row->push_back(std::numeric_limits<Real>::infinity());
           KALDI_WARN << "Reading infinite value into matrix.";
         } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
           cur_row->push_back(std::numeric_limits<Real>::quiet_NaN());
           KALDI_WARN << "Reading NaN value into matrix.";
         } else {
           if (str.length() > 20) str = str.substr(0, 17) + "...";
           specific_error << "Expecting numeric matrix data, got " << str;
           goto cleanup;
         }
       }
     }
     // Note, we never leave the while () loop before this
     // line (we return from it.)
  cleanup: // We only reach here in case of error in the while loop above.
     if(cur_row != NULL)
       delete cur_row;
     for (size_t i = 0; i < data.size(); i++)
       if(data[i] != NULL)
         delete data[i];
     // and then go on to "bad" below, where we print error.
   }
 bad:
   KALDI_ERR << "Failed to read matrix from stream.  " << specific_error.str()
             << " File position at start is "
             << pos_at_start << ", currently " << is.tellg();
 }


 // Constructor... note that this is not const-safe as it would
 // be quite complicated to implement a "const SubMatrix" class that
 // would not allow its contents to be changed.
 template<typename Real>
 SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
                            const MatrixIndexT ro,
                            const MatrixIndexT r,
                            const MatrixIndexT co,
                            const MatrixIndexT c) {
   if (r == 0 || c == 0) {
     // we support the empty sub-matrix as a special case.
     KALDI_ASSERT(c == 0 && r == 0);
     this->data_ = NULL;
     this->num_cols_ = 0;
     this->num_rows_ = 0;
     this->stride_ = 0;
     return;
   }
   KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(ro) <
                static_cast<UnsignedMatrixIndexT>(M.num_rows_) &&
                static_cast<UnsignedMatrixIndexT>(co) <
                static_cast<UnsignedMatrixIndexT>(M.num_cols_) &&
                static_cast<UnsignedMatrixIndexT>(r) <=
                static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
                static_cast<UnsignedMatrixIndexT>(c) <=
                static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
   // point to the begining of window
   MatrixBase<Real>::num_rows_ = r;
   MatrixBase<Real>::num_cols_ = c;
   MatrixBase<Real>::stride_ = M.Stride();
   MatrixBase<Real>::data_ = M.Data_workaround() +
       static_cast<size_t>(co) +
       static_cast<size_t>(ro) * static_cast<size_t>(M.Stride());
 }


 template<typename Real>
 SubMatrix<Real>::SubMatrix(Real *data,
                            MatrixIndexT num_rows,
                            MatrixIndexT num_cols,
                            MatrixIndexT stride):
     MatrixBase<Real>(data, num_cols, num_rows, stride) { // caution: reversed order!
   if (data == NULL) {
     KALDI_ASSERT(num_rows * num_cols == 0);
     this->num_rows_ = 0;
     this->num_cols_ = 0;
     this->stride_ = 0;
   } else {
     KALDI_ASSERT(this->stride_ >= this->num_cols_);
   }
 }


 template<typename Real>
 void MatrixBase<Real>::Add(const Real alpha) {
   Real *data = data_;
   MatrixIndexT stride = stride_;
   for (MatrixIndexT r = 0; r < num_rows_; r++)
     for (MatrixIndexT c = 0; c < num_cols_; c++)
       data[c + stride*r] += alpha;
 }

 template<typename Real>
 void MatrixBase<Real>::AddToDiag(const Real alpha) {
   Real *data = data_;
   MatrixIndexT this_stride = stride_ + 1,
       num_to_add = std::min(num_rows_, num_cols_);
   for (MatrixIndexT r = 0; r < num_to_add; r++)
     data[r * this_stride] += alpha;
 }


 template<typename Real>
 Real MatrixBase<Real>::Cond() const {
   KALDI_ASSERT(num_rows_ > 0&&num_cols_ > 0);
   Vector<Real> singular_values(std::min(num_rows_, num_cols_));
   Svd(&singular_values);  // Get singular values...
   Real min = singular_values(0), max = singular_values(0);  // both absolute values...
   for (MatrixIndexT i = 1;i < singular_values.Dim();i++) {
     min = std::min((Real)std::abs(singular_values(i)), min); max = std::max((Real)std::abs(singular_values(i)), max);
   }
   if (min > 0) return max/min;
   else return std::numeric_limits<Real>::infinity();
 }

 template<typename Real>
 Real MatrixBase<Real>::Trace(bool check_square) const  {
   KALDI_ASSERT(!check_square || num_rows_ == num_cols_);
   Real ans = 0.0;
   for (MatrixIndexT r = 0;r < std::min(num_rows_, num_cols_);r++) ans += data_ [r + stride_*r];
   return ans;
 }

 template<typename Real>
 Real MatrixBase<Real>::Max() const {
   KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
   Real ans= *data_;
   for (MatrixIndexT r = 0; r < num_rows_; r++)
     for (MatrixIndexT c = 0; c < num_cols_; c++)
       if (data_[c + stride_*r] > ans)
         ans = data_[c + stride_*r];
   return ans;
 }

 template<typename Real>
 Real MatrixBase<Real>::Min() const {
   KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
   Real ans= *data_;
   for (MatrixIndexT r = 0; r < num_rows_; r++)
     for (MatrixIndexT c = 0; c < num_cols_; c++)
       if (data_[c + stride_*r] < ans)
         ans = data_[c + stride_*r];
   return ans;
 }


 template <typename Real>
 void MatrixBase<Real>::AddMatMatMat(Real alpha,
                                     const MatrixBase<Real> &A, MatrixTransposeType transA,
                                     const MatrixBase<Real> &B, MatrixTransposeType transB,
                                     const MatrixBase<Real> &C, MatrixTransposeType transC,
                                     Real beta) {
   // Note on time taken with different orders of computation.  Assume not transposed in this /
   // discussion. Firstly, normalize expressions using A.NumCols == B.NumRows and B.NumCols == C.NumRows, prefer
   // rows where there is a choice.
   // time taken for (AB) is:  A.NumRows*B.NumRows*C.Rows
   // time taken for (AB)C is A.NumRows*C.NumRows*C.Cols
   // so this order is A.NumRows*B.NumRows*C.NumRows + A.NumRows*C.NumRows*C.NumCols.

   // time taken for (BC) is: B.NumRows*C.NumRows*C.Cols
   // time taken for A(BC) is: A.NumRows*B.NumRows*C.Cols
   // so this order is B.NumRows*C.NumRows*C.NumCols + A.NumRows*B.NumRows*C.Cols

   MatrixIndexT ARows = A.num_rows_, ACols = A.num_cols_, BRows = B.num_rows_, BCols = B.num_cols_,
       CRows = C.num_rows_, CCols = C.num_cols_;
   if (transA == kTrans) std::swap(ARows, ACols);
   if (transB == kTrans) std::swap(BRows, BCols);
   if (transC == kTrans) std::swap(CRows, CCols);

   MatrixIndexT AB_C_time = ARows*BRows*CRows + ARows*CRows*CCols;
   MatrixIndexT A_BC_time = BRows*CRows*CCols + ARows*BRows*CCols;

   if (AB_C_time < A_BC_time) {
     Matrix<Real> AB(ARows, BCols);
     AB.AddMatMat(1.0, A, transA, B, transB, 0.0);  // AB = A * B.
     (*this).AddMatMat(alpha, AB, kNoTrans, C, transC, beta);
   } else {
     Matrix<Real> BC(BRows, CCols);
     BC.AddMatMat(1.0, B, transB, C, transC, 0.0);  // BC = B * C.
     (*this).AddMatMat(alpha, A, transA, BC, kNoTrans, beta);
   }
 }


 template<typename Real>
 void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) {
   // Svd, *this = U*diag(s)*Vt.
   // With (*this).num_rows_ == m, (*this).num_cols_ == n,
   // Support only skinny Svd with m>=n (NumRows>=NumCols), and zero sizes for U and Vt mean
   // we do not want that output.  We expect that s.Dim() == m,
   // U is either 0 by 0 or m by n, and rv is either 0 by 0 or n by n.
   // Throws exception on error.

   KALDI_ASSERT(num_rows_>=num_cols_ && "Svd requires that #rows by >= #cols.");  // For compatibility with JAMA code.
   KALDI_ASSERT(s->Dim() == num_cols_);  // s should be the smaller dim.
   KALDI_ASSERT(U == NULL || (U->num_rows_ == num_rows_&&U->num_cols_ == num_cols_));
   KALDI_ASSERT(Vt == NULL || (Vt->num_rows_ == num_cols_&&Vt->num_cols_ == num_cols_));

   Real prescale = 1.0;
   if ( std::abs((*this)(0, 0) ) < 1.0e-30) {  // Very tiny value... can cause problems in Svd.
     Real max_elem = LargestAbsElem();
     if (max_elem != 0) {
       prescale = 1.0 / max_elem;
       if (std::abs(prescale) == std::numeric_limits<Real>::infinity()) { prescale = 1.0e+40; }
       (*this).Scale(prescale);
     }
   }

 #if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
   // "S" == skinny Svd (only one we support because of compatibility with Jama one which is only skinny),
   // "N"== no eigenvectors wanted.
   LapackGesvd(s, U, Vt);
 #else
   /*  if (num_rows_ > 1 && num_cols_ > 1 && (*this)(0, 0) == (*this)(1, 1)
       && Max() == Min() && (*this)(0, 0) != 0.0) { // special case that JamaSvd sometimes crashes on.
       KALDI_WARN << "Jama SVD crashes on this type of matrix, perturbing it to prevent crash.";
       for(int32 i = 0; i < NumRows(); i++)
       (*this)(i, i)  *= 1.00001;
       }*/
   bool ans = JamaSvd(s, U, Vt);
   if (Vt != NULL) Vt->Transpose();  // possibly to do: change this and also the transpose inside the JamaSvd routine.  note, Vt is square.
   if (!ans) {
     KALDI_ERR << "Error doing Svd";  // This one will be caught.
   }
 #endif
   if (prescale != 1.0) s->Scale(1.0/prescale);
 }

 template<typename Real>
 void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
   try {
     if (num_rows_ >= num_cols_) {
       Matrix<Real> tmp(*this);
       tmp.DestructiveSvd(s, U, Vt);
     } else {
       Matrix<Real> tmp(*this, kTrans);  // transpose of *this.
       // rVt will have different dim so cannot transpose in-place --> use a temp matrix.
       Matrix<Real> Vt_Trans(Vt ? Vt->num_cols_ : 0, Vt ? Vt->num_rows_ : 0);
       // U will be transpose
       tmp.DestructiveSvd(s, Vt ? &Vt_Trans : NULL, U);
       if (U) U->Transpose();
       if (Vt) Vt->CopyFromMat(Vt_Trans, kTrans);  // copy with transpose.
     }
   } catch (...) {
     KALDI_ERR << "Error doing Svd (did not converge), first part of matrix is\n"
               << SubMatrix<Real>(*this, 0, std::min((MatrixIndexT)10, num_rows_),
                                  0, std::min((MatrixIndexT)10, num_cols_))
               << ", min and max are: " << Min() << ", " << Max();
   }
 }

 template<typename Real>
 bool MatrixBase<Real>::IsSymmetric(Real cutoff) const {
   MatrixIndexT R = num_rows_, C = num_cols_;
   if (R != C) return false;
   Real bad_sum = 0.0, good_sum = 0.0;
   for (MatrixIndexT i = 0;i < R;i++) {
     for (MatrixIndexT j = 0;j < i;j++) {
       Real a = (*this)(i, j), b = (*this)(j, i), avg = 0.5*(a+b), diff = 0.5*(a-b);
       good_sum += std::abs(avg); bad_sum += std::abs(diff);
     }
     good_sum += std::abs((*this)(i, i));
   }
   if (bad_sum > cutoff*good_sum) return false;
   return true;
 }

 template<typename Real>
 bool MatrixBase<Real>::IsDiagonal(Real cutoff) const{
   MatrixIndexT R = num_rows_, C = num_cols_;
   Real bad_sum = 0.0, good_sum = 0.0;
   for (MatrixIndexT i = 0;i < R;i++) {
     for (MatrixIndexT j = 0;j < C;j++) {
       if (i == j) good_sum += std::abs((*this)(i, j));
       else bad_sum += std::abs((*this)(i, j));
     }
   }
   return (!(bad_sum > good_sum * cutoff));
 }

 // This does nothing, it's designed to trigger Valgrind errors
 // if any memory is uninitialized.
 template<typename Real>
 void MatrixBase<Real>::TestUninitialized() const {
   MatrixIndexT R = num_rows_, C = num_cols_, positive = 0;
   for (MatrixIndexT i = 0; i < R; i++)
     for (MatrixIndexT j = 0; j < C; j++)
       if ((*this)(i, j) > 0.0) positive++;
   if (positive > R * C)
     KALDI_ERR << "Error....";
 }


 template<typename Real>
 bool MatrixBase<Real>::IsUnit(Real cutoff) const {
   MatrixIndexT R = num_rows_, C = num_cols_;
   Real bad_max = 0.0;
   for (MatrixIndexT i = 0; i < R;i++)
     for (MatrixIndexT j = 0; j < C;j++)
       bad_max = std::max(bad_max, static_cast<Real>(std::abs( (*this)(i, j) - (i == j?1.0:0.0))));
   return (bad_max <= cutoff);
 }

 template<typename Real>
 bool MatrixBase<Real>::IsZero(Real cutoff)const {
   MatrixIndexT R = num_rows_, C = num_cols_;
   Real bad_max = 0.0;
   for (MatrixIndexT i = 0;i < R;i++)
     for (MatrixIndexT j = 0;j < C;j++)
       bad_max = std::max(bad_max, static_cast<Real>(std::abs( (*this)(i, j) )));
   return (bad_max <= cutoff);
 }

 template<typename Real>
 Real MatrixBase<Real>::FrobeniusNorm() const{
   return std::sqrt(TraceMatMat(*this, *this, kTrans));
 }

 template<typename Real>
 bool MatrixBase<Real>::ApproxEqual(const MatrixBase<Real> &other, float tol) const {
   if (num_rows_ != other.num_rows_ || num_cols_ != other.num_cols_)
     KALDI_ERR << "ApproxEqual: size mismatch.";
   Matrix<Real> tmp(*this);
   tmp.AddMat(-1.0, other);
   return (tmp.FrobeniusNorm() <= static_cast<Real>(tol) *
           this->FrobeniusNorm());
 }

 template<typename Real>
 bool MatrixBase<Real>::Equal(const MatrixBase<Real> &other) const {
   if (num_rows_ != other.num_rows_ || num_cols_ != other.num_cols_)
     KALDI_ERR << "Equal: size mismatch.";
   for (MatrixIndexT i = 0; i < num_rows_; i++)
     for (MatrixIndexT j = 0; j < num_cols_; j++)
       if ( (*this)(i, j) != other(i, j))
         return false;
   return true;
 }


 template<typename Real>
 Real MatrixBase<Real>::LargestAbsElem() const{
   MatrixIndexT R = num_rows_, C = num_cols_;
   Real largest = 0.0;
   for (MatrixIndexT i = 0;i < R;i++)
     for (MatrixIndexT j = 0;j < C;j++)
       largest = std::max(largest, (Real)std::abs((*this)(i, j)));
   return largest;
 }


 template<typename Real>
 void MatrixBase<Real>::OrthogonalizeRows() {
   KALDI_ASSERT(NumRows() <= NumCols());
   MatrixIndexT num_rows = num_rows_;
   for (MatrixIndexT i = 0; i < num_rows; i++) {
     int32 counter = 0;
     while (1) {
       Real start_prod = VecVec(this->Row(i), this->Row(i));
       if (start_prod - start_prod != 0.0 || start_prod == 0.0) {
         KALDI_WARN << "Self-product of row " << i << " of matrix is "
                    << start_prod << ", randomizing.";
         this->Row(i).SetRandn();
         counter++;
         continue;  // loop again.
       }
       for (MatrixIndexT j = 0; j < i; j++) {
         Real prod = VecVec(this->Row(i), this->Row(j));
         this->Row(i).AddVec(-prod, this->Row(j));
       }
       Real end_prod = VecVec(this->Row(i), this->Row(i));
       if (end_prod <= 0.01 * start_prod) { // We removed
         // almost all of the vector during orthogonalization,
         // so we have reason to doubt (for roundoff reasons)
         // that it's still orthogonal to the other vectors.
         // We need to orthogonalize again.
         if (end_prod == 0.0) { // Row is exactly zero:
           // generate random direction.
           this->Row(i).SetRandn();
         }
         counter++;
         if (counter > 100)
           KALDI_ERR << "Loop detected while orthogalizing matrix.";
       } else {
         this->Row(i).Scale(1.0 / std::sqrt(end_prod));
         break;
       }
     }
   }
 }


 // Uses Svd to compute the eigenvalue decomposition of a symmetric positive semidefinite
 //   matrix:
 // (*this) = rU * diag(rs) * rU^T, with rU an orthogonal matrix so rU^{-1} = rU^T.
 // Does this by computing svd (*this) = U diag(rs) V^T ... answer is just U diag(rs) U^T.
 // Throws exception if this failed to within supplied precision (typically because *this was not
 // symmetric positive definite).

 template<typename Real>
 void MatrixBase<Real>::SymPosSemiDefEig(VectorBase<Real> *rs, MatrixBase<Real> *rU, Real check_thresh) // e.g. check_thresh = 0.001
 {
   const MatrixIndexT D = num_rows_;

   KALDI_ASSERT(num_rows_ == num_cols_);
   KALDI_ASSERT(IsSymmetric() && "SymPosSemiDefEig: expecting input to be symmetrical.");
   KALDI_ASSERT(rU->num_rows_ == D && rU->num_cols_ == D && rs->Dim() == D);

   Matrix<Real>  Vt(D, D);
   Svd(rs, rU, &Vt);

   // First just zero any singular values if the column of U and V do not have +ve dot product--
   // this may mean we have small negative eigenvalues, and if we zero them the result will be closer to correct.
   for (MatrixIndexT i = 0;i < D;i++) {
     Real sum = 0.0;
     for (MatrixIndexT j = 0;j < D;j++) sum += (*rU)(j, i) * Vt(i, j);
     if (sum < 0.0) (*rs)(i) = 0.0;
   }

   {
     Matrix<Real> tmpU(*rU); Vector<Real> tmps(*rs); tmps.ApplyPow(0.5);
     tmpU.MulColsVec(tmps);
     SpMatrix<Real> tmpThis(D);
     tmpThis.AddMat2(1.0, tmpU, kNoTrans, 0.0);
     Matrix<Real> tmpThisFull(tmpThis);
     float new_norm = tmpThisFull.FrobeniusNorm();
     float old_norm = (*this).FrobeniusNorm();
     tmpThisFull.AddMat(-1.0, (*this));

     if (!(old_norm == 0 && new_norm == 0)) {
       float diff_norm = tmpThisFull.FrobeniusNorm();
       if (std::abs(new_norm-old_norm) > old_norm*check_thresh || diff_norm > old_norm*check_thresh) {
         KALDI_WARN << "SymPosSemiDefEig seems to have failed " << diff_norm << " !<< "
                    << check_thresh << "*" << old_norm << ", maybe matrix was not "
                    << "positive semi definite.  Continuing anyway.";
       }
     }
   }
 }


 template<typename Real>
 Real MatrixBase<Real>::LogDet(Real *det_sign) const {
   Real log_det;
   Matrix<Real> tmp(*this);
   tmp.Invert(&log_det, det_sign, false);  // false== output not needed (saves some computation).
   return log_det;
 }

 template<typename Real>
 void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
                                     bool inverse_needed) {
   double log_det_tmp, det_sign_tmp;
   Matrix<double> dmat(*this);
   dmat.Invert(&log_det_tmp, &det_sign_tmp, inverse_needed);
   if (inverse_needed) (*this).CopyFromMat(dmat);
   if (log_det) *log_det = log_det_tmp;
   if (det_sign) *det_sign = det_sign_tmp;
 }

 template<class Real>
 void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
   mat.CopyToMat(this);
 }

 template<class Real>
 Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
   Resize(M.NumRows(), M.NumCols(), kUndefined);
   M.CopyToMat(this);
 }


 template<typename Real>
 void MatrixBase<Real>::InvertElements() {
   for (MatrixIndexT r = 0; r < num_rows_; r++) {
     for (MatrixIndexT c = 0; c < num_cols_; c++) {
       (*this)(r, c) = static_cast<Real>(1.0 / (*this)(r, c));
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Transpose() {
   KALDI_ASSERT(num_rows_ == num_cols_);
   MatrixIndexT M = num_rows_;
   for (MatrixIndexT i = 0;i < M;i++)
     for (MatrixIndexT j = 0;j < i;j++) {
       Real &a = (*this)(i, j), &b = (*this)(j, i);
       std::swap(a, b);
     }
 }


 template<typename Real>
 void Matrix<Real>::Transpose() {
   if (this->num_rows_ != this->num_cols_) {
     Matrix<Real> tmp(*this, kTrans);
     Resize(this->num_cols_, this->num_rows_);
     this->CopyFromMat(tmp);
   } else {
     (static_cast<MatrixBase<Real>&>(*this)).Transpose();
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
       row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Exp(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
       row_data[col] = kaldi::Exp(src_row_data[col]);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Pow(const MatrixBase<Real> &src, Real power) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++) {
       row_data[col] = pow(src_row_data[col], power);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col ++) {
       if (include_sign == true && src_row_data[col] < 0) {
         row_data[col] = -pow(std::abs(src_row_data[col]), power);
       } else {
         row_data[col] = pow(std::abs(src_row_data[col]), power);
       }
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Floor(const MatrixBase<Real> &src, Real floor_val) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
       row_data[col] = (src_row_data[col] < floor_val ? floor_val : src_row_data[col]);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Ceiling(const MatrixBase<Real> &src, Real ceiling_val) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
       row_data[col] = (src_row_data[col] > ceiling_val ? ceiling_val : src_row_data[col]);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Log(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
       row_data[col] = kaldi::Log(src_row_data[col]);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::ExpSpecial(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++)
       row_data[col] = (src_row_data[col] < Real(0) ? kaldi::Exp(src_row_data[col]) : (src_row_data[col] + Real(1)));
   }
 }

 template<typename Real>
 void MatrixBase<Real>::ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
     for (MatrixIndexT col = 0; col < num_cols; col++) {
       const Real x = src_row_data[col];
       if (!(x >= lower_limit))
         row_data[col] = kaldi::Exp(lower_limit);
       else if (x > upper_limit)
         row_data[col] = kaldi::Exp(upper_limit);
       else
         row_data[col] = kaldi::Exp(x);
     }
   }
 }

 template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
   KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_);
   MatrixIndexT n = num_rows_;
   Matrix<Real> P(n, n);
   Vector<Real> re(n), im(n);
   this->Eig(&P, &re, &im);
   // Now attempt to take the complex eigenvalues to this power.
   for (MatrixIndexT i = 0; i < n; i++)
     if (!AttemptComplexPower(&(re(i)), &(im(i)), power))
       return false;  // e.g. real and negative, or zero, eigenvalues.

   Matrix<Real> D(n, n);  // D to the power.
   CreateEigenvalueMatrix(re, im, &D);

   Matrix<Real> tmp(n, n);  // P times D
   tmp.AddMatMat(1.0, P, kNoTrans, D, kNoTrans, 0.0);  // tmp := P*D
   P.Invert();
   // next line is: *this = tmp * P^{-1} = P * D * P^{-1}
   (*this).AddMatMat(1.0, tmp, kNoTrans, P, kNoTrans, 0.0);
   return true;
 }

 template<typename Real>
 void Matrix<Real>::Swap(Matrix<Real> *other) {
   std::swap(this->data_, other->data_);
   std::swap(this->num_cols_, other->num_cols_);
   std::swap(this->num_rows_, other->num_rows_);
   std::swap(this->stride_, other->stride_);
 }

 // Repeating this comment that appeared in the header:
 // Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
 // P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
 // slightly complicated, due to the need for P to be real.  In the symmetric
 // case D is diagonal and real, but in
 // the non-symmetric case there may be complex-conjugate pairs of eigenvalues.
 // In this case, for the equation (*this) = P D P^{-1} to hold, D must actually
 // be block diagonal, with 2x2 blocks corresponding to any such pairs.  If a
 // pair is lambda +- i*mu, D will have a corresponding 2x2 block
 // [lambda, mu; -mu, lambda].
 // Note that if the input matrix (*this) is non-invertible, P may not be invertible
 // so in this case instead of the equation (*this) = P D P^{-1} holding, we have
 // instead (*this) P = P D.
 //
 // By making the pointer arguments non-NULL or NULL, the user can choose to take
 // not to take the eigenvalues directly, and/or the matrix D which is block-diagonal
 // with 2x2 blocks.
 template<typename Real>
 void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
                            VectorBase<Real> *r,
                            VectorBase<Real> *i) const {
   EigenvalueDecomposition<Real>  eig(*this);
   if (P) eig.GetV(P);
   if (r) eig.GetRealEigenvalues(r);
   if (i) eig.GetImagEigenvalues(i);
 }


 // Begin non-member function definitions.

 //  /**
 //   * @brief Extension of the HTK header
 //  */
 // struct HtkHeaderExt
 //  {
 // INT_32 mHeaderSize;
 // INT_32 mVersion;
 // INT_32 mSampSize;
 // };

 template<typename Real>
 bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
 {
   // check instantiated with double or float.
   KALDI_ASSERT_IS_FLOATING_TYPE(Real);
   Matrix<Real> &M = *M_ptr;
   HtkHeader htk_hdr;

   // TODO(arnab): this fails if the HTK file has CRC cheksum or is compressed.
   is.read((char*)&htk_hdr, sizeof(htk_hdr));  // we're being really POSIX here!
   if (is.fail()) {
     KALDI_WARN << "Could not read header from HTK feature file ";
     return false;
   }

   KALDI_SWAP4(htk_hdr.mNSamples);
   KALDI_SWAP4(htk_hdr.mSamplePeriod);
   KALDI_SWAP2(htk_hdr.mSampleSize);
   KALDI_SWAP2(htk_hdr.mSampleKind);

   bool has_checksum = false;
   {
     // See HParm.h in HTK code for sources of these things.
     enum BaseParmKind{
       Waveform, Lpc, Lprefc, Lpcepstra, Lpdelcep,
       Irefc, Mfcc, Fbank, Melspec, User, Discrete, Plp, Anon };

     const int32 IsCompressed = 02000, HasChecksum = 010000, HasVq = 040000,
         Problem = IsCompressed | HasVq;
     int32 base_parm = htk_hdr.mSampleKind & (077);
     has_checksum = (base_parm & HasChecksum) != 0;
     htk_hdr.mSampleKind &= ~HasChecksum; // We don't support writing with
                                          // checksum so turn it off.
     if (htk_hdr.mSampleKind & Problem)
       KALDI_ERR << "Code to read HTK features does not support compressed "
           "features, or features with VQ.";
     if (base_parm == Waveform || base_parm == Irefc || base_parm == Discrete)
       KALDI_ERR << "Attempting to read HTK features from unsupported type "
           "(e.g. waveform or discrete features.";
   }

   KALDI_VLOG(3) << "HTK header: Num Samples: " << htk_hdr.mNSamples
                 << "; Sample period: " << htk_hdr.mSamplePeriod
                 << "; Sample size: " << htk_hdr.mSampleSize
                 << "; Sample kind: " << htk_hdr.mSampleKind;

   M.Resize(htk_hdr.mNSamples, htk_hdr.mSampleSize / sizeof(float));

   MatrixIndexT i;
   MatrixIndexT j;
   if (sizeof(Real) == sizeof(float)) {
     for (i = 0; i< M.NumRows(); i++) {
       is.read((char*)M.RowData(i), sizeof(float)*M.NumCols());
       if (is.fail()) {
         KALDI_WARN << "Could not read data from HTK feature file ";
         return false;
       }
       if (MachineIsLittleEndian()) {
         MatrixIndexT C = M.NumCols();
         for (j = 0; j < C; j++) {
           KALDI_SWAP4((M(i, j)));  // The HTK standard is big-endian!
         }
       }
     }
   } else {
     float *pmem = new float[M.NumCols()];
     for (i = 0; i < M.NumRows(); i++) {
       is.read((char*)pmem, sizeof(float)*M.NumCols());
       if (is.fail()) {
         KALDI_WARN << "Could not read data from HTK feature file ";
         delete [] pmem;
         return false;
       }
       MatrixIndexT C = M.NumCols();
       for (j = 0; j < C; j++) {
         if (MachineIsLittleEndian())  // HTK standard is big-endian!
           KALDI_SWAP4(pmem[j]);
         M(i, j) = static_cast<Real>(pmem[j]);
       }
     }
     delete [] pmem;
   }
   if (header_ptr) *header_ptr = htk_hdr;
   if (has_checksum) {
     int16 checksum;
     is.read((char*)&checksum, sizeof(checksum));
     if (is.fail())
       KALDI_WARN << "Could not read checksum from HTK feature file ";
     // We ignore the checksum.
   }
   return true;
 }


 template
 bool ReadHtk(std::istream &is, Matrix<float> *M, HtkHeader *header_ptr);

 template
 bool ReadHtk(std::istream &is, Matrix<double> *M, HtkHeader *header_ptr);

 template<typename Real>
 bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr) // header may be derived from a previous call to ReadHtk.  Must be in binary mode.
 {
   KALDI_ASSERT(M.NumRows() == static_cast<MatrixIndexT>(htk_hdr.mNSamples));
   KALDI_ASSERT(M.NumCols() == static_cast<MatrixIndexT>(htk_hdr.mSampleSize) /
                static_cast<MatrixIndexT>(sizeof(float)));

   KALDI_SWAP4(htk_hdr.mNSamples);
   KALDI_SWAP4(htk_hdr.mSamplePeriod);
   KALDI_SWAP2(htk_hdr.mSampleSize);
   KALDI_SWAP2(htk_hdr.mSampleKind);

   os.write((char*)&htk_hdr, sizeof(htk_hdr));
   if (os.fail())  goto bad;

   MatrixIndexT i;
   MatrixIndexT j;
   if (sizeof(Real) == sizeof(float) && !MachineIsLittleEndian()) {
     for (i = 0; i< M.NumRows(); i++) {  // Unlikely to reach here ever!
       os.write((char*)M.RowData(i), sizeof(float)*M.NumCols());
       if (os.fail()) goto bad;
     }
   } else {
     float *pmem = new float[M.NumCols()];

     for (i = 0; i < M.NumRows(); i++) {
       const Real *rowData = M.RowData(i);
       for (j = 0;j < M.NumCols();j++)
         pmem[j] =  static_cast<float> ( rowData[j] );
       if (MachineIsLittleEndian())
         for (j = 0;j < M.NumCols();j++)
           KALDI_SWAP4(pmem[j]);
       os.write((char*)pmem, sizeof(float)*M.NumCols());
       if (os.fail()) {
         delete [] pmem;
         goto bad;
       }
     }
     delete [] pmem;
   }
   return true;
 bad:
   KALDI_WARN << "Could not write to HTK feature file ";
   return false;
 }

 template
 bool WriteHtk(std::ostream &os, const MatrixBase<float> &M, HtkHeader htk_hdr);

 template
 bool WriteHtk(std::ostream &os, const MatrixBase<double> &M, HtkHeader htk_hdr);

 template<class Real>
 bool WriteSphinx(std::ostream &os, const MatrixBase<Real> &M)
 {
   // CMUSphinx mfc file header contains count of the floats, followed
   // by the data in float little endian format.

   int size = M.NumRows() * M.NumCols();
   os.write((char*)&size, sizeof(int));
   if (os.fail())  goto bad;

   MatrixIndexT i;
   MatrixIndexT j;
   if (sizeof(Real) == sizeof(float) && MachineIsLittleEndian()) {
     for (i = 0; i< M.NumRows(); i++) {  // Unlikely to reach here ever!
       os.write((char*)M.RowData(i), sizeof(float)*M.NumCols());
       if (os.fail()) goto bad;
     }
   } else {
     float *pmem = new float[M.NumCols()];

     for (i = 0; i < M.NumRows(); i++) {
       const Real *rowData = M.RowData(i);
       for (j = 0;j < M.NumCols();j++)
         pmem[j] =  static_cast<float> ( rowData[j] );
       if (!MachineIsLittleEndian())
         for (j = 0;j < M.NumCols();j++)
           KALDI_SWAP4(pmem[j]);
       os.write((char*)pmem, sizeof(float)*M.NumCols());
       if (os.fail()) {
         delete [] pmem;
         goto bad;
       }
     }
     delete [] pmem;
   }
   return true;
 bad:
   KALDI_WARN << "Could not write to Sphinx feature file";
   return false;
 }

 template
 bool WriteSphinx(std::ostream &os, const MatrixBase<float> &M);

 template
 bool WriteSphinx(std::ostream &os, const MatrixBase<double> &M);

 template <typename Real>
 Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                     const MatrixBase<Real> &B, MatrixTransposeType transB,
                     const MatrixBase<Real> &C, MatrixTransposeType transC) {
   MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(), BCols = B.NumCols(),
       CRows = C.NumRows(), CCols = C.NumCols();
   if (transA == kTrans) std::swap(ARows, ACols);
   if (transB == kTrans) std::swap(BRows, BCols);
   if (transC == kTrans) std::swap(CRows, CCols);
   KALDI_ASSERT( CCols == ARows && ACols == BRows && BCols == CRows && "TraceMatMatMat: args have mismatched dimensions.");
   if (ARows*BCols < std::min(BRows*CCols, CRows*ACols)) {
     Matrix<Real> AB(ARows, BCols);
     AB.AddMatMat(1.0, A, transA, B, transB, 0.0);  // AB = A * B.
     return TraceMatMat(AB, C, transC);
   } else if ( BRows*CCols < CRows*ACols) {
     Matrix<Real> BC(BRows, CCols);
     BC.AddMatMat(1.0, B, transB, C, transC, 0.0);  // BC = B * C.
     return TraceMatMat(BC, A, transA);
   } else {
     Matrix<Real> CA(CRows, ACols);
     CA.AddMatMat(1.0, C, transC, A, transA, 0.0);  // CA = C * A
     return TraceMatMat(CA, B, transB);
   }
 }

 template
 float TraceMatMatMat(const MatrixBase<float> &A, MatrixTransposeType transA,
                      const MatrixBase<float> &B, MatrixTransposeType transB,
                      const MatrixBase<float> &C, MatrixTransposeType transC);

 template
 double TraceMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA,
                       const MatrixBase<double> &B, MatrixTransposeType transB,
                       const MatrixBase<double> &C, MatrixTransposeType transC);


 template <typename Real>
 Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
                        const MatrixBase<Real> &B, MatrixTransposeType transB,
                        const MatrixBase<Real> &C, MatrixTransposeType transC,
                        const MatrixBase<Real> &D, MatrixTransposeType transD) {
   MatrixIndexT ARows = A.NumRows(), ACols = A.NumCols(), BRows = B.NumRows(), BCols = B.NumCols(),
       CRows = C.NumRows(), CCols = C.NumCols(), DRows = D.NumRows(), DCols = D.NumCols();
   if (transA == kTrans) std::swap(ARows, ACols);
   if (transB == kTrans) std::swap(BRows, BCols);
   if (transC == kTrans) std::swap(CRows, CCols);
   if (transD == kTrans) std::swap(DRows, DCols);
   KALDI_ASSERT( DCols == ARows && ACols == BRows && BCols == CRows && CCols == DRows && "TraceMatMatMat: args have mismatched dimensions.");
   if (ARows*BCols < std::min(BRows*CCols, std::min(CRows*DCols, DRows*ACols))) {
     Matrix<Real> AB(ARows, BCols);
     AB.AddMatMat(1.0, A, transA, B, transB, 0.0);  // AB = A * B.
     return TraceMatMatMat(AB, kNoTrans, C, transC, D, transD);
   } else if ((BRows*CCols) < std::min(CRows*DCols, DRows*ACols)) {
     Matrix<Real> BC(BRows, CCols);
     BC.AddMatMat(1.0, B, transB, C, transC, 0.0);  // BC = B * C.
     return TraceMatMatMat(BC, kNoTrans, D, transD, A, transA);
   } else if (CRows*DCols < DRows*ACols) {
     Matrix<Real> CD(CRows, DCols);
     CD.AddMatMat(1.0, C, transC, D, transD, 0.0);  // CD = C * D
     return TraceMatMatMat(CD, kNoTrans, A, transA, B, transB);
   } else {
     Matrix<Real> DA(DRows, ACols);
     DA.AddMatMat(1.0, D, transD, A, transA, 0.0);  // DA = D * A
     return TraceMatMatMat(DA, kNoTrans, B, transB, C, transC);
   }
 }

 template
 float TraceMatMatMatMat(const MatrixBase<float> &A, MatrixTransposeType transA,
                         const MatrixBase<float> &B, MatrixTransposeType transB,
                         const MatrixBase<float> &C, MatrixTransposeType transC,
                         const MatrixBase<float> &D, MatrixTransposeType transD);

 template
 double TraceMatMatMatMat(const MatrixBase<double> &A, MatrixTransposeType transA,
                          const MatrixBase<double> &B, MatrixTransposeType transB,
                          const MatrixBase<double> &C, MatrixTransposeType transC,
                          const MatrixBase<double> &D, MatrixTransposeType transD);

 template<typename Real> void  SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
                                    MatrixBase<Real> *Vt, bool sort_on_absolute_value) {
   MatrixIndexT num_singval = s->Dim();
   KALDI_ASSERT(U == NULL || U->NumCols() == num_singval);
   KALDI_ASSERT(Vt == NULL || Vt->NumRows() == num_singval);

   std::vector<std::pair<Real, MatrixIndexT> > vec(num_singval);
   // negative because we want revese order.
   for (MatrixIndexT d = 0; d < num_singval; d++) {
     Real val = (*s)(d),
         sort_val = -(sort_on_absolute_value ? std::abs(val) : val);
     vec[d] = std::pair<Real, MatrixIndexT>(sort_val, d);
   }
   std::sort(vec.begin(), vec.end());
   Vector<Real> s_copy(*s);
   for (MatrixIndexT d = 0; d < num_singval; d++)
     (*s)(d) = s_copy(vec[d].second);
   if (U != NULL) {
     Matrix<Real> Utmp(*U);
     MatrixIndexT dim = Utmp.NumRows();
     for (MatrixIndexT d = 0; d < num_singval; d++) {
       MatrixIndexT oldidx = vec[d].second;
       for (MatrixIndexT e = 0; e < dim; e++)
         (*U)(e, d) = Utmp(e, oldidx);
     }
   }
   if (Vt != NULL) {
     Matrix<Real> Vttmp(*Vt);
     for (MatrixIndexT d = 0; d < num_singval; d++)
       (*Vt).Row(d).CopyFromVec(Vttmp.Row(vec[d].second));
   }
 }

 template
 void SortSvd(VectorBase<float> *s, MatrixBase<float> *U,
              MatrixBase<float> *Vt, bool);

 template
 void SortSvd(VectorBase<double> *s, MatrixBase<double> *U,
              MatrixBase<double> *Vt, bool);

 template<typename Real>
 void CreateEigenvalueMatrix(const VectorBase<Real> &re, const VectorBase<Real> &im,
                             MatrixBase<Real> *D) {
   MatrixIndexT n = re.Dim();
   KALDI_ASSERT(im.Dim() == n && D->NumRows() == n && D->NumCols() == n);

   MatrixIndexT j = 0;
   D->SetZero();
   while (j < n) {
     if (im(j) == 0) {  // Real eigenvalue
       (*D)(j, j) = re(j);
       j++;
     } else {  // First of a complex pair
       KALDI_ASSERT(j+1 < n && ApproxEqual(im(j+1), -im(j))
                    && ApproxEqual(re(j+1), re(j)));
       Real lambda = re(j), mu = im(j);
       // create 2x2 block [lambda, mu; -mu, lambda]
       (*D)(j, j) = lambda;
       (*D)(j, j+1) = mu;
       (*D)(j+1, j) = -mu;
       (*D)(j+1, j+1) = lambda;
       j += 2;
     }
   }
 }

 template
 void CreateEigenvalueMatrix(const VectorBase<float> &re, const VectorBase<float> &im,
                             MatrixBase<float> *D);
 template
 void CreateEigenvalueMatrix(const VectorBase<double> &re, const VectorBase<double> &im,
                             MatrixBase<double> *D);


 template<typename Real>
 bool AttemptComplexPower(Real *x_re, Real *x_im, Real power) {
   // Used in Matrix<Real>::Power().
   // Attempts to take the complex value x to the power "power",
   // assuming that power is fractional (i.e. we don't treat integers as a
   // special case).  Returns false if this is not possible, either
   // because x is negative and real (hence there is no obvious answer
   // that is "closest to 1", and anyway this case does not make sense
   // in the Matrix<Real>::Power() routine);
   // or because power is negative, and x is zero.

   // First solve for r and theta in
   // x_re = r*cos(theta), x_im = r*sin(theta)
   if (*x_re < 0.0 && *x_im == 0.0) return false;  // can't do
   // it for negative real values.
   Real r = std::sqrt((*x_re * *x_re) + (*x_im * *x_im));  // r == radius.
   if (power < 0.0 && r == 0.0) return false;
   Real theta = std::atan2(*x_im, *x_re);
   // Take the power.
   r = std::pow(r, power);
   theta *= power;
   *x_re = r * std::cos(theta);
   *x_im = r * std::sin(theta);
   return true;
 }

 template
 bool AttemptComplexPower(float *x_re, float *x_im, float power);
 template
 bool AttemptComplexPower(double *x_re, double *x_im, double power);


 template <typename Real>
 Real TraceMatMat(const MatrixBase<Real> &A,
                   const MatrixBase<Real> &B,
                   MatrixTransposeType trans) {  // tr(A B), equivalent to sum of each element of A times same element in B'
   MatrixIndexT aStride = A.stride_, bStride = B.stride_;
   if (trans == kNoTrans) {
     KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
     Real ans = 0.0;
     Real *adata = A.data_, *bdata = B.data_;
     MatrixIndexT arows = A.NumRows(), acols = A.NumCols();
     for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata++)
       ans += cblas_Xdot(acols, adata, 1, bdata, bStride);
     return ans;
   } else {
     KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
     Real ans = 0.0;
     Real *adata = A.data_, *bdata = B.data_;
     MatrixIndexT arows = A.NumRows(), acols = A.NumCols();
     for (MatrixIndexT row = 0;row < arows;row++, adata+=aStride, bdata+=bStride)
       ans += cblas_Xdot(acols, adata, 1, bdata, 1);
     return ans;
   }
 }


 // Instantiate the template above for float and double.
 template
 float TraceMatMat(const MatrixBase<float> &A,
                   const MatrixBase<float> &B,
                   MatrixTransposeType trans);
 template
 double TraceMatMat(const MatrixBase<double> &A,
                   const MatrixBase<double> &B,
                   MatrixTransposeType trans);


 template<typename Real>
 Real MatrixBase<Real>::LogSumExp(Real prune) const {
   Real sum;
   if (sizeof(sum) == 8) sum = kLogZeroDouble;
   else sum = kLogZeroFloat;
   Real max_elem = Max(), cutoff;
   if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat;
   else cutoff = max_elem + kMinLogDiffDouble;
   if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning...
     cutoff = max_elem - prune;

   double sum_relto_max_elem = 0.0;

   for (MatrixIndexT i = 0; i < num_rows_; i++) {
     for (MatrixIndexT j = 0; j < num_cols_; j++) {
       BaseFloat f = (*this)(i, j);
       if (f >= cutoff)
         sum_relto_max_elem += kaldi::Exp(f - max_elem);
     }
   }
   return max_elem + kaldi::Log(sum_relto_max_elem);
 }

 template<typename Real>
 Real MatrixBase<Real>::ApplySoftMax() {
   Real max = this->Max(), sum = 0.0;
   // the 'max' helps to get in good numeric range.
   for (MatrixIndexT i = 0; i < num_rows_; i++)
     for (MatrixIndexT j = 0; j < num_cols_; j++)
       sum += ((*this)(i, j) = kaldi::Exp((*this)(i, j) - max));
   this->Scale(1.0 / sum);
   return max + kaldi::Log(sum);
 }

 template<typename Real>
 void MatrixBase<Real>::Tanh(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));

   if (num_cols_ == stride_ && src.num_cols_ == src.stride_) {
     SubVector<Real> src_vec(src.data_, num_rows_ * num_cols_),
         dst_vec(this->data_, num_rows_ * num_cols_);
     dst_vec.Tanh(src_vec);
   } else {
     for (MatrixIndexT r = 0; r < num_rows_; r++) {
       SubVector<Real> src_vec(src, r), dest_vec(*this, r);
       dest_vec.Tanh(src_vec);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
   int32 num_rows = num_rows_, num_cols = num_cols_;
   for (MatrixIndexT r = 0; r < num_rows; r++) {
     Real *row_data = this->RowData(r);
     const Real *src_row_data = src.RowData(r);
     for (MatrixIndexT c = 0; c < num_cols; c++) {
       Real x = src_row_data[c], y;
       if (x > 10.0) y = x; // avoid exponentiating large numbers; function
       // approaches y=x.
       else y = Log1p(kaldi::Exp(x)); // these defined in kaldi-math.h
       row_data[c] = y;
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
   KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 &&
                src.NumRows() == this->NumRows());
   int group_size = src.NumCols() / this->NumCols(),
     num_rows = this->NumRows(), num_cols = this->NumCols();
   for (MatrixIndexT i = 0; i < num_rows; i++)
     for (MatrixIndexT j = 0; j < num_cols; j++)
       (*this)(i, j) = src.Row(i).Range(j * group_size,  group_size).Norm(power);
 }

 template<typename Real>
 void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
   KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 &&
                src.NumRows() == this->NumRows());
   int group_size = src.NumCols() / this->NumCols(),
       num_rows = this->NumRows(), num_cols = this->NumCols();
   for (MatrixIndexT i = 0; i < num_rows; i++) {
     const Real *src_row_data = src.RowData(i);
     for (MatrixIndexT j = 0; j < num_cols; j++) {
       Real max_val = -1e20;
       for (MatrixIndexT k = 0; k < group_size; k++) {
         Real src_data = src_row_data[j * group_size + k];
         if (src_data > max_val)
           max_val = src_data;
       }
       (*this)(i, j) = max_val;
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
                                 const MatrixIndexT *indices) {
   KALDI_ASSERT(NumRows() == src.NumRows());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_, src_stride = src.stride_;
   Real *this_data = this->data_;
   const Real *src_data = src.data_;
 #ifdef KALDI_PARANOID
   MatrixIndexT src_cols = src.NumCols();
   for (MatrixIndexT i = 0; i < num_cols; i++)
     KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols);
 #endif

   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
     const MatrixIndexT *index_ptr = &(indices[0]);
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr < 0) this_data[c] = 0;
       else this_data[c] = src_data[*index_ptr];
     }
   }
 }


 template<typename Real>
 void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
                                const MatrixIndexT *indices) {
   KALDI_ASSERT(NumRows() == src.NumRows());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_, src_stride = src.stride_;
   Real *this_data = this->data_;
   const Real *src_data = src.data_;
 #ifdef KALDI_PARANOID
   MatrixIndexT src_cols = src.NumCols();
   for (MatrixIndexT i = 0; i < num_cols; i++)
     KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols);
 #endif

   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
     const MatrixIndexT *index_ptr = &(indices[0]);
     for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
       if (*index_ptr >= 0)
         this_data[c] += src_data[*index_ptr];
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
                                 const MatrixIndexT *indices) {
   KALDI_ASSERT(NumCols() == src.NumCols());
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_;
   Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     MatrixIndexT index = indices[r];
     if (index < 0) memset(this_data, 0, sizeof(Real) * num_cols_);
     else cblas_Xcopy(num_cols, src.RowData(index), 1, this_data, 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::CopyRows(const Real *const *src) {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     const Real *const src_data = src[r];
     if (src_data == NULL) memset(this_data, 0, sizeof(Real) * num_cols);
     else cblas_Xcopy(num_cols, src_data, 1, this_data, 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::CopyToRows(Real *const *dst) const {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   const Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     Real *const dst_data = dst[r];
     if (dst_data != NULL)
       cblas_Xcopy(num_cols, this_data, 1, dst_data, 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddRows(Real alpha,
                                const MatrixBase<Real> &src,
                                const MatrixIndexT *indexes) {
   KALDI_ASSERT(NumCols() == src.NumCols());
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     MatrixIndexT index = indexes[r];
     KALDI_ASSERT(index >= -1 && index < src.NumRows());
     if (index != -1)
       cblas_Xaxpy(num_cols, alpha, src.RowData(index), 1, this_data, 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddRows(Real alpha, const Real *const *src) {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     const Real *const src_data = src[r];
     if (src_data != NULL)
       cblas_Xaxpy(num_cols, alpha, src_data, 1, this_data, 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddToRows(Real alpha,
                                  const MatrixIndexT *indexes,
                                  MatrixBase<Real> *dst) const {
   KALDI_ASSERT(NumCols() == dst->NumCols());
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     MatrixIndexT index = indexes[r];
     KALDI_ASSERT(index >= -1 && index < dst->NumRows());
     if (index != -1)
       cblas_Xaxpy(num_cols, alpha, this_data, 1, dst->RowData(index), 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::AddToRows(Real alpha, Real *const *dst) const {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   const Real *this_data = this->data_;

   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     Real *const dst_data = dst[r];
     if (dst_data != NULL)
       cblas_Xaxpy(num_cols, alpha, this_data, 1, dst_data, 1);
   }
 }

 template<typename Real>
 void MatrixBase<Real>::Sigmoid(const MatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));

   if (num_cols_ == stride_ && src.num_cols_ == src.stride_) {
     SubVector<Real> src_vec(src.data_, num_rows_ * num_cols_),
         dst_vec(this->data_, num_rows_ * num_cols_);
     dst_vec.Sigmoid(src_vec);
   } else {
     for (MatrixIndexT r = 0; r < num_rows_; r++) {
       SubVector<Real> src_vec(src, r), dest_vec(*this, r);
       dest_vec.Sigmoid(src_vec);
     }
   }
 }

 template<typename Real>
 void MatrixBase<Real>::DiffSigmoid(const MatrixBase<Real> &value,
                                    const MatrixBase<Real> &diff) {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       stride = stride_, value_stride = value.stride_, diff_stride = diff.stride_;
   Real *data = data_;
   const Real *value_data = value.data_, *diff_data = diff.data_;
   for (MatrixIndexT r = 0; r < num_rows; r++) {
     for (MatrixIndexT c = 0; c < num_cols; c++)
       data[c] = diff_data[c] * value_data[c] * (1.0 - value_data[c]);
     data += stride;
     value_data += value_stride;
     diff_data += diff_stride;
   }
 }

 template<typename Real>
 void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
                                    const MatrixBase<Real> &diff) {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       stride = stride_, value_stride = value.stride_, diff_stride = diff.stride_;
   Real *data = data_;
   const Real *value_data = value.data_, *diff_data = diff.data_;
   for (MatrixIndexT r = 0; r < num_rows; r++) {
     for (MatrixIndexT c = 0; c < num_cols; c++)
       data[c] = diff_data[c] * (1.0 - (value_data[c] * value_data[c]));
     data += stride;
     value_data += value_stride;
     diff_data += diff_stride;
   }
 }


 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
   const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       stride = stride_;
   KALDI_ASSERT(v.Dim() == num_cols);
   if(num_cols <= 64) {
     Real *data = data_;
     const OtherReal *vdata = v.Data();
     for (MatrixIndexT i = 0; i < num_rows; i++, data += stride) {
       for (MatrixIndexT j = 0; j < num_cols; j++)
         data[j] += alpha * vdata[j];
     }

   } else {
     Vector<OtherReal> ones(num_rows);
     ones.Set(1.0);
     this->AddVecVec(alpha, ones, v);
    }
 }

 template void MatrixBase<float>::AddVecToRows(const float alpha,
                                               const VectorBase<float> &v);
 template void MatrixBase<float>::AddVecToRows(const float alpha,
                                               const VectorBase<double> &v);
 template void MatrixBase<double>::AddVecToRows(const double alpha,
                                                const VectorBase<float> &v);
 template void MatrixBase<double>::AddVecToRows(const double alpha,
                                                const VectorBase<double> &v);


 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v) {
   const MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       stride = stride_;
   KALDI_ASSERT(v.Dim() == num_rows);

   if (num_rows <= 64) {
     Real *data = data_;
     const OtherReal *vdata = v.Data();
     for (MatrixIndexT i = 0; i < num_rows; i++, data += stride) {
       Real to_add = alpha * vdata[i];
       for (MatrixIndexT j = 0; j < num_cols; j++)
         data[j] += to_add;
     }

   } else {
     Vector<OtherReal> ones(num_cols);
     ones.Set(1.0);
     this->AddVecVec(alpha, v, ones);
   }
 }

 template void MatrixBase<float>::AddVecToCols(const float alpha,
                                               const VectorBase<float> &v);
 template void MatrixBase<float>::AddVecToCols(const float alpha,
                                               const VectorBase<double> &v);
 template void MatrixBase<double>::AddVecToCols(const double alpha,
                                                const VectorBase<float> &v);
 template void MatrixBase<double>::AddVecToCols(const double alpha,
                                                const VectorBase<double> &v);

 //Explicit instantiation of the classes
 //Apparently, it seems to be necessary that the instantiation
 //happens at the end of the file. Otherwise, not all the member
 //functions will get instantiated.

 template class Matrix<float>;
 template class Matrix<double>;
 template class MatrixBase<float>;
 template class MatrixBase<double>;
 template class SubMatrix<float>;
 template class SubMatrix<double>;

 } // namespace kaldi
kaldi::CompressedMatrix
Definition: compressed-matrix.h:91

kaldi::SpMatrix::AddMat2
void AddMat2(const Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transM, const Real beta)
rank-N update: if (transM == kNoTrans) (*this) = beta*(*this) + alpha * M * M^T, or (if transM == kTr...
Definition: sp-matrix.cc:1110

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::Exp
double Exp(double x)
Definition: kaldi-math.h:83

kaldi::MatrixBase::Add
void Add(const Real alpha)
Add a scalar to each element.
Definition: kaldi-matrix.cc:1677

kaldi::Matrix::Matrix
Matrix()
Empty constructor.
Definition: kaldi-matrix-inl.h:29

kaldi::MatrixBase::InvertDouble
void InvertDouble(Real *LogDet=NULL, Real *det_sign=NULL, bool inverse_needed=true)
matrix inverse [double].
Definition: kaldi-matrix.cc:2046

kaldi::MatrixBase::Min
Real Min() const
Returns minimum element of matrix.
Definition: kaldi-matrix.cc:1728

kaldi::OtherReal
This class provides a way for switching between double and float types.
Definition: matrix-common.h:84

KALDI_SWAP2
#define KALDI_SWAP2(a)
Definition: kaldi-utils.h:114

kaldi::TraceMatMat
template double TraceMatMat(const MatrixBase< double > &A, const MatrixBase< double > &B, MatrixTransposeType trans)

kaldi::SpMatrix
Packed symetric matrix class.
Definition: matrix-common.h:62

kaldi::MatrixBase::LargestAbsElem
Real LargestAbsElem() const
largest absolute value.
Definition: kaldi-matrix.cc:1937

kaldi::HtkHeader::mSampleSize
int16 mSampleSize
Sample size.
Definition: kaldi-matrix.h:961

kaldi::CreateEigenvalueMatrix
template void CreateEigenvalueMatrix(const VectorBase< double > &re, const VectorBase< double > &im, MatrixBase< double > *D)

kaldi::kUndefined
Definition: matrix-common.h:39

kaldi::cblas_Xsyrk
void cblas_Xsyrk(const MatrixTransposeType trans, const MatrixIndexT dim_c, const MatrixIndexT other_dim_a, const float alpha, const float *A, const MatrixIndexT a_stride, const float beta, float *C, const MatrixIndexT c_stride)
Definition: cblas-wrappers.h:295

kaldi::MatrixBase::IsDiagonal
bool IsDiagonal(Real cutoff=1.0e-05) const
Returns true if matrix is Diagonal.
Definition: kaldi-matrix.cc:1864

kaldi::MatrixResizeType
MatrixResizeType
Definition: matrix-common.h:37

kaldi::Mfcc
OfflineFeatureTpl< MfccComputer > Mfcc
Definition: feature-mfcc.h:147

rnnlm::j
int j
Definition: mikolov-rnnlm-lib.cc:66

kaldi::clapack_Xgetri2
void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, float *p_work, KaldiBlasInt *l_work, KaldiBlasInt *result)
Definition: cblas-wrappers.h:404

sp-matrix.h

kaldi::VectorBase::Sigmoid
void Sigmoid(const VectorBase< Real > &src)
Sets each element of *this to the sigmoid function of the corresponding element of "src"...
Definition: kaldi-vector.cc:938

kaldi::MatrixBase::stride_
MatrixIndexT stride_
< Number of rows
Definition: kaldi-matrix.h:816

kaldi::RandUniform
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151

kaldi::MatrixBase::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67

kaldi::MatrixBase::Cond
Real Cond() const
Returns condition number by computing Svd.
Definition: kaldi-matrix.cc:1696

kaldi::MatrixBase
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49

kaldi::MatrixBase::Data
const Real * Data() const
Gives pointer to raw data (const).
Definition: kaldi-matrix.h:79

kaldi::ReadBasicType
void ReadBasicType(std::istream &is, bool binary, T *t)
ReadBasicType is the name of the read function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:55

kaldi::SparseVector::GetElement
const std::pair< MatrixIndexT, Real > & GetElement(MatrixIndexT i) const
get an indexed element (0 <= i < NumElements()).
Definition: sparse-matrix.h:77

jama-svd.h

kaldi::VectorBase::Tanh
void Tanh(const VectorBase< Real > &src)
Sets each element of *this to the tanh of the corresponding element of "src".
Definition: kaldi-vector.cc:900

kaldi::MatrixBase::data_
Real * data_
data memory area
Definition: kaldi-matrix.h:808

kaldi::Plp
OfflineFeatureTpl< PlpComputer > Plp
Definition: feature-plp.h:169

kaldi::MatrixBase::Max
Real Max() const
Returns maximum element of matrix.
Definition: kaldi-matrix.cc:1717

kaldi::MatrixBase::ApproxEqual
bool ApproxEqual(const MatrixBase< Real > &other, float tol=0.01) const
Returns true if ((*this)-other).FrobeniusNorm() <= tol * (*this).FrobeniusNorm(). ...
Definition: kaldi-matrix.cc:1915

kaldi::Xgemv_sparsevec
void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows, MatrixIndexT num_cols, Real alpha, const Real *Mdata, MatrixIndexT stride, const Real *xdata, MatrixIndexT incX, Real beta, Real *ydata, MatrixIndexT incY)
Definition: cblas-wrappers.h:199

kaldi::MatrixBase::SymPosSemiDefEig
void SymPosSemiDefEig(VectorBase< Real > *s, MatrixBase< Real > *P, Real check_thresh=0.001)
Uses Svd to compute the eigenvalue decomposition of a symmetric positive semi-definite matrix: (*this...
Definition: kaldi-matrix.cc:1996

kaldi::MatrixBase::Trace
Real Trace(bool check_square=true) const
Returns trace of matrix.
Definition: kaldi-matrix.cc:1709

kaldi-matrix.h

kaldi::MatrixBase::RowData
Real * RowData(MatrixIndexT i)
Returns pointer to data for one row (non-const)
Definition: kaldi-matrix.h:87

kaldi::clapack_Xgetrf2
void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, KaldiBlasInt *result)
Definition: cblas-wrappers.h:392

kaldi::MatrixBase::AddMat
void AddMat(const Real alpha, const MatrixBase< Real > &M, MatrixTransposeType transA=kNoTrans)
*this += alpha * M [or M^T]
Definition: kaldi-matrix.cc:356

kaldi::swap
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
Definition: basic-filebuf.h:275

kaldi::MatrixBase::LapackGesvd
void LapackGesvd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt)
Definition: kaldi-matrix.cc:660

kaldi::RandGauss
float RandGauss(struct RandomState *state=NULL)
Definition: kaldi-math.h:155

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

kaldi::MatrixBase::AddToDiag
void AddToDiag(const Real alpha)
Add a scalar to each diagonal element.
Definition: kaldi-matrix.cc:1686

kaldi::ReadToken
void ReadToken(std::istream &is, bool binary, std::string *str)
ReadToken gets the next token and puts it in str (exception on failure).
Definition: io-funcs.cc:154

kaldi::EigenvalueDecomposition
Definition: jama-eig.h:39

kaldi::MatrixBase::Data_workaround
Real * Data_workaround() const
A workaround that allows SubMatrix to get a pointer to non-const data for const Matrix.
Definition: kaldi-matrix.h:803

kaldi::Matrix
A class for storing matrices.
Definition: kaldi-matrix.h:823

kaldi::TraceMatMatMatMat
template double TraceMatMatMatMat(const MatrixBase< double > &A, MatrixTransposeType transA, const MatrixBase< double > &B, MatrixTransposeType transB, const MatrixBase< double > &C, MatrixTransposeType transC, const MatrixBase< double > &D, MatrixTransposeType transD)

kaldi::cblas_Xcopy
void cblas_Xcopy(const int N, const float *X, const int incX, float *Y, const int incY)
Definition: cblas-wrappers.h:37

kaldi::Matrix::Swap
void Swap(Matrix< Real > *other)
Swaps the contents of *this and *other. Shallow swap.
Definition: kaldi-matrix.cc:2255

kaldi::RandomState
Definition: kaldi-math.h:136

kaldi::MatrixBase::CopyFromMat
void CopyFromMat(const MatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given matrix. (no resize is done).
Definition: kaldi-matrix.cc:862

kaldi::PackedMatrix::NumRows
MatrixIndexT NumRows() const
Definition: packed-matrix.h:104

kaldi::clapack_Xgesvd
void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols, KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride, float *sv, float *Vdata, KaldiBlasInt *vstride, float *Udata, KaldiBlasInt *ustride, float *p_work, KaldiBlasInt *l_work, KaldiBlasInt *result)
Definition: cblas-wrappers.h:415

kaldi::CompressedMatrix::Read
void Read(std::istream &is, bool binary)
Definition: compressed-matrix.cc:566

kaldi::UnsignedMatrixIndexT
uint32 UnsignedMatrixIndexT
Definition: matrix-common.h:100

kaldi::mul_elements
void mul_elements(const MatrixIndexT dim, const double *a, double *b)
This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could extend this so...
Definition: cblas-wrappers.h:341

kaldi::Peek
int Peek(std::istream &is, bool binary)
Peek consumes whitespace (if binary == false) and then returns the peek() value of the stream...
Definition: io-funcs.cc:145

data_
uint64 data_
Definition: arpa-lm-compiler.cc:108

kaldi::ReadHtk
template bool ReadHtk(std::istream &is, Matrix< double > *M, HtkHeader *header_ptr)

kaldi::HtkHeader::mSamplePeriod
int32 mSamplePeriod
Sample period.
Definition: kaldi-matrix.h:959

kaldi::TraceMatMatMat
template double TraceMatMatMat(const MatrixBase< double > &A, MatrixTransposeType transA, const MatrixBase< double > &B, MatrixTransposeType transB, const MatrixBase< double > &C, MatrixTransposeType transC)

kaldi::MatrixBase::DestructiveSvd
void DestructiveSvd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt)
Singular value decomposition Major limitations: For nonsquare matrices, we assume m>=n (NumRows >= Nu...
Definition: kaldi-matrix.cc:1781

kaldi::kTrans
Definition: matrix-common.h:33

kaldi::SameDim
bool SameDim(const MatrixBase< Real > &M, const MatrixBase< Real > &N)
Definition: kaldi-matrix.h:1111

kaldi::MatrixBase::TestUninitialized
void TestUninitialized() const
Definition: kaldi-matrix.cc:1879

kaldi::cblas_Xdot
float cblas_Xdot(const int N, const float *const X, const int incX, const float *const Y, const int incY)
Definition: cblas-wrappers.h:64

kaldi::MatrixBase::IsZero
bool IsZero(Real cutoff=1.0e-05) const
Returns true if matrix is all zeros.
Definition: kaldi-matrix.cc:1900

kaldi::EigenvalueDecomposition::GetImagEigenvalues
void GetImagEigenvalues(VectorBase< Real > *i_out)
Definition: jama-eig.h:61

kaldi::SparseMatrix::NumCols
MatrixIndexT NumCols() const
Definition: sparse-matrix.cc:297

kaldi::Matrix::Read
void Read(std::istream &in, bool binary, bool add=false)
read from stream.
Definition: kaldi-matrix.cc:1450

kaldi::MatrixBase::IsSymmetric
bool IsSymmetric(Real cutoff=1.0e-05) const
Returns true if matrix is Symmetric.
Definition: kaldi-matrix.cc:1848

kaldi::MatrixBase::Stride
MatrixIndexT Stride() const
Stride (distance in memory between each row). Will be >= NumCols.
Definition: kaldi-matrix.h:70

kaldi::MatrixIndexT
int32 MatrixIndexT
Definition: matrix-common.h:98

kaldi::PackedMatrix::Data
Real * Data()
Definition: packed-matrix.h:102

kaldi::cblas_Xscal
void cblas_Xscal(const int N, const float alpha, float *data, const int inc)
Definition: cblas-wrappers.h:82

kaldi::MatrixBase::Row
const SubVector< Real > Row(MatrixIndexT i) const
Return specific row of matrix [const].
Definition: kaldi-matrix.h:188

kaldi::Log
double Log(double x)
Definition: kaldi-math.h:100

float

kaldi::SparseVector
Definition: sparse-matrix.h:41

kaldi::MatrixBase::num_rows_
MatrixIndexT num_rows_
< Number of columns
Definition: kaldi-matrix.h:813

kaldi::kDefaultStride
Definition: matrix-common.h:45

KALDI_MEMALIGN
#define KALDI_MEMALIGN(align, size, pp_orig)
Definition: kaldi-utils.h:58

kaldi::kMinLogDiffDouble
static const double kMinLogDiffDouble
Definition: kaldi-math.h:124

kaldi::RandGauss2
void RandGauss2(float *a, float *b, RandomState *state)
Definition: kaldi-math.cc:139

rnnlm::n
struct rnnlm::@11::@12 n

kaldi::WriteHtk
template bool WriteHtk(std::ostream &os, const MatrixBase< double > &M, HtkHeader htk_hdr)

kaldi::MatrixStrideType
MatrixStrideType
Definition: matrix-common.h:44

kaldi::Fbank
OfflineFeatureTpl< FbankComputer > Fbank
Definition: feature-fbank.h:143

kaldi::EigenvalueDecomposition::GetV
void GetV(MatrixBase< Real > *V_out)
Definition: jama-eig.h:47

kaldi::MatrixBase::Transpose
void Transpose()
Transpose the matrix.
Definition: kaldi-matrix.cc:2079

kaldi::cblas_Xsymm
void cblas_Xsymm(const float alpha, MatrixIndexT sz, const float *Adata, MatrixIndexT a_stride, const float *Bdata, MatrixIndexT b_stride, const float beta, float *Mdata, MatrixIndexT stride)
Definition: cblas-wrappers.h:256

kaldi::MatrixBase::AddMatMat
void AddMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const Real beta)
Definition: kaldi-matrix.cc:171

kaldi::MachineIsLittleEndian
int MachineIsLittleEndian()
Definition: kaldi-utils.h:83

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

kaldi::kNoTrans
Definition: matrix-common.h:34

KALDI_MEMALIGN_FREE
#define KALDI_MEMALIGN_FREE(x)
Definition: kaldi-utils.h:60

kaldi::HtkHeader::mSampleKind
uint16 mSampleKind
Sample kind.
Definition: kaldi-matrix.h:963

kaldi::MatrixBase::AddMatMatMat
void AddMatMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const MatrixBase< Real > &C, MatrixTransposeType transC, const Real beta)
this <– beta*this + alpha*A*B*C.
Definition: kaldi-matrix.cc:1741

kaldi::TpMatrix
Packed symetric matrix class.
Definition: matrix-common.h:63

kaldi::MatrixBase::num_cols_
MatrixIndexT num_cols_
these attributes store the real matrix size as it is stored in memory including memalignment ...
Definition: kaldi-matrix.h:812

kaldi::SparseVector::NumElements
MatrixIndexT NumElements() const
Returns the number of nonzero elements.
Definition: sparse-matrix.h:74

KALDI_WARN
#define KALDI_WARN
Definition: kaldi-error.h:150

jama-eig.h

kaldi::TraceMatMat
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
Definition: kaldi-matrix.cc:2692

kaldi::VectorBase::Data
Real * Data()
Returns a pointer to the start of the vector&#39;s data.
Definition: kaldi-vector.h:70

kaldi::WriteToken
void WriteToken(std::ostream &os, bool binary, const char *token)
The WriteToken functions are for writing nonempty sequences of non-space characters.
Definition: io-funcs.cc:134

kaldi::cblas_Xgemm
void cblas_Xgemm(const float alpha, MatrixTransposeType transA, const float *Adata, MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride, MatrixTransposeType transB, const float *Bdata, MatrixIndexT b_stride, const float beta, float *Mdata, MatrixIndexT num_rows, MatrixIndexT num_cols, MatrixIndexT stride)
Definition: cblas-wrappers.h:224

kaldi::VectorBase::Dim
MatrixIndexT Dim() const
Returns the dimension of the vector.
Definition: kaldi-vector.h:64

kaldi::MatrixBase::SetZero
void SetZero()
Sets matrix to zero.
Definition: kaldi-matrix.cc:1330

kaldi::VectorBase::Scale
void Scale(Real alpha)
Multiplies all elements by this constant.
Definition: kaldi-vector.cc:963

kaldi::kCopyData
Definition: matrix-common.h:40

kaldi::SparseMatrix
Definition: matrix-common.h:65

kaldi::SortSvd
template void SortSvd(VectorBase< double > *s, MatrixBase< double > *U, MatrixBase< double > *Vt, bool)

kaldi::kMinLogDiffFloat
static const float kMinLogDiffFloat
Definition: kaldi-math.h:125

kaldi::MatrixBase::MulColsVec
void MulColsVec(const VectorBase< Real > &scale)
Equivalent to (*this) = (*this) * diag(scale).
Definition: kaldi-matrix.cc:1319

KALDI_ASSERT_IS_FLOATING_TYPE
#define KALDI_ASSERT_IS_FLOATING_TYPE(F)
Definition: kaldi-utils.h:137

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::EigenvalueDecomposition::GetRealEigenvalues
void GetRealEigenvalues(VectorBase< Real > *r_out)
Definition: jama-eig.h:55

kaldi::Vector
A class representing a vector.
Definition: kaldi-vector.h:406

kaldi::cblas_Xaxpy
void cblas_Xaxpy(const int N, const float alpha, const float *X, const int incX, float *Y, const int incY)
Definition: cblas-wrappers.h:74

kaldi::CompressedMatrix::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for emtpy matrix).
Definition: compressed-matrix.h:146

kaldi::kSetZero
Definition: matrix-common.h:38

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

kaldi::VectorBase::Set
void Set(Real f)
Set all members of a vector to a specified value.
Definition: kaldi-vector.cc:336

kaldi::MatrixBase::FrobeniusNorm
Real FrobeniusNorm() const
Frobenius norm, which is the sqrt of sum of square elements.
Definition: kaldi-matrix.cc:1910

kaldi::MatrixBase::LogDet
Real LogDet(Real *det_sign=NULL) const
Returns logdet of matrix.
Definition: kaldi-matrix.cc:2038

kaldi::VectorBase::ApplyPow
void ApplyPow(Real power)
Take all elements of vector to a power.
Definition: kaldi-vector.h:179

kaldi::SparseMatrix::NumRows
MatrixIndexT NumRows() const
Definition: sparse-matrix.cc:292

KALDI_STRCASECMP
#define KALDI_STRCASECMP
Definition: kaldi-utils.h:147

kaldi::MatrixBase::AddVecVec
void AddVecVec(const Real alpha, const VectorBase< OtherReal > &a, const VectorBase< OtherReal > &b)
*this += alpha * a * b^T
Definition: kaldi-matrix.cc:129

kaldi::MatrixTransposeType
MatrixTransposeType
Definition: matrix-common.h:32

KALDI_VLOG
#define KALDI_VLOG(v)
Definition: kaldi-error.h:156

kaldi::WriteSphinx
template bool WriteSphinx(std::ostream &os, const MatrixBase< double > &M)

kaldi::MatrixBase::Range
SubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
Return a sub-part of matrix.
Definition: kaldi-matrix.h:202

kaldi::WriteBasicType
void WriteBasicType(std::ostream &os, bool binary, T t)
WriteBasicType is the name of the write function for bool, integer types, and floating-point types...
Definition: io-funcs-inl.h:34

kaldi::MatrixBase::Equal
bool Equal(const MatrixBase< Real > &other) const
Tests for exact equality. It&#39;s usually preferable to use ApproxEqual.
Definition: kaldi-matrix.cc:1925

kaldi::Matrix::Resize
void Resize(const MatrixIndexT r, const MatrixIndexT c, MatrixResizeType resize_type=kSetZero, MatrixStrideType stride_type=kDefaultStride)
Sets matrix to a specified size (zero is OK as long as both r and c are zero).
Definition: kaldi-matrix.cc:819

compressed-matrix.h

kaldi::kLogZeroFloat
const float kLogZeroFloat
Definition: kaldi-math.h:128

kaldi::MatrixBase::Svd
void Svd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt) const
Compute SVD (*this) = U diag(s) Vt.
Definition: kaldi-matrix.cc:1825

kaldi::Log1p
double Log1p(double x)
Definition: kaldi-math.h:104

kaldi::MatrixBase::OrthogonalizeRows
void OrthogonalizeRows()
This function orthogonalizes the rows of a matrix using the Gram-Schmidt process. ...
Definition: kaldi-matrix.cc:1948

kaldi::HtkHeader::mNSamples
int32 mNSamples
Number of samples.
Definition: kaldi-matrix.h:957

kaldi::MatrixBase::Invert
void Invert(Real *log_det=NULL, Real *det_sign=NULL, bool inverse_needed=true)
matrix inverse.
Definition: kaldi-matrix.cc:38

sparse-matrix.h

kaldi::VectorBase< float >

kaldi::CompressedMatrix::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for emtpy matrix).
Definition: compressed-matrix.h:150

kaldi::MatrixBase::IsUnit
bool IsUnit(Real cutoff=1.0e-05) const
Returns true if the matrix is all zeros, except for ones on diagonal.
Definition: kaldi-matrix.cc:1890

kaldi::cblas_Xger
void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha, const float *xdata, MatrixIndexT incX, const float *ydata, MatrixIndexT incY, float *Mdata, MatrixIndexT stride)
Definition: cblas-wrappers.h:275

kaldi::kLogZeroDouble
const double kLogZeroDouble
Definition: kaldi-math.h:129

kaldi::VecVec
Real VecVec(const VectorBase< Real > &a, const VectorBase< Real > &b)
Returns dot product between v1 and v2.
Definition: kaldi-vector.cc:37

kaldi::CompressedMatrix::CopyToMat
void CopyToMat(MatrixBase< Real > *mat, MatrixTransposeType trans=kNoTrans) const
Copies contents to matrix.
Definition: compressed-matrix.cc:614

kaldi::HtkHeader
A structure containing the HTK header.
Definition: kaldi-matrix.h:955

kaldi::SubMatrix
Sub-matrix representation.
Definition: kaldi-matrix.h:988

kaldi::SubVector
Represents a non-allocating general vector which can be defined as a sub-vector of higher-level vecto...
Definition: kaldi-vector.h:501

kaldi::ApproxEqual
static bool ApproxEqual(float a, float b, float relative_tolerance=0.001)
return abs(a - b) <= relative_tolerance * (abs(a)+abs(b)).
Definition: kaldi-math.h:265

KALDI_SWAP4
#define KALDI_SWAP4(a)
Definition: kaldi-utils.h:107

rnnlm::d
double d
Definition: mikolov-rnnlm-lib.cc:64

kaldi::SparseMatrix::Row
const SparseVector< Real > & Row(MatrixIndexT r) const
Definition: sparse-matrix.cc:543

kaldi::AttemptComplexPower
template bool AttemptComplexPower(double *x_re, double *x_im, double power)