Matrix for CUDA computing. More...

#include <matrix-common.h>

Inheritance diagram for CuMatrixBase< Real >:

Collaboration diagram for CuMatrixBase< Real >:

Public Member Functions
void	CopyCols (const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indexes)
	Copies column r from column indexes[r] of src. More...

void	AddCols (const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indices)
	Add column indices[r] of src to column r. More...

void	CopyRows (const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indexes)
	Copies row r from row indexes[r] of src. More...

void	CopyRows (const CuArrayBase< const Real *> &src)
	Copies row r of this matrix from an array of floats at the location given by src[r], where src[r] is assumed to be obtained from the RowData() function of another CuMatrix, or from CuVector::Data() (the point is: the data it points to should be on the GPU if we're using a GPU, and on a CPU otherwise). More...

void	CopyToRows (const CuArrayBase< Real *> &dst) const
	For each row r of this matrix, copies it to the array of floats at the location given by dst[r], where dst[r] is assumed to be obtained from the RowData() function of another CuMatrix, or from CuVector::Data() (i.e. More...

void	AddRows (Real alpha, const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indexes)
	Does for each row r, this.Row(r) += alpha * src.row(indexes[r]). More...

void	MulRows (const CuMatrixBase< Real > &src, const CuArrayBase< MatrixIndexT > &indexes)
	Does for each row r, this.Row(r) = alpha src.row(indexes[r]), where '*=' is elementwise multiplication. More...

void	AddRows (Real alpha, const CuArrayBase< const Real *> &src)
	Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the beginning of a region of memory representing a vector of floats, of the same length as this.NumCols(). More...

void	AddToRows (Real alpha, const CuArrayBase< MatrixIndexT > &indexes, CuMatrixBase< Real > *dst) const
	For each row i of *this, adds this->Row(i) to dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing. More...

void	AddToRows (Real alpha, const CuArrayBase< Real *> &dst) const
	For each row r of this matrix, adds it (times alpha) to the array of floats at the location given by dst[r], where dst[r] is assumed to be obtained from the RowData() function of another CuMatrix, or from CuVector::Data() (i.e. More...

void	SumColumnRanges (const CuMatrixBase< Real > &src, const CuArrayBase< Int32Pair > &indexes)
	For each row r of this and for each column c, sets (*this)(r, c) to the sum src(r, j), where j ranges from indexes[c].first through indexes[c].second - 1. More...

void	AddRowRanges (const CuMatrixBase< Real > &src, const CuArrayBase< Int32Pair > &indexes)
	For each row r of this and for each column c, do (*this)(r, c) += src(j, c), where j ranges from indexes[r].first through indexes[r].second - 1. More...

void	AddToDiag (Real value)
	Adds "value" to the diagonal elements of the matrix. More...

MatrixIndexT	NumRows () const
	Dimensions. More...

MatrixIndexT	NumCols () const

MatrixIndexT	Stride () const

::MatrixDim	Dim () const

Real	FrobeniusNorm () const

bool	IsUnit (Real tol=0.001) const

bool	ApproxEqual (const CuMatrixBase< Real > &other, float tol=0.01) const
	True if ((this)-other).FrobeniusNorm() <= tol this->FrobeniusNorm() More...

MatrixIndexT	SizeInBytes () const
	Get size of matrix in bytes. More...

template<typename OtherReal >
void	CopyFromMat (const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)

void	CopyFromGeneralMat (const GeneralMatrix &src, MatrixTransposeType trans=kNoTrans)

void	CopyFromMat (const MatrixBase< Real > &src, MatrixTransposeType trans=kNoTrans)

void	CopyFromSp (const CuSpMatrix< Real > &M)

template<typename OtherReal >
void	CopyFromTp (const CuTpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)

void	CopyRangeFromMatClamped (const CuMatrixBase< Real > &src, int32_t start_range, int32_t end_range, int32_t clamp_low, int32_t clamp_high)

template<typename OtherReal >
void	CopyFromMat (const CuMatrixBase< OtherReal > &M, MatrixTransposeType trans=kNoTrans)

template<typename OtherReal >
void	CopyToMat (MatrixBase< OtherReal > *dst, MatrixTransposeType trans=kNoTrans) const

void	CopyRowsFromVec (const CuVectorBase< Real > &v)
	This function has two modes of operation. More...

void	CopyRowsFromVec (const VectorBase< Real > &v)
	Version of CopyRowsFromVec() that takes a CPU-based vector. More...

void	CopyColsFromVec (const CuVectorBase< Real > &v)
	Copies vector into matrix, column-by-column. More...

void	CopyColFromVec (const CuVectorBase< Real > &v, const MatrixIndexT col)
	Copy vector into specific column of matrix. More...

void	Sigmoid (const CuMatrixBase< Real > &src)
	Set each element to the sigmoid of the corresponding element of "src": element by element, x = 1 / (1 + exp(-x)) More...

void	Heaviside (const CuMatrixBase< Real > &src)
	Set each element to the Heaviside function of the corresponding element of "src", which we define as the function (x > 0 ? 1.0 : 0.0) [note: in general, there are different ways to deal with the situation when x==0. More...

void	Exp (const CuMatrixBase< Real > &src)

void	Log (const CuMatrixBase< Real > &src)

void	Pow (const CuMatrixBase< Real > &src, Real power)

void	PowAbs (const CuMatrixBase< Real > &src, Real power, bool include_sign=false)
	Apply power to the absolute value of each element. More...

void	Floor (const CuMatrixBase< Real > &src, Real floor_val)

void	Ceiling (const CuMatrixBase< Real > &src, Real ceiling_val)

void	ExpLimited (const CuMatrixBase< Real > &src, Real lower_limit, Real upper_limit)
	This is equivalent to running: Floor(src, lower_limit); Ceiling(src, upper_limit); Exp(src) More...

void	ExpSpecial (const CuMatrixBase< Real > &src)
	For each element x of the matrix, set it to (x < 0 ? exp(x) : x + 1). More...

void	SoftMaxPerRow (const CuMatrixBase< Real > &src)
	Softmax nonlinearity Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row, with attention to avoiding overflow or underflow. More...

void	LogSoftMaxPerRow (const CuMatrixBase< Real > &src)
	LogSoftmax nonlinearity Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row, with attention to avoiding overflow or underflow. More...

void	SoftHinge (const CuMatrixBase< Real > &src)
	Apply the function y = log(1 + exp(x)), to each element. More...

void	GroupPnorm (const CuMatrixBase< Real > &src, Real pow)
	Apply the function y(i) = (sum_{j = iG}^{(i+1)G-1} x_j ^ (power)) ^ (1 / p) where G = x.NumCols() / y.NumCols() must be an integer. More...

void	DiffGroupPnorm (const CuMatrixBase< Real > &in_value, const CuMatrixBase< Real > &out_value, const CuMatrixBase< Real > &out_deriv, Real power)
	Differentiate backward through the GroupPnorm function. More...

void	GroupMax (const CuMatrixBase< Real > &src)
	Apply the function y(i) = (max_{j = iG}^{(i+1)G-1} x_j where G = x.NumCols() / y.NumCols() must be an integer. More...

void	GroupMaxDeriv (const CuMatrixBase< Real > &input, const CuMatrixBase< Real > &output)
	Calculate derivatives for the GroupMax function above, where "input" is the input to the GroupMax function above (i.e. More...

void	ParametricRelu (const CuMatrixBase< Real > &src, const CuVectorBase< Real > &alpha, const CuVectorBase< Real > &beta)
	Compute the parametric rectified linear unit function; element by element, this = src (src > 0 ? alpha : beta) More...

void	DiffParametricRelu (const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff, const CuVectorBase< Real > &alpha, const CuVectorBase< Real > &beta)
	Differentiate backward through the parametric relu function. More...

void	Tanh (const CuMatrixBase< Real > &src)
	Compute the hyperbolic tangent (tanh) function; element by element, *this = tanh(src). More...

void	DiffSigmoid (const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
	Differentiate backward through the sigmoid function. More...

void	DiffTanh (const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
	Differentiate backward through the tanh function. More...

void	DiffSoftmaxPerRow (const CuMatrixBase< Real > &value, const CuMatrixBase< Real > &diff)
	Differentiate backward through the softmax function. More...

void	DiffLogSoftmaxPerRow (const CuMatrixBase< Real > &out_value, const CuMatrixBase< Real > &out_deriv)
	Differentiate backward through the log softmax function. More...

void	DiffXent (const CuArrayBase< int32 > &tgt, CuVector< Real > *log_post_tgt)
	Differentiate the block [softmax+cross-entropy] : dE/da = posterior_mat - target_mat, 'E' is error function, 'a' is activation on softmax input. More...

void	Cholesky (CuMatrixBase< Real > *inv_cholesky=NULL)
	This function does sets this to the Cholesky factor of this (i.e. More...

void	SymInvertPosDef ()
	Inversion for positive definite symmetric matrices. More...

void	ApplyPow (Real power)

void	ApplyPowAbs (Real power, bool include_sign=false)

void	ApplyHeaviside ()

void	ApplyFloor (Real floor_val)

void	ApplyCeiling (Real ceiling_val)

void	ApplyExp ()

void	ApplyExpLimited (Real lower_limit, Real upper_limit)

void	ApplyExpSpecial ()

void	ApplySoftMaxPerRow ()

void	ApplyLogSoftMaxPerRow ()

void	ApplyLog ()

void	FindRowMaxId (CuArray< int32 > *id) const
	Find the id of the maximal element for each row (resizes the 'id' array to the appropriate size). More...

void	SetZero ()
	Math operations, some calling kernels. More...

void	Set (Real value)

void	Add (Real value)

void	SetZeroAboveDiag ()
	Zeroes all elements for which col > row. More...

void	Scale (Real value)

void	MulElements (const CuMatrixBase< Real > &A)
	Multiply two matrices elementwise: C = C .* A. More...

void	DivElements (const CuMatrixBase< Real > &A)
	Divide two matrices elementwise: C = A ./ A. More...

void	Max (const CuMatrixBase< Real > &A)
	Do, elementwise, this = max(this, A). More...

void	Min (const CuMatrixBase< Real > &A)
	Do, elementwise, this = min(this, A). More...

void	MulColsVec (const CuVectorBase< Real > &scale)
	scale i'th column by scale[i] More...

void	MulRowsVec (const CuVectorBase< Real > &scale)
	scale i'th row by scale[i] More...

void	MulRowsGroupMat (const CuMatrixBase< Real > &src)
	divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j]. More...

void	DivRowsVec (const CuVectorBase< Real > &div)
	divide i'th row by scale[i] More...

void	InvertElements ()
	invert the matrix by elements. More...

void	AddMat (Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
	this += alpha A More...

void	AddSmat (Real alpha, const CuSparseMatrix< Real > &A, MatrixTransposeType trans=kNoTrans)
	this += alpha A. More...

void	AddSmatMat (Real alpha, const CuSparseMatrix< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, Real beta)
	(this) = alpha op(A) * B + beta * (*this), where A is sparse. More...

void	AddMatSmat (Real alpha, const CuMatrixBase< Real > &A, const CuSparseMatrix< Real > &B, MatrixTransposeType transB, Real beta)
	(this) = alpha A * op(B) + beta * (*this), where B is sparse and op(B) is either B or trans(B) depending on the 'transB' argument. More...

void	AddToElements (Real alpha, const CuArrayBase< int32 > &elements)
	This is a rather special purpose function; we might generalize it later by adding a transpose-type option. More...

void	AddMatBlocks (Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType trans=kNoTrans)
	This function is like AddMat (it does this += alpha src), except that it supports cases where *this and src have different dimension. More...

void	AddVecToCols (Real alpha, const CuVectorBase< Real > &col, Real beta=1.0)
	(for each column c of this), c = alpha col + beta * c More...

void	AddVecToRows (Real alpha, const CuVectorBase< Real > &row, Real beta=1.0)
	(for each row r of this), r = alpha row + beta * r More...

void	AddMatMat (Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
	C = alpha * A(^T)B(^T) + beta C. More...

void	AddVecVec (Real alpha, const CuVectorBase< Real > &x, const CuVectorBase< Real > &y)
	A = alpha * x * y^T + A . More...

void	SetMatMatDivMat (const CuMatrixBase< Real > &A, const CuMatrixBase< Real > &B, const CuMatrixBase< Real > &C)
	this = a b / c (by element; when c = 0, this = a) this can be an alias of a, b or c safely and get expected result. More...

void	SymAddMat2 (const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
	this = beta this + alpha M M^T, for symmetric matrices. More...

void	AddMatBlock (Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuBlockMatrix< Real > &B, MatrixTransposeType transB, Real beta)
	This function is like AddMatMat but for where the second argument is of type CuBlockMatrix (a block-diagonal matrix of blocks). More...

void	AddDiagVecMat (const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
	this = beta this + alpha diag(v) * M [or M^T]. More...

void	AddMatDiagVec (const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, CuVectorBase< Real > &v, Real beta=1.0)

void	AddMatMatElements (const Real alpha, const CuMatrixBase< Real > &A, const CuMatrixBase< Real > &B, const Real beta)
	this = beta this + alpha A .* B (.* element by element multiplication) More...

void	AddMatSp (const Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuSpMatrix< Real > &B, const Real beta)
	this <– betathis + alphaA*B More...

void	AddSpMat (const Real alpha, const CuSpMatrix< Real > &A, const CuMatrixBase< Real > &B, MatrixTransposeType transB, const Real beta)
	this <– betathis + alphaSpA*B More...

void	AddTpMat (const Real alpha, const CuTpMatrix< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, const Real beta)
	this <– betathis + alphaA*B. More...

void	AddMatTp (const Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuTpMatrix< Real > &B, MatrixTransposeType transB, const Real beta)
	this <– betathis + alphaA*B. More...

void	CopyFromBlock (const CuBlockMatrix< Real > &B, MatrixTransposeType trans=kNoTrans)

void	CopyLowerToUpper ()

void	CopyUpperToLower ()

CuSubMatrix< Real >	Range (const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const

CuSubMatrix< Real >	RowRange (const MatrixIndexT row_offset, const MatrixIndexT num_rows) const

CuSubMatrix< Real >	ColRange (const MatrixIndexT col_offset, const MatrixIndexT num_cols) const

const CuSubVector< Real >	Row (MatrixIndexT i) const

CuSubVector< Real >	Row (MatrixIndexT i)

CuValue< Real >	operator() (MatrixIndexT r, MatrixIndexT c)

Real	operator() (MatrixIndexT r, MatrixIndexT c) const

Real	Sum () const

Real	Max () const

Real	Min () const

Real	Trace (bool check_square=true) const
	Return the trace. If check_square = true, will crash if matrix is not square. More...

void	SetRandn ()

void	SetRandUniform ()

void	Write (std::ostream &os, bool binary) const

void	AddElements (Real alpha, const std::vector< MatrixElement< Real > > &input)

void	AddElements (Real alpha, const CuArrayBase< Int32Pair > &indexes, const Real *input)

void	Lookup (const std::vector< Int32Pair > &indexes, Real *output) const

void	Lookup (const CuArrayBase< Int32Pair > &indexes, Real *output) const

void	EqualElementMask (const CuMatrixBase< Real > &mat, CuMatrix< Real > *mask) const

const Real *	RowData (MatrixIndexT r) const
	Get raw row pointer (const). More...

Real *	RowData (MatrixIndexT r)
	Get raw row pointer. More...

const Real *	Data () const
	Return data pointer (const). More...

Real *	Data ()
	Return data pointer. More...

const MatrixBase< Real > &	Mat () const

MatrixBase< Real > &	Mat ()

Protected Member Functions
	CuMatrixBase ()

	CuMatrixBase (Real *data, MatrixIndexT num_rows, MatrixIndexT num_cols, MatrixIndexT stride)
	This constructor takes the #rows, #cols and stride; it's called from the constructor of CuSubMatrix. More...

Protected Attributes
Real *	data_
	GPU data pointer (or regular matrix data pointer,. More...

MatrixIndexT	num_cols_

MatrixIndexT	num_rows_

MatrixIndexT	stride_

Private Member Functions
	KALDI_DISALLOW_COPY_AND_ASSIGN (CuMatrixBase)

Friends
class	CuMatrixBase< float >

class	CuMatrixBase< double >

class	CuVectorBase< float >

class	CuVectorBase< double >

class	VectorBase< Real >

class	CuSpMatrix< Real >

class	CuTpMatrix< float >

class	CuTpMatrix< double >

class	CuVectorBase< Real >

class	CuSubMatrix< Real >

class	CuRand< Real >

class	CuSubVector< Real >

class	CuBlockMatrix< Real >

class	CuSparseMatrix< float >

class	CuSparseMatrix< double >

class	CuSparseMatrix< Real >

Real	TraceMatMat (const CuMatrixBase< Real > &A, const CuMatrixBase< Real > &B, MatrixTransposeType trans)

Real	TraceMatSmat (const CuMatrixBase< Real > &A, const CuSparseMatrix< Real > &B, MatrixTransposeType trans)

void	AddMatMatBatched (const Real alpha, std::vector< CuSubMatrix< Real > * > &C, const std::vector< CuSubMatrix< Real > * > &A, MatrixTransposeType transA, const std::vector< CuSubMatrix< Real > * > &B, MatrixTransposeType transB, const Real beta)
	Does multiple matrix multiplications, executing them in parallel using cuBLAS's gemmBatched if we are using a GPU. More...

Detailed Description

template<typename Real>
class kaldi::CuMatrixBase< Real >

Matrix for CUDA computing.

Does the computation on the CUDA card when CUDA is compiled in and we have a suitable GPU (CuDevice::Instantiate().Enabled() == true); otherwise, does it on the CPU.

Definition at line 69 of file matrix-common.h.

Constructor & Destructor Documentation

◆ CuMatrixBase() [1/2]

CuMatrixBase ( )

inlineprotected

Definition at line 767 of file cu-matrix.h.

767 : data_(NULL), num_cols_(0), num_rows_(0), stride_(0) { }

kaldi::CuMatrixBase::data_

Real * data_

GPU data pointer (or regular matrix data pointer,.

Definition: cu-matrix.h:777

kaldi::CuMatrixBase::stride_

MatrixIndexT stride_

Definition: cu-matrix.h:787

kaldi::CuMatrixBase::num_cols_

MatrixIndexT num_cols_

Definition: cu-matrix.h:785

kaldi::CuMatrixBase::num_rows_

MatrixIndexT num_rows_

Definition: cu-matrix.h:786

◆ CuMatrixBase() [2/2]

CuMatrixBase	(	Real *	data,
		MatrixIndexT	num_rows,
		MatrixIndexT	num_cols,
		MatrixIndexT	stride
	)

inlineprotected

This constructor takes the #rows, #cols and stride; it's called from the constructor of CuSubMatrix.

Definition at line 771 of file cu-matrix.h.

774 :

775 data_(data), num_cols_(num_cols), num_rows_(num_rows), stride_(stride) { }

kaldi::CuMatrixBase::data_

Real * data_

GPU data pointer (or regular matrix data pointer,.

Definition: cu-matrix.h:777

kaldi::CuMatrixBase::stride_

MatrixIndexT stride_

Definition: cu-matrix.h:787

kaldi::CuMatrixBase::num_cols_

MatrixIndexT num_cols_

Definition: cu-matrix.h:785

kaldi::CuMatrixBase::num_rows_

MatrixIndexT num_rows_

Definition: cu-matrix.h:786

Member Function Documentation

◆ Add()

void Add ( Real value )

Definition at line 582 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), BackpropTruncationComponent::Backprop(), TanhComponent::Backprop(), LstmNonlinearityComponent::ConsolidateMemory(), kaldi::CuCompressedMatrixTestNonnegative(), kaldi::CuCompressedMatrixTestSymmetric(), GeneralDropoutComponent::GetMemo(), main(), kaldi::MeanVariance(), DropoutMaskComponent::Propagate(), DropoutComponent::Propagate(), ClipGradientComponent::RepairGradients(), TanhComponent::StoreStats(), kaldi::TestCuMatrixCompObjfAndDeriv(), kaldi::nnet3::TestSimpleComponentPropagateProperties(), kaldi::UnitTestCuMatrixAdd(), kaldi::UnitTestCuMatrixAdd2(), kaldi::UnitTestCuMatrixEqualElementMask(), kaldi::UnitTestCuMatrixObjfDeriv(), kaldi::UnitTestCuMatrixSetRandUniform(), and kaldi::UnitTestCuMatrixTraceMatMat().

                                        {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_add(dimGrid, dimBlock, data_, value, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Add(value);
   }
 }

◆ AddCols()

void AddCols	(	const CuMatrixBase< Real > &	src,
		const CuArrayBase< MatrixIndexT > &	indices
	)

Add column indices[r] of src to column r.

As a special case, if indexes[i] == -1, skip column i indices.size() must equal this->NumCols(), and src.NumRows() must equal this.NumRows()

Definition at line 2701 of file cu-matrix.cc.

Referenced by Convolutional1dComponent::Backprop(), ConvolutionalComponent::BackpropagateFnc(), ConvolutionComponent::InderivPatchesToInderiv(), and MaxpoolingComponent::InderivPatchesToInderiv().

                                                                            {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(indices.Dim() == NumCols());
     KALDI_ASSERT(NumRows() == src.NumRows());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(),
                   Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddCols(src.Mat(), indices.Data());
   }
 }

◆ AddDiagVecMat()

void AddDiagVecMat	(	const Real	alpha,
		const CuVectorBase< Real > &	v,
		const CuMatrixBase< Real > &	M,
		MatrixTransposeType	transM,
		Real	beta = `1.0`
	)

*this = beta * *this + alpha * diag(v) * M [or M^T].

The same as adding M but scaling each row M_i by v(i).

Definition at line 1382 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), kaldi::nnet3::attention::ApplyScalesToInput(), kaldi::nnet3::attention::ApplyScalesToOutput(), HiddenSoftmax::BackpropagateFnc(), MultiBasisComponent::BackpropagateFnc(), OnlinePreconditioner::ComputeWt1(), OnlineNaturalGradient::ComputeWt1(), kaldi::cu::DiffNormalizePerRow(), CuMatrixBase< float >::DiffSoftmaxPerRow(), MultiBasisComponent::PropagateFnc(), and kaldi::TestCuMatrixAddDiagVecMat().

                {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (transM == kNoTrans) {
       KALDI_ASSERT(SameDim(*this, M));
     } else {
       KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
     }
     KALDI_ASSERT(v.Dim() == this->NumRows());
 
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
                  n_blocks(num_rows_, CU2DBLOCK));
     MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
     if (transM == kTrans)
       std::swap(M_row_stride, M_col_stride);
     cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(),
                           v.Data(), M.Data(), M_row_stride, M_col_stride, beta);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddDiagVecMat(alpha, v.Vec(), M.Mat(), transM, beta);
   }
 }

◆ AddElements() [1/2]

void AddElements	(	Real	alpha,
		const std::vector< MatrixElement< Real > > &	input
	)

Definition at line 3277 of file cu-matrix.cc.

Referenced by OnlinePreconditioner::InitOrthonormalSpecial(), OnlineNaturalGradient::InitOrthonormalSpecial(), CuMatrixBase< float >::operator()(), DiscriminativeComputation::ProcessPosteriors(), and kaldi::UnitTestCuMatrixAddElements().

                                                                                    {
   // Checks the dimension.
   MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
   for (int32 i = 0; i < input.size(); ++i) {
     KALDI_ASSERT(input[i].row < num_rows && input[i].row >= 0 &&
                  input[i].column < num_cols && input[i].column >= 0);
   }
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement<Real>));
     CU_SAFE_CALL(cudaMemcpyAsync(addr, input.data(),
                                  input.size() * sizeof(MatrixElement<Real>),
                                  cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(input.size(), CU1DBLOCK));
 
     cuda_matrix_add_elements(dimGrid, dimBlock, this->data_, this->Dim(),
                              alpha, (MatrixElement<Real>*)addr, input.size());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().Free(addr);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     for (int32 i = 0; i < input.size(); i++) {
       (*this)(input[i].row, input[i].column) += alpha * input[i].weight;
     }
   }
 }

◆ AddElements() [2/2]

void AddElements	(	Real	alpha,
		const CuArrayBase< Int32Pair > &	indexes,
		const Real *	input
	)

Definition at line 3311 of file cu-matrix.cc.

                                                         {
   if (indexes.Dim() == 0) return;
   KALDI_ASSERT(input != NULL);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
     CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
                                  indexes.Dim() * sizeof(Real),
                                  cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     int dimBlock(CU1DBLOCK);
     int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK);
     cuda_matrix_add_indexed_values(dimGrid, dimBlock, this->Dim(), alpha,
                                    indexes.Data(), tmp_vec.Data(), indexes.Dim(), this->data_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
     const Int32Pair *index = indexes.Data();
     for (int32 i = 0; i < indexes.Dim(); i++) {
       KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 &&
                    index[i].second < num_cols && index[i].second >= 0);
       (*this)(index[i].first, index[i].second) += alpha * input[i];
     }
   }
 }

◆ AddMat()

void AddMat	(	Real	alpha,
		const CuMatrixBase< Real > &	A,
		MatrixTransposeType	trans = `kNoTrans`
	)

*this += alpha * A

Definition at line 954 of file cu-matrix.cc.

Referenced by RestrictedAttentionComponent::Add(), CuRand< float >::AddGaussNoise(), GeneralMatrix::AddToMat(), CuMatrixBase< float >::ApplyLog(), CuMatrixBase< float >::ApproxEqual(), kaldi::nnet3::attention::AttentionBackward(), kaldi::nnet3::attention::AttentionForward(), SigmoidComponent::Backprop(), Splice::BackpropagateFnc(), AveragePoolingComponent::BackpropagateFnc(), MultiBasisComponent::BackpropagateFnc(), LstmNonlinearityComponent::ConsolidateMemory(), kaldi::nnet3::ConstrainOrthonormalInternal(), kaldi::CuCompressedMatrixTestNonnegative(), kaldi::CuCompressedMatrixTestSymmetric(), CuMatrixBase< float >::DiffLogSoftmaxPerRow(), Xent::Eval(), Mse::Eval(), NnetComputer::ExecuteCommand(), AdditiveNoiseComponent::Propagate(), ClipGradientComponent::RepairGradients(), RestrictedAttentionComponent::StoreStats(), kaldi::nnet3::attention::TestAttentionForwardBackward(), NoOpTransform::TrainingBackward(), kaldi::UnitTestCuMatrixAddMat(), kaldi::UnitTestCuMatrixAddMatBlocks1(), kaldi::UnitTestCuMatrixAddMatBlocks1Trans(), kaldi::UnitTestCuMatrixAddMatBlocks2(), kaldi::UnitTestCuMatrixAddMatDiagVec(), kaldi::UnitTestCuMatrixAddMatMatElements(), kaldi::UnitTestLstmNonlinearity(), and kaldi::nnet3::UnitTestNnetInputDerivatives().

                                                             {
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (transA == kNoTrans) {
       KALDI_ASSERT(A.NumRows() == num_rows_ && A.NumCols() == num_cols_);
     } else {
       KALDI_ASSERT(A.NumCols() == num_rows_ && A.NumRows() == num_cols_);
     }
     if (num_rows_ == 0) return;
     CuTimer tim;
     // This block dimension seems to work better than the
     // one from GetBlockSizesForSimpleMatrixOperation().
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
                  n_blocks(NumRows(), CU2DBLOCK));
     cuda_add_mat(dimGrid, dimBlock, alpha, A.data_,
                  data_, Dim(), A.Stride(),
                  (transA == kTrans ? 1 : 0));
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddMat(alpha, A.Mat(), transA);
   }
 }

◆ AddMatBlock()

void AddMatBlock	(	Real	alpha,
		const CuMatrixBase< Real > &	A,
		MatrixTransposeType	transA,
		const CuBlockMatrix< Real > &	B,
		MatrixTransposeType	transB,
		Real	beta
	)

This function is like AddMatMat but for where the second argument is of type CuBlockMatrix (a block-diagonal matrix of blocks).

Definition at line 3205 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and kaldi::UnitTestCuBlockMatrixAddMatBlock().

                {
   // Check dimensions
   int32 A_num_rows = A.NumRows(), A_num_cols = A.NumCols(),
       A_row_stride = A.Stride(), A_col_stride = 1,
       B_num_rows = B.NumRows(), B_num_cols = B.NumCols();
   if (transA == kTrans) {
     std::swap(A_num_rows, A_num_cols);
     std::swap(A_row_stride, A_col_stride);
   }
   if (transB == kTrans) {
     std::swap(B_num_rows, B_num_cols);
   }
   // At this point the {A,B}_{rows,cols} variables are
   // after any transposition.
   KALDI_ASSERT(NumRows() == A_num_rows && NumCols() == B_num_cols);
   KALDI_ASSERT(A_num_cols == B_num_rows);
   int32 B_num_blocks = B.NumBlocks();
 
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     MatrixDim this_dim = Dim();
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     // (x,y) indices will be (row of *this, block of B)
     dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK),
                  n_blocks(B_num_blocks, CU2DBLOCK));
 
     // caution: the use of x as the row-index is not good, but
     // this code is not much used, so I'm not updating it.a
     cuda_add_mat_blockmat(dimGrid, dimBlock, data_, this_dim, A.Data(),
                           A_num_rows, A_num_cols, A_row_stride, A_col_stride,
                           B.CuData(), B_num_blocks, alpha, beta,
                           (transB == kTrans ? 1 : 0));
 
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     // "row_offset" and "col_offset" are offsets into B (or into B^T, if
     // transB == kTrans).
     int32 row_offset = 0, col_offset = 0;
     for (int32 b = 0; b < B_num_blocks; b++) {
       const CuSubMatrix<Real> this_block = B.Block(b);
       int32 this_num_rows = this_block.NumRows(),
           this_num_cols = this_block.NumCols();
       if (transB == kTrans) std::swap(this_num_rows, this_num_cols);
       CuSubMatrix<Real> this_part(*this, 0, num_rows_,
                                   col_offset, this_num_cols);
       CuSubMatrix<Real> A_part = (transA == kNoTrans ?
                                   CuSubMatrix<Real>(A, 0, num_rows_,
                                                     row_offset, this_num_rows) :
                                   CuSubMatrix<Real>(A, row_offset, this_num_rows,
                                                     0, num_rows_));
       this_part.AddMatMat(alpha, A_part, transA, this_block, transB, beta);
       row_offset += this_num_rows;
       col_offset += this_num_cols;
     }
     // Note: the values being compared below are all after applying any
     // transposition to B.
     KALDI_ASSERT(row_offset == B_num_rows && col_offset == B_num_cols);
   }
 }

◆ AddMatBlocks()

void AddMatBlocks	(	Real	alpha,
		const CuMatrixBase< Real > &	A,
		MatrixTransposeType	trans = `kNoTrans`
	)

This function is like AddMat (it does *this += alpha * src), except that it supports cases where *this and src have different dimension.

There are two allowed cases:

(1) *this is larger than src; we do a broadcasting operation. *this must have NumRows() == a * src.NumRows() and NumCols() == b * src.NumCols() for integer a >= 1, b >= 1. *this will be treated as a being made up of of blocks with the same size as src, and to each block we'll add alpha * src. This case does not support trans == kTrans.

(2) *this is smaller than src; we sum. src.NumRows() must == a * this->NumRows(), and src.NumCols() must == b * this->NumCols(), for a >= 1, b >= 1. In this case, src will be treated as being made up of blocks with the same size as *this, and to *this we will add the summation of all of those blocks.

Definition at line 1119 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), SumBlockComponent::Backprop(), SumBlockComponent::Propagate(), kaldi::UnitTestCuMatrixAddMatBlocks1(), kaldi::UnitTestCuMatrixAddMatBlocks1Trans(), kaldi::UnitTestCuMatrixAddMatBlocks2(), ConvolutionComponent::Update(), and Convolutional1dComponent::Update().

                                                                   {
   if (num_rows_ == 0 || num_cols_ == 0) return;
 
   if (A.NumRows() >= (transA == kNoTrans ? num_rows_ : num_cols_) &&
       A.NumCols() >= (transA == kNoTrans ? num_cols_ : num_rows_)) {
     // This is the "summing", not broadcasting, version of AddMatBlocks.
     // It supports both regular and transposed operation.
     int32 num_row_blocks, num_col_blocks;
     if (transA == kNoTrans) {
       KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
       num_row_blocks = A.Mat().NumRows() / num_rows_;
       num_col_blocks = A.Mat().NumCols() / num_cols_;
     } else {
       KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
       num_row_blocks = A.Mat().NumRows() / num_cols_;
       num_col_blocks = A.Mat().NumCols() / num_rows_;
     }
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       CuTimer tim;
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
       cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
                           num_col_blocks, data_, Dim(), A.Stride(),
                           (transA == kTrans ? 1 : 0));
       CU_SAFE_CALL(cudaGetLastError());
 
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     } else
 #endif
     {
       int32 nr, nc;
       if (transA == kNoTrans) {
         nr = num_rows_;
         nc = num_cols_;
       } else {
         nr = num_cols_;
         nc = num_rows_;
       }
       for (int32 i = 0; i < num_row_blocks; i++) {
         for (int32 j = 0; j < num_col_blocks; j++) {
           Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
                        transA);
         }
       }
     }
   } else {
     // This is the "broadcasting" version of AddMatBlocks, where
     // *this is larger than src.
     if (transA != kNoTrans)
       KALDI_ERR << "Transposed operation not supported currently.";
     if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
       KALDI_ERR << "Invalid sizes of arguments";
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       CuTimer tim;
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
       cuda_add_mat_repeated(dimGrid, dimBlock, alpha,
                             A.data_, A.Dim(), data_, Dim());
       CU_SAFE_CALL(cudaGetLastError());
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     } else
 #endif
     {
       const MatrixBase<Real> &src_mat = A.Mat(),
           &this_mat = this->Mat();
       for (int32 row_offset = 0; row_offset < NumRows();
            row_offset += src_mat.NumRows()) {
         for (int32 col_offset = 0; col_offset < NumCols();
              col_offset += src_mat.NumCols()) {
           SubMatrix<Real> this_part(this_mat,
                                     row_offset, src_mat.NumRows(),
                                     col_offset, src_mat.NumCols());
           this_part.AddMat(alpha, src_mat);
         }
       }
     }
   }
 }

◆ AddMatDiagVec()

void AddMatDiagVec	(	const Real	alpha,
		const CuMatrixBase< Real > &	M,
		MatrixTransposeType	transM,
		CuVectorBase< Real > &	v,
		Real	beta = `1.0`
	)

Definition at line 1415 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), BatchNormComponent::Backprop(), LstmNonlinearityComponent::ConsolidateMemory(), SigmoidComponent::RepairGradients(), and TanhComponent::RepairGradients().

                {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (transM == kNoTrans) {
       KALDI_ASSERT(SameDim(*this, M));
     } else {
       KALDI_ASSERT(M.NumRows() == NumCols() && M.NumCols() == NumRows());
     }
     KALDI_ASSERT(v.Dim() == this->NumCols());
 
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
     if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
     cuda_add_mat_diag_vec(dimGrid, dimBlock, alpha, data_, Dim(),
                           M.Data(), M_row_stride, M_col_stride, v.Data(),  beta);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddMatDiagVec(alpha, M.Mat(), transM, v.Vec(), beta);
   }
 }

◆ AddMatMat()

void AddMatMat	(	Real	alpha,
		const CuMatrixBase< Real > &	A,
		MatrixTransposeType	transA,
		const CuMatrixBase< Real > &	B,
		MatrixTransposeType	transB,
		Real	beta
	)

C = alpha * A(^T)*B(^T) + beta * C.

Definition at line 1291 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::AddMatBlock(), CuBlockMatrix< Real >::AddMatMat(), CuMatrixBase< float >::AddMatSp(), CuMatrixBase< float >::AddMatTp(), CuMatrixBase< float >::AddSpMat(), CuMatrixBase< float >::AddTpMat(), CuMatrixBase< float >::ApplyLog(), TdnnComponent::Backprop(), RepeatedAffineComponent::Backprop(), AffineComponent::Backprop(), LinearComponent::Backprop(), FixedLinearComponent::Backprop(), FixedAffineComponent::Backprop(), LinearTransform::BackpropagateFnc(), AffineTransform::BackpropagateFnc(), RecurrentComponent::BackpropagateFnc(), ConvolutionalComponent::BackpropagateFnc(), LstmProjected::BackpropagateFnc(), BlstmProjected::BackpropagateFnc(), ModelCollapser::CollapseComponentsAffine(), OnlinePreconditioner::ComputeWt1(), OnlineNaturalGradient::ComputeWt1(), LstmNonlinearityComponent::ConsolidateMemory(), kaldi::nnet3::ConstrainOrthonormalInternal(), kaldi::nnet3::time_height_convolution::ConvolveBackwardDataInternal(), kaldi::nnet3::time_height_convolution::ConvolveBackwardParamsInternal(), kaldi::nnet3::time_height_convolution::ConvolveForwardInternal(), kaldi::CuVectorUnitTestAddDiagMatMat(), OnlinePreconditioner::InitOrthonormalSpecial(), kaldi::nnet2::PreconditionDirections(), OnlinePreconditioner::PreconditionDirectionsInternal(), OnlineNaturalGradient::PreconditionDirectionsInternal(), TdnnComponent::Propagate(), AffineComponent::Propagate(), LinearComponent::Propagate(), DctComponent::Propagate(), FixedLinearComponent::Propagate(), FixedAffineComponent::Propagate(), KlHmm::PropagateFnc(), LinearTransform::PropagateFnc(), AffineTransform::PropagateFnc(), RecurrentComponent::PropagateFnc(), Rbm::PropagateFnc(), LstmProjected::PropagateFnc(), BlstmProjected::PropagateFnc(), Rbm::Reconstruct(), OnlineNaturalGradient::ReorthogonalizeRt1(), OnlinePreconditioner::ReorthogonalizeXt1(), kaldi::TestCuMatrixMatMat(), kaldi::UnitTestCuBlockMatrixAddMatMat(), kaldi::UnitTestCuCholesky(), kaldi::UnitTestCuMatrixAddMatMat(), kaldi::UnitTestCuMatrixSymAddMat2(), kaldi::UnitTestCuMatrixSymInvertPosDef(), kaldi::UnitTestCuSpMatrixInvert(), BlockAffineComponentPreconditioned::Update(), TdnnComponent::UpdateSimple(), and BlockAffineComponent::UpdateSimple().

                                                                         {
 
 
     // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping?
     // keep trans..., just swap A&B matrices: A->B B->A
     MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols());
     MatrixIndexT n = ((transA==kTrans)? A.NumCols() : A.NumRows());
     MatrixIndexT k = ((transB==kTrans)? B.NumCols() : B.NumRows());
     MatrixIndexT k1 = ((transA==kTrans)? A.NumRows() : A.NumCols());
 
     KALDI_ASSERT(m == NumCols());
     KALDI_ASSERT(n == NumRows());
     KALDI_ASSERT(k == k1);
 
     if (m == 0) return;
 
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CUBLAS_SAFE_CALL(cublas_gemm(GetCublasHandle(),
                              (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
                              (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
                              m, n, k, alpha, B.data_, B.Stride(),
                              A.data_, A.Stride(), beta, data_, Stride()));
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddMatMat(alpha, A.Mat(), transA, B.Mat(), transB, beta);
   }
 }

◆ AddMatMatElements()

void AddMatMatElements	(	const Real	alpha,
		const CuMatrixBase< Real > &	A,
		const CuMatrixBase< Real > &	B,
		const Real	beta
	)

*this = beta * *this + alpha * A .* B (.* element by element multiplication)

Definition at line 1447 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), StatisticsExtractionComponent::Backprop(), LstmNonlinearityComponent::ConsolidateMemory(), StatisticsPoolingComponent::Propagate(), and kaldi::UnitTestCuMatrixSetMatMatDivMat().

                                                                          {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(SameDim(*this, A) && SameDim(A, B));
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(),
                               B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddMatMatElements(alpha, A.Mat(), B.Mat(), beta);
   }
 }

◆ AddMatSmat()

void AddMatSmat	(	Real	alpha,
		const CuMatrixBase< Real > &	A,
		const CuSparseMatrix< Real > &	B,
		MatrixTransposeType	transB,
		Real	beta
	)

(*this) = alpha * A * op(B) + beta * (*this), where B is sparse and op(B) is either B or trans(B) depending on the 'transB' argument.

This is multiplication of a dense by a sparse matrix. See also AddSmatMat.

Definition at line 1080 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and kaldi::UnitTextCuMatrixAddMatSmat().

                                                                            {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (transB == kNoTrans) {
       KALDI_ASSERT(NumRows() == A.NumRows());
       KALDI_ASSERT(NumCols() == B.NumCols());
       KALDI_ASSERT(A.NumCols() == B.NumRows());
     } else {
       KALDI_ASSERT(NumRows() == A.NumRows());
       KALDI_ASSERT(NumCols() == B.NumRows());
       KALDI_ASSERT(A.NumCols() == B.NumCols());
     }
 
     CuTimer tim;
 
     cusparseMatDescr_t descr;
     CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descr));
     CU_SAFE_CALL(
         cusparse_csrmm(
             GetCusparseHandle(),
             transB == kNoTrans ?
                 CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
             B.NumRows(), NumRows(), B.NumCols(), B.NumElements(), &alpha, descr,
             B.CsrVal(), B.CsrRowPtr(), B.CsrColIdx(), A.Data(), A.Stride(),
             &beta, Data(), Stride()));
     CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descr));
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddMatSmat(alpha, A.Mat(), B.Smat(), transB, beta);
   }
 }

◆ AddMatSp()

void AddMatSp	(	const Real	alpha,
		const CuMatrixBase< Real > &	A,
		MatrixTransposeType	transA,
		const CuSpMatrix< Real > &	B,
		const Real	beta
	)

inline

this <– beta*this + alpha*A*B

Definition at line 614 of file cu-matrix.h.

                                  {
     CuMatrix<Real> M(B);
     return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
   }

◆ AddMatTp()

void AddMatTp	(	const Real	alpha,
		const CuMatrixBase< Real > &	A,
		MatrixTransposeType	transA,
		const CuTpMatrix< Real > &	B,
		MatrixTransposeType	transB,
		const Real	beta
	)

inline

this <– beta*this + alpha*A*B.

Definition at line 641 of file cu-matrix.h.

Referenced by kaldi::UnitTestCuMatrixAddMatTp().

                                  {
     CuMatrix<Real> M(B);
     return AddMatMat(alpha, A, transA, M, transB, beta);
   }

◆ AddRowRanges()

void AddRowRanges	(	const CuMatrixBase< Real > &	src,
		const CuArrayBase< Int32Pair > &	indexes
	)

For each row r of this and for each column c, do (*this)(r, c) += src(j, c), where j ranges from indexes[r].first through indexes[r].second - 1.

In general indexes must be >= 0 and < src.NumRows(); but to represent an empty range you may use the pair (-1, -1) or any pair of numbers (i, j) such that i >= j.

Definition at line 2931 of file cu-matrix.cc.

Referenced by StatisticsPoolingComponent::Backprop(), NnetComputer::ExecuteCommand(), StatisticsPoolingComponent::Propagate(), and kaldi::UnitTestCuMatrixAddRowRanges().

                                                                              {
   KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
   KALDI_ASSERT(src.NumCols() == NumCols());
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_row_ranges(dimGrid, dimBlock,
                         data_, Dim(), src.Data(), src.Dim(), indexes.Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   { // Implement here for the CPU..
     int32 num_rows = this->num_rows_, num_cols = this->num_cols_,
           this_stride = this->stride_, src_stride = src.stride_;
     Real *data = this->data_;
     const Real *src_data = src.data_;
     const Int32Pair *indexes_data = indexes.Data();
     for (int32 row = 0; row < num_rows; row++) {
       int32 start_row = indexes_data[row].first,
           end_row = indexes_data[row].second;
       for (int32 col = 0; col < num_cols; col++) {
         Real sum = 0.0;
         for (int32 src_row = start_row; src_row < end_row; src_row++)
           sum += src_data[src_row * src_stride + col];
         data[row * this_stride + col] += sum;
       }
     }
   }
 }

◆ AddRows() [1/2]

void AddRows	(	Real	alpha,
		const CuMatrixBase< Real > &	src,
		const CuArrayBase< MatrixIndexT > &	indexes
	)

Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).

If indexes[r] < 0, does not add anything. src.NumCols() must equal this.NumCols()

Definition at line 2766 of file cu-matrix.cc.

Referenced by StatisticsExtractionComponent::Backprop(), and NnetComputer::ExecuteCommand().

                                                                            {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
     KALDI_ASSERT(src.NumCols() == NumCols());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_rows(dimGrid, dimBlock, alpha,
                   data_, src.Data(), indexes.Data(), Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddRows(alpha, src.Mat(), indexes.Data());
   }
 }

◆ AddRows() [2/2]

void AddRows	(	Real	alpha,
		const CuArrayBase< const Real *> &	src
	)

Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the beginning of a region of memory representing a vector of floats, of the same length as this.NumCols().

Definition at line 2826 of file cu-matrix.cc.

                                                                                 {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(src.Dim()) == NumRows());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_rows(dimGrid, dimBlock, alpha, data_, src.Data(), Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddRows(alpha, src.Data());
   }
 }

◆ AddSmat()

void AddSmat	(	Real	alpha,
		const CuSparseMatrix< Real > &	A,
		MatrixTransposeType	trans = `kNoTrans`
	)

*this += alpha * A.

Definition at line 985 of file cu-matrix.cc.

Referenced by GeneralMatrix::AddToMat(), CuMatrixBase< float >::ApplyLog(), and kaldi::UnitTextCuMatrixAddSmat().

                                                             {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kNoTrans) {
       KALDI_ASSERT(NumRows() == A.NumRows());
       KALDI_ASSERT(NumCols() == A.NumCols());
     } else {
       KALDI_ASSERT(NumRows() == A.NumCols());
       KALDI_ASSERT(NumCols() == A.NumRows());
     }
 
     CuTimer tim;
 
     // We use warpSize threads per row to access only the nonzero elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of A.
     const int warpSize = 32;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y));
 
     if (trans == kNoTrans) {
       cuda_add_smat(dimGrid, dimBlock, Data(), Dim(), alpha, A.CsrRowPtr(),
                     A.CsrColIdx(), A.CsrVal());
     } else {
       cuda_add_smat_trans(dimGrid, dimBlock, Data(), Dim(), alpha,
       A.CsrRowPtr(), A.CsrColIdx(), A.CsrVal());
     }
 
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddSmat(alpha, A.Smat(), trans);
   }
 }

◆ AddSmatMat()

void AddSmatMat	(	Real	alpha,
		const CuSparseMatrix< Real > &	A,
		MatrixTransposeType	transA,
		const CuMatrixBase< Real > &	B,
		Real	beta
	)

(*this) = alpha * op(A) * B + beta * (*this), where A is sparse.

Multiplication of sparse with dense matrix. See also AddMatSmat. Note: we recommend, for greatest efficiency, that transA be kNoTrans. Use AddMatSmat() for better efficiency, as 2 dense mat transpose ops are called in this API.

Definition at line 1024 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and kaldi::UnitTextCuMatrixAddSmatMat().

                                                                             {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (transA == kNoTrans) {
       KALDI_ASSERT(NumRows() == A.NumRows());
       KALDI_ASSERT(NumCols() == B.NumCols());
       KALDI_ASSERT(A.NumCols() == B.NumRows());
     } else {
       KALDI_ASSERT(NumRows() == A.NumCols());
       KALDI_ASSERT(NumCols() == B.NumCols());
       KALDI_ASSERT(A.NumRows() == B.NumRows());
     }
 
     CuTimer tim;
 
     // We have op(A) and BT in col-major (B in row-major).
     // We first compute C in col-major (CT in row-major)
     // with C = op(A) * BT^T by cusparse_csrmm2,
     // then transpose CT to get C in row-major
     CuMatrix<Real> CT(*this, kTrans);
 
     cusparseMatDescr_t descr;
     CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descr));
     if (transA == kTrans) {
       // Note: only op(A)=A is supported if op(B)=B^T according to cusparse doc
       // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmm2
       CuSparseMatrix<Real> AT(A, kTrans);
       CU_SAFE_CALL(
           cusparse_csrmm2(GetCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
                           CUSPARSE_OPERATION_TRANSPOSE, AT.NumRows(),
                           CT.NumRows(), AT.NumCols(), AT.NumElements(), &alpha,
                           descr, AT.CsrVal(), AT.CsrRowPtr(), AT.CsrColIdx(),
                           B.Data(), B.Stride(), &beta, CT.Data(), CT.Stride()));
     } else {
       CU_SAFE_CALL(
           cusparse_csrmm2(GetCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE,
                           CUSPARSE_OPERATION_TRANSPOSE, A.NumRows(),
                           CT.NumRows(), A.NumCols(), A.NumElements(), &alpha,
                           descr, A.CsrVal(), A.CsrRowPtr(), A.CsrColIdx(),
                           B.Data(), B.Stride(), &beta, CT.Data(), CT.Stride()));
     }
     CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descr));
 
     this->CopyFromMat(CT, kTrans);
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddSmatMat(alpha, A.Smat(), transA, B.Mat(), beta);
   }
 }

◆ AddSpMat()

void AddSpMat	(	const Real	alpha,
		const CuSpMatrix< Real > &	A,
		const CuMatrixBase< Real > &	B,
		MatrixTransposeType	transB,
		const Real	beta
	)

inline

this <– beta*this + alpha*SpA*B

Definition at line 623 of file cu-matrix.h.

                                  {
     CuMatrix<Real> M(A);
     return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
   }

◆ AddToDiag()

void AddToDiag ( Real value )

Adds "value" to the diagonal elements of the matrix.

The matrix *this does not have to be square.

Definition at line 604 of file cu-matrix.cc.

Referenced by kaldi::nnet3::ConstrainOrthonormalInternal(), kaldi::nnet2::PreconditionDirections(), kaldi::TestCuMatrixCholesky(), and kaldi::UnitTestCuMatrixAddToDiag().

                                              {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
     // We'll create a fake matrix with "num_diag" rows, one
     // columnn, and a stride of "this_stride".  The y-value of
     // the grid/blocks corresponds to the row, in this kernel.
     MatrixIndexT num_diag = std::min(num_rows_, num_cols_),
         this_stride = stride_ + 1;
     dim3 dimBlock(1, CU1DBLOCK);
     dim3 dimGrid(1, n_blocks(num_diag, CU1DBLOCK));
     ::MatrixDim d = { num_diag, 1, this_stride };
     cuda_add(dimGrid, dimBlock, data_, value, d);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().AddToDiag(value);
   }
 }

◆ AddToElements()

void AddToElements	(	Real	alpha,
		const CuArrayBase< int32 > &	elements
	)

This is a rather special purpose function; we might generalize it later by adding a transpose-type option.

It expects 'elements.Dim()' to equal NumRows(), and for each elements[i] to be either -1, or 0 <= element[i] < NumCols(). It adds alpha to each element (*this)(i, elements[i]) for 0 <= i < NumRows().

Definition at line 3344 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and kaldi::UnitTestCuMatrixAddToElements().

                                                                                      {
   KALDI_ASSERT(elements.Dim() == NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(n_blocks(NumRows(), CU1DBLOCK));
 
     cuda_matrix_add_to_elements(dimGrid, dimBlock, alpha, data_, Dim(), elements.Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixBase<Real> &this_mat = this->Mat();
     const int32* row_to_col = elements.Data();
     for (int32 r = 0; r < this_mat.NumRows(); r++) {
       KALDI_ASSERT(row_to_col[r] >= -1);
       if (row_to_col[r] >= 0)
         this_mat(r, row_to_col[r]) += alpha;
     }
   }
 }

◆ AddToRows() [1/2]

void AddToRows	(	Real	alpha,
		const CuArrayBase< MatrixIndexT > &	indexes,
		CuMatrixBase< Real > *	dst
	)		const

For each row i of *this, adds this->Row(i) to dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing.

Requires that all the indexes[i] that are >= 0 be distinct, otherwise the behavior is undefined.

Definition at line 2869 of file cu-matrix.cc.

Referenced by NnetComputer::ExecuteCommand(), and kaldi::UnitTestCuMatrixAddToRows().

                                                                   {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
     KALDI_ASSERT(dst->NumCols() == NumCols());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_to_rows(dimGrid, dimBlock, alpha, dst->Data(), data_, indexes.Data(), Dim(), dst->Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddToRows(alpha, indexes.Data(), &(dst->Mat()));
   }
 }

◆ AddToRows() [2/2]

void AddToRows	(	Real	alpha,
		const CuArrayBase< Real *> &	dst
	)		const

For each row r of this matrix, adds it (times alpha) to the array of floats at the location given by dst[r], where dst[r] is assumed to be obtained from the RowData() function of another CuMatrix, or from CuVector::Data() (i.e.

it should point to memory on the GPU if we're using a GPU, or on the CPU otherwise). If dst[r] is NULL, does not do anything for that row. Requires that none of the memory regions pointed to by the pointers in "dst" overlap (e.g. none of the pointers should be the same).

Definition at line 2847 of file cu-matrix.cc.

                                                                         {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(dst.Dim()) == NumRows());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_to_rows(dimGrid, dimBlock, alpha, dst.Data(), data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddToRows(alpha, dst.Data());
   }
 }

◆ AddTpMat()

void AddTpMat	(	const Real	alpha,
		const CuTpMatrix< Real > &	A,
		MatrixTransposeType	transA,
		const CuMatrixBase< Real > &	B,
		MatrixTransposeType	transB,
		const Real	beta
	)

inline

this <– beta*this + alpha*A*B.

Definition at line 632 of file cu-matrix.h.

Referenced by kaldi::UnitTestCuMatrixAddTpMat().

                                  {
     CuMatrix<Real> M(A);
     return AddMatMat(alpha, M, transA, B, transB, beta);
   }

◆ AddVecToCols()

void AddVecToCols	(	Real	alpha,
		const CuVectorBase< Real > &	col,
		Real	beta = `1.0`
	)

(for each column c of *this), c = alpha * col + beta * c

Definition at line 1232 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), KlHmm::PropagateFnc(), and kaldi::UnitTestCuMatrixAddVecToCols().

                                                  {
   if (col.Dim() != NumRows()) {
     KALDI_ERR << "Non matching dimensions: Rows:" << NumRows() << " VectorDim:" << col.Dim();
   }
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta,
                          data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     if (beta != 1.0) Mat().Scale(beta);
     Mat().AddVecToCols(alpha, col.Vec());
   }
 }

◆ AddVecToRows()

void AddVecToRows	(	Real	alpha,
		const CuVectorBase< Real > &	row,
		Real	beta = `1.0`
	)

(for each row r of *this), r = alpha * row + beta * r

Definition at line 1261 of file cu-matrix.cc.

Referenced by DecodableNnetLoopedOnlineBase::AdvanceChunk(), DecodableNnetSimpleLooped::AdvanceChunk(), CuMatrixBase< float >::ApplyLog(), BatchNormComponent::Backprop(), SimpleSentenceAveragingComponent::BackpropagateFnc(), ScaleAndOffsetComponent::BackpropInternal(), NnetBatchComputer::Compute(), DecodableNnet2Online::ComputeForFrame(), DecodableNnetSimple::DoNnetComputation(), SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(), ConvolutionComponent::Propagate(), BatchNormComponent::Propagate(), FixedAffineComponent::Propagate(), FixedBiasComponent::Propagate(), PerElementOffsetComponent::Propagate(), Convolutional1dComponent::Propagate(), SimpleSentenceAveragingComponent::PropagateFnc(), AffineTransform::PropagateFnc(), RecurrentComponent::PropagateFnc(), Rbm::PropagateFnc(), ConvolutionalComponent::PropagateFnc(), AddShift::PropagateFnc(), ScaleAndOffsetComponent::PropagateInternal(), Rbm::Reconstruct(), SigmoidComponent::RepairGradients(), RectifiedLinearComponent::RepairGradients(), PdfPrior::SubtractOnLogpost(), kaldi::UnitTestCuMatrixAddVecToRows(), and SentenceAveragingComponent::Update().

                                                  {
   if (row.Dim() != NumCols()) {
     KALDI_ERR << "Non matching dimensions: Cols:" << NumCols() << " VectorDim:" << row.Dim();
   }
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.data_, beta, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     if (beta != 1.0) Mat().Scale(beta);
     Mat().AddVecToRows(alpha, row.Vec());
   }
 }

◆ AddVecVec()

void AddVecVec	(	Real	alpha,
		const CuVectorBase< Real > &	x,
		const CuVectorBase< Real > &	y
	)

A = alpha * x * y^T + A .

Definition at line 1329 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and kaldi::UnitTestCuMatrixAddVecVec().

                                                                           {
 
     MatrixIndexT m = y.Dim();
     MatrixIndexT n = x.Dim();
     KALDI_ASSERT(m == NumCols());
     KALDI_ASSERT(n == NumRows());
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CUBLAS_SAFE_CALL(cublas_ger(GetCublasHandle(), m, n, alpha,
                      y.Data(), 1, x.Data(), 1, data_, Stride()));
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().AddVecVec(alpha, x.Vec(), y.Vec());
   }
 }

◆ ApplyCeiling()

void ApplyCeiling ( Real ceiling_val )

inline

Definition at line 455 of file cu-matrix.h.

Referenced by ClipGradientComponent::Backprop(), RecurrentComponent::BackpropagateFnc(), and kaldi::UnitTestCuMatrixApplyCeiling().

                                              {
     this -> Ceiling(*this, ceiling_val);
   };

◆ ApplyExp()

void ApplyExp ( )

inline

Definition at line 459 of file cu-matrix.h.

Referenced by DiscriminativeComputation::Compute(), CuMatrixBase< float >::DiffLogSoftmaxPerRow(), and kaldi::UnitTestCuMatrixApplyExp().

                          {
     this -> Exp(*this);
   };

◆ ApplyExpLimited()

void ApplyExpLimited	(	Real	lower_limit,
		Real	upper_limit
	)

inline

Definition at line 464 of file cu-matrix.h.

Referenced by kaldi::UnitTestCuMatrixApplyExpLimited().

                                                                   {
     this -> ExpLimited(*this, lower_limit, upper_limit);
   };

◆ ApplyExpSpecial()

void ApplyExpSpecial ( )

inline

Definition at line 468 of file cu-matrix.h.

Referenced by kaldi::UnitTestCuMatrixApplyExpSpecial().

                                 {
     this -> ExpSpecial(*this);
   };

◆ ApplyFloor()

void ApplyFloor ( Real floor_val )

inline

Definition at line 451 of file cu-matrix.h.

Referenced by ClipGradientComponent::Backprop(), RecurrentComponent::BackpropagateFnc(), DecodableNnet2Online::ComputeForFrame(), main(), SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(), StatisticsPoolingComponent::Propagate(), RectifiedLinearComponent::Propagate(), SoftmaxComponent::Propagate(), LogSoftmaxComponent::Propagate(), ClipGradientComponent::RepairGradients(), RestrictedAttentionComponent::StoreStats(), kaldi::TestCuMatrixCompObjfAndDeriv(), kaldi::UnitTestCuMatrixApplyFloor(), kaldi::UnitTestCuMatrixObjfDeriv(), and kaldi::UnitTestCuMatrixSetMatMatDivMat().

                                          {
     this -> Floor(*this, floor_val);
   };

◆ ApplyHeaviside()

void ApplyHeaviside ( )

inline

Definition at line 447 of file cu-matrix.h.

Referenced by BackpropTruncationComponent::Backprop(), RectifiedLinearComponent::Backprop(), LstmNonlinearityComponent::ConsolidateMemory(), GeneralDropoutComponent::GetMemo(), DropoutMaskComponent::Propagate(), DropoutComponent::Propagate(), SigmoidComponent::RepairGradients(), TanhComponent::RepairGradients(), ClipGradientComponent::RepairGradients(), kaldi::TestCuMatrixHeaviside(), and kaldi::UnitTestCuMatrixApplyHeaviside().

                                {
     this -> Heaviside(*this);
   };

◆ ApplyLog()

void ApplyLog ( )

inline

Definition at line 480 of file cu-matrix.h.

Referenced by DecodableNnet2Online::ComputeForFrame(), main(), SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(), RestrictedAttentionComponent::StoreStats(), kaldi::TestCuMatrixCompObjfAndDeriv(), kaldi::UnitTestCuMatrixApplyLog(), and kaldi::UnitTestCuMatrixObjfDeriv().

                          {
     this -> Log(*this);
   };

◆ ApplyLogSoftMaxPerRow()

void ApplyLogSoftMaxPerRow ( )

inline

Definition at line 476 of file cu-matrix.h.

                                       {
     this -> LogSoftMaxPerRow(*this);
   };

◆ ApplyPow()

void ApplyPow ( Real power )

inline

Definition at line 438 of file cu-matrix.h.

Referenced by TanhComponent::Backprop(), LstmNonlinearityComponent::ConsolidateMemory(), kaldi::MeanVariance(), StatisticsExtractionComponent::Propagate(), StatisticsPoolingComponent::Propagate(), TanhComponent::StoreStats(), kaldi::UnitTestCuMatrixApplyPow(), kaldi::UnitTestCuMatrixSetRandn(), and kaldi::UnitTestCuMatrixSetRandUniform().

                                    {
     this -> Pow(*this, power);
   };

◆ ApplyPowAbs()

void ApplyPowAbs	(	Real	power,
		bool	include_sign = `false`
	)

inline

Definition at line 443 of file cu-matrix.h.

Referenced by PowerComponent::Backprop(), PowerComponent::Propagate(), ClipGradientComponent::RepairGradients(), and kaldi::UnitTestCuMatrixApplyPowAbs().

                                                                {
     this -> PowAbs(*this, power, include_sign);
   };

◆ ApplySoftMaxPerRow()

void ApplySoftMaxPerRow ( )

inline

Definition at line 472 of file cu-matrix.h.

                                    {
     this -> SoftMaxPerRow(*this);
   };

◆ ApproxEqual()

bool ApproxEqual	(	const CuMatrixBase< Real > &	other,
		float	tol = `0.01`
	)		const

True if ((*this)-other).FrobeniusNorm() <= tol * this->FrobeniusNorm()

Definition at line 2137 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::FrobeniusNorm(), kaldi::UnitTestCuCholesky(), and kaldi::UnitTestCuCopy().

                                                       {
   CuMatrix<Real> diff(*this);
   diff.AddMat(-1.0, other);
   return (diff.FrobeniusNorm() <= tol * (*this).FrobeniusNorm());
 }

◆ Ceiling()

void Ceiling	(	const CuMatrixBase< Real > &	src,
		Real	ceiling_val
	)

Definition at line 2601 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyCeiling(), and CuMatrixBase< float >::SizeInBytes().

                                                                                 {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_ceiling(dimGrid, dimBlock, this->data_, src.data_, ceiling_val, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().Ceiling(src.Mat(), ceiling_val);
   }
 }

◆ Cholesky()

void Cholesky ( CuMatrixBase< Real > * inv_cholesky = NULL )

This function does sets *this to the Cholesky factor of *this (i.e.

the C satisfying *this = C C^T), and sets "inv_cholesky" (if supplied) to its inverse. *this is treated as a symmetric matrix but only the lower triangle is accessed.

Definition at line 1987 of file cu-matrix.cc.

Referenced by CuTpMatrix< Real >::Cholesky(), CuMatrixBase< float >::Cholesky(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixCholesky(), kaldi::UnitTestCholesky(), and kaldi::UnitTestCuCholesky().

                                                                   {
   KALDI_ASSERT(this->NumRows() == this->NumCols());
   const int32 block_size = 64;  // We can tune this.
 #if HAVE_CUDA == 1
   bool have_gpu = CuDevice::Instantiate().Enabled();
 #else
   bool have_gpu = false;
 #endif
   if (this->NumRows() == 0) {
     return;
   }
   if (inv_cholesky == NULL && this->NumRows() >= block_size * 2 && have_gpu) {
     // Even if the user did not request the inverse Cholesky, for large enough
     // matrices (on GPUs) it's going to be more efficient to compute it anyway
     // as the recursion depends on it.
     CuMatrix<Real> inv(this->NumRows(), this->NumCols());
     Cholesky(&inv);
     return;
   }
   if (this->NumRows() <= block_size || inv_cholesky == NULL || !have_gpu) {
     // Don't recurse: compute the Cholesky (and inverse Cholesky, if requested)
     // directly, on the CPu.
     int32 dim = this->NumRows();
     CuSpMatrix<Real> this_sp(dim, kUndefined);
     this_sp.CopyFromMat(*this, kTakeLower);
     SpMatrix<Real> this_sp_cpu(this_sp);
     TpMatrix<Real> C_cpu(dim);
     C_cpu.Cholesky(this_sp_cpu);
     CuTpMatrix<Real> C(C_cpu);
     this->CopyFromTp(C);
     if (inv_cholesky != NULL) {
       C_cpu.Invert();  // Get inverse Cholesky on CPU.
       C.CopyFromTp(C_cpu);
       inv_cholesky->CopyFromTp(C); // Copy inverse Cholesky from CPU.
     }
     return;
   }
   // At this point, if none of the other cases apply, we recurse.
 
   // The selection of dim1 is a heuristic.  We could also just take half.
   int32 tot_dim = this->NumRows();
   int32 dim1;
   // Break it up into a whole number of blocks, for better memory alignment.
   // The line below, setting dim1 can be decided on a heuristic basis: from
   // the point of view of correctness, it can really be any value
   // 0 < dim1 < tot_dim.
   dim1 = block_size * std::max<int32>(1, tot_dim / (2 * block_size));
 
   int32 dim2 = tot_dim - dim1;
   CuSubMatrix<Real> this_11(*this, 0, dim1, 0, dim1),
       this_12(*this, 0, dim1, dim1, dim2),
       this_21(*this, dim1, dim2, 0, dim1),
       this_22(*this, dim1, dim2, dim1, dim2);
   CuSubMatrix<Real> inv_11(*inv_cholesky, 0, dim1, 0, dim1),
       inv_12(*inv_cholesky, 0, dim1, dim1, dim2),
       inv_21(*inv_cholesky, dim1, dim2, 0, dim1),
       inv_22(*inv_cholesky, dim1, dim2, dim1, dim2);
   /*
     Here is the math on block-wise Cholesky.  We'll use a Matlab-like notation for blocks of a matrix,
     e.g. [ A B; C D ], and also for transposes, e.g. A' is the transpose of A.
     Let A be the input matrix; we want to compute both its Cholesky L and its inverse Cholesky, which
     we'll call M.
     OK. let  L = [ L11 0; L21 L22 ] be the Cholesky factor of A.
     We have A = L L' = [ L11 0; L21 L22 ] * [ L11' L21'; 0 L22' ].  Multiplying it out,
     if A = [ A11 A12; A21 A22 ]; then
     A11 = L11 L11',  A21 = L21 L11', A22 = L21 L21' + L22 L22', and A12 = A21'.
 
     We also want an expression for the inverse of L (we call this M).
     If M = [ M11 0; M21 M22 ], then it's not hard to see that
     M11 = inv(L11), M22 = inv(L22).
     We can work out M21 as follows.  We know that [ L11 0; L21 L22 ] [ M11 0; M21 M22 ] = [ I 0; 0 I ].
     Considering the zero on the bottom of the rhs, we have: L21 M11 + L22 M21 = 0, which gives us:
     M21 = - L22^{-1} L21 M11 = - M22 L21 M11.
 
     Next, we want expressions for L21 and L22.  From the equation A21 = L21 L11', we have:
     L21 = A21 inv(L11') = A21 M11'
     We can compute L22 and M22 recursively by doing Cholesky (and computing the inverse Cholesky)
     on the quantity T = (A22 - L21 L21').   [we give it the name T just for easy reference.]
 
     Computationally, we do this as follows:
     (1) Recurse to get L11 and M11.
     (2) Compute L21 = A21 M11'
     (3) Compute T = A22 - L21 L21'
     (4) Recurse on T to get L22 and M22.
     (5) Compute M21 = -M22 L21 M11.
     Next, we have to consider the in-place nature of the computation, since L overwrites A
     [M has its own storage, in "inv_cholesky"].
     We address this here:
     (1) is in-place [L11 replaces A11, M11 has its own storage].
     (2) L21 gets written where M21 belongs.
     (3) T replaces A22.
     (4) is in-place [L22 replaces T where A22 was, M22 has its own storage]
     (5):(a)  we first compute the transpose of (L21 M11) is done in the upper part of A/L,
     where A12 or L12 would be.  Define a temporary expression
     U = (L21 M11)' = M11' L21'; this goes where A12 or L12 would be.
     (b) copy L21 to where it should be, in *this.
     (c) Compute M21 = -M22 U', in the correct place for M21.
     (d) zero L12 and M12.  */
 
   // (1) compute L11 and M11.
   this_11.Cholesky(&inv_11);
   // (2) compute L21 = A21 M11'.  For now it's in the "wrong place", where M21 should be.
   inv_21.AddMatMat(1.0, this_21, kNoTrans, inv_11, kTrans, 0.0);
   // (3) compute T = A22 - L21 L21'.  Note: only the lower triangle of T will be valid, but
   //      that's OK because Cholesky will ignore the upper part.
   this_22.SymAddMat2(-1.0, inv_21, kNoTrans, 1.0);
   // (4) Recurse to compute L22 and M22.
   this_22.Cholesky(&inv_22);
   // (5)(a) compute U = M11' L21'.  We use the storage of this_12 for this.  Note that L21 is
   //        currently where M21 should be.
   this_12.AddMatMat(1.0, inv_11, kTrans, inv_21, kTrans, 0.0);
   // (5)(b) copy L21 to where it should be.
   this_21.CopyFromMat(inv_21);
   // (5)(c) compute M21 = -M22 U'.
   inv_21.AddMatMat(-1.0, inv_22, kNoTrans, this_12, kTrans, 0.0);
   // (5)(d) zero L12 and M12.
   this_12.SetZero();
   inv_12.SetZero();
 
 }

◆ ColRange()

CuSubMatrix<Real> ColRange	(	const MatrixIndexT	col_offset,
		const MatrixIndexT	num_cols
	)		const

inline

Definition at line 665 of file cu-matrix.h.

                                                                        {
     return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
   }

◆ CopyColFromVec()

void CopyColFromVec	(	const CuVectorBase< Real > &	v,
		const MatrixIndexT	col
	)

Copy vector into specific column of matrix.

Definition at line 2414 of file cu-matrix.cc.

Referenced by kaldi::cu::NormalizePerRow(), StatisticsExtractionComponent::Propagate(), DropoutMaskComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), NaturalGradientRepeatedAffineComponent::Update(), and TimeHeightConvolutionComponent::UpdateNaturalGradient().

                                                                 {
   KALDI_ASSERT(v.Dim() == num_rows_ &&
                static_cast<UnsignedMatrixIndexT>(col) <
                static_cast<UnsignedMatrixIndexT>(num_cols_));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     cublas_copy(GetCublasHandle(),
                 v.Dim(), v.Data(), 1,
                 this->data_ + col, this->stride_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyColFromVec(v.Vec(), col);
   }
 }

◆ CopyCols()

void CopyCols	(	const CuMatrixBase< Real > &	src,
		const CuArrayBase< MatrixIndexT > &	indexes
	)

Copies column r from column indexes[r] of src.

As a special case, if indexes[i] == -1, sets column i to zero indexes.size() must equal this->NumCols(), and src.NumRows() must equal this.NumRows()

Definition at line 2656 of file cu-matrix.cc.

Referenced by SumGroupComponent::Backprop(), PermuteComponent::Backprop(), kaldi::nnet3::time_height_convolution::ConvolveBackwardParamsInternal(), kaldi::nnet3::time_height_convolution::ConvolveForwardInternal(), ConvolutionComponent::InputToInputPatches(), MaxpoolingComponent::InputToInputPatches(), PermuteComponent::Propagate(), Convolutional1dComponent::Propagate(), and Convolutional1dComponent::Update().

                                                                             {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(indices.Dim() == NumCols());
     KALDI_ASSERT(NumRows() == src.NumRows());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyCols(src.Mat(), indices.Data());
   }
 }

◆ CopyColsFromVec()

void CopyColsFromVec ( const CuVectorBase< Real > & v )

Copies vector into matrix, column-by-column.

Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows(); this has two modes of operation.

Definition at line 2376 of file cu-matrix.cc.

Referenced by DropoutComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuMatrixCopyColsFromVec().

                                                                      {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (rv.Dim() == num_rows_ * num_cols_) {
       // treat rv as a matrix of the size (num_cols x num_rows_)
       // and use transposed copy to fill *this
       // see CuMatrixBase<Real>::CopyFromMat() for more detail of the impl
       MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
       const int32 warpSize = 32;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(rv_dim.cols, warpSize),
                    n_blocks(rv_dim.rows, warpSize));
       cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, rv.Data(), Dim(),
                                rv_dim);
       CU_SAFE_CALL(cudaGetLastError());
     } else if (rv.Dim() == num_rows_) {
       // use 2D block (8x32) and large enough grid to cover matrix *this
       // dimBlock.x need to be at least warpSize for coalesced memory access.
       const int32 warpSize = 32;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
                    n_blocks(num_rows_, dimBlock.y));
       cuda_copy_cols_from_vec(dimGrid, dimBlock, Data(), Dim(), rv.Data());
       CU_SAFE_CALL(cudaGetLastError());
     } else {
       KALDI_ERR<< "Wrong sized arguments";
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyColsFromVec(rv.Vec());
   }
 }

◆ CopyFromBlock()

void CopyFromBlock	(	const CuBlockMatrix< Real > &	B,
		MatrixTransposeType	trans = `kNoTrans`
	)

Definition at line 161 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::AddMatTp().

                                                                   {
   this->SetZero();
   if (trans == kNoTrans) {
     KALDI_ASSERT(NumRows() == B.NumRows() && NumCols() == B.NumCols());
     int32 row_offset = 0, col_offset = 0;
     for (int32 b = 0; b < B.NumBlocks(); b++) {
       const CuMatrixBase<Real> &block = B.Block(b);
       int32 num_rows = block.NumRows(), num_cols = block.NumCols();
       CuSubMatrix<Real> this_block(*this, row_offset, num_rows,
                                    col_offset, num_cols);
       this_block.CopyFromMat(block);
       row_offset += num_rows;
       col_offset += num_cols;
     }
     KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
   } else {
     KALDI_ASSERT(NumRows() == B.NumCols() && NumCols() == B.NumRows());
     int32 row_offset = 0, col_offset = 0;
     for (int32 b = 0; b < B.NumBlocks(); b++) {
       const CuMatrixBase<Real> &block = B.Block(b);
       int32 num_rows = block.NumCols(), num_cols = block.NumRows();
       CuSubMatrix<Real> this_block(*this, row_offset, num_rows,
                                    col_offset, num_cols);
       this_block.CopyFromMat(block, kTrans);
       row_offset += num_rows;
       col_offset += num_cols;
     }
     KALDI_ASSERT(row_offset == NumRows() && col_offset == NumCols());
   }
 }

◆ CopyFromGeneralMat()

void CopyFromGeneralMat	(	const GeneralMatrix &	src,
		MatrixTransposeType	trans = `kNoTrans`
	)

Definition at line 3096 of file cu-matrix.cc.

Referenced by NnetComputer::AcceptInputs(), kaldi::nnet3::ComputeObjectiveFunction(), and CuMatrixBase< float >::SizeInBytes().

                                                                        {
   switch (src.Type()) {
     case kFullMatrix: {
       const Matrix<BaseFloat> &src_full_mat = src.GetFullMatrix();
       this->CopyFromMat(src_full_mat, trans);
       return;
     }
     case kCompressedMatrix: {
       Matrix<BaseFloat> mat;
       src.GetMatrix(&mat);
       this->CopyFromMat(mat, trans);
       return;
     }
     case kSparseMatrix: {
       const SparseMatrix<BaseFloat> &smat = src.GetSparseMatrix();
 #if HAVE_CUDA == 1
       if (CuDevice::Instantiate().Enabled()) {
         // only take this branch if we're actually using CUDA, or it would
         // entail a wasteful copy of the sparse matrix.
         CuSparseMatrix<BaseFloat> cu_smat(smat);
         cu_smat.CopyToMat(this, trans);
         return;
       }
 #endif
       smat.CopyToMat(&(Mat()), trans);
       return;
     }
     default:
       KALDI_ERR << "Invalid GeneralMatrix type.";
   }
 }

◆ CopyFromMat() [1/3]

void CopyFromMat	(	const MatrixBase< OtherReal > &	src,
		MatrixTransposeType	trans = `kNoTrans`
	)

Definition at line 344 of file cu-matrix.cc.

                                                                 {
   CuMatrix<OtherReal> temp(src);
   this->CopyFromMat(temp, trans);
 }

◆ CopyFromMat() [2/3]

void CopyFromMat	(	const MatrixBase< Real > &	src,
		MatrixTransposeType	trans = `kNoTrans`
	)

Definition at line 314 of file cu-matrix.cc.

                                                                 {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kNoTrans) {
       KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
       CuTimer tim;
 
       MatrixIndexT dst_pitch = stride_*sizeof(Real);
       MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
       MatrixIndexT width = src.NumCols()*sizeof(Real);
       CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch,
                                 width, src.NumRows(), cudaMemcpyHostToDevice,
                                 cudaStreamPerThread));
       CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 
       CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim);
     } else {
       CuMatrix<Real> trans_mat(src); // Do the transpose on the GPU board.
       this->CopyFromMat(trans_mat, kTrans);
     }
   } else
 #endif
   {
     Mat().CopyFromMat(src, trans);
   }
 }

◆ CopyFromMat() [3/3]

void CopyFromMat	(	const CuMatrixBase< OtherReal > &	M,
		MatrixTransposeType	trans = `kNoTrans`
	)

Definition at line 208 of file cu-matrix.cc.

                                                                 {
   if (sizeof(Real) == sizeof(OtherReal) &&
       static_cast<const void*>(M.Data()) ==
       static_cast<const void*>(this->Data())) {
     if (M.Data() == NULL)
       return;
     // CopyFromMat called on same data.  Nothing to do (except sanity checks)
     KALDI_ASSERT(trans == kNoTrans && M.NumRows() == NumRows() &&
                  M.NumCols() == NumCols() && M.Stride() == Stride());
     return;
   }
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kNoTrans) {
       KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_);
     } else {
       KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_);
     }
     if (M.num_rows_ == 0) return; // Nothing to do.
     CuTimer tim;
     if (sizeof(Real) == sizeof(OtherReal) && trans == kNoTrans ) {
       MatrixIndexT dst_pitch = stride_ * sizeof(Real);
       MatrixIndexT src_pitch = M.Stride() * sizeof(Real);
       MatrixIndexT width = M.NumCols() * sizeof(Real);
       CU_SAFE_CALL(
         cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch,
                           width, M.num_rows_, cudaMemcpyDeviceToDevice,
                           cudaStreamPerThread));
     } else {
       if (trans == kNoTrans) {
         dim3 dimGrid, dimBlock;
         GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                               &dimGrid, &dimBlock);
         cuda_copy_from_mat(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim());
       } else {
         // 2D thread block with warps (blockDim.x) along the row-dim of input M.
         // Each (8x32) thread block will transpose (32x32) data
         const int32 warpSize = 32;
         dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
         dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
             n_blocks(M.NumRows(), warpSize));
         cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(),
             M.Dim());
       }
       CU_SAFE_CALL(cudaGetLastError());
     }
     CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim);
   } else
 #endif
   {
     Mat().CopyFromMat(M.Mat(), trans);
   }
 }

◆ CopyFromSp()

void CopyFromSp ( const CuSpMatrix< Real > & M )

Definition at line 360 of file cu-matrix.cc.

Referenced by CuMatrix< float >::CuMatrix(), CuSpMatrix< Real >::Invert(), CuMatrixBase< float >::SizeInBytes(), and kaldi::TestCuMatrixCopyFromSp().

                                                              {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
   if (num_rows_ == 0)
     return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
                  n_blocks(NumRows(), CU2DBLOCK));
     cuda_copy_from_sp(dimGrid, dimBlock, M.Data(), data_, Dim());
     CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromSp", tim);
   } else
 #endif
   {
     Mat().CopyFromSp(M.Mat());
   }
 }

◆ CopyFromTp()

template void CopyFromTp	(	const CuTpMatrix< OtherReal > &	M,
		MatrixTransposeType	trans = `kNoTrans`
	)

Definition at line 280 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::Cholesky(), CuMatrix< float >::CuMatrix(), CuTpMatrix< Real >::Invert(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixCopyFromTp(), and kaldi::UnitTestCuMatrixCopyFromTp().

                                                                {
   KALDI_ASSERT(num_rows_ == M.NumRows() && num_cols_ == num_rows_);
   if (num_rows_ == 0)
     return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK),
                  n_blocks(num_rows_, CU2DBLOCK));
     if (trans == kNoTrans) {
       cuda_copy_from_tp(dimGrid, dimBlock, data_, M.Data(), Dim());
     } else {
       cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim());
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyFromTp(M.Mat(), trans);
   }
 }

◆ CopyLowerToUpper()

void CopyLowerToUpper ( )

Definition at line 2969 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::AddMatTp(), kaldi::nnet3::ConstrainOrthonormalInternal(), kaldi::nnet2::PreconditionDirections(), kaldi::TestCuMatrixCopyLowerToUpper(), kaldi::UnitTestCuCholesky(), and kaldi::UnitTestCuMatrixCopyLowerToUpper().

                                           {
   KALDI_ASSERT(num_cols_ == num_rows_);
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     int32 dim = num_rows_;
     dim3 dimGrid(n_blocks(dim, CU2DBLOCK),
                  n_blocks(dim, CU2DBLOCK));
     cuda_copy_low_upp(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyLowerToUpper();
   }
 }

◆ CopyRangeFromMatClamped()

void CopyRangeFromMatClamped	(	const CuMatrixBase< Real > &	src,
		int32_t	start_range,
		int32_t	end_range,
		int32_t	clamp_low,
		int32_t	clamp_high
	)

Definition at line 419 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::SizeInBytes().

                                              {
 
   KALDI_ASSERT(NumCols() == this->NumCols());
   KALDI_ASSERT(NumRows() == end_range-start_range);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
       src.Data(), src.Stride(), clamp_low, clamp_high,
       Data(), Stride());
   } else
 #endif
   {
     for (int32 t = start_range; t < end_range; t++) {
       int32 t_clamped = t;
       if (t_clamped < clamp_low) t_clamped = clamp_low;
       if (t_clamped >= clamp_high) t_clamped = clamp_high;
       CuSubVector<Real> dest_row=this->Row(t - start_range);
       const CuSubVector<Real> src_row=src.Row(t_clamped);
       dest_row.CopyFromVec(src_row);
     }
   }
 }

◆ CopyRows() [1/2]

void CopyRows	(	const CuMatrixBase< Real > &	src,
		const CuArrayBase< MatrixIndexT > &	indexes
	)

Copies row r from row indexes[r] of src.

As a special case, if indexes[i] < 0, sets row i to zero. src.NumCols() must equal this.NumCols()

Definition at line 2678 of file cu-matrix.cc.

Referenced by StatisticsExtractionComponent::Backprop(), SpliceComponent::Backprop(), NnetComputer::ExecuteCommand(), main(), DistributeComponent::Propagate(), and SpliceMaxComponent::Propagate().

                                                                             {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(indices.Dim()) == NumRows());
     KALDI_ASSERT(NumCols() == src.NumCols());
 
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(),
                    Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyRows(src.Mat(), indices.Data());
   }
 }

◆ CopyRows() [2/2]

void CopyRows ( const CuArrayBase< const Real *> & src )

Copies row r of this matrix from an array of floats at the location given by src[r], where src[r] is assumed to be obtained from the RowData() function of another CuMatrix, or from CuVector::Data() (the point is: the data it points to should be on the GPU if we're using a GPU, and on a CPU otherwise).

src.size() must equal this.NumRows(), and if any src[r] is NULL then this.Row(r) will be set to zero.

Definition at line 2723 of file cu-matrix.cc.

                                                                      {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(src.Dim()) == NumRows());
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
                  n_blocks(num_rows_, CU2DBLOCK));
     cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyRows(src.Data());
   }
 }

◆ CopyRowsFromVec() [1/2]

void CopyRowsFromVec ( const CuVectorBase< Real > & v )

This function has two modes of operation.

If v.Dim() == NumRows() * NumCols(), then treats the vector as a row-by-row concatenation of a matrix and copies to *this. if v.Dim() == NumCols(), it sets each row of *this to a copy of v.

Definition at line 2301 of file cu-matrix.cc.

Referenced by kaldi::CuVectorUnitTestCopyFromMat(), NnetOnlineComputer::Flush(), NnetRescaler::FormatInput(), TimeHeightConvolutionComponent::Propagate(), TdnnComponent::Propagate(), RepeatedAffineComponent::Propagate(), ConstantComponent::Propagate(), AffineComponent::Propagate(), FixedAffineComponent::Propagate(), BlockAffineComponent::Propagate(), ConstantFunctionComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuMatrixCopyRowsFromVec().

                                                                     {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
         CU_SAFE_CALL(
           cudaMemcpyAsync(data_, v_data, sizeof(Real)*num_rows_*num_cols_,
                           cudaMemcpyDeviceToDevice, cudaStreamPerThread));
       } else {
         CU_SAFE_CALL(
           cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(),
                             num_cols_*sizeof(Real), num_cols_*sizeof(Real),
                             num_rows_, cudaMemcpyDeviceToDevice,
                             cudaStreamPerThread));
       }
     } else if (v.Dim() == num_cols_) {
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
       cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data());
       CU_SAFE_CALL(cudaGetLastError());
     } else {
       KALDI_ERR << "Wrong sized arguments";
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyRowsFromVec(v.Vec());
   }
 }

◆ CopyRowsFromVec() [2/2]

void CopyRowsFromVec ( const VectorBase< Real > & v )

Version of CopyRowsFromVec() that takes a CPU-based vector.

Definition at line 2336 of file cu-matrix.cc.

                                                                   {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
         CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data,
                                      sizeof(Real)*num_rows_*num_cols_,
                                      cudaMemcpyHostToDevice,
                                      cudaStreamPerThread));
       } else {
         const Real *v_data = v.Data();
         for (MatrixIndexT r = 0; r < num_rows_; r++) {
           Real *row_data = RowData(r);
           CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_,
                                        cudaMemcpyHostToDevice,
                                        cudaStreamPerThread));
           v_data += num_cols_;
         }
       }
       CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     } else if (v.Dim() == num_cols_) {
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
       cuda_copy_rows_from_vec(dimGrid, dimBlock, this->data_, this->Dim(), v.Data());
       CU_SAFE_CALL(cudaGetLastError());
     } else {
       KALDI_ERR << "Wrong sized arguments";
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyRowsFromVec(v);
   }
 }

◆ CopyToMat()

template void CopyToMat	(	MatrixBase< OtherReal > *	dst,
		MatrixTransposeType	trans = `kNoTrans`
	)		const

Definition at line 447 of file cu-matrix.cc.

                                                                     {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kTrans || sizeof(OtherReal) != sizeof(Real)) {
       CuMatrix<OtherReal> this_trans(*this, trans);
       this_trans.CopyToMat(dst, kNoTrans);
     } else {
       KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols());
       if (num_rows_ == 0) return;
       CuTimer tim;
 
       MatrixIndexT src_pitch = stride_*sizeof(Real);
       MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
       MatrixIndexT width = NumCols()*sizeof(Real);
       CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_,
                                      src_pitch, width, this->num_rows_,
                                      cudaMemcpyDeviceToHost, cudaStreamPerThread));
       CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
     }
   } else
   #endif
   {
     dst->CopyFromMat(Mat(), trans);
   }
 }

◆ CopyToRows()

void CopyToRows ( const CuArrayBase< Real *> & dst ) const

For each row r of this matrix, copies it to the array of floats at the location given by dst[r], where dst[r] is assumed to be obtained from the RowData() function of another CuMatrix, or from CuVector::Data() (i.e.

it should point to memory on the GPU if we're using a GPU, or on the CPU otherwise). If dst[r] is NULL, does not copy anywhere. Requires that none of the memory regions pointed to by the pointers in "dst" overlap (e.g. none of the pointers should be the same).

Definition at line 2744 of file cu-matrix.cc.

Referenced by DistributeComponent::Backprop(), NnetComputer::ExecuteCommand(), and kaldi::UnitTestCuMatrixCopyToRows().

                                                                        {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(dst.Dim()) == NumRows());
 
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
                  n_blocks(num_rows_, CU2DBLOCK));
     cuda_copy_to_rows(dimGrid, dimBlock, dst.Data(), data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyToRows(dst.Data());
   }
 }

◆ CopyUpperToLower()

void CopyUpperToLower ( )

Definition at line 2990 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::AddMatTp(), kaldi::TestCuMatrixCopyUpperToLower(), and kaldi::UnitTestCuMatrixCopyUpperToLower().

                                           {
   KALDI_ASSERT(num_cols_ == num_rows_);
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     int32 dim = this->num_rows_;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(dim, CU2DBLOCK),
                  n_blocks(dim, CU2DBLOCK));
     cuda_copy_upp_low(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().CopyUpperToLower();
   }
 }

◆ Data() [1/2]

const Real* Data ( ) const

inline

Return data pointer (const).

Warning: may return a pointer to GPU memory. Use at your own risk.

Definition at line 746 of file cu-matrix.h.

746 { return data_; }

kaldi::CuMatrixBase::data_

Real * data_

GPU data pointer (or regular matrix data pointer,.

Definition: cu-matrix.h:777

◆ Data() [2/2]

Real* Data ( )

inline

Return data pointer.

Warning: may return a pointer to GPU memory. Use at your own risk.

Definition at line 749 of file cu-matrix.h.

749 { return data_; }

kaldi::CuMatrixBase::data_

Real * data_

GPU data pointer (or regular matrix data pointer,.

Definition: cu-matrix.h:777

◆ DiffGroupPnorm()

void DiffGroupPnorm	(	const CuMatrixBase< Real > &	in_value,
		const CuMatrixBase< Real > &	out_value,
		const CuMatrixBase< Real > &	out_deriv,
		Real	power
	)

Differentiate backward through the GroupPnorm function.

It is a combination of GroupPnormDeriv and MulRowsGroupMat.

Definition at line 841 of file cu-matrix.cc.

Referenced by PnormComponent::Backprop(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuMatrixDiffGroupPnorm().

                                                     {
   KALDI_ASSERT(out_value.NumCols() > 0);
   KALDI_ASSERT(out_value.NumCols() == out_deriv.NumCols());
   int group_size = this->NumCols() / out_value.NumCols();
   KALDI_ASSERT(this->NumCols() == out_value.NumCols() * group_size);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
                  n_blocks(NumRows(), dimBlock.y));
     if (dimGrid.x * dimGrid.y > 1024) {
       dimGrid.y = std::max(1024 / dimGrid.x, unsigned(1));
     }
     cuda_diff_group_pnorm(dimGrid, dimBlock, this->data_, in_value.Data(),
                           out_value.Data(), out_deriv.Data(), Dim(),
                           in_value.Stride(), out_value.Stride(),
                           out_deriv.Stride(), group_size, power);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().GroupPnormDeriv(in_value.Mat(), out_value.Mat(), power);
     MulRowsGroupMat(out_deriv);
   }
 }

◆ DiffLogSoftmaxPerRow()

void DiffLogSoftmaxPerRow	(	const CuMatrixBase< Real > &	out_value,
		const CuMatrixBase< Real > &	out_deriv
	)

Differentiate backward through the log softmax function.

Here, "out_value" is the log softmax output. Does, for each row i, *this(i) = out_deriv(i) - sum(out_deriv(i)) .* exp(out_value(i)) xxxx(i) is row-vector. Supports in-place operation, this == &out_deriv.

Definition at line 1903 of file cu-matrix.cc.

Referenced by LogSoftmaxComponent::Backprop(), CuMatrixBase< float >::DiffLogSoftmaxPerRow(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuDiffLogSoftmax().

                                                                               {
 
   KALDI_ASSERT(SameDim(out_value, out_deriv) && SameDim(out_value, *this) &&
                this != &out_value);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     // CUDA thread layout: one thread block per matrix-row.
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(num_rows_);
     cuda_diff_log_softmax(dimGrid, dimBlock, this->Dim(), out_value.Data(),
                           out_value.Stride(), out_deriv.Data(),
                           out_deriv.Stride(), data_);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     if (this == &out_deriv) {
       // the code below doesn't work for in-place, so make a copy and recurse.
       CuMatrix<Real> temp(NumRows(), NumCols(), kUndefined);
       temp.DiffLogSoftmaxPerRow(out_value, out_deriv);
       CopyFromMat(temp);
       return;
     }
     /*
      Let the output be y, then
      y_i = x_i - log(sum_i exp(x_i))
      where x_i is the input to the component. The Jacobian matrix of this
      function is
      J = I - 1 exp(y^T)
      where 1 is a vector of ones. Let the derivative vector at the output be e,
      and at the input be d, then we have
      d = e - exp(y) Sum(e)
      d_i = e_i - exp(y_i) Sum(e)
      */
     const CuMatrixBase<Real> &Y(out_value), &E(out_deriv);
     CuMatrixBase<Real> &D(*this);
 
     D.CopyFromMat(Y);
     D.ApplyExp();                           // exp(y)
     CuVector<Real> E_sum(D.NumRows()); // Initializes to zero
     E_sum.AddColSumMat(1.0, E);             // Sum(e)
     D.MulRowsVec(E_sum);                    // exp(y) Sum(e)
     D.Scale(-1.0);                          // - exp(y) Sum(e)
     D.AddMat(1.0, E, kNoTrans);             // e - exp(y_i) Sum(e)
   }
 }

◆ DiffParametricRelu()

void DiffParametricRelu	(	const CuMatrixBase< Real > &	value,
		const CuMatrixBase< Real > &	diff,
		const CuVectorBase< Real > &	alpha,
		const CuVectorBase< Real > &	beta
	)

Differentiate backward through the parametric relu function.

Here the "value" is the Relu input. Does, element-by-element. *this = diff * (value > 0 ? alpha : beta)

Definition at line 1501 of file cu-matrix.cc.

Referenced by ParametricRelu::BackpropagateFnc(), and CuMatrixBase< float >::SizeInBytes().

                                      {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK));
 
     cuda_diff_parametric_relu(dimGrid, dimBlock, data_, diff.data_, value.data_,
                               Dim(), diff.Stride(), value.Stride(),
                               alpha.data_, beta.data_);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     // Do it on CPU,
     for (MatrixIndexT r = 0; r < NumRows(); r++) {
       for (MatrixIndexT c = 0; c < NumCols(); c++) {
         Real value_elem = value.Mat()(r,c);
         this->Mat()(r,c) = diff.Mat()(r,c) *
           (value_elem >= 0.0 ? alpha.Vec()(c) : beta.Vec()(c));
       }
     }
   }
 }

◆ DiffSigmoid()

void DiffSigmoid	(	const CuMatrixBase< Real > &	value,
		const CuMatrixBase< Real > &	diff
	)

Differentiate backward through the sigmoid function.

Here, "value" is the sigmoid output. Does, element-by-element, *this = diff * value * (1 - value).

Definition at line 1764 of file cu-matrix.cc.

Referenced by SigmoidComponent::Backprop(), Sigmoid::BackpropagateFnc(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuDiffSigmoid().

                                                                      {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_diff_sigmoid(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().DiffSigmoid(value.Mat(), diff.Mat());
   }
 }

◆ DiffSoftmaxPerRow()

void DiffSoftmaxPerRow	(	const CuMatrixBase< Real > &	value,
		const CuMatrixBase< Real > &	diff
	)

Differentiate backward through the softmax function.

Here, "value" is the softmax output. Does, for each row i, *this(i) = diff(i) * diag(value(i)) - diff(i) * (value(i)^T * value(i)) xxxx(i) is row-vector; '*' and '-' are matrix operations. Supports in-place operation, this == &diff.

Definition at line 1868 of file cu-matrix.cc.

Referenced by kaldi::nnet3::attention::AttentionBackward(), SoftmaxComponent::Backprop(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuDiffSoftmax().

                                                                            {
 
   KALDI_ASSERT(SameDim(value, diff) && SameDim(value, *this) &&
                this != &value);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     // CUDA thread layout: one thread block per matrix-row.
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(num_rows_);
     cuda_diff_softmax(dimGrid, dimBlock, data_, this->Dim(), value.Data(),
         value.Stride(), diff.Data(), diff.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     const CuMatrixBase<Real> &P(value), &E(diff);
     CuMatrixBase<Real> &D(*this);
 
     CuVector<Real> pe_vec(D.NumRows()); // For each row i, the dot product (p_t . e_t).
     pe_vec.AddDiagMatMat(1.0, P, kNoTrans, E, kTrans, 0.0);
 
     D.CopyFromMat(E);
     D.MulElements(P);
     // At this point, D = P .* E (in matlab notation)
     D.AddDiagVecMat(-1.0, pe_vec, P, kNoTrans, 1.0); // does D -= diag(pe_vec) * P.
   }
 }

◆ DiffTanh()

void DiffTanh	(	const CuMatrixBase< Real > &	value,
		const CuMatrixBase< Real > &	diff
	)

Differentiate backward through the tanh function.

Here, "value" is the tanh output. Does, element-by-element, *this = diff * (1 - value^2).

Definition at line 1809 of file cu-matrix.cc.

Referenced by TanhComponent::Backprop(), RecurrentComponent::BackpropagateFnc(), Tanh::BackpropagateFnc(), LstmNonlinearityComponent::ConsolidateMemory(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuDiffTanh().

                                                                   {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_diff_tanh(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().DiffTanh(value.Mat(), diff.Mat());
   }
 }

◆ DiffXent()

void DiffXent	(	const CuArrayBase< int32 > &	tgt,
		CuVector< Real > *	log_post_tgt
	)

Differentiate the block [softmax+cross-entropy] : dE/da = posterior_mat - target_mat, 'E' is error function, 'a' is activation on softmax input.

Interface: tgt ... index vector, encodes the matrix of targets net_out_or_diff ... before invocation net output, after diff dE/da log_post_tgt ... per-frame statistics for cross-entropy computations : log(sum_row(posterior_mat .* target_mat))

Definition at line 1957 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuDiffXent().

                                                                 {
 
   KALDI_ASSERT(tgt.Dim() == num_rows_);
   log_post_tgt->Resize(tgt.Dim());
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimBlock(1, CU2DBLOCK*8);
     dim3 dimGrid(1, n_blocks(tgt.Dim(), CU2DBLOCK*8));
     cuda_diff_xent(dimGrid, dimBlock, tgt.Data(), data_,
                    log_post_tgt->data_, Dim());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixIndexT num_rows = num_rows_;
     for(int32 r = 0; r < num_rows; r++) {
       int32 col_tgt = tgt.Data()[r];
       Real &value = Mat()(r, col_tgt);
       log_post_tgt->Vec()(r) = kaldi::Log(value);
       value -= 1.0;
     }
   }
 }

◆ Dim()

::MatrixDim Dim ( ) const

inline

Definition at line 221 of file cu-matrix.h.

Referenced by CuVectorBase< float >::AddColSumMat(), CuVectorBase< float >::AddDiagMatMat(), CuMatrixBase< float >::AddMatBlocks(), CuMatrixBase< float >::AddRowRanges(), CuVectorBase< float >::AddRowSumMat(), CuMatrix< float >::CompObjfAndDeriv(), LstmNonlinearityComponent::ConsolidateMemory(), kaldi::cu::Copy(), CuVectorBase< float >::CopyColFromMat(), CuTpMatrix< Real >::CopyFromMat(), CuCompressedMatrix::CopyFromMat(), CuSpMatrix< Real >::CopyFromMat(), CuMatrixBase< float >::CopyFromMat(), CuSparseMatrix< Real >::CopyToMat(), CuCompressedMatrix::CopyToMat(), kaldi::cu::DiffNormalizePerRow(), kaldi::cu::EnsureNonzero(), CuTpMatrix< Real >::Invert(), kaldi::cu::NormalizePerRow(), kaldi::cu::Randomize(), kaldi::cu::RegularizeL1(), CuBlockMatrix< Real >::SetCudaData(), kaldi::cu::Splice(), NonlinearComponent::StoreStatsInternal(), CuMatrixBase< float >::SumColumnRanges(), kaldi::TraceMatMat(), kaldi::TraceMatSmat(), and NonlinearComponent::UpdateStats().

                         {
     ::MatrixDim d = { num_rows_, num_cols_, stride_ };
     return d;
   }

◆ DivElements()

void DivElements ( const CuMatrixBase< Real > & A )

Divide two matrices elementwise: C = A ./ A.

Definition at line 691 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), CuVectorBase< float >::DivElements(), kaldi::UnitTestCuMatrixDivElements(), and kaldi::UnitTestCuMatrixSetMatMatDivMat().

                                                                 {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_div_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().DivElements(A.Mat());
   }
 }

◆ DivRowsVec()

void DivRowsVec ( const CuVectorBase< Real > & div )

divide i'th row by scale[i]

Definition at line 899 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), StatisticsPoolingComponent::Backprop(), StatisticsPoolingComponent::Propagate(), kaldi::TestCuMatrixDivRowsVec(), and kaldi::UnitTestCuMatrixDivRowsVec().

                                                                  {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(div.Dim() == NumRows());
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     // For large matrix we do more work per thread by limiting the
     // the grid size to reduce the block launching overhead.
     if (dimGrid.x * dimGrid.y > 1024) {
       dimGrid.x = 1024 / dimGrid.y;
       if (dimGrid.x == 0) {
         dimGrid.x = 1;
       }
     }
     cuda_div_rows_vec(dimGrid, dimBlock, data_, div.data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Vector<Real> temp(div.Vec()); // will copy.
     temp.InvertElements();
     Mat().MulRowsVec(temp);
   }
 }

◆ EqualElementMask()

void EqualElementMask	(	const CuMatrixBase< Real > &	mat,
		CuMatrix< Real > *	mask
	)		const

Definition at line 3429 of file cu-matrix.cc.

Referenced by MaxpoolingComponent::Backprop(), MaxPoolingComponent::BackpropagateFnc(), and CuMatrixBase< float >::operator()().

                                                                                                    {
   // Check the inputs:
   KALDI_ASSERT(mat.NumRows() == NumRows() && mat.NumCols() == NumCols());
   KALDI_ASSERT(mask != NULL);
   // Resizes the output matrix:
   mask->Resize(NumRows(), NumCols(), kSetZero);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(),
                             mask->Data(), this->Dim(), mat.Stride(),
                             mask->Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     for (int32 r = 0; r < NumRows(); r++) {
       for (int32 c = 0; c < NumCols(); c++) {
         (*mask)(r,c) = ((*this)(r,c) ==  mat(r,c) ? 1.0 : 0.0);
       }
     }
   }
 }

◆ Exp()

void Exp ( const CuMatrixBase< Real > & src )

Definition at line 2456 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyExp(), and CuMatrixBase< float >::SizeInBytes().

                                                           {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_exp(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
              src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Exp(src.Mat());
   }
 }

◆ ExpLimited()

void ExpLimited	(	const CuMatrixBase< Real > &	src,
		Real	lower_limit,
		Real	upper_limit
	)

This is equivalent to running: Floor(src, lower_limit); Ceiling(src, upper_limit); Exp(src)

Definition at line 2541 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyExpLimited(), and CuMatrixBase< float >::SizeInBytes().

                                                                                                      {
   KALDI_ASSERT(SameDim(*this, src));
   KALDI_ASSERT(upper_limit > lower_limit);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_exp_limited(dimGrid, dimBlock, this->data_, src.data_, lower_limit, upper_limit,
                      this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().ExpLimited(src.Mat(), lower_limit, upper_limit);
   }
 }

◆ ExpSpecial()

void ExpSpecial ( const CuMatrixBase< Real > & src )

For each element x of the matrix, set it to (x < 0 ? exp(x) : x + 1).

This function is used in our RNNLM training.

Definition at line 2563 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyExpSpecial(), and CuMatrixBase< float >::SizeInBytes().

                                                                  {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_exp_special(dimGrid, dimBlock, this->data_, src.data_, Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().ExpSpecial(src.Mat());
   }
 }

◆ FindRowMaxId()

void FindRowMaxId ( CuArray< int32 > * id ) const

Find the id of the maximal element for each row (resizes the 'id' array to the appropriate size).

Definition at line 1829 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), kaldi::nnet3::ComputeAccuracy(), NnetUpdater::ComputeTotAccuracy(), Xent::Eval(), kaldi::TestCuFindRowMaxId(), and kaldi::UnitTestCuFindRowMaxId().

                                                               {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     id->Resize(num_rows_);
     MatrixDim d = Dim();
 
     // CUDA thread layout: one thread block per matrix-row.
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(num_rows_);
     cuda_find_row_max_id(dimGrid, dimBlock, data_, NULL, id->Data(), d);
     CU_SAFE_CALL(cudaGetLastError());
 
     // now we have the indices!
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     // allocate index buffer
     id->Resize(num_rows_);
     id->Set(-1);
     // find maxima
     MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
     for (MatrixIndexT r = 0; r < num_rows; r++) {
       Real max = -1e21;
       int32 max_id = -1;
       const Real *row_data = Mat().RowData(r);
       for (MatrixIndexT c = 0; c < num_cols; c++) {
         if (max < row_data[c]) {
           max = row_data[c];
           max_id = c;
         }
       }
       id->Data()[r] = max_id;
     }
   }
 }

◆ Floor()

void Floor	(	const CuMatrixBase< Real > &	src,
		Real	floor_val
	)

Definition at line 2582 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyFloor(), and CuMatrixBase< float >::SizeInBytes().

                                                                             {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_floor(dimGrid, dimBlock, data_, src.data_, floor_val, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().Floor(src.Mat(), floor_val);
   }
 }

◆ FrobeniusNorm()

Real FrobeniusNorm ( ) const

inline

Definition at line 226 of file cu-matrix.h.

Referenced by CuMatrixBase< float >::ApproxEqual(), kaldi::nnet3::ConstrainOrthonormalInternal(), and kaldi::UnitTestCuSparseMatrixFrobeniusNorm().

226 { return sqrt(TraceMatMat(*this, *this, kTrans)); }

kaldi::kTrans

Definition: matrix-common.h:33

kaldi::CuMatrixBase::TraceMatMat

friend Real TraceMatMat(const CuMatrixBase< Real > &A, const CuMatrixBase< Real > &B, MatrixTransposeType trans)

Definition: cu-matrix.cc:2145

◆ GroupMax()

void GroupMax ( const CuMatrixBase< Real > & src )

Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j where G = x.NumCols() / y.NumCols() must be an integer.

[note: y corresponds to *this and x to src, so src.NumCols() / this->NumCols() must be an integer.

Definition at line 1617 of file cu-matrix.cc.

Referenced by MaxoutComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixGroupMax(), kaldi::TestCuMatrixGroupMaxAllGroupSizes(), and kaldi::UnitTestCuMatrixGroupMax().

                                                                {
   int group_size = src.NumCols() / this->NumCols();
   KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size &&
                this->NumRows() == src.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     // One thread block per row.
     // Use 2D block for small group size to simplify the calculation.
     // Each group is reduced by threads_per_group threads.
     // threads_per_group should be a power of 2 for fast tree reduction.
     //        group size: 1 2 3 4 5 6 7 .. 12 13 .. 24 25 .. 48 ...
     // threads_per_group: 1 1 1 2 2 2 4 ..  4  8 ..  8 16 .. 16 ...
     int threads_per_group = CU1DBLOCK;
     while (threads_per_group * 3 / 2 >= group_size) {
       threads_per_group >>= 1;
     }
     if (group_size == 1) {
       threads_per_group = 1;
     }
     dim3 dimBlock(threads_per_group, CU1DBLOCK / threads_per_group);
     dim3 dimGrid(NumRows());
     cuda_group_max(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
         src.Stride(), group_size);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().GroupMax(src.Mat());
   }
 }

◆ GroupMaxDeriv()

void GroupMaxDeriv	(	const CuMatrixBase< Real > &	input,
		const CuMatrixBase< Real > &	output
	)

Calculate derivatives for the GroupMax function above, where "input" is the input to the GroupMax function above (i.e.

the "src" variable), and "output" is the result of the computation (i.e. the "this" of that function call), and *this must have the same dimension as "input". Each element of *this will be set to 1 if the corresponding input equals the output of the group, and 0 otherwise. The equals the function derivative where it is defined (it's not defined where multiple inputs in the group are equal to the output).

Definition at line 874 of file cu-matrix.cc.

Referenced by MaxoutComponent::Backprop(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixGroupMaxDeriv(), and kaldi::UnitTestCuMatrixGroupMaxDeriv().

                                                                        {
   KALDI_ASSERT(src2.NumCols() > 0);
   int group_size = this->NumCols() / src2.NumCols();
   KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
                  n_blocks(NumRows(), CU2DBLOCK));
     cuda_calc_group_max_deriv(dimGrid, dimBlock, this->data_, src1.Data(),
                               src2.Data(), Dim(), src1.Stride(), src2.Stride(),
                               group_size);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().GroupMaxDeriv(src1.Mat(), src2.Mat());
   }
 }

◆ GroupPnorm()

void GroupPnorm	(	const CuMatrixBase< Real > &	src,
		Real	pow
	)

Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p) where G = x.NumCols() / y.NumCols() must be an integer.

[note: y corresponds to *this and x to src, so src.NumCols() / this->NumCols() must be an integer.

Definition at line 1576 of file cu-matrix.cc.

Referenced by PnormComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixDiffGroupPnorm(), kaldi::TestCuMatrixGroupPnorm(), and kaldi::UnitTestCuMatrixGroupPnorm().

                                                                              {
   int group_size = src.NumCols() / this->NumCols();
   KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size &&
                this->NumRows() == src.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (power == Real(0) || power == Real(1) || power == Real(2)
         || power == std::numeric_limits<Real>::infinity()) {
       // One thread block per row.
       // Use 2D block for small group size to simplify the calculation
       // Each group is reduced by threads_per_group threads.
       // threads_per_group should be a power of 2 for fast tree reduction.
       int threads_per_group = CU1DBLOCK;
       while (threads_per_group * 3 / 2 >= group_size) {
         threads_per_group >>= 1;
       }
       if (group_size == 1) {
         threads_per_group = 1;
       }
       dim3 dimBlock(threads_per_group, CU1DBLOCK / threads_per_group);
       dim3 dimGrid(NumRows());
       cuda_group_spec_pnorm(dimGrid, dimBlock, this->data_, src.data_,
                             this->Dim(), src.Stride(), group_size, power);
     } else {
       dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
       dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
                    n_blocks(NumRows(), CU2DBLOCK));
       cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
                        src.Stride(), group_size, power);
     }
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().GroupPnorm(src.Mat(), power);
   }
 }

◆ Heaviside()

void Heaviside ( const CuMatrixBase< Real > & src )

Set each element to the Heaviside function of the corresponding element of "src", which we define as the function (x > 0 ? 1.0 : 0.0) [note: in general, there are different ways to deal with the situation when x==0.

]

Definition at line 2435 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyHeaviside(), RectifiedLinearComponent::Backprop(), CuRand< float >::BinarizeProbs(), kaldi::CuCompressedMatrixTestSign(), CuMatrixBase< float >::SizeInBytes(), RectifiedLinearComponent::StoreStats(), and kaldi::UnitTestCuMatrixHeaviside().

                                                                 {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
                    src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Heaviside(src.Mat());
   }
 }

◆ InvertElements()

void InvertElements ( )

invert the matrix by elements.

Definition at line 932 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), kaldi::TestCuMatrixCompObjfAndDeriv(), NnetEnsembleTrainer::TrainOneMinibatch(), kaldi::UnitTestCuMatrixInvertElements(), and kaldi::UnitTestCuMatrixObjfDeriv().

                                         {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_invert_elements(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().InvertElements();
   }
 }

◆ IsUnit()

bool IsUnit ( Real tol = 0.001 ) const

Definition at line 629 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::FrobeniusNorm(), OnlinePreconditioner::InitOrthonormalSpecial(), kaldi::UnitTestCuMatrixSymInvertPosDef(), and kaldi::UnitTestCuSpMatrixInvert().

                                               {
   // want to return:
   //FrobeniusNorm(*this - I) <= tol * NumRows(), i.e.:
   //sqrt (trace((*this - I)(*this-I)) <= tol * NumRows()
   //    trace((*this - I)(*this - I)) <= tol * NumRows()
   // trace(*this * *this) + trace(I) - 2 * trace(*this) <= tol * NumRows()
   // trace(*this * *this) + dim - 2*this.Trace() <= tol * NumRows()
   KALDI_ASSERT(this->NumRows() == this->NumCols());
   return (TraceMatMat(*this, *this, kTrans) + this->NumRows() - 2.0 * this->Trace() <=
           tol * this->NumRows());
 }

◆ KALDI_DISALLOW_COPY_AND_ASSIGN()

KALDI_DISALLOW_COPY_AND_ASSIGN ( CuMatrixBase< Real > )

private

◆ Log()

void Log ( const CuMatrixBase< Real > & src )

Definition at line 2477 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and CuMatrixBase< float >::SizeInBytes().

                                                           {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_log(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
              src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Log(src.Mat());
   }
 }

◆ LogSoftMaxPerRow()

void LogSoftMaxPerRow ( const CuMatrixBase< Real > & src )

LogSoftmax nonlinearity Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row, with attention to avoiding overflow or underflow.

Supports in-place operation (i.e. this == &src).

Definition at line 1740 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLogSoftMaxPerRow(), LogSoftmaxComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixLogSoftmax(), and kaldi::UnitTestCuLogSoftmax().

                                                                        {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = src.num_rows_;
     cuda_log_softmax_reduce(dimGrid, dimBlock,
                             data_, src.data_, Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixBase<Real> &mat(this->Mat());
     mat.CopyFromMat(src.Mat());
     for(MatrixIndexT r = 0; r < mat.NumRows(); r++) {
       mat.Row(r).ApplyLogSoftMax();
     }
   }
 }

◆ Lookup() [1/2]

void Lookup	(	const std::vector< Int32Pair > &	indexes,
		Real *	output
	)		const

Definition at line 3370 of file cu-matrix.cc.

Referenced by NnetDiscriminativeUpdater::LatticeComputations(), CuMatrixBase< float >::operator()(), kaldi::TestCuMatrixLookup(), and kaldi::UnitTestCuMatrixLookup().

                                                     {
   // Checks the dimension.
   MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
   for (int32 i = 0; i < indices.size(); ++i) {
     KALDI_ASSERT(indices[i].first < num_rows && indices[i].first >= 0 &&
                  indices[i].second < num_cols && indices[i].second >= 0);
   }
   if (indices.size() == 0) return;
   KALDI_ASSERT(output != NULL);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuArray<Int32Pair> cuda_indices(indices);
     Lookup(cuda_indices, output);
   } else
 #endif
   {
     for (int32 i = 0; i < indices.size(); i++) {
       output[i] = (*this)(indices[i].first, indices[i].second);
     }
   }
 }

◆ Lookup() [2/2]

void Lookup	(	const CuArrayBase< Int32Pair > &	indexes,
		Real *	output
	)		const

Definition at line 3395 of file cu-matrix.cc.

                                                     {
   int32 num_elements = indices.Dim();
   if (num_elements == 0) return;
   KALDI_ASSERT(output != NULL);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuArray<Real> cuda_output(num_elements);
     CuTimer tim;
     dim3 dimBlock(CU1DBLOCK, 1);
     dim3 dimGrid(n_blocks(num_elements, CU1DBLOCK), 1);
 
     cuda_matrix_lookup(dimGrid, dimBlock, this->data_, this->Dim(),
                        indices.Data(), num_elements, cuda_output.Data());
     CU_SAFE_CALL(cudaGetLastError());
 
     cuda_output.CopyToHost(output);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
     const Int32Pair *index = indices.Data();
     for (int32 i = 0; i < num_elements; i++) {
       KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 &&
                    index[i].second < num_cols && index[i].second >= 0);
       output[i] = (*this)(index[i].first, index[i].second);
     }
   }
 }

◆ Mat() [1/2]

const MatrixBase<Real>& Mat ( ) const

inline

Definition at line 755 of file cu-matrix.h.

                                              {
     return *(reinterpret_cast<const MatrixBase<Real>* >(this));
   }

◆ Mat() [2/2]

MatrixBase<Real>& Mat ( )

inline

Definition at line 758 of file cu-matrix.h.

                                  {
     return *(reinterpret_cast<MatrixBase<Real>* >(this));
   }

◆ Max() [1/2]

void Max ( const CuMatrixBase< Real > & A )

Do, elementwise, *this = max(*this, A).

Definition at line 715 of file cu-matrix.cc.

Referenced by kaldi::CuCompressedMatrixTestNonnegative(), kaldi::CuCompressedMatrixTestSymmetric(), main(), MaxpoolingComponent::Propagate(), SpliceMaxComponent::Propagate(), kaldi::TestCuMatrixMax(), kaldi::UnitTestCuMatrixMax(), and kaldi::UnitTestCuMatrixReduceMax().

                                                         {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_max(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Max(A.Mat());
   }
 }

◆ Max() [2/2]

Real Max ( ) const

Definition at line 3033 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and CuMatrixBase< float >::operator()().

                                    {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
     CuTimer tim;
 
     CuVector<Real> col_max(num_rows_, kUndefined);
     cuda_max_mat_cols(num_rows_, CU1DBLOCK, col_max.Data(), data_, Dim());
     Real ans = col_max.Max();
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
     return ans;
   } else
 #endif
   {
     return Mat().Max();
   }
 }

◆ Min() [1/2]

void Min ( const CuMatrixBase< Real > & A )

Do, elementwise, *this = min(*this, A).

Definition at line 740 of file cu-matrix.cc.

Referenced by kaldi::CuCompressedMatrixTestNonnegative(), kaldi::CuCompressedMatrixTestSymmetric(), main(), kaldi::TestCuMatrixMin(), kaldi::UnitTestCuMatrixMin(), and kaldi::UnitTestCuMatrixReduceMin().

                                                         {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_min(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Min(A.Mat());
   }
 }

◆ Min() [2/2]

Real Min ( ) const

Definition at line 3054 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and CuMatrixBase< float >::operator()().

                                    {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
     CuTimer tim;
 
     CuVector<Real> col_min(num_rows_, kUndefined);
     cuda_min_mat_cols(num_rows_, CU1DBLOCK, col_min.Data(), data_, Dim());
     Real ans = col_min.Min();
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
     return ans;
   } else
 #endif
   {
     return Mat().Min();
   }
 }

◆ MulColsVec()

void MulColsVec ( const CuVectorBase< Real > & scale )

scale i'th column by scale[i]

Definition at line 765 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), BatchNormComponent::Backprop(), FixedScaleComponent::Backprop(), PerElementScaleComponent::Backprop(), Rescale::BackpropagateFnc(), ScaleAndOffsetComponent::BackpropInternal(), LstmNonlinearityComponent::ConsolidateMemory(), ModelCollapser::PreMultiplyAffineParameters(), BatchNormComponent::Propagate(), FixedScaleComponent::Propagate(), PerElementScaleComponent::Propagate(), Rescale::PropagateFnc(), ScaleAndOffsetComponent::PropagateInternal(), kaldi::UnitTestCuMatrixAddMatDiagVec(), and kaldi::UnitTestCuMatrixMulColsVec().

                                                                    {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(scale.Dim() == NumCols());
 
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_mul_cols_vec(dimGrid, dimBlock, data_, scale.data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().MulColsVec(scale.Vec());
   }
 }

◆ MulElements()

void MulElements ( const CuMatrixBase< Real > & A )

Multiply two matrices elementwise: C = C .* A.

Definition at line 667 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), ElementwiseProductComponent::Backprop(), BackpropTruncationComponent::Backprop(), MaxpoolingComponent::Backprop(), SigmoidComponent::Backprop(), TanhComponent::Backprop(), PowerComponent::Backprop(), RectifiedLinearComponent::Backprop(), SoftHingeComponent::Backprop(), HiddenSoftmax::BackpropagateFnc(), Dropout::BackpropagateFnc(), ScaleAndOffsetComponent::BackpropInternal(), kaldi::nnet1::ComputeStdDev(), LstmNonlinearityComponent::ConsolidateMemory(), CuMatrixBase< float >::DiffSoftmaxPerRow(), Mse::Eval(), ElementwiseProductComponent::Propagate(), DropoutComponent::Propagate(), KlHmm::PropagateFnc(), LengthNormComponent::PropagateFnc(), Dropout::PropagateFnc(), ClipGradientComponent::RepairGradients(), NnetEnsembleTrainer::TrainOneMinibatch(), kaldi::UnitTestCuMatrixAddMatMatElements(), kaldi::UnitTestCuMatrixMulElements(), kaldi::nnet1::UnitTestLengthNorm(), AffineTransform::Update(), FramePoolingComponent::Update(), ConvolutionalComponent::Update(), Rescale::Update(), and NaturalGradientPerElementScaleComponent::Update().

                                                                 {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_mul_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().MulElements(A.Mat());
   }
 }

◆ MulRows()

void MulRows	(	const CuMatrixBase< Real > &	src,
		const CuArrayBase< MatrixIndexT > &	indexes
	)

Does for each row r, this.Row(r) *= alpha * src.row(indexes[r]), where '*=' is elementwise multiplication.

If indexes[r] < 0, does not add anything. src.NumCols() must equal this.NumCols()

Definition at line 2790 of file cu-matrix.cc.

Referenced by GeneralDropoutComponent::Backprop(), and GeneralDropoutComponent::Propagate().

                                                                            {
   if (NumRows() == 0) return;
   KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(src.NumCols() == NumCols());
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_mul_rows(dimGrid, dimBlock,
                   data_, src.Data(), indexes.Data(), Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixBase<Real> &this_mat(Mat());
     const MatrixBase<Real> &src_mat(src.Mat());
     int32 num_rows = NumRows();
     const MatrixIndexT *index_ptr = indexes.Data();
     for (int32 r = 0; r < num_rows; r++) {
       int32 src_r = index_ptr[r];
       if (src_r < 0)
         continue;
       SubVector<Real> this_row(this_mat, r),
           src_row(src_mat, src_r);
       this_row.MulElements(src_row);
     }
   }
 }

◆ MulRowsGroupMat()

void MulRowsGroupMat ( const CuMatrixBase< Real > & src )

divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].

Definition at line 816 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), MaxoutComponent::Backprop(), and kaldi::UnitTestCuMatrixMulRowsGroupMat().

                                                                       {
   KALDI_ASSERT(src.NumCols() > 0);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     int group_size = this->NumCols() / src.NumCols();
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_mul_rows_group_mat(dimGrid, dimBlock, this->data_, src.data_,
                             this->Dim(), src.Stride(), group_size);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().MulRowsGroupMat(src.Mat());
   }
 }

◆ MulRowsVec()

void MulRowsVec ( const CuVectorBase< Real > & scale )

scale i'th row by scale[i]

Definition at line 792 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), BackpropTruncationComponent::Backprop(), SpecAugmentTimeMaskComponent::Backprop(), ClipGradientComponent::Backprop(), BlockSoftmax::BackpropagateFnc(), LengthNormComponent::BackpropagateFnc(), CuMatrixBase< float >::DiffLogSoftmaxPerRow(), kaldi::cu::DiffNormalizePerRow(), Xent::Eval(), Mse::Eval(), kaldi::nnet2::PreconditionDirections(), NnetChainTrainer::ProcessOutputs(), NnetDiscriminativeTrainer::ProcessOutputs(), SpecAugmentTimeMaskComponent::Propagate(), LengthNormComponent::PropagateFnc(), OnlineNaturalGradient::ReorthogonalizeRt1(), OnlinePreconditioner::ReorthogonalizeXt1(), kaldi::UnitTestCuMatrixMulRowsVec(), and kaldi::nnet3::time_height_convolution::ZeroBlankRows().

                                                                    {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(scale.Dim() == NumRows());
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_mul_rows_vec(dimGrid, dimBlock, data_, scale.data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().MulRowsVec(scale.Vec());
   }
 }

◆ NumCols()

MatrixIndexT NumCols ( ) const

inline

Definition at line 216 of file cu-matrix.h.

216 { return num_cols_; }

kaldi::CuMatrixBase::num_cols_

MatrixIndexT num_cols_

Definition: cu-matrix.h:785

◆ NumRows()

MatrixIndexT NumRows ( ) const

inline

Dimensions.

Definition at line 215 of file cu-matrix.h.

215 { return num_rows_; }

kaldi::CuMatrixBase::num_rows_

MatrixIndexT num_rows_

Definition: cu-matrix.h:786

◆ operator()() [1/2]

CuValue<Real> operator()	(	MatrixIndexT	r,
		MatrixIndexT	c
	)

inline

Definition at line 682 of file cu-matrix.h.

                                                                    {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
     return CuValue<Real>(data_ + r * stride_ + c);
   }

◆ operator()() [2/2]

Real operator()	(	MatrixIndexT	r,
		MatrixIndexT	c
	)		const

inline

Definition at line 690 of file cu-matrix.h.

                                                                 {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
                           static_cast<UnsignedMatrixIndexT>(c) <
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
     return CuValue<Real>(data_ + r * stride_ + c);  // will be casted to Real.
   }

◆ ParametricRelu()

void ParametricRelu	(	const CuMatrixBase< Real > &	src,
		const CuVectorBase< Real > &	alpha,
		const CuVectorBase< Real > &	beta
	)

Compute the parametric rectified linear unit function; element by element, *this = src * (src > 0 ? alpha : beta)

Definition at line 1467 of file cu-matrix.cc.

Referenced by ParametricRelu::PropagateFnc(), and CuMatrixBase< float >::SizeInBytes().

                                      {
   KALDI_ASSERT(src.NumRows() == this->NumRows());
   KALDI_ASSERT(src.NumCols() == this->NumCols());
   KALDI_ASSERT(alpha.Dim() == this->NumCols());
   KALDI_ASSERT(beta.Dim() == this->NumCols());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
 
     cuda_parametric_relu(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
                          src.Stride(), alpha.data_, beta.data_);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     // Do it on CPU,
     for (MatrixIndexT r = 0; r < NumRows(); r++) {
       for (MatrixIndexT c = 0; c < NumCols(); c++) {
         Real src_elem = src.Mat()(r,c);
         this->Mat()(r,c) = src_elem * (src_elem >= 0.0 ? alpha.Vec()(c) : beta.Vec()(c));
       }
     }
   }
 }

◆ Pow()

void Pow	(	const CuMatrixBase< Real > &	src,
		Real	power
	)

Definition at line 2500 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyPow(), and CuMatrixBase< float >::SizeInBytes().

                                                                       {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_pow(dimGrid, dimBlock, this->data_, src.data_, power, this->Dim(),
              src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Pow(src.Mat(), power);
   }
 }

◆ PowAbs()

void PowAbs	(	const CuMatrixBase< Real > &	src,
		Real	power,
		bool	include_sign = `false`
	)

Apply power to the absolute value of each element.

If include_sign is true, the result will be multiplied with the sign of the input value. If the power is negative and the input to the power is zero, The output will be set zero. If include_sign is true, it will multiply the result by the sign of the input.

Definition at line 2521 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyPowAbs(), and CuMatrixBase< float >::SizeInBytes().

                                                                                             {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_pow_abs(dimGrid, dimBlock, this->data_, src.data_, power, include_sign,
                  this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().PowAbs(src.Mat(), power, include_sign);
   }
 }

◆ Range()

CuSubMatrix<Real> Range	(	const MatrixIndexT	row_offset,
		const MatrixIndexT	num_rows,
		const MatrixIndexT	col_offset,
		const MatrixIndexT	num_cols
	)		const

inline

Definition at line 653 of file cu-matrix.h.

Referenced by CuBlockMatrix< Real >::AddMatMat(), Splice::BackpropagateFnc(), ModelCollapser::CollapseComponentsAffine(), NnetOnlineComputer::Compute(), ConvolutionComponent::Init(), AffineComponent::Init(), AffineComponentPreconditioned::Init(), AffineComponentPreconditionedOnline::Init(), FixedAffineComponent::Init(), Convolutional1dComponent::Init(), NaturalGradientAffineComponent::InitFromConfig(), kaldi::nnet2::NnetComputationChunked(), NnetComputer::NnetComputer(), OnlinePreconditioner::PreconditionDirections(), OnlineNaturalGradient::PreconditionDirections(), NnetOnlineComputer::Propagate(), kaldi::UnitTestCuMatrixAddMat(), kaldi::UnitTestCuSubMatrix(), NaturalGradientAffineComponent::Update(), AffineComponentPreconditioned::Update(), and AffineComponentPreconditionedOnline::Update().

                                                                     {
     return CuSubMatrix<Real>(*this, row_offset, num_rows,
                              col_offset, num_cols);
   }

◆ Row() [1/2]

const CuSubVector<Real> Row ( MatrixIndexT i ) const

inline

Definition at line 670 of file cu-matrix.h.

Referenced by kaldi::nnet3::utterance_splitting::AddOnlineIvectorsToTasks(), DecodableNnetSimpleLooped::AdvanceChunk(), MultiBasisComponent::BackpropagateFnc(), NnetOnlineComputer::Compute(), CuMatrixBase< float >::CopyRangeFromMatClamped(), DecodableNnetSimple::DoNnetComputation(), NnetBatchComputer::FormatInputs(), kaldi::nnet2::NnetComputationChunked(), NnetComputer::NnetComputer(), RecurrentComponent::PropagateFnc(), kaldi::nnet3::RunNnetComputation(), kaldi::UnitTestCuMatrixAddDiagVecMat(), kaldi::nnet2::UnitTestGenericComponentInternal(), kaldi::UnitTestLstmNonlinearity(), kaldi::nnet1::UnitTestSimpleSentenceAveragingComponent(), NaturalGradientRepeatedAffineComponent::Update(), and TimeHeightConvolutionComponent::UpdateNaturalGradient().

                                                            {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
     return CuSubVector<Real>(data_ + (i * stride_), NumCols());
   }

◆ Row() [2/2]

CuSubVector<Real> Row ( MatrixIndexT i )

inline

Definition at line 676 of file cu-matrix.h.

                                                {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
                  static_cast<UnsignedMatrixIndexT>(num_rows_));
     return CuSubVector<Real>(data_ + (i * stride_), NumCols());
   }

◆ RowData() [1/2]

const Real* RowData ( MatrixIndexT r ) const

inline

Get raw row pointer (const).

Warning: may return a pointer to GPU memory. Use at your own risk.

Definition at line 740 of file cu-matrix.h.

Referenced by CuVectorBase< float >::CopyRowsFromMat(), VectorBase< float >::CopyRowsFromMat(), CuSubVector< Real >::CuSubVector(), kaldi::cu::EnsureNonzero(), RectifiedLinearComponent::RepairGradients(), kaldi::TestCuMatrixAddRows2(), kaldi::TestCuMatrixAddToRows(), kaldi::TestCuMatrixCopyRows2(), kaldi::TestCuMatrixCopyToRows(), kaldi::UnitTestCuMatrixAddRows(), kaldi::UnitTestCuMatrixCopyRows(), and kaldi::UnitTestCuMatrixMulRows().

740 { return data_ + r * stride_; }

kaldi::CuMatrixBase::data_

Real * data_

GPU data pointer (or regular matrix data pointer,.

Definition: cu-matrix.h:777

kaldi::CuMatrixBase::stride_

MatrixIndexT stride_

Definition: cu-matrix.h:787

◆ RowData() [2/2]

Real* RowData ( MatrixIndexT r )

inline

Get raw row pointer.

Warning: may return a pointer to GPU memory. Use at your own risk.

Definition at line 743 of file cu-matrix.h.

743 { return data_ + r * stride_; }

kaldi::CuMatrixBase::data_

Real * data_

GPU data pointer (or regular matrix data pointer,.

Definition: cu-matrix.h:777

kaldi::CuMatrixBase::stride_

MatrixIndexT stride_

Definition: cu-matrix.h:787

◆ RowRange()

CuSubMatrix<Real> RowRange	(	const MatrixIndexT	row_offset,
		const MatrixIndexT	num_rows
	)		const

inline

Definition at line 660 of file cu-matrix.h.

Referenced by Splice::BackpropagateFnc(), RecurrentComponent::BackpropagateFnc(), LstmProjected::BackpropagateFnc(), BlstmProjected::BackpropagateFnc(), NnetBatchComputer::FormatInputs(), NnetBatchComputer::FormatOutputs(), kaldi::nnet3::MergeTaskOutput(), NnetOnlineComputer::Propagate(), RecurrentComponent::PropagateFnc(), LstmProjected::PropagateFnc(), BlstmProjected::PropagateFnc(), and TimeHeightConvolutionComponent::UpdateNaturalGradient().

                                                                        {
     return CuSubMatrix<Real>(*this, row_offset, num_rows,
                              0, num_cols_);
   }

◆ Scale()

void Scale ( Real value )

Definition at line 644 of file cu-matrix.cc.

Referenced by DecodableNnetLoopedOnlineBase::AdvanceChunk(), DecodableNnetSimpleLooped::AdvanceChunk(), CuMatrixBase< float >::ApplyLog(), BackpropTruncationComponent::Backprop(), TanhComponent::Backprop(), PowerComponent::Backprop(), ScaleComponent::Backprop(), NoOpComponent::Backprop(), AveragePoolingComponent::BackpropagateFnc(), MultiBasisComponent::BackpropagateFnc(), Dropout::BackpropagateFnc(), NnetBatchComputer::Compute(), DecodableNnet2Online::ComputeForFrame(), LstmNonlinearityComponent::ConsolidateMemory(), kaldi::CuCompressedMatrixTestNonnegative(), kaldi::CuCompressedMatrixTestSymmetric(), CuMatrixBase< float >::DiffLogSoftmaxPerRow(), DecodableNnetSimple::DoNnetComputation(), NnetComputer::ExecuteCommand(), GeneralDropoutComponent::GetMemo(), main(), kaldi::nnet2::PreconditionDirectionsAlphaRescaled(), SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(), NnetChainTrainer::ProcessOutputs(), NnetDiscriminativeTrainer::ProcessOutputs(), BackpropTruncationComponent::Propagate(), ScaleComponent::Propagate(), DropoutMaskComponent::Propagate(), DropoutComponent::Propagate(), KlHmm::PropagateFnc(), Dropout::PropagateFnc(), ClipGradientComponent::RepairGradients(), RestrictedAttentionComponent::Scale(), TanhComponent::StoreStats(), kaldi::nnet3::attention::TestAttentionForwardBackward(), kaldi::UnitTestCuMatrixAddMatDiagVec(), kaldi::UnitTestCuMatrixAddMatMatElements(), kaldi::UnitTestCuMatrixScale(), and kaldi::UnitTestLstmNonlinearity().

                                          {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_scale(dimGrid, dimBlock, data_, value, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().Scale(value);
   }
 }

◆ Set()

void Set ( Real value )

Definition at line 531 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), BackpropTruncationComponent::Backprop(), SigmoidComponent::Backprop(), NnetComputer::ExecuteCommand(), MaxpoolingComponent::Propagate(), DropoutMaskComponent::Propagate(), MaxPoolingComponent::PropagateFnc(), SigmoidComponent::StoreStats(), kaldi::UnitTestCuMatrixObjfDeriv(), kaldi::UnitTestCuMatrixSet(), kaldi::nnet1::UnitTestDropoutComponent(), and kaldi::nnet1::UnitTestMaxPoolingComponent().

                                        {
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_set_const(dimGrid, dimBlock, data_, value, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Set(value);
   }
 }

◆ SetMatMatDivMat()

void SetMatMatDivMat	(	const CuMatrixBase< Real > &	A,
		const CuMatrixBase< Real > &	B,
		const CuMatrixBase< Real > &	C
	)

*this = a * b / c (by element; when c = 0, *this = a) *this can be an alias of a, b or c safely and get expected result.

dst = a * b / c (by element; when c = 0, dst = a) dst can be an alias of a, b or c safely and get expected result.

Definition at line 1206 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), and DropoutComponent::Backprop().

                                                                               {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
 
     KALDI_ASSERT(num_rows_ == A.num_rows_ && num_cols_ == A.num_cols_);
     KALDI_ASSERT(num_rows_ == B.num_rows_ && num_cols_ == B.num_cols_);
     KALDI_ASSERT(num_rows_ == C.num_rows_ && num_cols_ == C.num_cols_);
     if (num_rows_ == 0) return;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_set_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_,
                              data_, Dim(), A.Stride(), B.Stride(), C.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().SetMatMatDivMat(A.Mat(), B.Mat(), C.Mat());
   }
 }

◆ SetRandn()

void SetRandn ( )

Definition at line 3132 of file cu-matrix.cc.

                                   {
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuRand<Real> tmp;
     tmp.RandGaussian(this);
   } else
 #endif
   {
     Mat().SetRandn();
   }
 }

◆ SetRandUniform()

void SetRandUniform ( )

Definition at line 3146 of file cu-matrix.cc.

Referenced by kaldi::CuCompressedMatrixTestNonnegative(), kaldi::CuCompressedMatrixTestSymmetric(), CuMatrixBase< float >::operator()(), kaldi::UnitTestCuMatrixEqualElementMask(), kaldi::UnitTestCuMatrixSetRandUniform(), and kaldi::UnitTestCuMatrixTraceMatMat().

                                         {
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuRand<Real> tmp;
     tmp.RandUniform(this);
   } else
 #endif
   {
     Mat().SetRandUniform();
   }
 }

◆ SetZero()

void SetZero ( )

Math operations, some calling kernels.

Definition at line 509 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), DistributeComponent::Backprop(), StatisticsExtractionComponent::Backprop(), ClipGradientComponent::Backprop(), MaxPoolingComponent::BackpropagateFnc(), AveragePoolingComponent::BackpropagateFnc(), CopyComponent::BackpropagateFnc(), LstmNonlinearityComponent::ConsolidateMemory(), CuSparseMatrix< Real >::CopyToMat(), NnetComputer::ExecuteCommand(), OnlinePreconditioner::InitOrthonormalSpecial(), OnlineNaturalGradient::InitOrthonormalSpecial(), StatisticsExtractionComponent::Propagate(), StatisticsPoolingComponent::Propagate(), AveragePoolingComponent::PropagateFnc(), FramePoolingComponent::PropagateFnc(), kaldi::nnet3::attention::TestAttentionForwardBackward(), kaldi::TestCuMatrixCompObjfAndDeriv(), kaldi::UnitTestCuMatrixSetMatMatDivMat(), and RestrictedAttentionComponent::ZeroStats().

                                  {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
                               num_cols_ * sizeof(Real), num_rows_ ,
                               cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
   } else
 #endif
   {
     Mat().SetZero();
   }
 }

◆ SetZeroAboveDiag()

void SetZeroAboveDiag ( )

Zeroes all elements for which col > row.

Definition at line 554 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), kaldi::TestCuMatrixSetZeroAboveDiag(), kaldi::UnitTestCuMatrixSetZeroAboveDiag(), and kaldi::UnitTestSetZeroAboveDiag().

                                           {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     CuTimer tim;
 
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_set_zero_above_diag(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     MatrixBase<Real> &mat = Mat();
     int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
     for (int32 r = 0; r + 1 < num_rows; r++) {
       SubVector<Real> vec(mat, r),
           vec_part(vec, r + 1, num_cols - (r + 1));
       vec_part.SetZero();
     }
   }
 }

◆ Sigmoid()

void Sigmoid ( const CuMatrixBase< Real > & src )

Set each element to the sigmoid of the corresponding element of "src": element by element, x = 1 / (1 + exp(-x))

Definition at line 1534 of file cu-matrix.cc.

Referenced by SoftHingeComponent::Backprop(), SigmoidComponent::Propagate(), Sigmoid::PropagateFnc(), Rbm::PropagateFnc(), Rbm::Reconstruct(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixSigmoid(), kaldi::UnitTestCuMatrixSigmoid(), and kaldi::UnitTestCuSigmoid().

                                                               {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
                  src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().Sigmoid(src.Mat());
   }
 }

◆ SizeInBytes()

MatrixIndexT SizeInBytes ( ) const

inline

Get size of matrix in bytes.

Definition at line 234 of file cu-matrix.h.

234 { return num_rows_*stride_*sizeof(Real); }

kaldi::CuMatrixBase::stride_

MatrixIndexT stride_

Definition: cu-matrix.h:787

kaldi::CuMatrixBase::num_rows_

MatrixIndexT num_rows_

Definition: cu-matrix.h:786

◆ SoftHinge()

void SoftHinge ( const CuMatrixBase< Real > & src )

Apply the function y = log(1 + exp(x)), to each element.

Note: the derivative of this function is the sigmoid function. This is like a soft ReLU.

Definition at line 1555 of file cu-matrix.cc.

Referenced by SoftHingeComponent::Propagate(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuMatrixSoftHinge().

                                                                 {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
                     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     Mat().SoftHinge(src.Mat());
   }
 }

◆ SoftMaxPerRow()

void SoftMaxPerRow ( const CuMatrixBase< Real > & src )

Softmax nonlinearity Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row, with attention to avoiding overflow or underflow.

Supports in-place operation (i.e. this == &src).

Definition at line 1717 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplySoftMaxPerRow(), kaldi::nnet3::attention::AttentionForward(), SoftmaxComponent::Propagate(), Softmax::PropagateFnc(), HiddenSoftmax::PropagateFnc(), BlockSoftmax::PropagateFnc(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestCuMatrixSoftmax(), and kaldi::UnitTestCuSoftmax().

                                                                     {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     size_t dimBlock = CU1DBLOCK;
     size_t dimGrid = src.num_rows_;
     cuda_softmax_reduce(dimGrid, dimBlock, data_, src.data_, Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
     MatrixBase<Real> &mat(this->Mat());
     mat.CopyFromMat(src.Mat());
     for(MatrixIndexT r = 0; r < mat.NumRows(); r++) {
       mat.Row(r).ApplySoftMax();
     }
   }
 }

◆ Stride()

MatrixIndexT Stride ( ) const

inline

Definition at line 217 of file cu-matrix.h.

217 { return stride_; }

kaldi::CuMatrixBase::stride_

MatrixIndexT stride_

Definition: cu-matrix.h:787

◆ Sum()

Real Sum ( ) const

Definition at line 3012 of file cu-matrix.cc.

                                    {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(num_rows_ > 0 && num_cols_ > 0);
     CuTimer tim;
 
     CuVector<Real> col_sum(num_rows_, kUndefined);
     cuda_sum_mat_cols(num_rows_, CU1DBLOCK, col_sum.Data(), data_, Dim());
     Real ans = col_sum.Sum();
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
     return ans;
   } else
 #endif
   {
     return Mat().Sum();
   }
 }

◆ SumColumnRanges()

void SumColumnRanges	(	const CuMatrixBase< Real > &	src,
		const CuArrayBase< Int32Pair > &	indexes
	)

For each row r of this and for each column c, sets (*this)(r, c) to the sum src(r, j), where j ranges from indexes[c].first through indexes[c].second - 1.

Definition at line 2893 of file cu-matrix.cc.

Referenced by SumGroupComponent::Propagate(), and kaldi::UnitTestCuMatrixSumColumnRanges().

                                                                                 {
   KALDI_ASSERT(static_cast<MatrixIndexT>(indices.Dim()) == NumCols());
   KALDI_ASSERT(NumRows() == src.NumRows());
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
     cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(),
                            src.Dim(), indices.Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     int32 num_rows = this->num_rows_, num_cols = this->num_cols_,
        this_stride = this->stride_, src_stride = src.stride_;
     Real *data = this->data_;
     const Real *src_data = src.data_;
     const Int32Pair *indices_data = indices.Data();
     for (int32 row = 0; row < num_rows; row++) {
       for (int32 col = 0; col < num_cols; col++) {
         int32 start_col = indices_data[col].first,
                 end_col = indices_data[col].second;
         Real sum = 0.0;
         for (int32 src_col = start_col; src_col < end_col; src_col++)
           sum += src_data[row * src_stride + src_col];
         data[row * this_stride + col] = sum;
       }
     }
   }
 }

◆ SymAddMat2()

void SymAddMat2	(	const Real	alpha,
		const CuMatrixBase< Real > &	M,
		MatrixTransposeType	transA,
		Real	beta
	)

*this = beta * *this + alpha * M M^T, for symmetric matrices.

It only updates the lower triangle of *this. It will leave the matrix asymmetric; if you need it symmetric as a regular matrix, do CopyLowerToUpper().

Definition at line 1353 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::ApplyLog(), CuMatrixBase< float >::Cholesky(), kaldi::nnet3::ConstrainOrthonormalInternal(), kaldi::nnet2::PreconditionDirections(), OnlineNaturalGradient::ReorthogonalizeRt1(), OnlinePreconditioner::ReorthogonalizeXt1(), kaldi::TestSymInvertPosDef(), kaldi::UnitTestCuCholesky(), kaldi::UnitTestCuMatrixSymAddMat2(), and kaldi::UnitTestCuMatrixSymInvertPosDef().

                {
   KALDI_ASSERT(num_rows_ == num_cols_ &&
                ((transA == kNoTrans && A.num_rows_ == num_rows_) ||
                 (transA == kTrans && A.num_cols_ == num_cols_)));
   if (num_rows_ == 0) return;
   KALDI_ASSERT(A.data_ != data_);
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     cublasOperation_t trans = (transA == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
     MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
     CUBLAS_SAFE_CALL(cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER,
                                  trans, num_rows_, A_other_dim,
                                  alpha, A.Data(), A.Stride(),
                                  beta, this->data_, this->stride_));
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().SymAddMat2(alpha, A.Mat(), transA, beta);
   }
 }

◆ SymInvertPosDef()

void SymInvertPosDef ( )

Inversion for positive definite symmetric matrices.

Treats the input as symmetric but only reads the lower triangle. The output is symmetric.

Definition at line 2111 of file cu-matrix.cc.

Referenced by CuSpMatrix< Real >::Invert(), kaldi::nnet2::PreconditionDirections(), CuMatrixBase< float >::SizeInBytes(), kaldi::TestSymInvertPosDef(), kaldi::UnitInvert(), kaldi::UnitTestCuMatrixSymInvertPosDef(), and kaldi::UnitTestInvert().

                                          {
   KALDI_ASSERT(num_rows_ == num_cols_);
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CuMatrix<Real> inv_cholesky(num_rows_, num_rows_);
     this->Cholesky(&inv_cholesky);
     // note: SymAddMat2 only updates lower part of *this.
     this->SymAddMat2(1.0, inv_cholesky, kTrans, 0.0);
     this->CopyLowerToUpper();
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     SpMatrix<Real> temp_sp(this->Mat(), kTakeLower);
     TpMatrix<Real> C(temp_sp.NumRows(), kUndefined);
     C.Cholesky(temp_sp);
     C.Invert();
     temp_sp.AddTp2(1.0, C, kTrans, 0.0);
     this->Mat().CopyFromSp(temp_sp);
     // was previously just: CuSpMatrix::Invert().
   }
 }

◆ Tanh()

void Tanh ( const CuMatrixBase< Real > & src )

Compute the hyperbolic tangent (tanh) function; element by element, *this = tanh(src).

Definition at line 1786 of file cu-matrix.cc.

Referenced by LstmNonlinearityComponent::ConsolidateMemory(), TanhComponent::Propagate(), Tanh::PropagateFnc(), CuMatrixBase< float >::SizeInBytes(), and kaldi::UnitTestCuTanh().

                                                            {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
 
     cuda_tanh(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     Mat().Tanh(src.Mat());
   }
 }

◆ Trace()

Real Trace ( bool check_square = true ) const

Return the trace. If check_square = true, will crash if matrix is not square.

Definition at line 3075 of file cu-matrix.cc.

Referenced by kaldi::nnet3::ConstrainOrthonormalInternal(), kaldi::CuVectorUnitTestCopyDiagFromMat(), and CuMatrixBase< float >::operator()().

                                                       {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (check_square) KALDI_ASSERT(this->num_rows_ == this->num_cols_);
     MatrixIndexT dim = std::min(this->num_rows_, this->num_cols_);
     CuVector<Real> tmp(1, kUndefined); // for result.
     int dimBlock(CU1DBLOCK);
     int dimGrid = 1;// only 1 block here. we have loops in each thread  //(n_blocks(dim_, CU1DBLOCK));
     cuda_vec_sum(dimGrid, dimBlock, data_, tmp.Data(), dim, Stride() + 1);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim);
     return tmp(0);
   } else
 #endif
   {
     return Mat().Trace(check_square);
   }
 }

◆ Write()

void Write	(	std::ostream &	os,
		bool	binary
	)		const

Definition at line 502 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::operator()(), kaldi::UnitTestCuMatrixIO(), and RestrictedAttentionComponent::Write().

                                                                 {
   Matrix<Real> temp(this->num_rows_, this->num_cols_, kUndefined);
   this->CopyToMat(&temp);
   temp.Write(os, binary);
 }

Friends And Related Function Documentation

◆ AddMatMatBatched

void AddMatMatBatched	(	const Real	alpha,
		std::vector< CuSubMatrix< Real > * > &	C,
		const std::vector< CuSubMatrix< Real > * > &	A,
		MatrixTransposeType	transA,
		const std::vector< CuSubMatrix< Real > * > &	B,
		MatrixTransposeType	transB,
		const Real	beta
	)

friend

Does multiple matrix multiplications, executing them in parallel using cuBLAS's gemmBatched if we are using a GPU.

Vectors A, B and C must have the same length; for each i, this function executes the matrix operation C[i] = alpha * A[i](^T)*B[i](^T) + beta * C[i].

Definition at line 2207 of file cu-matrix.cc.

                                                                    {
   KALDI_ASSERT(A.size() == B.size() && B.size() == C.size());
   int32 size = A.size();
 
   if (size == 0) return;
 
   // all elements must have the same num-rows, num-cols and stride
   for (int32 i = 0; i + 1 < size; i++) {
     KALDI_ASSERT(A[i]->NumRows() == A[i+1]->NumRows());
     KALDI_ASSERT(A[i]->NumCols() == A[i+1]->NumCols());
     KALDI_ASSERT(A[i]->Stride() == A[i+1]->Stride());
     KALDI_ASSERT(B[i]->NumRows() == B[i+1]->NumRows());
     KALDI_ASSERT(B[i]->NumCols() == B[i+1]->NumCols());
     KALDI_ASSERT(B[i]->Stride() == B[i+1]->Stride());
     KALDI_ASSERT(C[i]->NumRows() == C[i+1]->NumRows());
     KALDI_ASSERT(C[i]->NumCols() == C[i+1]->NumCols());
     KALDI_ASSERT(C[i]->Stride() == C[i+1]->Stride());
   }
   // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping?
   // keep trans..., just swap A&B matrices: A->B B->A
   MatrixIndexT m = ((transB==kTrans)? B[0]->NumRows() : B[0]->NumCols());
   MatrixIndexT n = ((transA==kTrans)? A[0]->NumCols() : A[0]->NumRows());
   MatrixIndexT k = ((transB==kTrans)? B[0]->NumCols() : B[0]->NumRows());
   MatrixIndexT k1 = ((transA==kTrans)? A[0]->NumRows() : A[0]->NumCols());
 
   KALDI_ASSERT(m == C[0]->NumCols());
   KALDI_ASSERT(n == C[0]->NumRows());
   KALDI_ASSERT(k == k1);
 
   if (m == 0) return;
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     Real **device_abc_array =
         static_cast<Real**>(CuDevice::Instantiate().Malloc(3 * size * sizeof(Real*)));
     const Real **device_a_array = const_cast<const Real**>(device_abc_array);
     const Real **device_b_array = const_cast<const Real**>(device_abc_array) + size;
     Real **device_c_array = device_abc_array + 2 * size;
     const Real **host_abc_array = new const Real*[3*size];
     const Real **host_a_array = host_abc_array;
     const Real **host_b_array = host_abc_array + size;
     const Real **host_c_array = host_abc_array + 2 * size;
 
     for (int32 i = 0; i < size; i++) {
       host_a_array[i] = A[i]->data_;
       host_b_array[i] = B[i]->data_;
       host_c_array[i] = C[i]->data_;
     }
 
     CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
                                  3*size*sizeof(Real*), cudaMemcpyHostToDevice,
                                  cudaStreamPerThread));
 
     CUBLAS_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
                                         (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
                                         (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
                                         m, n, k, alpha, device_b_array,
                                         B[0]->Stride(), device_a_array,
                                         A[0]->Stride(), beta, device_c_array,
                                         C[0]->Stride(), size));
 
     CuDevice::Instantiate().Free(device_abc_array);
     delete[] host_abc_array;
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     for (int32 i = 0; i < size; i++) {
       C[i]->Mat().AddMatMat(alpha, A[i]->Mat(), transA, B[i]->Mat(), transB, beta);
     }
   }
 }

◆ CuBlockMatrix< Real >

friend class CuBlockMatrix< Real >

friend

Definition at line 93 of file cu-matrix.h.

◆ CuMatrixBase< double >

friend class CuMatrixBase< double >

friend

Definition at line 82 of file cu-matrix.h.

◆ CuMatrixBase< float >

friend class CuMatrixBase< float >

friend

Definition at line 81 of file cu-matrix.h.

◆ CuRand< Real >

friend class CuRand< Real >

friend

Definition at line 91 of file cu-matrix.h.

◆ CuSparseMatrix< double >

friend class CuSparseMatrix< double >

friend

Definition at line 95 of file cu-matrix.h.

◆ CuSparseMatrix< float >

friend class CuSparseMatrix< float >

friend

Definition at line 94 of file cu-matrix.h.

◆ CuSparseMatrix< Real >

friend class CuSparseMatrix< Real >

friend

Definition at line 96 of file cu-matrix.h.

◆ CuSpMatrix< Real >

friend class CuSpMatrix< Real >

friend

Definition at line 86 of file cu-matrix.h.

◆ CuSubMatrix< Real >

friend class CuSubMatrix< Real >

friend

Definition at line 90 of file cu-matrix.h.

◆ CuSubVector< Real >

friend class CuSubVector< Real >

friend

Definition at line 92 of file cu-matrix.h.

◆ CuTpMatrix< double >

friend class CuTpMatrix< double >

friend

Definition at line 88 of file cu-matrix.h.

◆ CuTpMatrix< float >

friend class CuTpMatrix< float >

friend

Definition at line 87 of file cu-matrix.h.

◆ CuVectorBase< double >

friend class CuVectorBase< double >

friend

Definition at line 84 of file cu-matrix.h.

◆ CuVectorBase< float >

friend class CuVectorBase< float >

friend

Definition at line 83 of file cu-matrix.h.

◆ CuVectorBase< Real >

friend class CuVectorBase< Real >

friend

Definition at line 89 of file cu-matrix.h.

◆ TraceMatMat

Real TraceMatMat	(	const CuMatrixBase< Real > &	A,
		const CuMatrixBase< Real > &	B,
		MatrixTransposeType	trans
	)

friend

Definition at line 2145 of file cu-matrix.cc.

Referenced by CuMatrixBase< float >::FrobeniusNorm().

                                             {
   if (A.num_rows_ == 0) {
     KALDI_ASSERT(B.num_rows_ == 0);
     return 0.0;
   }
   Real result = 0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kNoTrans) {
       KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
     } else {
       KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
     }
     CuTimer tim;
     // 2D blocks: each (8x32) block sums up (32x32) elements.
     // 2D grid: try to cover all the matrix A unless it is too big.
     // Kernel will reduce to ~256 elements with good performance,
     // if the matrix is not in a very bad shape.
     // (wider or taller than 32x8192)
     // CPU will then reduce to 1 element.
     const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
         n_blocks(A.NumRows(), kWarpSize));
     if (dimGrid.x * dimGrid.y > 256) {
       dimGrid.y = 256 / dimGrid.x;
       if (dimGrid.y == 0) {
         dimGrid.y = 1;
       }
     }
     CuVector<Real> result_vec(dimGrid.x * dimGrid.y, kUndefined);
     if (trans == kNoTrans) {
       cuda_trace_mat_mat(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(),
           B.Stride(), result_vec.Data());
     } else {
       cuda_trace_mat_mat_trans(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(),
           B.Stride(), result_vec.Data());
     }
     CU_SAFE_CALL(cudaGetLastError());
     Vector<Real> result_cpu(result_vec); // copying from CUDA faster than summing in CUDA.
     result = result_cpu.Sum();
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     result = TraceMatMat(A.Mat(), B.Mat(), trans);
   }
   return result;
 }

◆ TraceMatSmat

Real TraceMatSmat	(	const CuMatrixBase< Real > &	A,
		const CuSparseMatrix< Real > &	B,
		MatrixTransposeType	trans
	)

friend

Definition at line 524 of file cu-sparse-matrix.cc.

                                              {
   if (A.NumCols() == 0) {
     KALDI_ASSERT(B.NumCols() == 0);
     return 0.0;
   }
   if (B.NumElements() == 0) {
     return 0.0;
   }
   Real result = 0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kTrans) {
       KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
     } else {
       KALDI_ASSERT(A.NumCols() == B.NumRows() && A.NumRows() == B.NumCols());
     }
 
     // The Sum() method in CuVector handles a bunch of logic, we use that to
     // comptue the trace.
     CuVector<Real> sum_vec(B.NumElements());
     CuTimer tim;
 
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of B.
     const int warpSize = 32;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y));
 
     if (trans == kNoTrans) {
       cuda_trace_mat_smat(dimGrid, dimBlock, A.Data(), A.Dim(), B.CsrRowPtr(),
                           B.CsrColIdx(), B.CsrVal(), sum_vec.Data());
     } else {
       cuda_trace_mat_smat_trans(dimGrid, dimBlock, A.Data(), A.Dim(),
                                 B.CsrRowPtr(), B.CsrColIdx(), B.CsrVal(),
                                 sum_vec.Data());
     }
     result = sum_vec.Sum();
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
     result = TraceMatSmat(A.Mat(), B.Smat(), trans);
   }
   return result;
 }

◆ VectorBase< Real >

friend class VectorBase< Real >

friend

Definition at line 85 of file cu-matrix.h.

Member Data Documentation

◆ data_

Real* data_

protected

GPU data pointer (or regular matrix data pointer,.

if either CUDA was not compiled in or we could not acquire the device).

Definition at line 777 of file cu-matrix.h.

◆ num_cols_

MatrixIndexT num_cols_

protected

Definition at line 785 of file cu-matrix.h.

Referenced by CuSubMatrix< Real >::CuSubMatrix(), CuMatrixBase< float >::Dim(), CuMatrixBase< float >::NumCols(), CuMatrixBase< float >::operator()(), CuMatrixBase< float >::RowRange(), CuMatrixBase< float >::SetMatMatDivMat(), CuMatrix< float >::Swap(), and CuMatrixBase< float >::SymAddMat2().

◆ num_rows_

MatrixIndexT num_rows_

protected

Definition at line 786 of file cu-matrix.h.

◆ stride_

MatrixIndexT stride_

protected

Definition at line 787 of file cu-matrix.h.

Referenced by CuMatrixBase< float >::AddRowRanges(), CuSubMatrix< Real >::CuSubMatrix(), CuMatrixBase< float >::Dim(), CuMatrixBase< float >::operator()(), CuMatrixBase< float >::Row(), CuMatrixBase< float >::RowData(), CuMatrixBase< float >::SizeInBytes(), CuMatrixBase< float >::Stride(), CuMatrixBase< float >::SumColumnRanges(), and CuMatrix< float >::Swap().

The documentation for this class was generated from the following files:

matrix/matrix-common.h
cudamatrix/cu-matrix.h
cudamatrix/cu-matrix.cc

Public Member Functions

Protected Member Functions

Protected Attributes

Private Member Functions

Friends

Detailed Description

template<typename Real> class kaldi::CuMatrixBase< Real >

Constructor & Destructor Documentation

◆ CuMatrixBase() [1/2]

◆ CuMatrixBase() [2/2]

Member Function Documentation

◆ Add()

◆ AddCols()

◆ AddDiagVecMat()

◆ AddElements() [1/2]

◆ AddElements() [2/2]

◆ AddMat()

◆ AddMatBlock()

◆ AddMatBlocks()

◆ AddMatDiagVec()

◆ AddMatMat()

◆ AddMatMatElements()

◆ AddMatSmat()

◆ AddMatSp()

◆ AddMatTp()

◆ AddRowRanges()

◆ AddRows() [1/2]

◆ AddRows() [2/2]

◆ AddSmat()

◆ AddSmatMat()

◆ AddSpMat()

◆ AddToDiag()

◆ AddToElements()

◆ AddToRows() [1/2]

◆ AddToRows() [2/2]

◆ AddTpMat()

◆ AddVecToCols()

◆ AddVecToRows()

◆ AddVecVec()

◆ ApplyCeiling()

◆ ApplyExp()

◆ ApplyExpLimited()

◆ ApplyExpSpecial()

◆ ApplyFloor()

◆ ApplyHeaviside()

◆ ApplyLog()

◆ ApplyLogSoftMaxPerRow()

◆ ApplyPow()

◆ ApplyPowAbs()

◆ ApplySoftMaxPerRow()

◆ ApproxEqual()

◆ Ceiling()

◆ Cholesky()

◆ ColRange()

◆ CopyColFromVec()

◆ CopyCols()

◆ CopyColsFromVec()

◆ CopyFromBlock()

◆ CopyFromGeneralMat()

◆ CopyFromMat() [1/3]

◆ CopyFromMat() [2/3]

◆ CopyFromMat() [3/3]

◆ CopyFromSp()

◆ CopyFromTp()

◆ CopyLowerToUpper()

◆ CopyRangeFromMatClamped()

◆ CopyRows() [1/2]

◆ CopyRows() [2/2]

◆ CopyRowsFromVec() [1/2]

◆ CopyRowsFromVec() [2/2]

◆ CopyToMat()

◆ CopyToRows()

◆ CopyUpperToLower()

◆ Data() [1/2]

◆ Data() [2/2]

◆ DiffGroupPnorm()

◆ DiffLogSoftmaxPerRow()

◆ DiffParametricRelu()

◆ DiffSigmoid()

◆ DiffSoftmaxPerRow()

template<typename Real>
class kaldi::CuMatrixBase< Real >