28 rank_(40), update_period_(1), num_samples_history_(2000.0),
29 num_minibatches_history_(0.0), alpha_(4.0),
30 epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
31 self_debug_(false), rho_t_(-1.0e+10) { }
50 std::vector<MatrixElement<BaseFloat> > elems;
51 elems.reserve(num_cols);
53 for (
int32 r = 0; r < num_rows; r++) {
54 std::vector<int32> cols;
55 for (
int32 c = r; c < num_cols; c += num_rows)
57 BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
59 for (
size_t i = 0;
i < cols.size();
i++) {
62 normalizer * (
i == 0 ? first_elem :
73 KALDI_WARN <<
"Rank " <<
rank_ <<
" of online preconditioner is >= dim " << D
75 << (D - 1) <<
" (but this is probably still too high)";
118 W_t_.Scale(sqrt(E_tii));
140 int32 num_init_iters;
148 for (
int32 i = 0;
i < num_init_iters;
i++) {
150 X0_copy.CopyFromMat(X0);
178 WJKL_t.
Range(0, R, 0, D).CopyFromMat(
W_t_);
188 updating, d_t, &WJKL_t, X_t);
191 if (initial_product <= 0.0) {
195 *scale = sqrt(initial_product / final_product);
215 ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
225 O(
i,
j) *= i_factor * j_factor;
228 if (O.
IsUnit(threshold)) {
230 KALDI_WARN <<
"Not reorthogonalizing since already orthognoal: " << O;
235 bool cholesky_ok =
true;
240 if (!(C.
Max() < 100.0)) {
241 KALDI_WARN <<
"Cholesky out of expected range, " 242 <<
"reorthogonalizing with Gram-Schmidt";
248 KALDI_WARN <<
"Cholesky or Invert() failed while re-orthogonalizing R_t. " 249 <<
"Re-orthogonalizing on CPU.";
269 C(i,
j) *= i_factor * j_factor;
299 O(
i,
j) *= i_factor * j_factor;
302 if (!O.
IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
304 int32 worst_i = 0, worst_j = 0;
308 BaseFloat error = fabs(elem - (
i ==
j ? 1.0 : 0.0));
309 if (error > worst_error || error != error) {
316 if (worst_error > 1.0e-02 || worst_error != worst_error) {
317 KALDI_WARN <<
"Failed to verify W_t (worst error: O[" << worst_i <<
',' 318 << worst_j <<
"] = " << O(worst_i, worst_j)
319 <<
", d_t = " <<
d_t_;
343 L_t(*WJKL_t, 0, R, D, R),
344 K_t(*WJKL_t, R, R, D, R),
345 WJ_t(*WJKL_t, 0, 2 * R, 0, D),
346 LK_t(*WJKL_t, 0, 2 * R, D, R);
359 bool compute_lk_together = (N > D);
361 if (compute_lk_together) {
370 K_t.SymAddMat2(1.0, J_t,
kNoTrans, 0.0);
371 L_t.SymAddMat2(1.0, H_t,
kTrans, 0.0);
376 K_t_cpu(LK_cpu, R, R, 0, R);
377 if (!compute_lk_together) {
379 L_t_cpu.CopyLowerToUpper();
386 ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
392 ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t_double);
394 Z_t_double.
Scale(1.0 / z_t_scale);
400 Z_t_scaled.
Eig(&c_t, &U_t);
402 c_t.
Scale(z_t_scale);
404 const BaseFloat condition_threshold = 1.0e+06;
408 bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
410 BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
414 must_reorthogonalize =
true;
416 KALDI_WARN <<
"Floored " << nf <<
" elements of C_t.";
425 BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_X_Xt
426 + (1-eta)*(D * rho_t + d_t.
Sum())
432 if (rho_t1 < floor_val)
437 ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
440 if (must_reorthogonalize) {
452 d_t_.CopyFromVec(d_t1);
462 const int num_initial_updates = 10;
465 (
t_ <= num_initial_updates ||
479 if (ans > 0.9) ans = 0.9;
504 ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
510 w_t_coeff(
i) = (1.0 - eta) / (eta/N) * (d_t(
i) + rho_t);
518 BaseFloat i_factor = (eta / N) * sqrt_e_t1(
i) * inv_sqrt_c_t(
i);
521 A_t(
i,
j) *= i_factor * j_factor;
540 d_t_rho_t.
Add(rho_t);
541 double etaN = eta / N, eta1 = 1.0 - eta,
542 etaN_sq = etaN * etaN, eta1_sq = eta1 * eta1,
543 etaN_eta1 = etaN * eta1;
546 double inv_sqrt_e_t_i = inv_sqrt_e_t(
i), d_t_rho_t_i = d_t_rho_t(
i);
548 double inv_sqrt_e_t_j = inv_sqrt_e_t(
j), d_t_rho_t_j = d_t_rho_t(
j),
549 L_t_i_j = 0.5 * (L_t(
i,
j) + L_t(
j,
i)),
550 K_t_i_j = 0.5 * (K_t(
i,
j) + K_t(
j,
i));
552 (*Z_t)(
i,
j) = etaN_sq * inv_sqrt_e_t_i * K_t_i_j * inv_sqrt_e_t_j
553 + etaN_eta1 * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j * d_t_rho_t_j
554 + etaN_eta1 * d_t_rho_t_i * inv_sqrt_e_t_i * L_t_i_j * inv_sqrt_e_t_j
555 + (i ==
j ? eta1_sq * d_t_rho_t_i * d_t_rho_t_i : 0.0);
570 e[
i] = 1.0 / (beta_t / d[
i] + 1);
614 num_samples_history < 1.0e+6);
void CopyFromMat(const MatrixBase< OtherReal > &src, MatrixTransposeType trans=kNoTrans)
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
static void InitOrthonormalSpecial(CuMatrixBase< BaseFloat > *R)
This function creates a matrix with orthonormal rows that is like the following matrix, except with each row normalized to have unit 2-norm: [ 1.1 0 1 0 1 0 0 1.1 0 1 0 1 ] The reason why the first element in each row is 1.1 and not 1, is for symmetry-breaking...
bool IsUnit(Real cutoff=1.0e-05) const
CuMatrix< BaseFloat > W_t_
BaseFloat Eta(int32 N) const
BaseFloat num_minibatches_history_
Packed symetric matrix class.
void InitDefault(int32 D)
void ReorthogonalizeRt1(const VectorBase< BaseFloat > &d_t1, BaseFloat rho_t1, CuMatrixBase< BaseFloat > *W_t1, CuMatrixBase< BaseFloat > *temp_W, CuMatrixBase< BaseFloat > *temp_O)
Base class which provides matrix operations not involving resizing or allocation. ...
void SetUpdatePeriod(int32 update_period)
void SetNumSamplesHistory(BaseFloat num_samples_history)
OnlineNaturalGradient & operator=(const OnlineNaturalGradient &other)
void AddElements(Real alpha, const std::vector< MatrixElement< Real > > &input)
CuSubMatrix< Real > Range(const MatrixIndexT row_offset, const MatrixIndexT num_rows, const MatrixIndexT col_offset, const MatrixIndexT num_cols) const
void swap(basic_filebuf< CharT, Traits > &x, basic_filebuf< CharT, Traits > &y)
void Eig(VectorBase< Real > *s, MatrixBase< Real > *P=NULL) const
Solves the symmetric eigenvalue problem: at end we should have (*this) = P * diag(s) * P^T...
Keywords for search: natural gradient, naturalgradient, NG-SGD.
This class represents a matrix that's stored on the GPU if we have one, and in memory if not...
void ApplyFloor(Real floor_val, MatrixIndexT *floored_count=nullptr)
Applies floor to all elements.
void ComputeWt1(int32 N, const VectorBase< BaseFloat > &d_t, const VectorBase< BaseFloat > &d_t1, BaseFloat rho_t, BaseFloat rho_t1, const MatrixBase< BaseFloat > &U_t, const VectorBase< BaseFloat > &sqrt_c_t, const VectorBase< BaseFloat > &inv_sqrt_e_t, const CuMatrixBase< BaseFloat > &W_t, CuMatrixBase< BaseFloat > *J_t, CuMatrixBase< BaseFloat > *W_t1) const
void CopyFromVec(const VectorBase< Real > &v)
Copy data from another vector (must match own size).
void Cholesky(const SpMatrix< Real > &orig)
void SetZero()
Math operations, some calling kernels.
void SymAddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transA, Real beta)
*this = beta * *this + alpha * M M^T, for symmetric matrices.
void CopyFromTp(const TpMatrix< OtherReal > &M, MatrixTransposeType trans=kNoTrans)
Copy given tpmatrix. (no resize is done).
Real Max() const
Returns the maximum value of any element, or -infinity for the empty vector.
void PreconditionDirections(CuMatrixBase< BaseFloat > *X, BaseFloat *scale)
This call implements the main functionality of this class.
Packed symetric matrix class.
void AddMatMat(Real alpha, const CuMatrixBase< Real > &A, MatrixTransposeType transA, const CuMatrixBase< Real > &B, MatrixTransposeType transB, Real beta)
C = alpha * A(^T)*B(^T) + beta * C.
Real TraceMatMat(const MatrixBase< Real > &A, const MatrixBase< Real > &B, MatrixTransposeType trans)
We need to declare this here as it will be a friend function.
This class is used for a piece of a CuMatrix.
Real * Data()
Returns a pointer to the start of the vector's data.
MatrixIndexT Dim() const
Returns the dimension of the vector.
void ComputeEt(const VectorBase< BaseFloat > &d_t, BaseFloat beta_t, VectorBase< BaseFloat > *e_t, VectorBase< BaseFloat > *sqrt_e_t, VectorBase< BaseFloat > *inv_sqrt_e_t) const
void Scale(Real alpha)
Multiplies all elements by this constant.
void Swap(OnlineNaturalGradient *other)
Real Sum() const
Returns sum of the elements.
void SetAlpha(BaseFloat alpha)
Matrix for CUDA computing.
MatrixIndexT NumCols() const
A class representing a vector.
void InvertElements()
Invert all elements.
#define KALDI_ASSERT(cond)
void ApplyPow(Real power)
Take all elements of vector to a power.
void ComputeZt(int32 N, BaseFloat rho_t, const VectorBase< BaseFloat > &d_t, const VectorBase< BaseFloat > &inv_sqrt_e_t, const MatrixBase< BaseFloat > &K_t, const MatrixBase< BaseFloat > &L_t, SpMatrix< double > *Z_t) const
void SetNumMinibatchesHistory(BaseFloat num_minibatches_history)
BaseFloat num_samples_history_
void OrthogonalizeRows()
This function orthogonalizes the rows of a matrix using the Gram-Schmidt process. ...
MatrixIndexT NumRows() const
Dimensions.
Provides a vector abstraction class.
void Add(Real c)
Add a constant to each element of a vector.
void Init(const CuMatrixBase< BaseFloat > &R0)
void PreconditionDirectionsInternal(const BaseFloat rho_t, const BaseFloat tr_X_Xt, bool updating, const Vector< BaseFloat > &d_t, CuMatrixBase< BaseFloat > *WJKL_t, CuMatrixBase< BaseFloat > *X_t)
void MulRowsVec(const CuVectorBase< Real > &scale)
scale i'th row by scale[i]
Sub-matrix representation.
void AddMat2(const Real alpha, const CuMatrixBase< Real > &M, MatrixTransposeType transM, const Real beta)
void SortSvd(VectorBase< Real > *s, MatrixBase< Real > *U, MatrixBase< Real > *Vt, bool sort_on_absolute_value)
Function to ensure that SVD is sorted.
void CopyLowerToUpper()
Copy lower triangle to upper triangle (symmetrize)
void AddDiagVecMat(const Real alpha, const CuVectorBase< Real > &v, const CuMatrixBase< Real > &M, MatrixTransposeType transM, Real beta=1.0)
*this = beta * *this + alpha * diag(v) * M [or M^T].