Functions
void	GetAttentionDotProductsSimple (BaseFloat alpha, const CuMatrixBase< BaseFloat > &A, const CuMatrixBase< BaseFloat > &B, CuMatrixBase< BaseFloat > *C)

void	ApplyScalesToOutputSimple (BaseFloat alpha, const CuMatrixBase< BaseFloat > &B, const CuMatrixBase< BaseFloat > &C, CuMatrixBase< BaseFloat > *A)

void	ApplyScalesToInputSimple (BaseFloat alpha, const CuMatrixBase< BaseFloat > &A, const CuMatrixBase< BaseFloat > &C, CuMatrixBase< BaseFloat > *B)

void	UnitTestAttentionDotProductAndAddScales ()

void	TestAttentionForwardBackward ()

void	UnitTestAttention ()

void	GetAttentionDotProducts (BaseFloat alpha, const CuMatrixBase< BaseFloat > &A, const CuMatrixBase< BaseFloat > &B, CuMatrixBase< BaseFloat > *C)
	This function is a utility function that is at the core of how we implement attention. More...

void	ApplyScalesToOutput (BaseFloat alpha, const CuMatrixBase< BaseFloat > &B, const CuMatrixBase< BaseFloat > &C, CuMatrixBase< BaseFloat > *A)
	This function is related to GetAttentionDotProducts(); it is used in scaling the values by the softmax scales, and in backprop. More...

void	ApplyScalesToInput (BaseFloat alpha, const CuMatrixBase< BaseFloat > &A, const CuMatrixBase< BaseFloat > &C, CuMatrixBase< BaseFloat > *B)
	This function is related to GetAttentionDotProducts(); it is used in backprop. More...

void	AttentionForward (BaseFloat key_scale, const CuMatrixBase< BaseFloat > &keys, const CuMatrixBase< BaseFloat > &queries, const CuMatrixBase< BaseFloat > &values, CuMatrixBase< BaseFloat > c, CuMatrixBase< BaseFloat > output)
	This is a higher-level interface to the attention code. More...

void	AttentionBackward (BaseFloat key_scale, const CuMatrixBase< BaseFloat > &keys, const CuMatrixBase< BaseFloat > &queries, const CuMatrixBase< BaseFloat > &values, const CuMatrixBase< BaseFloat > &c, const CuMatrixBase< BaseFloat > &output_deriv, CuMatrixBase< BaseFloat > keys_deriv, CuMatrixBase< BaseFloat > queries_deriv, CuMatrixBase< BaseFloat > *values_deriv)
	Performs the backward pass corresponding to 'AttentionForward', propagating the derivative back to the keys, queries and values. More...

Function Documentation

◆ ApplyScalesToInput()

void ApplyScalesToInput	(	BaseFloat	alpha,
		const CuMatrixBase< BaseFloat > &	A,
		const CuMatrixBase< BaseFloat > &	C,
		CuMatrixBase< BaseFloat > *	B
	)

This function is related to GetAttentionDotProducts(); it is used in backprop.

We have put the A, B and C in an unusual order here in order to make clearer the relationship with GetAttentionDotProducts(). The matrices have the same relationship in terms of their dimensions, as A, B and C do in GetAttentionDotProducts().

This function implements:

B->Row(i + j * row_shift) += alpha * C(i, j) * A.Row(i).

Definition at line 76 of file attention.cc.

References CuMatrixBase< Real >::AddDiagVecMat(), KALDI_ASSERT, kaldi::kNoTrans, kaldi::kTrans, CuMatrixBase< Real >::NumCols(), and CuMatrixBase< Real >::NumRows().

Referenced by AttentionBackward(), and UnitTestAttentionDotProductAndAddScales().

                                                     {
   KALDI_ASSERT(A.NumCols() == B->NumCols() &&
                A.NumRows() == C.NumRows());
   int32 num_output_rows = A.NumRows(),
       input_num_cols = A.NumCols(),
       num_extra_rows = B->NumRows() - A.NumRows(),
       context_dim = C.NumCols();
   KALDI_ASSERT(num_extra_rows > 0 && num_extra_rows % (context_dim - 1) == 0);
   int32 row_shift = num_extra_rows / (context_dim - 1);
   CuMatrix<BaseFloat> Ctrans(C, kTrans);
   for (int32 o = 0; o < context_dim; o++) {
     CuSubVector<BaseFloat> c_col(Ctrans, o);
     CuSubMatrix<BaseFloat> B_part(*B, o * row_shift, num_output_rows,
                                   0, input_num_cols);
     B_part.AddDiagVecMat(alpha, c_col, A, kNoTrans, 1.0);
   }
 }

◆ ApplyScalesToInputSimple()

void kaldi::nnet3::attention::ApplyScalesToInputSimple	(	BaseFloat	alpha,
		const CuMatrixBase< BaseFloat > &	A,
		const CuMatrixBase< BaseFloat > &	C,
		CuMatrixBase< BaseFloat > *	B
	)

Definition at line 72 of file attention-test.cc.

References rnnlm::i, rnnlm::j, KALDI_ASSERT, CuMatrixBase< Real >::NumCols(), and CuMatrixBase< Real >::NumRows().

Referenced by UnitTestAttentionDotProductAndAddScales().

                                                           {
   KALDI_ASSERT(A.NumCols() == B->NumCols() &&
                A.NumRows() == C.NumRows());
   int32 num_extra_rows = B->NumRows() - A.NumRows(),
       context_dim = C.NumCols();
   KALDI_ASSERT(num_extra_rows > 0 && num_extra_rows % (context_dim - 1) == 0);
   int32 row_shift = num_extra_rows / (context_dim - 1);
   for (int32 i = 0; i < A.NumRows(); i++) {
     for (int32 j = 0; j < A.NumCols(); j++) {
       for (int32 k = 0; k < context_dim; k++) {
         (*B)(i + (k * row_shift), j) += alpha * C(i, k) * A(i, j);
       }
     }
   }
 }

◆ ApplyScalesToOutput()

void ApplyScalesToOutput	(	BaseFloat	alpha,
		const CuMatrixBase< BaseFloat > &	B,
		const CuMatrixBase< BaseFloat > &	C,
		CuMatrixBase< BaseFloat > *	A
	)

This function is related to GetAttentionDotProducts(); it is used in scaling the values by the softmax scales, and in backprop.

We have put the A, B and C in an unusual order here in order to make clearer the relationship with GetAttentionDotProducts(). The matrices have the same relationship in terms of their dimensions, as A, B and C do in GetAttentionDotProducts().

This function implements:

A->Row(i) += alpha * C(i, j) * B.Row(i + j * row_shift).

Definition at line 55 of file attention.cc.

References CuMatrixBase< Real >::AddDiagVecMat(), KALDI_ASSERT, kaldi::kNoTrans, kaldi::kTrans, CuMatrixBase< Real >::NumCols(), and CuMatrixBase< Real >::NumRows().

Referenced by AttentionBackward(), AttentionForward(), and UnitTestAttentionDotProductAndAddScales().

                                                      {
   KALDI_ASSERT(A->NumCols() == B.NumCols() &&
                A->NumRows() == C.NumRows());
   int32 num_output_rows = A->NumRows(),
       input_num_cols = A->NumCols(),
       num_extra_rows = B.NumRows() - A->NumRows(),
       context_dim = C.NumCols();
   KALDI_ASSERT(num_extra_rows > 0 && num_extra_rows % (context_dim - 1) == 0);
   int32 row_shift = num_extra_rows / (context_dim - 1);
   CuMatrix<BaseFloat> Ctrans(C, kTrans);
   for (int32 o = 0; o < context_dim; o++) {
     CuSubVector<BaseFloat> c_col(Ctrans, o);
     CuSubMatrix<BaseFloat> B_part(B, o * row_shift, num_output_rows,
                                   0, input_num_cols);
     A->AddDiagVecMat(alpha, c_col, B_part, kNoTrans, 1.0);
   }
 }

◆ ApplyScalesToOutputSimple()

void kaldi::nnet3::attention::ApplyScalesToOutputSimple	(	BaseFloat	alpha,
		const CuMatrixBase< BaseFloat > &	B,
		const CuMatrixBase< BaseFloat > &	C,
		CuMatrixBase< BaseFloat > *	A
	)

Definition at line 52 of file attention-test.cc.

References rnnlm::i, rnnlm::j, KALDI_ASSERT, CuMatrixBase< Real >::NumCols(), and CuMatrixBase< Real >::NumRows().

Referenced by UnitTestAttentionDotProductAndAddScales().

                                                            {
   KALDI_ASSERT(A->NumCols() == B.NumCols() &&
                A->NumRows() == C.NumRows());
   int32 num_extra_rows = B.NumRows() - A->NumRows(),
       context_dim = C.NumCols();
   KALDI_ASSERT(num_extra_rows > 0 && num_extra_rows % (context_dim - 1) == 0);
   int32 row_shift = num_extra_rows / (context_dim - 1);
   for (int32 i = 0; i < A->NumRows(); i++) {
     for (int32 j = 0; j < A->NumCols(); j++) {
       for (int32 k = 0; k < context_dim; k++) {
         (*A)(i, j) += alpha * C(i, k) * B(i + (k * row_shift), j);
       }
     }
   }
 }

◆ AttentionBackward()

void AttentionBackward	(	BaseFloat	key_scale,
		const CuMatrixBase< BaseFloat > &	keys,
		const CuMatrixBase< BaseFloat > &	queries,
		const CuMatrixBase< BaseFloat > &	values,
		const CuMatrixBase< BaseFloat > &	c,
		const CuMatrixBase< BaseFloat > &	output_deriv,
		CuMatrixBase< BaseFloat > *	keys_deriv,
		CuMatrixBase< BaseFloat > *	queries_deriv,
		CuMatrixBase< BaseFloat > *	values_deriv
	)

Performs the backward pass corresponding to 'AttentionForward', propagating the derivative back to the keys, queries and values.

The interface should be easy to understand with reference to AttentionForward(), so we won't document it, except to note that 'keys_deriv', 'queries_deriv' and 'values_deriv' are added to*, not set, by this function.

Definition at line 154 of file attention.cc.

References CuMatrixBase< Real >::AddMat(), ApplyScalesToInput(), ApplyScalesToOutput(), CuMatrixBase< Real >::DiffSoftmaxPerRow(), GetAttentionDotProducts(), KALDI_ASSERT, kaldi::kUndefined, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and kaldi::SameDim().

Referenced by RestrictedAttentionComponent::BackpropOneHead(), and TestAttentionForwardBackward().

                                                               {
 
   // First check the dimensions and values.
   KALDI_ASSERT(key_scale > 0.0);
   int32 num_input_rows = keys.NumRows(),
       key_dim = keys.NumCols(),
       num_output_rows = queries.NumRows(),
       context_dim = queries.NumCols() - key_dim,
       value_dim = values.NumCols();
   KALDI_ASSERT(num_input_rows > 0 && key_dim > 0 &&
                num_input_rows > num_output_rows &&
                context_dim > 0 &&
                (num_input_rows - num_output_rows) % (context_dim - 1) == 0 &&
                values.NumRows() == num_input_rows);
   KALDI_ASSERT(SameDim(keys, *keys_deriv) &&
                SameDim(queries, *queries_deriv) &&
                SameDim(values, *values_deriv));
 
   KALDI_ASSERT(c.NumRows() == num_output_rows &&
                c.NumCols() == context_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_output_rows &&
                (output_deriv.NumCols() == value_dim ||
                 output_deriv.NumCols() == value_dim + context_dim));
 
   CuMatrix<BaseFloat> c_deriv(num_output_rows, context_dim,
                               kUndefined);
 
   CuSubMatrix<BaseFloat> output_values_part_deriv(
       output_deriv, 0, num_output_rows, 0, value_dim);
   // This is the backprop w.r.t. the forward-pass statement:
   // ApplyScalesToOutput(1.0, values, *c, &output_values_part);
   GetAttentionDotProducts(1.0, output_values_part_deriv,
                           values, &c_deriv);
 
   if (output_deriv.NumCols() == value_dim + context_dim) {
     CuSubMatrix<BaseFloat> output_deriv_context_part(
         output_deriv, 0, num_output_rows, value_dim, context_dim);
     // this is the backprop w.r.t. the
     // forward-pass statement: output_context_part.CopyFromMat(*c);
     c_deriv.AddMat(1.0, output_deriv_context_part);
   }
 
   // Propagate the derivatives back through the softmax nonlinearity,
   // in-place; this is the backprop w.r.t. the statement
   // 'c->SoftMaxPerRow(*c);'.  From this point on, c_deriv actually
   // contains the derivative to the pre-softmax values which we call
   // 'b' in the math.
   c_deriv.DiffSoftmaxPerRow(c, c_deriv);
 
 
   CuSubMatrix<BaseFloat> queries_key_part(
       queries, 0, num_output_rows,
       0, key_dim),
       queries_key_part_deriv(
           *queries_deriv, 0, num_output_rows,
           0, key_dim),
       queries_context_part_deriv(
           *queries_deriv, 0, num_output_rows,
           key_dim, context_dim);
 
   // Below is the backprop corresponding to the forward-propagation command:
   // c->AddMat(1.0, queries_context_part)
   queries_context_part_deriv.AddMat(1.0, c_deriv);
 
   // The following statement is the part of the backprop w.r.t. the
   // statement:
   // GetAttentionDotProducts(key_scale, queries_key_part, keys, c);
   // which propagates the derivative back to 'queries_key_part'.
   ApplyScalesToOutput(key_scale, keys, c_deriv, &queries_key_part_deriv);
 
   // The following statement is the part of the backprop w.r.t. the
   // statement:
   // GetAttentionDotProducts(key_scale, queries_key_part, keys, c);
   // which propagates the derivative back to 'keys'.
   ApplyScalesToInput(key_scale, queries_key_part, c_deriv, keys_deriv);
 
   // The followign statement is the part of the backprop w.r.t.
   // the statement:
   // ApplyScalesToOutput(1.0, values, *c, &output_values_part);
   // which propagates the derivative back to 'values'.
   ApplyScalesToInput(1.0, output_values_part_deriv, c,  values_deriv);
 }

◆ AttentionForward()

void AttentionForward	(	BaseFloat	key_scale,
		const CuMatrixBase< BaseFloat > &	keys,
		const CuMatrixBase< BaseFloat > &	queries,
		const CuMatrixBase< BaseFloat > &	values,
		CuMatrixBase< BaseFloat > *	c,
		CuMatrixBase< BaseFloat > *	output
	)

This is a higher-level interface to the attention code.

Read the extended comment in the file nnet3/attention.h for context.

Parameters

[in]	key_scale	Scale on the non-context part of the keys.
[in]	keys	Matrix whose rows contains the keys, dimension is num-input-rows by key-dim.
[in]	queries	Matrix whose rows contains the queries, dimension is num-output-rows by query-dim, where query-dim == key-dim + context-dim. num-output-rows - num-input-rows must be a multiple of context-dim - 1 (we'll 'shift' the keys by multiples of 0, n, 2n, ... (context-dim - 1) * n.
[in]	values	Values to average at the output, of dimension num-input-rows by value-dim. [we may add context information to these averages if required, see comment for 'output'].
[out]	c	Expected to be finite at entry (no infs or nan's); at exit this will contain the output of the softmax. Must be of dimension num-output-rows by context-dim.
[out]	output	The output of the attention mechanism will be added to this location. Dimension must be num-output-rows by either value-dim, or value-dim + context-dim. To the first 'value-dim' columns of this will be added the weighted combination of 'values', weighted by the corresponding weights of 'c' (e.g. the first column of 'c' scaling the first 'output-dim' rows of 'values', then the next column of 'c' scaling the submatrix of 'values' shifted by 'n', and so on. If the output->NumCols() is value-dim + context-dim, 'c' will be added to the remaining columns of 'output'.

Definition at line 97 of file attention.cc.

References CuMatrixBase< Real >::AddMat(), ApplyScalesToOutput(), CuMatrixBase< Real >::CopyFromMat(), GetAttentionDotProducts(), KALDI_ASSERT, CuMatrixBase< Real >::NumCols(), CuMatrixBase< Real >::NumRows(), and CuMatrixBase< Real >::SoftMaxPerRow().

Referenced by RestrictedAttentionComponent::PropagateOneHead(), and TestAttentionForwardBackward().

                                                        {
   // First check the dimensions and values.
   KALDI_ASSERT(key_scale > 0.0);
   int32 num_input_rows = keys.NumRows(),
       key_dim = keys.NumCols(),
       num_output_rows = queries.NumRows(),
       context_dim = queries.NumCols() - key_dim,
       value_dim = values.NumCols();
   KALDI_ASSERT(num_input_rows > 0 && key_dim > 0 &&
                num_input_rows > num_output_rows &&
                context_dim > 0 &&
                (num_input_rows - num_output_rows) % (context_dim - 1) == 0 &&
                values.NumRows() == num_input_rows);
   KALDI_ASSERT(c->NumRows() == num_output_rows &&
                c->NumCols() == context_dim);
   KALDI_ASSERT(output->NumRows() == num_output_rows &&
                (output->NumCols() == value_dim ||
                 output->NumCols() == value_dim + context_dim));
 
   CuSubMatrix<BaseFloat> queries_key_part(
       queries, 0, num_output_rows,
       0, key_dim),
       queries_context_part(
           queries, 0, num_output_rows,
           key_dim, context_dim);
 
   GetAttentionDotProducts(key_scale,
                           queries_key_part,
                           keys, c);
   // think of 'queries_context_part' as a position-dependent bias term.
   c->AddMat(1.0, queries_context_part);
   // compute the soft-max function.  Up till this point, 'c'
   // actually contained what in attention.h we called 'b', which is
   // the input to the softmax.
   c->SoftMaxPerRow(*c);
 
 
   // the part of the output that is weighted
   // combinations of the input values.
   CuSubMatrix<BaseFloat> output_values_part(
       *output, 0, num_output_rows, 0, value_dim);
 
   ApplyScalesToOutput(1.0, values, *c, &output_values_part);
 
 
   if (output->NumCols() == value_dim + context_dim) {
     CuSubMatrix<BaseFloat> output_context_part(
         *output, 0, num_output_rows, value_dim, context_dim);
     output_context_part.CopyFromMat(*c);
   }
 }

◆ GetAttentionDotProducts()

void GetAttentionDotProducts	(	BaseFloat	alpha,
		const CuMatrixBase< BaseFloat > &	A,
		const CuMatrixBase< BaseFloat > &	B,
		CuMatrixBase< BaseFloat > *	C
	)

This function is a utility function that is at the core of how we implement attention.

It may in future need to be renamed and possibly moved into the cudamatrix directory and implemented in CUDA. The current implementation is quite inefficient. We can also consider doing a complete redesign of how the implementation works, such that this function doesn't exist at all; or we could have a batched version of this function that would operate on a batch of A, B and C at once (or a "strided, batched" version where the difference between the members of the batch is expressed as a stride).

This function implements a special operation that you could view as some kind of matrix multiplication where only a band of the product is retained.

The inputs A and B must have the same number of columns (A.NumCols() == B.NumCols()), and A and C must have the same number of rows (A.NumRows() == C->NumRows()). The number of rows of B must exceed the number of rows of A. Define num_extra_rows = B.NumRows() - A.NumRows(). Then C.NumCols() - 1 must divide num_extra_rows. Define row_shift = num_extra_rows / (C.NumCols() - 1).

This function implements: (*C)(i, j) = alpha * VecVec(A.Row(i), B.Row(i + j * row_shift))

Definition at line 32 of file attention.cc.

References CuVectorBase< Real >::AddDiagMatMat(), CuMatrixBase< Real >::CopyFromMat(), KALDI_ASSERT, kaldi::kNoTrans, kaldi::kTrans, CuMatrixBase< Real >::NumCols(), and CuMatrixBase< Real >::NumRows().

Referenced by AttentionBackward(), AttentionForward(), and UnitTestAttentionDotProductAndAddScales().

                                                          {
   KALDI_ASSERT(A.NumCols() == B.NumCols() &&
                A.NumRows() == C->NumRows());
   int32 num_output_rows = A.NumRows(),
       input_num_cols = A.NumCols(),
       num_extra_rows = B.NumRows() - A.NumRows(),
       context_dim = C->NumCols();
   KALDI_ASSERT(num_extra_rows > 0 && num_extra_rows % (context_dim - 1) == 0);
   int32 row_shift = num_extra_rows / (context_dim - 1);
   CuMatrix<BaseFloat> Ctrans(C->NumCols(),
                              C->NumRows());
   for (int32 o = 0; o < context_dim; o++) {
     CuSubVector<BaseFloat> c_col(Ctrans, o);
     CuSubMatrix<BaseFloat> B_part(B, o * row_shift, num_output_rows,
                                   0, input_num_cols);
     c_col.AddDiagMatMat(alpha, A, kNoTrans, B_part, kTrans, 0.0);
   }
   C->CopyFromMat(Ctrans, kTrans);
 }

◆ GetAttentionDotProductsSimple()

void kaldi::nnet3::attention::GetAttentionDotProductsSimple	(	BaseFloat	alpha,
		const CuMatrixBase< BaseFloat > &	A,
		const CuMatrixBase< BaseFloat > &	B,
		CuMatrixBase< BaseFloat > *	C
	)

Definition at line 30 of file attention-test.cc.

References rnnlm::i, rnnlm::j, KALDI_ASSERT, CuMatrixBase< Real >::NumCols(), and CuMatrixBase< Real >::NumRows().

Referenced by UnitTestAttentionDotProductAndAddScales().

                                                                {
   KALDI_ASSERT(A.NumCols() == B.NumCols() &&
                A.NumRows() == C->NumRows());
   int32 input_num_cols = A.NumCols(),
       num_extra_rows = B.NumRows() - A.NumRows(),
       context_dim = C->NumCols();
   KALDI_ASSERT(num_extra_rows > 0 && num_extra_rows % (context_dim - 1) == 0);
   int32 row_shift = num_extra_rows / (context_dim - 1);
   for (int32 i = 0; i < C->NumRows(); i++) {
     for (int32 j = 0; j < C->NumCols(); j++) {
       (*C)(i, j) = 0.0;
       for (int32 k = 0; k < input_num_cols; k++) {
         (*C)(i, j) += alpha * A(i, k) * B(i + (j * row_shift), k);
       }
     }
   }
 }

◆ TestAttentionForwardBackward()

void kaldi::nnet3::attention::TestAttentionForwardBackward ( )

Definition at line 120 of file attention-test.cc.

References CuMatrixBase< Real >::AddMat(), AttentionBackward(), AttentionForward(), rnnlm::i, KALDI_ASSERT, KALDI_LOG, kaldi::kTrans, CuMatrixBase< Real >::NumCols(), kaldi::RandInt(), CuMatrixBase< Real >::Scale(), CuMatrixBase< Real >::SetRandn(), CuMatrixBase< Real >::SetZero(), and kaldi::TraceMatMat().

Referenced by UnitTestAttention().

                                     {
   BaseFloat key_scale = 0.5 * RandInt(1, 3);
   BaseFloat epsilon = 1.0e-03;
   int32 test_dim = 3;
   bool output_context = (RandInt(0, 1) == 0);
   int32 output_num_rows = RandInt(1, 50),
       value_dim = RandInt(10, 30), key_dim = RandInt(10, 30),
       row_shift = RandInt(1, 5), context_dim = RandInt(2, 5),
       num_extra_rows = (context_dim - 1) * row_shift,
       input_num_rows = output_num_rows + num_extra_rows,
       query_dim = key_dim + context_dim;
   CuMatrix<BaseFloat> keys(input_num_rows, key_dim),
       queries(output_num_rows, query_dim),
       values(input_num_rows, value_dim),
       C(output_num_rows, context_dim),
       output(output_num_rows, value_dim + (output_context ? context_dim : 0));
 
 
   keys.SetRandn();
   queries.SetRandn();
   values.SetRandn();
 
 
   AttentionForward(key_scale, keys, queries, values, &C, &output);
 
   CuMatrix<BaseFloat> keys_deriv(input_num_rows, key_dim),
       queries_deriv(output_num_rows, query_dim),
       values_deriv(input_num_rows, value_dim),
       output_deriv(output_num_rows, output.NumCols());
 
   output_deriv.SetRandn();
 
   AttentionBackward(key_scale, keys, queries, values, C,
                     output_deriv, &keys_deriv, &queries_deriv,
                     &values_deriv);
 
   BaseFloat objf_baseline = TraceMatMat(output_deriv, output, kTrans);
 
 
 
 
   {  // perturb the values and see if the objf changes as predicted.
     Vector<BaseFloat> predicted_vec(test_dim), observed_vec(test_dim);
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> values2(input_num_rows, value_dim);
       values2.SetRandn();
       values2.Scale(epsilon);
       BaseFloat predicted_delta_objf = TraceMatMat(values_deriv, values2, kTrans);
       values2.AddMat(1.0, values);
 
       output.SetZero();
       AttentionForward(key_scale, keys, queries, values2, &C, &output);
       BaseFloat objf2 = TraceMatMat(output_deriv, output, kTrans),
           observed_delta_objf = objf2 - objf_baseline;
       KALDI_LOG << "Changing values: predicted objf change is "
                 << predicted_delta_objf << ", observed objf change is "
                 << observed_delta_objf;
       predicted_vec(i) = predicted_delta_objf;
       observed_vec(i) = observed_delta_objf;
     }
     KALDI_ASSERT(predicted_vec.ApproxEqual(observed_vec, 0.1));
   }
 
   {  // perturb the keys and see if the objf changes as predicted.
     Vector<BaseFloat> predicted_vec(test_dim), observed_vec(test_dim);
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> keys2(input_num_rows, key_dim);
       keys2.SetRandn();
       keys2.Scale(epsilon);
       BaseFloat predicted_delta_objf = TraceMatMat(keys_deriv, keys2, kTrans);
       keys2.AddMat(1.0, keys);
 
       output.SetZero();
       AttentionForward(key_scale, keys2, queries, values, &C, &output);
       BaseFloat objf2 = TraceMatMat(output_deriv, output, kTrans),
           observed_delta_objf = objf2 - objf_baseline;
       KALDI_LOG << "Changing keys: predicted objf change is "
                 << predicted_delta_objf << ", observed objf change is "
                 << observed_delta_objf;
       predicted_vec(i) = predicted_delta_objf;
       observed_vec(i) = observed_delta_objf;
     }
     KALDI_ASSERT(predicted_vec.ApproxEqual(observed_vec, 0.1));
   }
 
 
   {  // perturb the queries and see if the objf changes as predicted.
     Vector<BaseFloat> predicted_vec(test_dim), observed_vec(test_dim);
     for (int32 i = 0; i < test_dim; i++) {
       CuMatrix<BaseFloat> queries2(output_num_rows, query_dim);
       queries2.SetRandn();
       queries2.Scale(epsilon);
       BaseFloat predicted_delta_objf = TraceMatMat(queries_deriv, queries2, kTrans);
       queries2.AddMat(1.0, queries);
 
       output.SetZero();
       AttentionForward(key_scale, keys, queries2, values, &C, &output);
       BaseFloat objf2 = TraceMatMat(output_deriv, output, kTrans),
           observed_delta_objf = objf2 - objf_baseline;
       KALDI_LOG << "Changing queries: predicted objf change is "
                 << predicted_delta_objf << ", observed objf change is "
                 << observed_delta_objf;
       predicted_vec(i) = predicted_delta_objf;
       observed_vec(i) = observed_delta_objf;
     }
     KALDI_ASSERT(predicted_vec.ApproxEqual(observed_vec, 0.1));
   }
 }

◆ UnitTestAttention()

void kaldi::nnet3::attention::UnitTestAttention ( )

Definition at line 229 of file attention-test.cc.

References TestAttentionForwardBackward(), and UnitTestAttentionDotProductAndAddScales().

Referenced by main().

                          {
   UnitTestAttentionDotProductAndAddScales();
   TestAttentionForwardBackward();
 }

◆ UnitTestAttentionDotProductAndAddScales()

void kaldi::nnet3::attention::UnitTestAttentionDotProductAndAddScales ( )

Definition at line 91 of file attention-test.cc.

References ApplyScalesToInput(), ApplyScalesToInputSimple(), ApplyScalesToOutput(), ApplyScalesToOutputSimple(), kaldi::AssertEqual(), GetAttentionDotProducts(), GetAttentionDotProductsSimple(), kaldi::RandInt(), and CuMatrixBase< Real >::SetRandn().

Referenced by UnitTestAttention().

                                                {
   int32 output_num_rows = RandInt(1, 50), input_num_cols = RandInt(1, 10),
       row_shift = RandInt(1, 5), context_dim = RandInt(2, 5),
       num_extra_rows = (context_dim - 1) * row_shift,
       input_num_rows = output_num_rows + num_extra_rows;
   BaseFloat alpha = 0.25 * RandInt(1, 5);
   CuMatrix<BaseFloat> A(output_num_rows, input_num_cols),
       B(input_num_rows, input_num_cols),
       C(output_num_rows, context_dim);
 
   B.SetRandn();
   C.SetRandn();
   A.Set(0.0);
   CuMatrix<BaseFloat> A2(A);
   ApplyScalesToOutput(alpha, B, C, &A);
   ApplyScalesToOutputSimple(alpha, B, C, &A2);
   AssertEqual(A, A2);
 
   CuMatrix<BaseFloat> C2(C);
   GetAttentionDotProductsSimple(alpha, A, B, &C);
   GetAttentionDotProducts(alpha, A, B, &C2);
   AssertEqual(C, C2);
 
   CuMatrix<BaseFloat> B2(B);
   ApplyScalesToInput(alpha, A, C, &B);
   ApplyScalesToInputSimple(alpha, A, C, &B2);
   AssertEqual(B, B2);
 }

Functions

Function Documentation

◆ ApplyScalesToInput()

◆ ApplyScalesToInputSimple()

◆ ApplyScalesToOutput()

◆ ApplyScalesToOutputSimple()

◆ AttentionBackward()

◆ AttentionForward()

◆ GetAttentionDotProducts()

◆ GetAttentionDotProductsSimple()

◆ TestAttentionForwardBackward()

◆ UnitTestAttention()

◆ UnitTestAttentionDotProductAndAddScales()