convolution.h
Go to the documentation of this file.
1 // nnet3/convolution.h
2 
3 // Copyright 2017 Johns Hopkins University (author: Daniel Povey)
4 
5 // See ../../COPYING for clarification regarding multiple authors
6 //
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 //
11 // http://www.apache.org/licenses/LICENSE-2.0
12 //
13 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
15 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
16 // MERCHANTABLITY OR NON-INFRINGEMENT.
17 // See the Apache 2 License for the specific language governing permissions and
18 // limitations under the License.
19 
20 #ifndef KALDI_NNET3_NNET_CONVOLUTION_H_
21 #define KALDI_NNET3_NNET_CONVOLUTION_H_
22 
23 #include "base/kaldi-common.h"
24 #include "util/common-utils.h"
25 #include "itf/options-itf.h"
26 #include "matrix/matrix-lib.h"
28 #include "nnet3/nnet-common.h"
29 
30 #include <iostream>
31 
32 namespace kaldi {
33 namespace nnet3 {
34 
50 
51 
52 namespace time_height_convolution {
53 
126  int32 num_filters_in; // number of input filters, e.g. 128.
127  int32 num_filters_out; // number of output filters, e.g. 256.
128  int32 height_in; // image height in, e.g. 40.
129  int32 height_out; // image height out, e.g. 40 (no subsampling or zero
130  // padding), 38 (with zero padding) (or for an example with
131  // 2x subsampling and no zero-padding: maybe 20).
132  int32 height_subsample_out; // subsampling factor for height. In the 3
133  // examples given for height_out above, would be
134  // 1, 1 and 2 respectively.
135  struct Offset {
138  // give it a lexicographic ordering.
139  inline bool operator < (const Offset &other) const {
140  if (time_offset < other.time_offset) return true;
141  else if (time_offset > other.time_offset) return false;
142  else return height_offset < other.height_offset;
143  }
144  inline bool operator <= (const Offset &other) const {
145  if (time_offset < other.time_offset) return true;
146  else if (time_offset > other.time_offset) return false;
147  else return height_offset <= other.height_offset;
148  }
149  inline bool operator == (const Offset &other) const {
150  return time_offset == other.time_offset &&
151  height_offset == other.height_offset;
152  }
153  };
154  // For a 3x3 patch, the 'offsets' vector would be a list of 9 elements. It's
155  // always unique and sorted in lexicographic order. See the extended comment
156  // for struct ConvolutionModel for an explanation.
157  std::vector<Offset> offsets;
158 
159  // This set, 'required_time_offsets', relates to zero-padding on the time
160  // axis. It should consist of a nonempty subset of the time-offset values
161  // that have been seen in offsets[*].time_offset. If there is no zero-padding
162  // on the time (width) axis it would be that entire set. If there is
163  // zero-padding it would in most circumstances contain just the middle one,
164  // e.g. of {0,1,2} we'd keep just {1}, or of {-3,0,3} we'd keep just {0}. The
165  // way to understand it is that all the time-offsets define dependencies in
166  // the computation, but the list of 'required' offsets determines when a
167  // computation can proceed when some of the dependencies are not present (any
168  // non-required depenencies that were not present default to zero).
169  std::set<int32> required_time_offsets;
170 
171  // This variable, which is derived from 'offsets', stores all the time offsets
172  // that are present there, i.e. all the values of 'offsets[*].time_offset'
173  std::set<int32> all_time_offsets;
174 
175  // This variable, which is derived from 'offsets', is the greatest common
176  // divisor of the differences between the members of 'all_time_offsets';
177  // e.g. if 'all_time_offsets' is {1,3,5} it would be 2. It is used to figure
178  // out what grid structure the input to the computation should have. It is
179  // set to zero if all_time_offsets.size() == 1.
181 
182 
183  // Computes the derived parameters 'all_time_offsets' and
184  // 'time_offsets_modulus'.
185  void ComputeDerived();
186 
187  // You'll notice that there is nothing here that explicitly specifies the
188  // padding. At this level, any padding on the height axis is implicit. For
189  // example, suppose there is a height-offset of -1, that implies we must be
190  // padding at the bottom by at least 1, because the output height-index starts
191  // from 0, and it would require the input at height -1, whereas the input
192  // height-index starts from 0. All padding is implicitly zero-padding.
193  // Padding in the height dimension depends on (height_in, height_out,
194  // height_subsample_out) and the 'height_offset' members of 'offsets'; padding
195  // in the time dimension depends on 'required_time_offset'
196  // vs. 'all_time_offsets'.
197 
198  // the InputDim() and OutputDim() really relate to its behavior in a
199  // neural-net component, they are the input-dim and output-dim of the features
200  // that the component has as input/output; physically, this is the column
201  // dimension at the input and output of the component. The time dimension
202  // corresponds to the row-index of those features.
203  int32 InputDim() const { return num_filters_in * height_in; }
204  int32 OutputDim() const { return num_filters_out * height_out; }
205  // number of rows in the parameter matrix
206  int32 ParamRows() const { return num_filters_out; }
207  // number of cols in the parameter matrix
208  int32 ParamCols() const { return num_filters_in * static_cast<int32>(offsets.size()); }
209 
211 
212  bool operator == (const ConvolutionModel &other) const;
213 
214  /*
215  Checks that this model makes sense, and returns true if so; if not, returns
216  false (and if it's for certain less-obvious reasons, prints a warning first
217  explaining why)..
218 
219  @param [in] check_heights_used If true, part of the check is that all
220  height-values at the input are used at some point (if they
221  are not, this model is probably not what you intended).
222  @param [in] allow_height_padding If true, the checking code assumes that
223  zero-padding on the height axis is permitted.
224  @return Returns true if the check passed, false otherwise.
225  */
226  bool Check(bool check_heights_used = true,
227  bool allow_height_padding = true) const;
228 
229  // Returns an info-string that describes the model; it looks like
230  // "num-filters-in=32, num-filters-out=64, height-in=40, height-out=40, ... ".
231  // It's suitable for use in the 'info' output of the convolutional component.
232  std::string Info() const;
233 
234  void Write(std::ostream &os, bool binary) const;
235  void Read(std::istream &is, bool binary);
236 };
237 
238 
253  // num_filters_in and num_filters_out will be the same as in the model.
255  // height_out will be the same as in the model, but height_in may be
256  // affected by reshaping (may be larger than the model's height_in).
258  // num_t_in and num_t_out are the number of rows in the input and output
259  // matrices, but num_t_in may be affected by reshaping (may be smaller
260  // than the model's num_t_in).
261  // num_t_in will be >= num_times_out, and if it's greater it will be greater by a
262  // small additive term, not by a multiplicative factor.
263  int32 num_t_in, num_t_out;
264  // num_images is the number of (n,x) pairs present in the input/output
265  // indexes (although in most setups the x values will all be zero and
266  // they will only vary in n).
268 
269  // temp_rows and temp_cols define the size of a temporary matrix that the
270  // computation uses. temp_rows is the number of rows in that temporary
271  // matrix; it will normally be equal to [multiplying from greatest to least
272  // stride], (num_times_out * num_images), but it may be less in order to save
273  // memory. The execution code is in charge of looping over the data using
274  // this matrix, in order to ensure that we cover all output rows. If you are
275  // just trying to understand the framework, assume that it's always equal to
276  // num_times_out * num_images.
277 
278  // Note: if all of the steps[*].columns_are_contiguous values are true AND all
279  // of the steps[*].columns.Dim() equal the input-num-cols (=num_filters_in *
280  // height_in), then the temporary matrix is never needed and in that case,
281  // temp_rows and temp_cols will both be zero.
282  int32 temp_rows, temp_cols;
283 
284  // There may be a few steps in the computation (e.g. in a 3x3 convolution
285  // without subsampling, there would be 3 steps), and the output is a summation
286  // over contributions from each step. each step has a different value
287  // 'input_time_shift' (which is the number of input rows to discard at the
288  // start of the input matrix, and won't be the same as the increment in 't',
289  // if t_step_in in the ConvolutionComputationIo != 1.
291  // input_time_shift >= 0 is the number of initial time-indexes of the input
292  // (i.e. the number of initial rows of the matrix) that we discard for this
293  // step. We may discard some final time-indexes too, if needed so that the
294  // total number of input time-indexes equals the total number of output
295  // time-indexes.
297 
298  // params_start_col >= 0 says the start-column-index of the parameter matrix
299  // where we start a sub-matrix to be used in this step (the num-cols of that
300  // sub-matrix is given by columns.Dim() / height_out).
302 
303  // height_map is the 'upstream' parameter from which 'columns' and
304  // 'backward_columns' are derived; it compactly defines a column mapping
305  // that is used when copying the input to a temporary matrix.
306  // height_map.dim() * num_filters_in gives the num-cols in this temporary
307  // matrix. Each element of 'height_map' corresponds to a column range of
308  // 'num_filters_in' columnn of the temporary matrix, and it says which
309  // (same-sized) column-range of the input matrix is to be used as the source
310  // for this data. Its elements are in the range -1 <= height_map[i] <
311  // num_filters_in, where -1's are used for blocks that have zero values.
312  // height_map would be the same as 'columns' if num_filters_in == 1.
313  std::vector<int32> height_map;
314 
315  // 'columns' is derived from 'pixel_map'.
316  // columns.Dim() <= temp_cols is the num-columns of
317  // a sub-matrix of the temporary matrix, that we
318  // populate on this step.
319  //
320  // -1 <= columns[i] < height_in * num_filters_in
321  // gives the dimension of the (reshaped) input to copy
322  // If columns[i] == -1, it means write a zero.
324 
325  // 'backward_columns' is derived from 'columns', it is used in
326  // the backprop. Each element of 'backward_columns' has the
327  // same dim as the num-cols of the input matrix. It's basically
328  // the reverse map of 'columns', but split into multiple parts (and
329  // padded with -1's as necessary) so that we can process elements
330  // of the input which are copied multiple times to the temporary
331  // matrix.
332  std::vector<CuArray<int32> > backward_columns;
333 
334  // 'columns_are_contiguous' is derived from 'columns'; it's true if
335  // 'columns' is a contiguous range of nonnegative integers, like '20, 21,
336  // 22, ... '.
338  // 'first_column' is derived from 'columns'; it equals columns[0]. It is
339  // only of interest if 'columns_are_contiguous' is true (it enables an
340  // optimization).
342  };
343  std::vector<ConvolutionStep> steps;
344 
345 
346  void Write(std::ostream &os, bool binary) const;
347  void Read(std::istream &is, bool binary);
348 
349  // Computes derived variables in 'steps', i.e. 'columns', 'backward_columns',
350  // columns_are_contiguous, and 'first_column'.
351  void ComputeDerived();
352 
353  // check that this computation makes sense; crash if not.
354  void Check() const;
355 };
356 
357 
358 
363  // max_memory_mb determines how many megabytes of memory we are willing to use
364  // for the temporary matrix. If it would exceed this amount, we do the
365  // computation in batches.
367  ConvolutionComputationOptions(): max_memory_mb(200.0) { }
368 };
369 
370 
371 
372 // This struct represents the structure of the input and output of a
373 // convolutional computation (the input and output images; not the model itself,
374 // which is represented by ConvolutionModel). We require that both the input
375 // and output indexes have a regular repeated structure, and if this is not the
376 // case then the input and output indexes will be padded with 'blank' indexes
377 // (indexes having a 't' vlaue of kNoTime) as needed to fit them into regular
378 // grids. In addition 'blank' indexes may be added to reflect zero-padding on
379 // the input.
381  int32 num_images; // 'num_images' is the number of distinct (n,x) values in
382  // the indexes. Normally the x values would all be zero
383  // and the n values would go from 0 to num_images - 1, but
384  // this is not required. We do enforce (via padding) that
385  // each (n,x) pair, i.e. each image, is associated with the
386  // same number of 't' values.
387 
388  // the following represents the set of 't' values on the input and output.
389  // their meaning is obvious, but we should note that if there is just one
390  // output or input index, we will set the step to zero when initially
391  // creating this struct, and it may get set to other values later on, mostly
392  // to avoid creating extra code paths.
393  int32 start_t_in, t_step_in, num_t_in;
394  int32 start_t_out, t_step_out, num_t_out;
395 
396  // reorder_t_in will be 1 in normal cases (no downsampling), but it may have values
397  // greater than 1 (e.g. 2 if we're downsampling by a factor of 2).
398  // This doesn't affect the set of indexes on the input, but it affects how they
399  // are ordered.
400  //
401  // If reorder_t_in == 1 then order the indexes one block for all
402  // indexes with t=t0=start_t_in; then one block for all
403  // t=t1=(start_t_in+t_step_in); then one block for t=t2, t=t3, and so on.
404  //
405  // If reorder_t_in is >1 (for example, 2), then the values for t0 and t1 would
406  // be interspersed in a single block; then the values for t1 and t2 would
407  // be interspersed in the next block; and so on. Within these blocks,
408  // it's the 't' values that have the smaller stride. This ordering allows
409  // a reshaping such that we can imagine that the input and output have the
410  // same 't' increment; it's useful in subsampling convolutions..
412 
413  void Write(std::ostream &os, bool binary) const;
414  void Read(std::istream &is, bool binary);
415 };
416 
424 void CheckModelAndIo(const ConvolutionModel &model,
425  const ConvolutionComputationIo &io,
426  bool allow_extra_input = false);
427 
428 
448  const ConvolutionModel &model,
449  const std::vector<Index> &input_indexes,
450  const std::vector<Index> &output_indexes,
451  const ConvolutionComputationOptions &opts,
452  ConvolutionComputation *computation,
453  std::vector<Index> *input_indexes_modified,
454  std::vector<Index> *output_indexes_modified);
455 
456 
477 void ConvolveForward(
478  const ConvolutionComputation &conv_comp,
479  const CuMatrixBase<BaseFloat> &input,
480  const CuMatrixBase<BaseFloat> &params,
481  CuMatrixBase<BaseFloat> *output);
482 
483 
509  const ConvolutionComputation &conv_comp,
510  const CuMatrixBase<BaseFloat> &params,
511  const CuMatrixBase<BaseFloat> &output_deriv,
512  CuMatrixBase<BaseFloat> *input_deriv);
513 
541  const ConvolutionComputation &conv_comp,
542  const CuMatrixBase<BaseFloat> &input,
543  const CuMatrixBase<BaseFloat> &output_deriv,
544  BaseFloat alpha,
545  CuMatrixBase<BaseFloat> *params_deriv);
546 
547 
555 void GetComputationIo(
556  const std::vector<Index> &input_indexes,
557  const std::vector<Index> &output_indexes,
559 
560 
571  const ConvolutionComputationIo &io,
572  const std::vector<Index> &orig_input_indexes,
573  const std::vector<Index> &orig_output_indexes,
574  std::vector<Index> *input_indexes,
575  std::vector<Index> *output_indexes);
576 
577 
586 void PadComputationInputTime(const ConvolutionModel &model,
588 
589 
602 void PadModelHeight(const ConvolutionModel &model,
603  ConvolutionModel *model_padded);
604 
605 
622  const ConvolutionModel &model,
623  const ConvolutionModel &model_padded,
624  ConvolutionComputation *computation);
625 
644 void AppendInputFrames(const ConvolutionModel &model,
646  ConvolutionModel *model_appended,
647  ConvolutionComputationIo *io_appended);
648 
649 
650 /*
651  This function takes a model and a specification of the comptuation's
652  IO, and generates the computation. This is stage 4 of the compilation.
653  It assumes that stages 1, 2 and 3 have already been done so that:
654 
655  - Any required padding of the time axis (stage 1) and the height axis
656  (stage 2) have been done (so any desired input values are available).
657  - The t_stride_in and t_stride_out of the io object have the same value
658  (stage 3).
659 
660  At this point the compilation process is actually quite simple: for each
661  time shift (where the number of time shifts equals num_t_in + 1 - num_t_out
662  of 'io'), we do a computation that copies and maybe duplicates the input
663  columns to a temporary matrix, and then does a matrix multiplication
664  between that temporary matrix
665  */
666 void MakeComputation(const ConvolutionModel &model,
668  const ConvolutionComputationOptions &opts,
669  ConvolutionComputation *computation);
670 
671 
672 } // namespace time_height_convolution
673 
674 } // namespace nnet3
675 
676 
677 
678 
679 
680 } // namespace kaldi
681 
682 
683 #endif
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20
void Write(std::ostream &os, bool binary) const
Definition: convolution.cc:225
void ConvolveBackwardParams(const ConvolutionComputation &cc, const CuMatrixBase< BaseFloat > &input, const CuMatrixBase< BaseFloat > &output_deriv, BaseFloat alpha, CuMatrixBase< BaseFloat > *params_deriv)
This does the part of the backward derivative computation of convolution, that computes derivatives w...
Definition: convolution.cc:840
bool Check(bool check_heights_used=true, bool allow_height_padding=true) const
Definition: convolution.cc:130
This comment explains the basic framework used for everything related to time-height convolution...
Definition: convolution.h:125
kaldi::int32 int32
void GetIndexesForComputation(const ConvolutionComputationIo &io, const std::vector< Index > &orig_input_indexes, const std::vector< Index > &orig_output_indexes, std::vector< Index > *input_indexes, std::vector< Index > *output_indexes)
This function computes the reordered and possibly padded indexes corresponding to the computation in ...
void UnPadModelHeight(const ConvolutionComputationOptions &opts, const ConvolutionModel &model, const ConvolutionModel &model_padded, ConvolutionComputation *computation)
This function modifies, if necessary, a computation that has been built for the model &#39;model_padded&#39;...
void ConvolveBackwardData(const ConvolutionComputation &cc, const CuMatrixBase< BaseFloat > &params, const CuMatrixBase< BaseFloat > &output_deriv, CuMatrixBase< BaseFloat > *input_deriv)
This does the part of the backward derivative computation of convolution, that propagates derivatives...
Definition: convolution.cc:682
void MakeComputation(const ConvolutionModel &model, ConvolutionComputationIo &io, const ConvolutionComputationOptions &opts, ConvolutionComputation *computation)
void AppendInputFrames(const ConvolutionModel &model, ConvolutionComputationIo *io, ConvolutionModel *model_appended, ConvolutionComputationIo *io_appended)
This function takes an input model and I/O specification, and it modifies both of them if necessary t...
void CompileConvolutionComputation(const ConvolutionModel &model, const std::vector< Index > &input_indexes, const std::vector< Index > &output_indexes, const ConvolutionComputationOptions &opts, ConvolutionComputation *computation, std::vector< Index > *input_indexes_modified, std::vector< Index > *output_indexes_modified)
This function does the compilation for a convolution computation; it&#39;s a wrapper for the functions be...
void PadModelHeight(const ConvolutionModel &model, ConvolutionModel *model_padded)
This function takes a model that might require zero padding in the height dimension and outputs a mod...
Definition: convolution.cc:918
This struct represents the structure of a convolution computation.
Definition: convolution.h:252
Matrix for CUDA computing.
Definition: matrix-common.h:69
void GetComputationIo(const std::vector< Index > &input_indexes, const std::vector< Index > &output_indexes, ConvolutionComputationIo *io)
This function takes lists of input and output indexes to a computation (e.g.
This struct contains options for compiling the convolutional computation.
Definition: convolution.h:362
void ConvolveForward(const ConvolutionComputation &cc, const CuMatrixBase< BaseFloat > &input, const CuMatrixBase< BaseFloat > &params, CuMatrixBase< BaseFloat > *output)
This does the forward computation of convolution.
Definition: convolution.cc:524
void CheckModelAndIo(const ConvolutionModel &model, const ConvolutionComputationIo &io, bool allow_extra_input)
Check that this model and this I/O request are compatible in terms of required context, etc, and crash if not.
void PadComputationInputTime(const ConvolutionModel &model, ConvolutionComputationIo *io)
This function extends the set of input indexes that the computation has, to account for any required ...