doc/nnet3-egs-augment-image_8cc_source.html

 // nnet3bin/nnet3-egs-augment-image.cc

 // Copyright      2017  Johns Hopkins University (author:  Daniel Povey)
 //                2017  Hossein Hadian
 //                2017  Yiwen Shao

 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //  http://www.apache.org/licenses/LICENSE-2.0
 //
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.

 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-example-utils.h"

 namespace kaldi {
 namespace nnet3 {

 enum FillMode { kNearest, kReflect };

 struct ImageAugmentationConfig {
   int32 num_channels;
   BaseFloat horizontal_flip_prob;
   BaseFloat horizontal_shift;
   BaseFloat vertical_shift;
   BaseFloat rotation_degree;
   BaseFloat rotation_prob;
   std::string fill_mode_string;

   ImageAugmentationConfig():
       num_channels(1),
       horizontal_flip_prob(0.0),
       horizontal_shift(0.0),
       vertical_shift(0.0),
       rotation_degree(0.0),
       rotation_prob(0.0),
       fill_mode_string("nearest") { }


   void Register(ParseOptions *po) {
     po->Register("num-channels", &num_channels, "Number of colors in the image."
                  "It is important to specify this (helps interpret the image "
                  "correctly.");
     po->Register("horizontal-flip-prob", &horizontal_flip_prob,
                  "Probability of doing horizontal flip");
     po->Register("horizontal-shift", &horizontal_shift,
                  "Maximum allowed horizontal shift as proportion of image "
                  "width.  Padding is with closest pixel.");
     po->Register("vertical-shift", &vertical_shift,
                  "Maximum allowed vertical shift as proportion of image "
                  "height.  Padding is with closest pixel.");
     po->Register("rotation-degree", &rotation_degree,
                  "Maximum allowed degree to rotate the image");
     po->Register("rotation-prob", &rotation_prob,
                  "Probability of doing rotation");
     po->Register("fill-mode", &fill_mode_string, "Mode for dealing with "
                  "points outside the image boundary when applying transformation. "
                  "Choices = {nearest, reflect}");
   }

   void Check() const {
     KALDI_ASSERT(num_channels >= 1);
     KALDI_ASSERT(horizontal_flip_prob >= 0 &&
                  horizontal_flip_prob <= 1);
     KALDI_ASSERT(horizontal_shift >= 0 && horizontal_shift <= 1);
     KALDI_ASSERT(vertical_shift >= 0 && vertical_shift <= 1);
     KALDI_ASSERT(rotation_degree >=0 && rotation_degree <= 180);
     KALDI_ASSERT(rotation_prob >=0 && rotation_prob <= 1);
     KALDI_ASSERT(fill_mode_string == "nearest" || fill_mode_string == "reflect");
   }

   FillMode GetFillMode() const {
     FillMode fill_mode;
     if (fill_mode_string == "reflect") {
       fill_mode = kReflect;
     } else {
       if (fill_mode_string != "nearest") {
         KALDI_ERR << "Choices for --fill-mode are 'nearest' or 'reflect', got: "
                   << fill_mode_string;
       } else {
         fill_mode = kNearest;
       }
     }
     return fill_mode;
   }
 };

 void ApplyAffineTransform(MatrixBase<BaseFloat> &transform,
                           int32 num_channels,
                           MatrixBase<BaseFloat> *image,
                           FillMode fill_mode) {
   int32 num_rows = image->NumRows(),
         num_cols = image->NumCols(),
         height = num_cols / num_channels,
         width = num_rows;
   KALDI_ASSERT(num_cols % num_channels == 0);
   Matrix<BaseFloat> original_image(*image);
   for (int32 r = 0; r < width; r++) {
     for (int32 c = 0; c < height; c++) {
       // (r_old, c_old) is the coordinate of the pixel in the original image
       // while (r, c) is the coordinate in the new (transformed) image.
       BaseFloat r_old = transform(0, 0) * r +
                                           transform(0, 1) * c + transform(0, 2);
       BaseFloat c_old = transform(1, 0) * r +
                                           transform(1, 1) * c + transform(1, 2);
       // We are going to do bilinear interpolation between 4 closest points
       // to the point (r_old, c_old) of the original image. We have:
       // r1  <=  r_old  <=  r2
       // c1  <=  c_old  <=  c2
       int32 r1 = static_cast<int32>(floor(r_old));
       int32 c1 = static_cast<int32>(floor(c_old));
       int32 r2 = r1 + 1;
       int32 c2 = c1 + 1;

       // These weights determine how much each of the 4 points contributes
       // to the final interpolated value:
       BaseFloat weight_11 = (r2 - r_old) * (c2 - c_old),
           weight_12 = (r2 - r_old) * (c_old - c1),
           weight_21 = (r_old - r1) * (c2 - c_old),
           weight_22 = (r_old - r1) * (c_old - c1);
       // Handle edge conditions:
       if (fill_mode == kNearest) {
         if (r1 < 0) {
           r1 = 0;
           if (r2 < 0) r2 = 0;
         }
         if (r2 >= width) {
           r2 = width - 1;
           if (r1 >= width) r1 = width - 1;
         }
         if (c1 < 0) {
           c1 = 0;
           if (c2 < 0) c2 = 0;
         }
         if (c2 >= height) {
           c2 = height - 1;
           if (c1 >= height) c1 = height - 1;
         }
       } else {
         KALDI_ASSERT(fill_mode == kReflect);
         if (r1 < 0) {
           r1 = - r1;
           if (r2 < 0) r2 = - r2;
         }
         if (r2 >= width) {
           r2 = 2 * width - 2 - r2;
           if (r1 >= width) r1 = 2 * width - 2 - r1;
         }
         if (c1 < 0) {
           c1 = - c1;
           if (c2 < 0) c2 = -c2;
         }
         if (c2 >= height) {
           c2 = 2 * height - 2 - c2;
           if (c1 >= height) c1 = 2 * height - 2 - c1;
         }
       }
       for (int32 ch = 0; ch < num_channels; ch++) {
         // find the values at the 4 points
         BaseFloat p11 = original_image(r1, num_channels * c1 + ch),
             p12 = original_image(r1, num_channels * c2 + ch),
             p21 = original_image(r2, num_channels * c1 + ch),
             p22 = original_image(r2, num_channels * c2 + ch);
         (*image)(r, num_channels * c + ch) = weight_11 * p11 + weight_12 * p12 +
             weight_21 * p21 + weight_22 * p22;
       }
     }
   }
 }

 void PerturbImage(const ImageAugmentationConfig &config,
                   MatrixBase<BaseFloat> *image) {
   config.Check();
   FillMode fill_mode = config.GetFillMode();
   int32 image_width = image->NumRows(),
       num_channels = config.num_channels,
       image_height = image->NumCols() / num_channels;
   if (image->NumCols() % num_channels != 0) {
     KALDI_ERR << "Number of columns in image must divide the number "
         "of channels";
   }
   // We do an affine transform which
   // handles flipping, translation, rotation, magnification, and shear.
   Matrix<BaseFloat> transform_mat(3, 3, kUndefined);
   transform_mat.SetUnit();

   Matrix<BaseFloat> shift_mat(3, 3, kUndefined);
   shift_mat.SetUnit();
   // translation (shift) mat:
   // [ 1   0  x_shift
   //   0   1  y_shift
   //   0   0  1       ]
   BaseFloat horizontal_shift = (2.0 * RandUniform() - 1.0) *
       config.horizontal_shift * image_width;
   BaseFloat vertical_shift = (2.0 * RandUniform() - 1.0) *
       config.vertical_shift * image_height;
   shift_mat(0, 2) = round(horizontal_shift);
   shift_mat(1, 2) = round(vertical_shift);
   // since we will center the image before applying the transform,
   // horizontal flipping is simply achieved by setting [0, 0] to -1:
   if (WithProb(config.horizontal_flip_prob))
     shift_mat(0, 0) = -1.0;

   Matrix<BaseFloat> rotation_mat(3, 3, kUndefined);
   rotation_mat.SetUnit();
   // rotation mat:
   // [ cos(theta)  -sin(theta)  0
   //   sin(theta)  cos(theta)   0
   //   0           0            1 ]
   if (RandUniform() <= config.rotation_prob) {
     BaseFloat theta = (2 * config.rotation_degree * RandUniform() -
                        config.rotation_degree) / 180.0 * M_PI;
     rotation_mat(0, 0) = cos(theta);
     rotation_mat(0, 1) = -sin(theta);
     rotation_mat(1, 0) = sin(theta);
     rotation_mat(1, 1) = cos(theta);
   }

   Matrix<BaseFloat> shear_mat(3, 3, kUndefined);
   shear_mat.SetUnit();
   // shear mat:
   // [ 1    -sin(shear)   0
   //   0     cos(shear)   0
   //   0     0            1 ]

   Matrix<BaseFloat> zoom_mat(3, 3, kUndefined);
   zoom_mat.SetUnit();
   // zoom mat:
   // [ x_zoom   0   0
   //   0   y_zoom   0
   //   0     0      1 ]

   // transform_mat = rotation_mat * shift_mat * shear_mat * zoom_mat:
   transform_mat.AddMatMat(1.0, shift_mat, kNoTrans,
                           shear_mat, kNoTrans, 0.0);
   transform_mat.AddMatMatMat(1.0, rotation_mat, kNoTrans,
                              transform_mat, kNoTrans,
                              zoom_mat, kNoTrans, 0.0);
   if (transform_mat.IsUnit())  // nothing to do
     return;

   // we should now change the origin of transform to the center of
   // the image (necessary for flipping, zoom, shear, and rotation)
   // we do this by using two translations: one before the main transform
   // and one after.
   Matrix<BaseFloat> set_origin_mat(3, 3, kUndefined);
   set_origin_mat.SetUnit();
   set_origin_mat(0, 2) = image_width / 2.0 - 0.5;
   set_origin_mat(1, 2) = image_height / 2.0 - 0.5;
   Matrix<BaseFloat> reset_origin_mat(3, 3, kUndefined);
   reset_origin_mat.SetUnit();
   reset_origin_mat(0, 2) = -image_width / 2.0 + 0.5;
   reset_origin_mat(1, 2) = -image_height / 2.0 + 0.5;

   // transform_mat = set_origin_mat * transform_mat * reset_origin_mat
   transform_mat.AddMatMatMat(1.0, set_origin_mat, kNoTrans,
                              transform_mat, kNoTrans,
                              reset_origin_mat, kNoTrans, 0.0);
   ApplyAffineTransform(transform_mat, config.num_channels, image, fill_mode);
 }


 void PerturbImageInNnetExample(
     const ImageAugmentationConfig &config,
     NnetExample *eg) {
   int32 io_size = eg->io.size();
   bool found_input = false;
   for (int32 i = 0; i < io_size; i++) {
     NnetIo &io = eg->io[i];
     if (io.name == "input") {
       found_input = true;
       Matrix<BaseFloat> image;
       io.features.GetMatrix(&image);
       // note: 'GetMatrix' may uncompress if it was compressed.
       // We won't recompress, but this won't matter because this
       // program is intended to be used as part of a pipe, we
       // likely won't be dumping the perturbed data to disk.
       PerturbImage(config, &image);

       // modify the 'io' object.
       io.features = image;
     }
   }
   if (!found_input)
     KALDI_ERR << "Nnet example to perturb had no NnetIo object named 'input'";
 }


 }  // namespace nnet3
 }  // namespace kaldi

 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;

     const char *usage =
         "Copy examples (single frames or fixed-size groups of frames) for neural\n"
         "network training, doing image augmentation inline (copies after possibly\n"
         "modifying of each image, randomly chosen according to configuration\n"
         "parameters).\n"
         "E.g.:\n"
         "  nnet3-egs-augment-image --horizontal-flip-prob=0.5 --horizontal-shift=0.1\\\n"
         "       --vertical-shift=0.1 --srand=103 --num-channels=3 --fill-mode=nearest ark:- ark:-\n"
         "\n"
         "Requires that each eg contain a NnetIo object 'input', with successive\n"
         "'t' values representing different x offsets , and the feature dimension\n"
         "representing the y offset and the channel (color), with the channel\n"
         "varying the fastest.\n"
         "See also: nnet3-copy-egs\n";


     int32 srand_seed = 0;

     ImageAugmentationConfig config;

     ParseOptions po(usage);
     po.Register("srand", &srand_seed, "Seed for the random number generator");

     config.Register(&po);

     po.Read(argc, argv);

     srand(srand_seed);

     if (po.NumArgs() < 2) {
       po.PrintUsage();
       exit(1);
     }


     std::string examples_rspecifier = po.GetArg(1),
         examples_wspecifier = po.GetArg(2);

     SequentialNnetExampleReader example_reader(examples_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);


     int64 num_done = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_done++) {
       std::string key = example_reader.Key();
       NnetExample eg(example_reader.Value());
       PerturbImageInNnetExample(config, &eg);
       example_writer.Write(key, eg);
     }
     KALDI_LOG << "Perturbed " << num_done << " neural-network training images.";
     return (num_done == 0 ? 1 : 0);
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
kaldi::nnet3::NnetExample
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
Definition: nnet-example.h:111

nnet-example.h

kaldi
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
Definition: chain.dox:20

kaldi::nnet3::NnetIo
Definition: nnet-example.h:33

kaldi::nnet3::ImageAugmentationConfig::rotation_prob
BaseFloat rotation_prob
Definition: nnet3-egs-augment-image.cc:39

kaldi::nnet3::ImageAugmentationConfig
Definition: nnet3-egs-augment-image.cc:33

kaldi::kUndefined
Definition: matrix-common.h:39

kaldi::nnet3::ImageAugmentationConfig::GetFillMode
FillMode GetFillMode() const
Definition: nnet3-egs-augment-image.cc:84

kaldi::nnet3::ImageAugmentationConfig::num_channels
int32 num_channels
Definition: nnet3-egs-augment-image.cc:34

kaldi::RandUniform
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
Definition: kaldi-math.h:151

kaldi::GeneralMatrix::GetMatrix
void GetMatrix(Matrix< BaseFloat > *mat) const
Outputs the contents as a matrix.
Definition: sparse-matrix.cc:817

M_PI
#define M_PI
Definition: kaldi-math.h:44

kaldi::MatrixBase::NumCols
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
Definition: kaldi-matrix.h:67

main
int main(int argc, char *argv[])
Definition: nnet3-egs-augment-image.cc:331

kaldi::MatrixBase
Base class which provides matrix operations not involving resizing or allocation. ...
Definition: kaldi-matrix.h:49

kaldi::nnet3::ImageAugmentationConfig::Check
void Check() const
Definition: nnet3-egs-augment-image.cc:73

kaldi::ParseOptions::PrintUsage
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
Definition: parse-options.cc:393

kaldi::SequentialTableReader::Key
std::string Key()
Definition: kaldi-table-inl.h:918

kaldi::nnet3::kReflect
Definition: nnet3-egs-augment-image.cc:31

kaldi::nnet3::ImageAugmentationConfig::rotation_degree
BaseFloat rotation_degree
Definition: nnet3-egs-augment-image.cc:38

kaldi::WithProb
bool WithProb(BaseFloat prob, struct RandomState *state)
Definition: kaldi-math.cc:72

kaldi::TableWriter
A templated class for writing objects to an archive or script file; see The Table concept...
Definition: kaldi-table.h:368

kaldi::int32
kaldi::int32 int32
Definition: online-tcp-source.cc:27

common-utils.h

kaldi::nnet3::NnetIo::features
GeneralMatrix features
The features or labels.
Definition: nnet-example.h:46

kaldi::Matrix< BaseFloat >

kaldi::nnet3::ImageAugmentationConfig::horizontal_flip_prob
BaseFloat horizontal_flip_prob
Definition: nnet3-egs-augment-image.cc:35

kaldi::MatrixBase::SetUnit
void SetUnit()
Sets to zero, except ones along diagonal [for non-square matrices too].
Definition: kaldi-matrix.cc:1348

kaldi::TableWriter::Write
void Write(const std::string &key, const T &value) const
Definition: kaldi-table-inl.h:1511

kaldi::nnet3
Definition: dnn3_code_compilation.dox:22

kaldi::ParseOptions::Register
void Register(const std::string &name, bool *ptr, const std::string &doc)
Definition: parse-options.cc:56

kaldi::nnet3::PerturbImageInNnetExample
void PerturbImageInNnetExample(const ImageAugmentationConfig &config, NnetExample *eg)
This function does image perturbation as directed by &#39;config&#39; The example &#39;eg&#39; is expected to contain...
Definition: nnet3-egs-augment-image.cc:302

kaldi::ParseOptions
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
Definition: parse-options.h:36

float

transition-model.h

kaldi::nnet3::ImageAugmentationConfig::vertical_shift
BaseFloat vertical_shift
Definition: nnet3-egs-augment-image.cc:37

kaldi::SequentialTableReader
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
Definition: kaldi-table.h:287

nnet-example-utils.h

kaldi::MatrixBase::AddMatMat
void AddMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const Real beta)
Definition: kaldi-matrix.cc:171

kaldi::ParseOptions::Read
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
Definition: parse-options.cc:311

kaldi::SequentialTableReader::Done
bool Done()
Definition: kaldi-table-inl.h:948

KALDI_ERR
#define KALDI_ERR
Definition: kaldi-error.h:147

kaldi::kNoTrans
Definition: matrix-common.h:34

kaldi::nnet3::PerturbImage
void PerturbImage(const ImageAugmentationConfig &config, MatrixBase< BaseFloat > *image)
This function randomly modifies (perturbs) the image by applying different geometric transformations ...
Definition: nnet3-egs-augment-image.cc:205

kaldi::MatrixBase::AddMatMatMat
void AddMatMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const MatrixBase< Real > &C, MatrixTransposeType transC, const Real beta)
this <– beta*this + alpha*A*B*C.
Definition: kaldi-matrix.cc:1741

kaldi::ParseOptions::GetArg
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
Definition: parse-options.cc:202

kaldi::SequentialTableReader::Next
void Next()
Definition: kaldi-table-inl.h:942

kaldi::nnet3::ImageAugmentationConfig::Register
void Register(ParseOptions *po)
Definition: nnet3-egs-augment-image.cc:52

rnnlm::i
int i
Definition: mikolov-rnnlm-lib.cc:66

kaldi::ParseOptions::NumArgs
int NumArgs() const
Number of positional parameters (c.f. argc-1).
Definition: parse-options.cc:198

kaldi::nnet3::ApplyAffineTransform
void ApplyAffineTransform(MatrixBase< BaseFloat > &transform, int32 num_channels, MatrixBase< BaseFloat > *image, FillMode fill_mode)
This function applies a geometric transformation &#39;transform&#39; to the image.
Definition: nnet3-egs-augment-image.cc:110

kaldi::SequentialTableReader::Value
T & Value()
Definition: kaldi-table-inl.h:934

kaldi::nnet3::FillMode
FillMode
Definition: nnet3-egs-augment-image.cc:31

KALDI_ASSERT
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

kaldi::MatrixBase::NumRows
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
Definition: kaldi-matrix.h:64

kaldi::nnet3::NnetIo::name
std::string name
the name of the input in the neural net; in simple setups it will just be "input".
Definition: nnet-example.h:36

kaldi::nnet3::ImageAugmentationConfig::horizontal_shift
BaseFloat horizontal_shift
Definition: nnet3-egs-augment-image.cc:36

kaldi::nnet3::ImageAugmentationConfig::fill_mode_string
std::string fill_mode_string
Definition: nnet3-egs-augment-image.cc:40

kaldi::nnet3::ImageAugmentationConfig::ImageAugmentationConfig
ImageAugmentationConfig()
Definition: nnet3-egs-augment-image.cc:42

kaldi::nnet3::NnetExample::io
std::vector< NnetIo > io
"io" contains the input and output.
Definition: nnet-example.h:116

kaldi::MatrixBase::IsUnit
bool IsUnit(Real cutoff=1.0e-05) const
Returns true if the matrix is all zeros, except for ones on diagonal.
Definition: kaldi-matrix.cc:1890

KALDI_LOG
#define KALDI_LOG
Definition: kaldi-error.h:153

kaldi::nnet3::kNearest
Definition: nnet3-egs-augment-image.cc:31

kaldi-common.h