44 horizontal_flip_prob(0.0),
45 horizontal_shift(0.0),
49 fill_mode_string(
"nearest") { }
53 po->
Register(
"num-channels", &num_channels,
"Number of colors in the image." 54 "It is important to specify this (helps interpret the image " 56 po->
Register(
"horizontal-flip-prob", &horizontal_flip_prob,
57 "Probability of doing horizontal flip");
58 po->
Register(
"horizontal-shift", &horizontal_shift,
59 "Maximum allowed horizontal shift as proportion of image " 60 "width. Padding is with closest pixel.");
61 po->
Register(
"vertical-shift", &vertical_shift,
62 "Maximum allowed vertical shift as proportion of image " 63 "height. Padding is with closest pixel.");
64 po->
Register(
"rotation-degree", &rotation_degree,
65 "Maximum allowed degree to rotate the image");
66 po->
Register(
"rotation-prob", &rotation_prob,
67 "Probability of doing rotation");
68 po->
Register(
"fill-mode", &fill_mode_string,
"Mode for dealing with " 69 "points outside the image boundary when applying transformation. " 70 "Choices = {nearest, reflect}");
76 horizontal_flip_prob <= 1);
77 KALDI_ASSERT(horizontal_shift >= 0 && horizontal_shift <= 1);
78 KALDI_ASSERT(vertical_shift >= 0 && vertical_shift <= 1);
79 KALDI_ASSERT(rotation_degree >=0 && rotation_degree <= 180);
81 KALDI_ASSERT(fill_mode_string ==
"nearest" || fill_mode_string ==
"reflect");
86 if (fill_mode_string ==
"reflect") {
89 if (fill_mode_string !=
"nearest") {
90 KALDI_ERR <<
"Choices for --fill-mode are 'nearest' or 'reflect', got: " 120 for (
int32 r = 0; r < width; r++) {
121 for (
int32 c = 0; c < height; c++) {
125 transform(0, 1) * c + transform(0, 2);
127 transform(1, 1) * c + transform(1, 2);
139 BaseFloat weight_11 = (r2 - r_old) * (c2 - c_old),
140 weight_12 = (r2 - r_old) * (c_old - c1),
141 weight_21 = (r_old - r1) * (c2 - c_old),
142 weight_22 = (r_old - r1) * (c_old - c1);
151 if (r1 >= width) r1 = width - 1;
159 if (c1 >= height) c1 = height - 1;
165 if (r2 < 0) r2 = - r2;
168 r2 = 2 * width - 2 - r2;
169 if (r1 >= width) r1 = 2 * width - 2 - r1;
173 if (c2 < 0) c2 = -c2;
176 c2 = 2 * height - 2 - c2;
177 if (c1 >= height) c1 = 2 * height - 2 - c1;
182 BaseFloat p11 = original_image(r1, num_channels * c1 + ch),
183 p12 = original_image(r1, num_channels * c2 + ch),
184 p21 = original_image(r2, num_channels * c1 + ch),
185 p22 = original_image(r2, num_channels * c2 + ch);
186 (*image)(r, num_channels * c + ch) = weight_11 * p11 + weight_12 * p12 +
187 weight_21 * p21 + weight_22 * p22;
213 KALDI_ERR <<
"Number of columns in image must divide the number " 231 shift_mat(0, 2) = round(horizontal_shift);
232 shift_mat(1, 2) = round(vertical_shift);
236 shift_mat(0, 0) = -1.0;
247 rotation_mat(0, 0) = cos(theta);
248 rotation_mat(0, 1) = -sin(theta);
249 rotation_mat(1, 0) = sin(theta);
250 rotation_mat(1, 1) = cos(theta);
273 if (transform_mat.
IsUnit())
282 set_origin_mat(0, 2) = image_width / 2.0 - 0.5;
283 set_origin_mat(1, 2) = image_height / 2.0 - 0.5;
286 reset_origin_mat(0, 2) = -image_width / 2.0 + 0.5;
287 reset_origin_mat(1, 2) = -image_height / 2.0 + 0.5;
306 bool found_input =
false;
307 for (
int32 i = 0;
i < io_size;
i++) {
309 if (io.
name ==
"input") {
324 KALDI_ERR <<
"Nnet example to perturb had no NnetIo object named 'input'";
331 int main(
int argc,
char *argv[]) {
333 using namespace kaldi;
336 typedef kaldi::int64 int64;
339 "Copy examples (single frames or fixed-size groups of frames) for neural\n" 340 "network training, doing image augmentation inline (copies after possibly\n" 341 "modifying of each image, randomly chosen according to configuration\n" 344 " nnet3-egs-augment-image --horizontal-flip-prob=0.5 --horizontal-shift=0.1\\\n" 345 " --vertical-shift=0.1 --srand=103 --num-channels=3 --fill-mode=nearest ark:- ark:-\n" 347 "Requires that each eg contain a NnetIo object 'input', with successive\n" 348 "'t' values representing different x offsets , and the feature dimension\n" 349 "representing the y offset and the channel (color), with the channel\n" 350 "varying the fastest.\n" 351 "See also: nnet3-copy-egs\n";
354 int32 srand_seed = 0;
359 po.
Register(
"srand", &srand_seed,
"Seed for the random number generator");
373 std::string examples_rspecifier = po.
GetArg(1),
374 examples_wspecifier = po.
GetArg(2);
381 for (; !example_reader.
Done(); example_reader.
Next(), num_done++) {
382 std::string key = example_reader.
Key();
385 example_writer.
Write(key, eg);
387 KALDI_LOG <<
"Perturbed " << num_done <<
" neural-network training images.";
388 return (num_done == 0 ? 1 : 0);
389 }
catch(
const std::exception &e) {
390 std::cerr << e.what() <<
'\n';
NnetExample is the input data and corresponding label (or labels) for one or more frames of input...
This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for...
FillMode GetFillMode() const
float RandUniform(struct RandomState *state=NULL)
Returns a random number strictly between 0 and 1.
void GetMatrix(Matrix< BaseFloat > *mat) const
Outputs the contents as a matrix.
MatrixIndexT NumCols() const
Returns number of columns (or zero for empty matrix).
int main(int argc, char *argv[])
Base class which provides matrix operations not involving resizing or allocation. ...
void PrintUsage(bool print_command_line=false)
Prints the usage documentation [provided in the constructor].
BaseFloat rotation_degree
bool WithProb(BaseFloat prob, struct RandomState *state)
A templated class for writing objects to an archive or script file; see The Table concept...
GeneralMatrix features
The features or labels.
BaseFloat horizontal_flip_prob
void SetUnit()
Sets to zero, except ones along diagonal [for non-square matrices too].
void Write(const std::string &key, const T &value) const
void Register(const std::string &name, bool *ptr, const std::string &doc)
void PerturbImageInNnetExample(const ImageAugmentationConfig &config, NnetExample *eg)
This function does image perturbation as directed by 'config' The example 'eg' is expected to contain...
The class ParseOptions is for parsing command-line options; see Parsing command-line options for more...
A templated class for reading objects sequentially from an archive or script file; see The Table conc...
void AddMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const Real beta)
int Read(int argc, const char *const *argv)
Parses the command line options and fills the ParseOptions-registered variables.
void PerturbImage(const ImageAugmentationConfig &config, MatrixBase< BaseFloat > *image)
This function randomly modifies (perturbs) the image by applying different geometric transformations ...
void AddMatMatMat(const Real alpha, const MatrixBase< Real > &A, MatrixTransposeType transA, const MatrixBase< Real > &B, MatrixTransposeType transB, const MatrixBase< Real > &C, MatrixTransposeType transC, const Real beta)
this <– beta*this + alpha*A*B*C.
std::string GetArg(int param) const
Returns one of the positional parameters; 1-based indexing for argc/argv compatibility.
void Register(ParseOptions *po)
int NumArgs() const
Number of positional parameters (c.f. argc-1).
void ApplyAffineTransform(MatrixBase< BaseFloat > &transform, int32 num_channels, MatrixBase< BaseFloat > *image, FillMode fill_mode)
This function applies a geometric transformation 'transform' to the image.
#define KALDI_ASSERT(cond)
MatrixIndexT NumRows() const
Returns number of rows (or zero for empty matrix).
std::string name
the name of the input in the neural net; in simple setups it will just be "input".
BaseFloat horizontal_shift
std::string fill_mode_string
ImageAugmentationConfig()
std::vector< NnetIo > io
"io" contains the input and output.
bool IsUnit(Real cutoff=1.0e-05) const
Returns true if the matrix is all zeros, except for ones on diagonal.