#include "base/kaldi-common.h"
#include "util/common-utils.h"
#include "hmm/transition-model.h"
#include "nnet3/nnet-example.h"
#include "nnet3/nnet-example-utils.h"

Include dependency graph for nnet3-egs-augment-image.cc:

Go to the source code of this file.

Classes
struct	ImageAugmentationConfig

Namespaces
	kaldi
	This code computes Goodness of Pronunciation (GOP) and extracts phone-level pronunciation feature for mispronunciations detection tasks, the reference:

	kaldi::nnet3

Enumerations
enum	FillMode { kNearest, kReflect }

Functions
void	ApplyAffineTransform (MatrixBase< BaseFloat > &transform, int32 num_channels, MatrixBase< BaseFloat > *image, FillMode fill_mode)
	This function applies a geometric transformation 'transform' to the image. More...

void	PerturbImage (const ImageAugmentationConfig &config, MatrixBase< BaseFloat > *image)
	This function randomly modifies (perturbs) the image by applying different geometric transformations according to the options in 'config'. More...

void	PerturbImageInNnetExample (const ImageAugmentationConfig &config, NnetExample *eg)
	This function does image perturbation as directed by 'config' The example 'eg' is expected to contain a NnetIo member with the name 'input', representing an image. More...

int	main (int argc, char *argv[])

Function Documentation

◆ main()

int main	(	int	argc,
		char *	argv[]
	)

Definition at line 331 of file nnet3-egs-augment-image.cc.

References SequentialTableReader< Holder >::Done(), ParseOptions::GetArg(), KALDI_LOG, SequentialTableReader< Holder >::Key(), SequentialTableReader< Holder >::Next(), ParseOptions::NumArgs(), kaldi::nnet3::PerturbImageInNnetExample(), ParseOptions::PrintUsage(), ParseOptions::Read(), ImageAugmentationConfig::Register(), ParseOptions::Register(), SequentialTableReader< Holder >::Value(), and TableWriter< Holder >::Write().

                                  {
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
 
     const char *usage =
         "Copy examples (single frames or fixed-size groups of frames) for neural\n"
         "network training, doing image augmentation inline (copies after possibly\n"
         "modifying of each image, randomly chosen according to configuration\n"
         "parameters).\n"
         "E.g.:\n"
         "  nnet3-egs-augment-image --horizontal-flip-prob=0.5 --horizontal-shift=0.1\\\n"
         "       --vertical-shift=0.1 --srand=103 --num-channels=3 --fill-mode=nearest ark:- ark:-\n"
         "\n"
         "Requires that each eg contain a NnetIo object 'input', with successive\n"
         "'t' values representing different x offsets , and the feature dimension\n"
         "representing the y offset and the channel (color), with the channel\n"
         "varying the fastest.\n"
         "See also: nnet3-copy-egs\n";
 
 
     int32 srand_seed = 0;
 
     ImageAugmentationConfig config;
 
     ParseOptions po(usage);
     po.Register("srand", &srand_seed, "Seed for the random number generator");
 
     config.Register(&po);
 
     po.Read(argc, argv);
 
     srand(srand_seed);
 
     if (po.NumArgs() < 2) {
       po.PrintUsage();
       exit(1);
     }
 
 
     std::string examples_rspecifier = po.GetArg(1),
         examples_wspecifier = po.GetArg(2);
 
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
 
 
     int64 num_done = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_done++) {
       std::string key = example_reader.Key();
       NnetExample eg(example_reader.Value());
       PerturbImageInNnetExample(config, &eg);
       example_writer.Write(key, eg);
     }
     KALDI_LOG << "Perturbed " << num_done << " neural-network training images.";
     return (num_done == 0 ? 1 : 0);
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }

Classes

Namespaces

Enumerations

Functions

Function Documentation

◆ main()