diff options
Diffstat (limited to 'tesseract/src/training/unicharset/validator.h')
-rw-r--r-- | tesseract/src/training/unicharset/validator.h | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/tesseract/src/training/unicharset/validator.h b/tesseract/src/training/unicharset/validator.h new file mode 100644 index 00000000..53ac9f33 --- /dev/null +++ b/tesseract/src/training/unicharset/validator.h @@ -0,0 +1,249 @@ +/********************************************************************** + * File: validator.h + * Description: Base class for various text validators. Intended mainly for + * scripts that use a virama character. + * Author: Ray Smith + * + * (C) Copyright 2017, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_TRAINING_VALIDATOR_H_ +#define TESSERACT_TRAINING_VALIDATOR_H_ + +#include "export.h" + +#include <tesseract/unichar.h> + +#include <memory> +#include <vector> + +namespace tesseract { + +// Different kinds of grapheme normalization - not just for Indic! +// A grapheme is a syllable unit in Indic and can be several unicodes. +// In other scripts, a grapheme is a base character and accent/diacritic +// combination, as not all accented characters have a single composed form. +enum class GraphemeNormMode { + // Validation result is a single string, even if input is multi-word. + kSingleString, + // Standard unicode graphemes are validated and output as grapheme units. + kCombined, + // Graphemes are validated and sub-divided. For virama-using scripts, units + // that correspond to repeatable glyphs are generated. (Mostly single unicodes + // but viramas and joiners are paired with the most sensible neighbor.) + // For non-virama scripts, this means that base/accent pairs are separated, + // ie the output is individual unicodes. + kGlyphSplit, + // The output is always single unicodes, regardless of the script. + kIndividualUnicodes, +}; + +// An enum representing the scripts that use a virama character. It is +// guaranteed that the value of any element, (except kNonVirama) can be cast +// to a unicode (char32) value that represents the start of the unicode range +// of the corresponding script. +enum class ViramaScript : char32 { + kNonVirama = 0, + kDevanagari = 0x900, + kBengali = 0x980, + kGurmukhi = 0xa00, + kGujarati = 0xa80, + kOriya = 0xb00, + kTamil = 0xb80, + kTelugu = 0xc00, + kKannada = 0xc80, + kMalayalam = 0xd00, + kSinhala = 0xd80, + kMyanmar = 0x1000, + kKhmer = 0x1780, + kJavanese = 0xa980, +}; + +// Base class offers a validation API and protected methods to allow subclasses +// to easily build the validated/segmented output. +class TESS_UNICHARSET_TRAINING_API Validator { + public: + // Validates and cleans the src vector of unicodes to the *dest, according to + // g_mode. In the case of kSingleString, a single vector containing the whole + // result is added to *dest. With kCombined, multiple vectors are added to + // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are + // added to *dest with a smaller unit representing a glyph in each. + // In case of validation error, returns false and as much as possible of the + // input, without discarding invalid text. + static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, + bool report_errors, + const std::vector<char32>& src, + std::vector<std::vector<char32>>* dest); + + // Returns true if the unicode ch is a non-printing zero-width mark of no + // significance to OCR training or evaluation. + static bool IsZeroWidthMark(char32 ch) { + return ch == kZeroWidthSpace || ch == kLeftToRightMark || + ch == kRightToLeftMark || ch == kInvalid; + } + virtual ~Validator(); + + // Some specific but universally useful unicodes. + static const char32 kZeroWidthSpace; + static const char32 kZeroWidthNonJoiner; + static const char32 kZeroWidthJoiner; + static const char32 kLeftToRightMark; + static const char32 kRightToLeftMark; + static const char32 kInvalid; + + protected: + // These are more or less the character class identifiers in the ISCII + // standard, section 8. They have been augmented with the Unicode meta + // characters Zero Width Joiner and Zero Width Non Joiner, and the + // Unicode Vedic Marks. + // The best sources of information on Unicode and Indic scripts are: + // http://varamozhi.sourceforge.net/iscii91.pdf + // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf + // http://unicode.org/faq/indic.html + // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx + enum class CharClass { + // NOTE: The values of the enum members are meaningless and arbitrary, ie + // they are not used for sorting, or any other risky application. + // The reason they are what they are is they are a single character + // abbreviation that can be used in a regexp/BNF definition of a grammar, + // IN A COMMENT, and still not relied upon in the code. + kConsonant = 'C', + kVowel = 'V', + kVirama = 'H', // (aka Halant) + kMatra = 'M', // (aka Dependent Vowel) + kMatraPiece = 'P', // unicode provides pieces of Matras. + kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks) + kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C + kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D + kVedicMark = 'v', // Modifiers can come modify any indic syllable. + kNukta = 'N', // Occurs only immediately after consonants. + kRobat = 'R', // Khmer only. + kOther = 'O', // (digits, measures, non-Indic, etc) + // Additional classes used only by ValidateGrapheme. + kWhitespace = ' ', + kCombiner = 'c', // Combiners other than virama. + }; + using IndicPair = std::pair<CharClass, char32>; + + Validator(ViramaScript script, bool report_errors) + : script_(script), + codes_used_(0), + output_used_(0), + report_errors_(report_errors) {} + + // Factory method that understands how to map script to the right subclass. + static std::unique_ptr<Validator> ScriptValidator(ViramaScript script, + bool report_errors); + + // Internal version of the public static ValidateCleanAndSegment. + // Validates and cleans the src vector of unicodes to the *dest, according to + // its type and the given g_mode. + // In case of validation error, returns false and returns as much as possible + // of the input, without discarding invalid text. + bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, + const std::vector<char32>& src, + std::vector<std::vector<char32>>* dest); + // Moves the results from parts_ or output_ to dest according to g_mode. + void MoveResultsToDest(GraphemeNormMode g_mode, + std::vector<std::vector<char32>>* dest); + + // Computes and returns the ViramaScript corresponding to the most frequent + // virama-using script in the input, or kNonVirama if none are present. + static ViramaScript MostFrequentViramaScript( + const std::vector<char32>& utf32); + // Returns true if the given UTF-32 unicode is a "virama" character. + static bool IsVirama(char32 unicode); + // Returns true if the given UTF-32 unicode is a vedic accent. + static bool IsVedicAccent(char32 unicode); + // Returns true if the script is one that uses subscripts for conjuncts. + bool IsSubscriptScript() const; + + // Helper function appends the next element of codes_ only to output_, + // without touching parts_ + // Returns true at the end of codes_. + bool CodeOnlyToOutput() { + output_.push_back(codes_[codes_used_].second); + return ++codes_used_ == codes_.size(); + } + + // Helper function adds a length-element vector to parts_ from the last length + // elements of output_. If there are more than length unused elements in + // output_, adds unicodes as single-element vectors to parts_ to catch + // output_used_ up to output->size() - length before adding the length-element + // vector. + void MultiCodePart(unsigned length) { + while (output_used_ + length < output_.size()) { + parts_.emplace_back( + std::initializer_list<char32>{output_[output_used_++]}); + } + parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]}); + while (++output_used_ < output_.size()) { + parts_.back().push_back(output_[output_used_]); + } + } + + // Helper function appends the next element of codes_ to output_, and then + // calls MultiCodePart to add the appropriate components to parts_. + // Returns true at the end of codes_. + bool UseMultiCode(unsigned length) { + output_.push_back(codes_[codes_used_].second); + MultiCodePart(length); + return ++codes_used_ == codes_.size(); + } + + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + virtual bool ConsumeGraphemeIfValid() = 0; + // Sets codes_ to the class codes for the given unicode text. + void ComputeClassCodes(const std::vector<char32>& text); + // Returns the CharClass corresponding to the given Unicode ch. + virtual CharClass UnicodeToCharClass(char32 ch) const = 0; + // Resets to the initial state. + void Clear(); + + // Number of unicodes in each Indic codepage. + static const int kIndicCodePageSize = 128; + // Lowest unicode value of any Indic script. (Devanagari). + static const char32 kMinIndicUnicode = 0x900; + // Highest unicode value of any consistent (ISCII-based) Indic script. + static const char32 kMaxSinhalaUnicode = 0xdff; + // Highest unicode value of any virama-using script. (Khmer). + static const char32 kMaxViramaScriptUnicode = 0x17ff; + // Some special unicodes. + static const char32 kSinhalaVirama = 0xdca; + static const char32 kMyanmarVirama = 0x1039; + static const char32 kKhmerVirama = 0x17d2; + // Javanese Script - aksarajawa + static const char32 kJavaneseVirama = 0xa9c0; + static const char32 kMaxJavaneseUnicode = 0xa9df; + + // Script we are operating on. + ViramaScript script_; + // Input unicodes with assigned CharClass is the data to be validated. + std::vector<IndicPair> codes_; + // Glyph-like components of the input. + std::vector<std::vector<char32>> parts_; + // Copied validated unicodes from codes_ that are OK to output. + std::vector<char32> output_; + // The number of elements of codes_ that have been processed so far. + unsigned codes_used_; + // The number of elements of output_ that have already been added to parts_. + unsigned output_used_; + // Log error messages for reasons why text is invalid. + bool report_errors_; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATOR_H_ |