diff options
Diffstat (limited to 'tesseract/src/ccstruct/rejctmap.h')
-rw-r--r-- | tesseract/src/ccstruct/rejctmap.h | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/tesseract/src/ccstruct/rejctmap.h b/tesseract/src/ccstruct/rejctmap.h new file mode 100644 index 00000000..5ae37fa6 --- /dev/null +++ b/tesseract/src/ccstruct/rejctmap.h @@ -0,0 +1,269 @@ +/********************************************************************** + * File: rejctmap.h (Formerly rejmap.h) + * Description: REJ and REJMAP class functions. + * Author: Phil Cheatle + * Created: Thu Jun 9 13:46:38 BST 1994 + * + * (C) Copyright 1994, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + +This module may look unnecessarily verbose, but here's the philosophy... + +ALL processing of the reject map is done in this module. There are lots of +separate calls to set reject/accept flags. These have DELIBERATELY been kept +distinct so that this module can decide what to do. + +Basically, there is a flag for each sort of rejection or acceptance. This +provides a history of what has happened to EACH character. + +Determining whether a character is CURRENTLY rejected depends on implicit +understanding of the SEQUENCE of possible calls. The flags are defined and +grouped in the REJ_FLAGS enum. These groupings are used in determining a +characters CURRENT rejection status. Basically, a character is ACCEPTED if + + none of the permanent rej flags are set + AND ( the character has never been rejected + OR an accept flag is set which is LATER than the latest reject flag ) + +IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE +OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! +**********************************************************************/ + +#ifndef REJCTMAP_H +#define REJCTMAP_H + +#include "bits16.h" +#include "errcode.h" +#include "params.h" + +#include <memory> + +namespace tesseract { + +enum REJ_FLAGS { + /* Reject modes which are NEVER overridden */ + R_TESS_FAILURE, // PERM Tess didn't classify + R_SMALL_XHT, // PERM Xht too small + R_EDGE_CHAR, // PERM Too close to edge of image + R_1IL_CONFLICT, // PERM 1Il confusion + R_POSTNN_1IL, // PERM 1Il unrejected by NN + R_REJ_CBLOB, // PERM Odd blob + R_MM_REJECT, // PERM Matrix match rejection (m's) + R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend + + /* Initial reject modes (pre NN_ACCEPT) */ + R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) + R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD + R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD + R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD + + /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ + R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop + R_DUBIOUS, // TEMP Post NN dodgy chars + R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN + R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest + R_XHT_FIXUP, // TEMP Xht tests unsure + + /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ + R_BAD_QUALITY, // TEMP Quality metrics bad for WERD + + /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ + R_DOC_REJ, // TEMP Document rejection + R_BLOCK_REJ, // TEMP Block rejection + R_ROW_REJ, // TEMP Row rejection + R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space + + /* Accept modes which occur between the above rejection groups */ + R_NN_ACCEPT, // NN acceptance + R_HYPHEN_ACCEPT, // Hyphen acceptance + R_MM_ACCEPT, // Matrix match acceptance + R_QUALITY_ACCEPT, // Accept word in good quality doc + R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures +}; + +/* REJECT MAP VALUES */ + +#define MAP_ACCEPT '1' +#define MAP_REJECT_PERM '0' +#define MAP_REJECT_TEMP '2' +#define MAP_REJECT_POTENTIAL '3' + +class REJ +{ + BITS16 flags1; + BITS16 flags2; + + void set_flag(REJ_FLAGS rej_flag) { + if (rej_flag < 16) + flags1.set(rej_flag); + else + flags2.set(rej_flag - 16); + } + + bool rej_before_nn_accept(); + bool rej_between_nn_and_mm(); + bool rej_between_mm_and_quality_accept(); + bool rej_between_quality_and_minimal_rej_accept(); + bool rej_before_mm_accept(); + bool rej_before_quality_accept(); + + public: + REJ() = default; + + REJ( //classwise copy + const REJ &source) { + flags1 = source.flags1; + flags2 = source.flags2; + } + + REJ & operator= ( //assign REJ + const REJ & source) { //from this + flags1 = source.flags1; + flags2 = source.flags2; + return *this; + } + + bool flag(REJ_FLAGS rej_flag) { + if (rej_flag < 16) + return flags1[rej_flag]; + else + return flags2[rej_flag - 16]; + } + + char display_char() { + if (perm_rejected ()) + return MAP_REJECT_PERM; + else if (accept_if_good_quality ()) + return MAP_REJECT_POTENTIAL; + else if (rejected ()) + return MAP_REJECT_TEMP; + else + return MAP_ACCEPT; + } + + bool perm_rejected(); //Is char perm reject? + + bool rejected(); //Is char rejected? + + bool accepted() { //Is char accepted? + return !rejected (); + } + + //potential rej? + bool accept_if_good_quality(); + + bool recoverable() { + return (rejected () && !perm_rejected ()); + } + + void setrej_tess_failure(); //Tess generated blank + void setrej_small_xht(); //Small xht char/wd + void setrej_edge_char(); //Close to image edge + void setrej_1Il_conflict(); //Initial reject map + void setrej_postNN_1Il(); //1Il after NN + void setrej_rej_cblob(); //Insert duff blob + void setrej_mm_reject(); //Matrix matcher + //Odd repeated char + void setrej_bad_repetition(); + void setrej_poor_match(); //Failed Rays heuristic + //TEMP reject_word + void setrej_not_tess_accepted(); + //TEMP reject_word + void setrej_contains_blanks(); + void setrej_bad_permuter(); //POTENTIAL reject_word + void setrej_hyphen(); //PostNN dubious hyph or . + void setrej_dubious(); //PostNN dubious limit + void setrej_no_alphanums(); //TEMP reject_word + void setrej_mostly_rej(); //TEMP reject_word + void setrej_xht_fixup(); //xht fixup + void setrej_bad_quality(); //TEMP reject_word + void setrej_doc_rej(); //TEMP reject_word + void setrej_block_rej(); //TEMP reject_word + void setrej_row_rej(); //TEMP reject_word + void setrej_unlv_rej(); //TEMP reject_word + void setrej_nn_accept(); //NN Flipped a char + void setrej_hyphen_accept(); //Good aspect ratio + void setrej_mm_accept(); //Matrix matcher + //Quality flip a char + void setrej_quality_accept(); + //Accept all except blank + void setrej_minimal_rej_accept(); + + void full_print(FILE *fp); +}; + +class REJMAP +{ + std::unique_ptr<REJ[]> ptr; // ptr to the chars + int16_t len; //Number of chars + + public: + REJMAP() : len(0) {} + + REJMAP(const REJMAP &rejmap) { *this = rejmap; } + + REJMAP &operator=(const REJMAP &source); + + // Sets up the ptr array to length, whatever it was before. + void initialise(int16_t length); + + REJ &operator[]( // access function + int16_t index) const // map index + { + ASSERT_HOST(index < len); + return ptr[index]; // no bounds checks + } + + int32_t length() const { //map length + return len; + } + + int16_t accept_count(); //How many accepted? + + int16_t reject_count() { //How many rejects? + return len - accept_count (); + } + + void remove_pos( //Cut out an element + int16_t pos); //element to remove + + void print(FILE *fp); + + void full_print(FILE *fp); + + bool recoverable_rejects(); //Any non perm rejs? + + bool quality_recoverable_rejects(); + //Any potential rejs? + + void rej_word_small_xht(); //Reject whole word + //Reject whole word + void rej_word_tess_failure(); + void rej_word_not_tess_accepted(); + //Reject whole word + //Reject whole word + void rej_word_contains_blanks(); + //Reject whole word + void rej_word_bad_permuter(); + void rej_word_xht_fixup(); //Reject whole word + //Reject whole word + void rej_word_no_alphanums(); + void rej_word_mostly_rej(); //Reject whole word + void rej_word_bad_quality(); //Reject whole word + void rej_word_doc_rej(); //Reject whole word + void rej_word_block_rej(); //Reject whole word + void rej_word_row_rej(); //Reject whole word +}; + +} // namespace tesseract + +#endif |