diff options
author | Thomas Deutschmann <whissi@gentoo.org> | 2021-03-30 10:59:39 +0200 |
---|---|---|
committer | Thomas Deutschmann <whissi@gentoo.org> | 2021-04-01 00:04:14 +0200 |
commit | 5ff1d6955496b3cf9a35042c9ac35db43bc336b1 (patch) | |
tree | 6d470f7eb448f59f53e8df1010aec9dad8ce1f72 /tesseract/src/wordrec/pieces.cpp | |
parent | Import Ghostscript 9.53.1 (diff) | |
download | ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.gz ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.tar.bz2 ghostscript-gpl-patches-5ff1d6955496b3cf9a35042c9ac35db43bc336b1.zip |
Import Ghostscript 9.54ghostscript-9.54
Signed-off-by: Thomas Deutschmann <whissi@gentoo.org>
Diffstat (limited to 'tesseract/src/wordrec/pieces.cpp')
-rw-r--r-- | tesseract/src/wordrec/pieces.cpp | 334 |
1 files changed, 334 insertions, 0 deletions
diff --git a/tesseract/src/wordrec/pieces.cpp b/tesseract/src/wordrec/pieces.cpp new file mode 100644 index 00000000..0d3b5efb --- /dev/null +++ b/tesseract/src/wordrec/pieces.cpp @@ -0,0 +1,334 @@ +/****************************************************************************** + * + * File: pieces.cpp (Formerly pieces.c) + * Description: + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1987, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ +/*---------------------------------------------------------------------- + I n c l u d e s +----------------------------------------------------------------------*/ + +#include "blobs.h" +#include "helpers.h" +#include "matrix.h" +#include "ratngs.h" +#include "seam.h" +#include "wordrec.h" + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +using tesseract::ScoredFont; + +/*---------------------------------------------------------------------- + F u n c t i o n s +----------------------------------------------------------------------*/ + +/********************************************************************** + * classify_piece + * + * Create a larger piece from a collection of smaller ones. Classify + * it and return the results. Take the large piece apart to leave + * the collection of small pieces un modified. + **********************************************************************/ +namespace tesseract { +BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams, + int16_t start, + int16_t end, + const char* description, + TWERD *word, + BlamerBundle *blamer_bundle) { + if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end); + BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description, + ScrollView::WHITE, blamer_bundle); + // Set the matrix_cell_ entries in all the BLOB_CHOICES. + BLOB_CHOICE_IT bc_it(choices); + for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { + bc_it.data()->set_matrix_cell(start, end); + } + + if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end); + + return (choices); +} + +template<class BLOB_CHOICE> +int SortByUnicharID(const void *void1, const void *void2) { + const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1); + const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2); + + return p1->unichar_id() - p2->unichar_id(); +} + +template<class BLOB_CHOICE> +int SortByRating(const void *void1, const void *void2) { + const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1); + const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2); + + if (p1->rating() < p2->rating()) + return 1; + return -1; +} + + +/********************************************************************** + * fill_filtered_fragment_list + * + * Filter the fragment list so that the filtered_choices only contain + * fragments that are in the correct position. choices is the list + * that we are going to filter. fragment_pos is the position in the + * fragment that we are looking for and num_frag_parts is the the + * total number of pieces. The result will be appended to + * filtered_choices. + **********************************************************************/ +void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, + int fragment_pos, + int num_frag_parts, + BLOB_CHOICE_LIST *filtered_choices) { + BLOB_CHOICE_IT filtered_choices_it(filtered_choices); + BLOB_CHOICE_IT choices_it(choices); + + for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); + choices_it.forward()) { + UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); + const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); + + if (frag != nullptr && frag->get_pos() == fragment_pos && + frag->get_total() == num_frag_parts) { + // Recover the unichar_id of the unichar that this fragment is + // a part of + auto *b = new BLOB_CHOICE(*choices_it.data()); + int original_unichar = unicharset.unichar_to_id(frag->get_unichar()); + b->set_unichar_id(original_unichar); + filtered_choices_it.add_to_end(b); + } + } + + filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>); +} + + +/********************************************************************** + * merge_and_put_fragment_lists + * + * Merge the fragment lists in choice_lists and append it to the + * ratings matrix. + **********************************************************************/ +void Wordrec::merge_and_put_fragment_lists(int16_t row, int16_t column, + int16_t num_frag_parts, + BLOB_CHOICE_LIST *choice_lists, + MATRIX *ratings) { + auto *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; + + for (int i = 0; i < num_frag_parts; i++) { + choice_lists_it[i].set_to_list(&choice_lists[i]); + choice_lists_it[i].mark_cycle_pt(); + } + + BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); + if (merged_choice == nullptr) + merged_choice = new BLOB_CHOICE_LIST; + + bool end_of_list = false; + BLOB_CHOICE_IT merged_choice_it(merged_choice); + while (!end_of_list) { + // Find the maximum unichar_id of the current entry the iterators + // are pointing at + UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); + for (int i = 0; i < num_frag_parts; i++) { + UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); + if (max_unichar_id < unichar_id) { + max_unichar_id = unichar_id; + } + } + + // Move the each iterators until it gets to an entry that has a + // value greater than or equal to max_unichar_id + for (int i = 0; i < num_frag_parts; i++) { + UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); + while (!choice_lists_it[i].cycled_list() && + unichar_id < max_unichar_id) { + choice_lists_it[i].forward(); + unichar_id = choice_lists_it[i].data()->unichar_id(); + } + if (choice_lists_it[i].cycled_list()) { + end_of_list = true; + break; + } + } + + if (end_of_list) + break; + + // Checks if the fragments are parts of the same character + UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); + bool same_unichar = true; + for (int i = 1; i < num_frag_parts; i++) { + UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); + if (unichar_id != first_unichar_id) { + same_unichar = false; + break; + } + } + + if (same_unichar) { + // Add the merged character to the result + UNICHAR_ID merged_unichar_id = first_unichar_id; + auto merged_fonts = choice_lists_it[0].data()->fonts(); + float merged_min_xheight = choice_lists_it[0].data()->min_xheight(); + float merged_max_xheight = choice_lists_it[0].data()->max_xheight(); + float positive_yshift = 0, negative_yshift = 0; + int merged_script_id = choice_lists_it[0].data()->script_id(); + BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier(); + + float merged_rating = 0, merged_certainty = 0; + for (int i = 0; i < num_frag_parts; i++) { + float rating = choice_lists_it[i].data()->rating(); + float certainty = choice_lists_it[i].data()->certainty(); + + if (i == 0 || certainty < merged_certainty) + merged_certainty = certainty; + merged_rating += rating; + + choice_lists_it[i].forward(); + if (choice_lists_it[i].cycled_list()) + end_of_list = true; + IntersectRange(choice_lists_it[i].data()->min_xheight(), + choice_lists_it[i].data()->max_xheight(), + &merged_min_xheight, &merged_max_xheight); + float yshift = choice_lists_it[i].data()->yshift(); + if (yshift > positive_yshift) positive_yshift = yshift; + if (yshift < negative_yshift) negative_yshift = yshift; + // Use the min font rating over the parts. + // TODO(rays) font lists are unsorted. Need to be faster? + const auto& frag_fonts = choice_lists_it[i].data()->fonts(); + for (auto frag_font : frag_fonts) { + int merged_f = 0; + for (; merged_f < merged_fonts.size() && + merged_fonts[merged_f].fontinfo_id != frag_font.fontinfo_id; + ++merged_f) {} + if (merged_f == merged_fonts.size()) { + merged_fonts.push_back(frag_font); + } else if (merged_fonts[merged_f].score > frag_font.score) { + merged_fonts[merged_f].score = frag_font.score; + } + } + } + + float merged_yshift = positive_yshift != 0 + ? (negative_yshift != 0 ? 0 : positive_yshift) + : negative_yshift; + auto* choice = new BLOB_CHOICE(merged_unichar_id, + merged_rating, + merged_certainty, + merged_script_id, + merged_min_xheight, + merged_max_xheight, + merged_yshift, + classifier); + choice->set_fonts(merged_fonts); + merged_choice_it.add_to_end(choice); + } + } + + if (classify_debug_level) + print_ratings_list("Merged Fragments", merged_choice, + unicharset); + + if (merged_choice->empty()) + delete merged_choice; + else + ratings->put(row, column, merged_choice); + + delete [] choice_lists_it; +} + +/********************************************************************** + * get_fragment_lists + * + * Recursively go through the ratings matrix to find lists of fragments + * to be merged in the function merge_and_put_fragment_lists. + * current_frag is the position of the piece we are looking for. + * current_row is the row in the rating matrix we are currently at. + * start is the row we started initially, so that we can know where + * to append the results to the matrix. num_frag_parts is the total + * number of pieces we are looking for and num_blobs is the size of the + * ratings matrix. + **********************************************************************/ +void Wordrec::get_fragment_lists(int16_t current_frag, int16_t current_row, + int16_t start, int16_t num_frag_parts, + int16_t num_blobs, MATRIX *ratings, + BLOB_CHOICE_LIST *choice_lists) { + if (current_frag == num_frag_parts) { + merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts, + choice_lists, ratings); + return; + } + + for (int16_t x = current_row; x < num_blobs; x++) { + BLOB_CHOICE_LIST *choices = ratings->get(current_row, x); + if (choices == nullptr) + continue; + + fill_filtered_fragment_list(choices, current_frag, num_frag_parts, + &choice_lists[current_frag]); + if (!choice_lists[current_frag].empty()) { + get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts, + num_blobs, ratings, choice_lists); + choice_lists[current_frag].clear(); + } + } +} + + +/********************************************************************** + * merge_fragments + * + * Try to merge fragments in the ratings matrix and put the result in + * the corresponding row and column + **********************************************************************/ +void Wordrec::merge_fragments(MATRIX *ratings, int16_t num_blobs) { + BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks]; + for (int16_t start = 0; start < num_blobs; start++) { + for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks; + frag_parts++) { + get_fragment_lists(0, start, start, frag_parts, num_blobs, + ratings, choice_lists); + } + } + + // Delete fragments from the rating matrix + for (int16_t x = 0; x < num_blobs; x++) { + for (int16_t y = x; y < num_blobs; y++) { + BLOB_CHOICE_LIST *choices = ratings->get(x, y); + if (choices != nullptr) { + BLOB_CHOICE_IT choices_it(choices); + for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); + choices_it.forward()) { + UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); + const CHAR_FRAGMENT *frag = + unicharset.get_fragment(choice_unichar_id); + if (frag != nullptr) + delete choices_it.extract(); + } + } + } + } +} + + +} // namespace tesseract |