diff options
Diffstat (limited to 'tesseract/unittest')
82 files changed, 14481 insertions, 0 deletions
diff --git a/tesseract/unittest/README.md b/tesseract/unittest/README.md new file mode 100644 index 00000000..bf4f83fe --- /dev/null +++ b/tesseract/unittest/README.md @@ -0,0 +1,88 @@ +# Unit Testing for Tesseract + + +## Requirements + +### Files and structure +``` + +├── langdata_lstm +│ ├── common.punc +│ ├── common.unicharambigs +│ ├── desired_bigrams.txt +│ ├── eng +│ │ ├── desired_characters +│ │ ├── eng.config +│ │ ├── eng.numbers +│ │ ├── eng.punc +│ │ ├── eng.singles_text +│ │ ├── eng.training_text +│ │ ├── eng.unicharambigs +│ │ ├── eng.wordlist +│ │ └── okfonts.txt +│ ├── extended +│ │ └── extended.config +│ ├── extendedhin +│ │ └── extendedhin.config +│ ├── font_properties +│ ├── forbidden_characters_default +│ ├── hin +│ │ ├── hin.config +│ │ ├── hin.numbers +│ │ ├── hin.punc +│ │ └── hin.wordlist +│ ├── kan +│ │ └── kan.config +│ ├── kor +│ │ └── kor.config +│ ├── osd +│ │ └── osd.unicharset +│ └── radical-stroke.txt +├── tessdata +│ ├── ara.traineddata +│ ├── chi_tra.traineddata +│ ├── eng.traineddata +│ ├── heb.traineddata +│ ├── hin.traineddata +│ ├── jpn.traineddata +│ ├── kmr.traineddata +│ ├── osd.traineddata +│ └── vie.traineddata +├── tessdata_best +│ ├── eng.traineddata +│ ├── fra.traineddata +│ ├── kmr.traineddata +│ └── osd.traineddata +├── tessdata_fast +│ ├── eng.traineddata +│ ├── kmr.traineddata +│ ├── osd.traineddata +│ └── script +│ └── Latin.traineddata +└── tesseract + ├── abseil + ... + ├── test + ├── unittest + └── VERSION +``` + +### Fonts + +* Microsoft fonts: arialbi.ttf, times.ttf, verdana.ttf - [instalation guide](https://www.makeuseof.com/tag/how-to-install-microsoft-core-fonts-in-ubuntu-linux/) +* [ae_Arab.ttf](https://www.wfonts.com/download/data/2014/12/03/ae-arab/ae-arab.zip) +* dejavu-fonts: [DejaVuSans-ExtraLight.ttf](https://dejavu-fonts.github.io/Download.html) +* [Lohit-Hindi.ttf](https://raw.githubusercontent.com/pratul/packageofpractices/master/assets/fonts/Lohit-Hindi.ttf) +* [UnBatang.ttf](https://raw.githubusercontent.com/byrongibson/fonts/master/backup/truetype.original/unfonts-core/UnBatang.ttf) + + +## Run tests + +To run the tests, do the following in tesseract folder + +``` +autoreconf -fiv +git submodule update --init +export TESSDATA_PREFIX=/prefix/to/path/to/tessdata +make check +``` diff --git a/tesseract/unittest/apiexample_test.cc b/tesseract/unittest/apiexample_test.cc new file mode 100644 index 00000000..5a721fa3 --- /dev/null +++ b/tesseract/unittest/apiexample_test.cc @@ -0,0 +1,119 @@ +/////////////////////////////////////////////////////////////////////// +// File: apiexample_test.cc +// Description: Api Test for Tesseract using text fixtures and parameters. +// Tests for Devanagari, Latin and Arabic scripts are disabled by default. +// Disabled tests can be run when required by using the +// --gtest_also_run_disabled_tests argument. +// ./unittest/apiexample_test --gtest_also_run_disabled_tests +// +// Author: ShreeDevi Kumar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// expects clone of tessdata_fast repo in ../../tessdata_fast + +//#include "log.h" +#include <time.h> +#include <fstream> +#include <iostream> +#include <locale> +#include <memory> // std::unique_ptr +#include <string> +#include <tesseract/baseapi.h> +#include "include_gunit.h" +#include "allheaders.h" + +namespace tesseract { + +class QuickTest : public testing::Test { + protected: + virtual void SetUp() { start_time_ = time(nullptr); } + virtual void TearDown() { +#ifndef NDEBUG + // Debug builds can be very slow, so allow 4 min for OCR of a test image. + // apitest_example including disabled tests takes about 18 min on ARMv7. + const time_t MAX_SECONDS_FOR_TEST = 240; +#else + // Release builds typically need less than 10 s for OCR of a test image, + // apitest_example including disabled tests takes about 90 s on ARMv7. + const time_t MAX_SECONDS_FOR_TEST = 55; +#endif + const time_t end_time = time(nullptr); + EXPECT_TRUE(end_time - start_time_ <= MAX_SECONDS_FOR_TEST) + << "The test took too long - " + << ::testing::PrintToString(end_time - start_time_); + } + time_t start_time_; +}; + +void OCRTester(const char* imgname, const char* groundtruth, + const char* tessdatadir, const char* lang) { + // log.info() << tessdatadir << " for language: " << lang << std::endl; + char* outText; + std::locale loc("C"); // You can also use "" for the default system locale + std::ifstream file(groundtruth); + file.imbue(loc); // Use it for file input + std::string gtText((std::istreambuf_iterator<char>(file)), + std::istreambuf_iterator<char>()); + std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI()); + ASSERT_FALSE(api->Init(tessdatadir, lang)) + << "Could not initialize tesseract."; + Pix* image = pixRead(imgname); + ASSERT_TRUE(image != nullptr) << "Failed to read test image."; + api->SetImage(image); + outText = api->GetUTF8Text(); + EXPECT_EQ(gtText, outText) + << "Phototest.tif OCR does not match ground truth for " + << ::testing::PrintToString(lang); + api->End(); + delete[] outText; + pixDestroy(&image); +} + +class MatchGroundTruth : public QuickTest, + public ::testing::WithParamInterface<const char*> {}; + +TEST_P(MatchGroundTruth, FastPhototestOCR) { + OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", + TESSDATA_DIR "_fast", GetParam()); +} + +TEST_P(MatchGroundTruth, BestPhototestOCR) { + OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", + TESSDATA_DIR "_best", GetParam()); +} + +TEST_P(MatchGroundTruth, TessPhototestOCR) { + OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt", + TESSDATA_DIR, GetParam()); +} + +INSTANTIATE_TEST_SUITE_P(Eng, MatchGroundTruth, ::testing::Values("eng")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth, + ::testing::Values("script/Latin")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth, + ::testing::Values("script/Devanagari")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth, + ::testing::Values("script/Arabic")); + +class EuroText : public QuickTest {}; + +TEST_F(EuroText, FastLatinOCR) { + OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt", + TESSDATA_DIR "_fast", "script/Latin"); +} + +// script/Latin for eurotext.tif does not match groundtruth +// for tessdata & tessdata_best. +// so do not test these here. + +} // namespace diff --git a/tesseract/unittest/applybox_test.cc b/tesseract/unittest/applybox_test.cc new file mode 100644 index 00000000..055172d7 --- /dev/null +++ b/tesseract/unittest/applybox_test.cc @@ -0,0 +1,128 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> +#include "allheaders.h" +#include <tesseract/baseapi.h> +#include "boxread.h" +#include "rect.h" +#include <tesseract/resultiterator.h> + +#include "include_gunit.h" + +namespace tesseract { + +const char* kTruthTextWords = "To simple burn running of goods lately.\n"; +const char* kTruthTextLine = "Tosimpleburnrunningofgoodslately.\n"; + +// The fixture for testing Tesseract. +class ApplyBoxTest : public testing::Test { + protected: + std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, name); + } + std::string TessdataPath() { return TESSDATA_DIR; } + + ApplyBoxTest() { src_pix_ = nullptr; } + ~ApplyBoxTest() { pixDestroy(&src_pix_); } + + bool SetImage(const char* filename) { + bool found = false; + pixDestroy(&src_pix_); + src_pix_ = pixRead(TestDataNameToPath(filename).c_str()); + if (api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) { + api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK); + api_.SetImage(src_pix_); + api_.SetVariable("tessedit_make_boxes_from_boxes", "1"); + api_.SetInputName(TestDataNameToPath(filename).c_str()); + found = true; + } + return found; + } + + // Runs ApplyBoxes (via setting the appropriate variables and Recognize) + // and checks that the output ocr text matches the truth_str, and that + // the boxes match the given box file well enough. + // If line_mode is true, ApplyBoxes is run in line segmentation mode, + // otherwise the input box file is assumed to have character-level boxes. + void VerifyBoxesAndText(const char* imagefile, const char* truth_str, + const char* target_box_file, bool line_mode) { + if (!SetImage(imagefile)) { + // eng.traineddata not found or other problem during Init. + GTEST_SKIP(); + return; + } + if (line_mode) + api_.SetVariable("tessedit_resegment_from_line_boxes", "1"); + else + api_.SetVariable("tessedit_resegment_from_boxes", "1"); + api_.Recognize(nullptr); + char* ocr_text = api_.GetUTF8Text(); + EXPECT_STREQ(truth_str, ocr_text); + delete[] ocr_text; + // Test the boxes by reading the target box file in parallel with the + // bounding boxes in the ocr output. + std::string box_filename = TestDataNameToPath(target_box_file); + FILE* box_file = OpenBoxFile(box_filename.c_str()); + ASSERT_TRUE(box_file != nullptr); + int height = pixGetHeight(src_pix_); + ResultIterator* it = api_.GetIterator(); + do { + int left, top, right, bottom; + EXPECT_TRUE( + it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom)); + TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top)); + int line_number = 0; + TBOX truth_box; + STRING box_text; + EXPECT_TRUE( + ReadNextBox(0, &line_number, box_file, &box_text, &truth_box)); + // Testing for major overlap is a bit weak, but if they all + // major overlap successfully, then it has to be fairly close. + EXPECT_TRUE(ocr_box.major_overlap(truth_box)); + // Also check that the symbol text matches the box text. + char* symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL); + EXPECT_STREQ(box_text.c_str(), symbol_text); + delete[] symbol_text; + } while (it->Next(tesseract::RIL_SYMBOL)); + delete it; + } + + Pix* src_pix_; + std::string ocr_text_; + tesseract::TessBaseAPI api_; +}; + +// Tests character-level applyboxes on normal Times New Roman. +TEST_F(ApplyBoxTest, TimesCharLevel) { + VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box", + false); +} + +// Tests character-level applyboxes on italic Times New Roman. +TEST_F(ApplyBoxTest, ItalicCharLevel) { + VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box", + false); +} + +// Tests line-level applyboxes on normal Times New Roman. +TEST_F(ApplyBoxTest, TimesLineLevel) { + VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine, + "trainingtimes.box", true); +} + +// Tests line-level applyboxes on italic Times New Roman. +TEST_F(ApplyBoxTest, ItalLineLevel) { + VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box", + true); +} + +} // namespace diff --git a/tesseract/unittest/baseapi_test.cc b/tesseract/unittest/baseapi_test.cc new file mode 100644 index 00000000..285172e3 --- /dev/null +++ b/tesseract/unittest/baseapi_test.cc @@ -0,0 +1,402 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" + +#include "cycletimer.h" // for CycleTimer +#include "log.h" // for LOG +#include "ocrblock.h" // for class BLOCK +#include "pageres.h" + +#include <tesseract/baseapi.h> + +#include "allheaders.h" +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" +#include "gmock/gmock-matchers.h" + +#include <memory> +#include <regex> +#include <string> +#include <vector> + +namespace tesseract { + +using ::testing::ContainsRegex; +using ::testing::HasSubstr; + +static const char* langs[] = {"eng", "vie", "hin", "ara", nullptr}; +static const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif", + "arabic.tif", nullptr}; +static const char* gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", + "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c", + "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", + nullptr}; + +class FriendlyTessBaseAPI : public tesseract::TessBaseAPI { + FRIEND_TEST(TesseractTest, LSTMGeometryTest); +}; + +std::string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) { + tess->SetImage(pix); + char* result = tess->GetUTF8Text(); + std::string ocr_result = result; + delete[] result; + absl::StripAsciiWhitespace(&ocr_result); + return ocr_result; +} + +// The fixture for testing Tesseract. +class TesseractTest : public testing::Test { + protected: + static std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, name); + } + static std::string TessdataPath() { + return TESSDATA_DIR; + } +}; + +// Tests that Tesseract gets exactly the right answer on phototest. +TEST_F(TesseractTest, BasicTesseractTest) { + tesseract::TessBaseAPI api; + std::string truth_text; + std::string ocr_text; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) { + Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str()); + CHECK(src_pix); + ocr_text = GetCleanedTextResult(&api, src_pix); + CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"), + &truth_text, file::Defaults())); + absl::StripAsciiWhitespace(&truth_text); + EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str()); + pixDestroy(&src_pix); + } else { + // eng.traineddata not found. + GTEST_SKIP(); + } +} + +// Test that api.GetComponentImages() will return a set of images for +// paragraphs even if text recognition was not run. +TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) { + tesseract::TessBaseAPI api; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) { + api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK); + api.SetVariable("paragraph_debug_level", "3"); +#if 0 // TODO: b622.png is missing + Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str()); + CHECK(src_pix); + api.SetImage(src_pix); + Boxa* para_boxes = + api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr); + EXPECT_TRUE(para_boxes != nullptr); + Boxa* block_boxes = + api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr); + EXPECT_TRUE(block_boxes != nullptr); + // TODO(eger): Get paragraphs out of this page pre-text. + EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes)); + boxaDestroy(&block_boxes); + boxaDestroy(¶_boxes); + pixDestroy(&src_pix); +#endif + } else { + // eng.traineddata not found. + GTEST_SKIP(); + } +} + +// We should get hOCR output and not seg fault, even if the api caller doesn't +// call SetInputName(). +TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) { + tesseract::TessBaseAPI api; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) { + // eng.traineddata not found. + GTEST_SKIP(); + return; + } + Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str()); + CHECK(src_pix); + api.SetImage(src_pix); + char* result = api.GetHOCRText(0); + EXPECT_TRUE(result != nullptr); + EXPECT_THAT(result, HasSubstr("Hello")); + EXPECT_THAT(result, HasSubstr("<div class='ocr_page'")); + delete[] result; + pixDestroy(&src_pix); +} + +// hOCR output should contain baseline info for upright textlines. +TEST_F(TesseractTest, HOCRContainsBaseline) { + tesseract::TessBaseAPI api; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) { + // eng.traineddata not found. + GTEST_SKIP(); + return; + } + Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str()); + CHECK(src_pix); + api.SetInputName("HelloGoogle.tif"); + api.SetImage(src_pix); + char* result = api.GetHOCRText(0); + EXPECT_TRUE(result != nullptr); + EXPECT_THAT(result, HasSubstr("Hello")); + EXPECT_TRUE(std::regex_search(result, std::regex{ "<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+" })); + + delete[] result; + pixDestroy(&src_pix); +} + +// Tests that Tesseract gets exactly the right answer on some page numbers. +TEST_F(TesseractTest, AdaptToWordStrTest) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because TessBaseAPI::AdaptToWordStr is missing. + GTEST_SKIP(); +#else + static const char* kTrainingPages[] = { + "136.tif", "256.tif", "410.tif", "432.tif", "540.tif", + "692.tif", "779.tif", "793.tif", "808.tif", "815.tif", + "12.tif", "12.tif", nullptr}; + static const char* kTrainingText[] = { + "1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0", "6 9 2", "7 7 9", + "7 9 3", "8 0 8", "8 1 5", "1 2", "1 2", nullptr}; + static const char* kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr}; + static const char* kTestText[] = {"324", "433", "12", nullptr}; + tesseract::TessBaseAPI api; + std::string truth_text; + std::string ocr_text; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) { + // eng.traineddata not found. + GTEST_SKIP(); + return; + } + api.SetVariable("matcher_sufficient_examples_for_prototyping", "1"); + api.SetVariable("classify_class_pruner_threshold", "220"); + // Train on the training text. + for (int i = 0; kTrainingPages[i] != nullptr; ++i) { + std::string image_file = TestDataNameToPath(kTrainingPages[i]); + Pix* src_pix = pixRead(image_file.c_str()); + CHECK(src_pix); + api.SetImage(src_pix); + EXPECT_TRUE( + api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i])) + << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image " + << image_file; + pixDestroy(&src_pix); + } + // Test the test text. + api.SetVariable("tess_bn_matching", "1"); + api.SetPageSegMode(tesseract::PSM_SINGLE_WORD); + for (int i = 0; kTestPages[i] != nullptr; ++i) { + Pix* src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str()); + CHECK(src_pix); + ocr_text = GetCleanedTextResult(&api, src_pix); + absl::StripAsciiWhitespace(&truth_text); + EXPECT_STREQ(kTestText[i], ocr_text.c_str()); + pixDestroy(&src_pix); + } +#endif +} + +// Tests that LSTM gets exactly the right answer on phototest. +TEST_F(TesseractTest, BasicLSTMTest) { + tesseract::TessBaseAPI api; + std::string truth_text; + std::string ocr_text; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) { + // eng.traineddata not found. + GTEST_SKIP(); + return; + } + Pix* src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str()); + CHECK(src_pix); + ocr_text = GetCleanedTextResult(&api, src_pix); + CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"), + &truth_text, file::Defaults())); + absl::StripAsciiWhitespace(&truth_text); + EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str()); + pixDestroy(&src_pix); +} + +// Test that LSTM's character bounding boxes are properly converted to +// Tesseract structures. Note that we can't guarantee that LSTM's +// character boxes fall completely within Tesseract's word box because +// the baseline denormalization/normalization transforms may introduce +// errors due to float/int conversions (e.g., see OUTLINE::move() in +// ccstruct/poutline.h) Instead, we do a loose check. +TEST_F(TesseractTest, LSTMGeometryTest) { + Pix* src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str()); + FriendlyTessBaseAPI api; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) { + // eng.traineddata not found. + GTEST_SKIP(); + return; + } + api.SetImage(src_pix); + ASSERT_EQ(api.Recognize(nullptr), 0); + + const PAGE_RES* page_res = api.GetPageRes(); + PAGE_RES_IT page_res_it(const_cast<PAGE_RES*>(page_res)); + page_res_it.restart_page(); + BLOCK* block = page_res_it.block()->block; + CHECK(block); + + // extract word and character boxes for each word + for (page_res_it.restart_page(); page_res_it.word() != nullptr; + page_res_it.forward()) { + WERD_RES* word = page_res_it.word(); + CHECK(word); + CHECK(word->best_choice); + CHECK_GT(word->best_choice->length(), 0); + CHECK(word->word); + CHECK(word->box_word); + // tesseract's word box + TBOX tess_blob_box; + tess_blob_box = word->word->bounding_box(); + tess_blob_box.rotate(block->re_rotation()); + // verify that each of LSTM's character boxes lies close to within + // tesseract's word box + for (int i = 0; i < word->box_word->length(); ++i) { + TBOX lstm_blob_box = word->box_word->BlobBox(i); + // LSTM character box should not spill out of tesseract word box + // by more than a few pixels in any direction + EXPECT_LT(tess_blob_box.left() - lstm_blob_box.left(), 5); + EXPECT_LT(lstm_blob_box.right() - tess_blob_box.right(), 5); + EXPECT_LT(tess_blob_box.bottom() - lstm_blob_box.bottom(), 5); + EXPECT_LT(lstm_blob_box.top() - tess_blob_box.top(), 5); + } + } + pixDestroy(&src_pix); +} + +TEST_F(TesseractTest, InitConfigOnlyTest) { + // Languages for testing initialization. + const char* langs[] = {"eng", "chi_tra", "jpn", "vie"}; + std::unique_ptr<tesseract::TessBaseAPI> api; + CycleTimer timer; + for (size_t i = 0; i < ARRAYSIZE(langs); ++i) { + api.reset(new tesseract::TessBaseAPI); + timer.Restart(); + EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], + tesseract::OEM_TESSERACT_ONLY)); + timer.Stop(); + LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() + << "ms in regular init"; + } + // Init variables to set for config-only initialization. + std::vector<std::string> vars_vec, vars_values; + vars_vec.push_back("tessedit_init_config_only"); + vars_values.push_back("1"); + LOG(INFO) << "Switching to config only initialization:"; + for (size_t i = 0; i < ARRAYSIZE(langs); ++i) { + api.reset(new tesseract::TessBaseAPI); + timer.Restart(); + EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], + tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec, + &vars_values, false)); + timer.Stop(); + LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs() + << "ms in config-only init"; + } +} + +// Tests if two instances of Tesseract/LSTM can co-exist in the same thread. +// NOTE: This is not an exhaustive test and current support for multiple +// instances in Tesseract is fragile. This test is intended largely as a means +// of detecting and guarding against the existing support being possibly broken +// by future CLs. TessBaseAPI instances are initialized using the default +// OEM_DEFAULT mode. +TEST(TesseractInstanceTest, TestMultipleTessInstances) { + int num_langs = 0; + while (langs[num_langs] != nullptr) ++num_langs; + + const std::string kTessdataPath = TESSDATA_DIR; + + // Preload images and verify that OCR is correct on them individually. + std::vector<Pix*> pix(num_langs); + for (int i = 0; i < num_langs; ++i) { + SCOPED_TRACE(absl::StrCat("Single instance test with lang = ", langs[i])); + std::string path = file::JoinPath(TESTING_DIR, image_files[i]); + pix[i] = pixRead(path.c_str()); + QCHECK(pix[i] != nullptr) << "Could not read " << path; + + tesseract::TessBaseAPI tess; + EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i])); + std::string ocr_result = GetCleanedTextResult(&tess, pix[i]); + EXPECT_STREQ(gt_text[i], ocr_result.c_str()); + } + + // Process the images in all pairwise combinations of associated languages. + std::string ocr_result[2]; + for (int i = 0; i < num_langs; ++i) { + for (int j = i + 1; j < num_langs; ++j) { + tesseract::TessBaseAPI tess1, tess2; + tess1.Init(kTessdataPath.c_str(), langs[i]); + tess2.Init(kTessdataPath.c_str(), langs[j]); + + ocr_result[0] = GetCleanedTextResult(&tess1, pix[i]); + ocr_result[1] = GetCleanedTextResult(&tess2, pix[j]); + + EXPECT_FALSE(strcmp(gt_text[i], ocr_result[0].c_str()) || + strcmp(gt_text[j], ocr_result[1].c_str())) + << "OCR failed on language pair " << langs[i] << "-" << langs[j]; + } + } + + for (int i = 0; i < num_langs; ++i) pixDestroy(&pix[i]); +} + +// Tests whether Tesseract parameters are correctly set for the two instances. +TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) { + std::string illegal_name = "an_illegal_name"; + std::string langs[2] = {"eng", "hin"}; + std::string int_param_name = "tessedit_pageseg_mode"; + int int_param[2] = {1, 2}; + std::string int_param_str[2] = {"1", "2"}; + std::string bool_param_name = "tessedit_ambigs_training"; + bool bool_param[2] = {false, true}; + std::string bool_param_str[2] = {"F", "T"}; + std::string str_param_name = "tessedit_char_blacklist"; + std::string str_param[2] = {"abc", "def"}; + std::string double_param_name = "segment_penalty_dict_frequent_word"; + std::string double_param_str[2] = {"0.01", "2"}; + double double_param[2] = {0.01, 2}; + + const std::string kTessdataPath = TESSDATA_DIR; + + tesseract::TessBaseAPI tess1, tess2; + for (int i = 0; i < 2; ++i) { + tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2; + api->Init(kTessdataPath.c_str(), langs[i].c_str()); + api->SetVariable(illegal_name.c_str(), "none"); + api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str()); + api->SetVariable(bool_param_name.c_str(), bool_param_str[i].c_str()); + api->SetVariable(str_param_name.c_str(), str_param[i].c_str()); + api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str()); + } + for (int i = 0; i < 2; ++i) { + tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2; + EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str())); + int intvar; + EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar)); + EXPECT_EQ(int_param[i], intvar); + bool boolvar; + EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar)); + EXPECT_EQ(bool_param[i], boolvar); + EXPECT_STREQ(str_param[i].c_str(), + api->GetStringVariable(str_param_name.c_str())); + double doublevar; + EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar)); + EXPECT_EQ(double_param[i], doublevar); + } +} + +} // namespace diff --git a/tesseract/unittest/baseapi_thread_test.cc b/tesseract/unittest/baseapi_thread_test.cc new file mode 100644 index 00000000..3608a748 --- /dev/null +++ b/tesseract/unittest/baseapi_thread_test.cc @@ -0,0 +1,229 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Unit test to run Tesseract instances in parallel threads and verify +// the OCR result. + +// Note that success of running this test as-is does NOT verify +// thread-safety. For that, you need to run this binary under TSAN using the +// associated baseapi_thread_test_with_tsan.sh script. +// +// The tests are partitioned by instance to allow running Tesseract/Cube/both +// and by stage to run initialization/recognition/both. See flag descriptions +// for details. + +#include <functional> +#include <memory> +#include <string> +#ifdef INCLUDE_TENSORFLOW +#include <tensorflow/core/lib/core/threadpool.h> +#endif +#include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace +#include "allheaders.h" +#include "include_gunit.h" +#include <tesseract/baseapi.h> +#include "commandlineflags.h" +#include "log.h" + +// Run with Tesseract instances. +BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances"); +// Run with Cube instances. +// Note that with TSAN, Cube typically takes much longer to test. Ignoring +// std::string operations using the associated tess_tsan.ignore file when +// testing Cube significantly reduces testing time. +BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances"); + +// When used with TSAN, having more repetitions can help in finding hidden +// thread-safety violations at the expense of increased testing time. +INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run."); + +INT_PARAM_FLAG(max_concurrent_instances, 0, + "Maximum number of instances to run in parallel at any given " + "instant. The number of concurrent instances cannot exceed " + "reps * number_of_langs_tested, which is also the default value."); + +namespace tesseract { + +static const char* kTessLangs[] = {"eng", "vie", nullptr}; +static const char* kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr}; +static const char* kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67", + nullptr}; + +static const char* kCubeLangs[] = {"hin", "ara", nullptr}; +static const char* kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr}; +static const char* kCubeTruthText[] = { + "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c", + "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr}; + +class BaseapiThreadTest : public ::testing::Test { + protected: + static void SetUpTestCase() { + CHECK(FLAGS_test_tesseract || FLAGS_test_cube) + << "Need to test at least one of Tesseract/Cube!"; + // Form a list of langs/gt_text/image_files we will work with. + std::vector<std::string> image_files; + if (FLAGS_test_tesseract) { + int i = 0; + while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) { + langs_.push_back(kTessLangs[i]); + gt_text_.push_back(kTessTruthText[i]); + image_files.push_back(kTessImages[i]); + ++i; + } + LOG(INFO) << "Testing Tesseract on " << i << " languages."; + } + if (FLAGS_test_cube) { + int i = 0; + while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) { + langs_.push_back(kCubeLangs[i]); + gt_text_.push_back(kCubeTruthText[i]); + image_files.push_back(kCubeImages[i]); + ++i; + } + LOG(INFO) << "Testing Cube on " << i << " languages."; + } + num_langs_ = langs_.size(); + + // Pre-load the images into an array. We will be making multiple copies of + // an image here if FLAGS_reps > 1 and that is intentional. In this test, we + // wish to not make any assumptions about the thread-safety of Pix objects, + // and so entirely disallow concurrent access of a Pix instance. + const int n = num_langs_ * FLAGS_reps; + for (int i = 0; i < n; ++i) { + std::string path = TESTING_DIR "/" + image_files[i % num_langs_]; + Pix* new_pix = pixRead(path.c_str()); + QCHECK(new_pix != nullptr) << "Could not read " << path; + pix_.push_back(new_pix); + } + +#ifdef INCLUDE_TENSORFLOW + pool_size_ = (FLAGS_max_concurrent_instances < 1) + ? num_langs_ * FLAGS_reps + : FLAGS_max_concurrent_instances; +#endif + } + + static void TearDownTestCase() { + for (auto& pix : pix_) { + pixDestroy(&pix); + } + } + +#ifdef INCLUDE_TENSORFLOW + void ResetPool() { + pool_.reset(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_)); + } + + void WaitForPoolWorkers() { pool_.reset(nullptr); } + + std::unique_ptr<tensorflow::thread::ThreadPool> pool_; + static int pool_size_; +#endif + static std::vector<Pix*> pix_; + static std::vector<std::string> langs_; + static std::vector<std::string> gt_text_; + static int num_langs_; +}; + +// static member variable declarations. +#ifdef INCLUDE_TENSORFLOW +int BaseapiThreadTest::pool_size_; +#endif +std::vector<Pix*> BaseapiThreadTest::pix_; +std::vector<std::string> BaseapiThreadTest::langs_; +std::vector<std::string> BaseapiThreadTest::gt_text_; +int BaseapiThreadTest::num_langs_; + +static void InitTessInstance(TessBaseAPI* tess, const std::string& lang) { + CHECK(tess != nullptr); + EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str())); +} + +static void GetCleanedText(TessBaseAPI* tess, Pix* pix, std::string* ocr_text) { + tess->SetImage(pix); + char* result = tess->GetUTF8Text(); + *ocr_text = result; + delete[] result; + absl::StripAsciiWhitespace(ocr_text); +} + +static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lang, + const std::string& expected_text) { + TessBaseAPI* tess_local = nullptr; + if (tess) { + tess_local = tess; + } else { + tess_local = new TessBaseAPI; + InitTessInstance(tess_local, lang); + } + std::string ocr_text; + GetCleanedText(tess_local, pix, &ocr_text); + EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str()); + if (tess_local != tess) delete tess_local; +} + +// Check that Tesseract/Cube produce the correct results in single-threaded +// operation. If not, it is pointless to run the real multi-threaded tests. +TEST_F(BaseapiThreadTest, TestBasicSanity) { + for (int i = 0; i < num_langs_; ++i) { + TessBaseAPI tess; + InitTessInstance(&tess, langs_[i]); + std::string ocr_text; + GetCleanedText(&tess, pix_[i], &ocr_text); + CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0) + << "Failed with lang = " << langs_[i]; + } +} + +// Test concurrent instance initialization. +TEST_F(BaseapiThreadTest, TestInit) { +#ifdef INCLUDE_TENSORFLOW + const int n = num_langs_ * FLAGS_reps; + ResetPool(); + std::vector<TessBaseAPI> tess(n); + for (int i = 0; i < n; ++i) { + pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_])); + } + WaitForPoolWorkers(); +#endif +} + +// Test concurrent recognition. +TEST_F(BaseapiThreadTest, TestRecognition) { +#ifdef INCLUDE_TENSORFLOW + const int n = num_langs_ * FLAGS_reps; + std::vector<TessBaseAPI> tess(n); + // Initialize api instances in a single thread. + for (int i = 0; i < n; ++i) { + InitTessInstance(&tess[i], langs_[i % num_langs_]); + } + + ResetPool(); + for (int i = 0; i < n; ++i) { + pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i], + langs_[i % num_langs_], gt_text_[i % num_langs_])); + } + WaitForPoolWorkers(); +#endif +} + +TEST_F(BaseapiThreadTest, TestAll) { +#ifdef INCLUDE_TENSORFLOW + const int n = num_langs_ * FLAGS_reps; + ResetPool(); + for (int i = 0; i < n; ++i) { + pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i], + langs_[i % num_langs_], gt_text_[i % num_langs_])); + } + WaitForPoolWorkers(); +#endif +} +} // namespace diff --git a/tesseract/unittest/bitvector_test.cc b/tesseract/unittest/bitvector_test.cc new file mode 100644 index 00000000..9be718a0 --- /dev/null +++ b/tesseract/unittest/bitvector_test.cc @@ -0,0 +1,166 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cmath> +#include <cstdio> +#include <string> + +#include "bitvector.h" + +#include "include_gunit.h" + +const int kPrimeLimit = 1000; + +namespace tesseract { + +class BitVectorTest : public testing::Test { + protected: + void SetUp() override { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + public: + std::string OutputNameToPath(const std::string& name) { + return file::JoinPath(FLAGS_test_tmpdir, name); + } + // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes. + void ComputePrimes(BitVector* map) { + map->Init(kPrimeLimit + 1); + TestAll(*map, false); + map->SetBit(2); + // Set all the odds to true. + for (int i = 3; i <= kPrimeLimit; i += 2) map->SetValue(i, true); + int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit)); + for (int f = 3; f <= factor_limit; f += 2) { + if (map->At(f)) { + for (int m = 2; m * f <= kPrimeLimit; ++m) map->ResetBit(f * m); + } + } + } + + void TestPrimes(const BitVector& map) { + // Now all primes in the vector are true, and all others false. + // According to Wikipedia, there are 168 primes under 1000, the last + // of which is 997. + int total_primes = 0; + for (int i = 0; i <= kPrimeLimit; ++i) { + if (map[i]) ++total_primes; + } + EXPECT_EQ(168, total_primes); + EXPECT_TRUE(map[997]); + EXPECT_FALSE(map[998]); + EXPECT_FALSE(map[999]); + } + // Test that all bits in the vector have the given value. + void TestAll(const BitVector& map, bool value) { + for (int i = 0; i < map.size(); ++i) { + EXPECT_EQ(value, map[i]); + } + } + + // Sets up a BitVector with bit patterns for byte values in + // [start_byte, end_byte) positioned every spacing bytes (for spacing >= 1) + // with spacing-1 zero bytes in between the pattern bytes. + void SetBitPattern(int start_byte, int end_byte, int spacing, BitVector* bv) { + bv->Init((end_byte - start_byte) * 8 * spacing); + for (int byte_value = start_byte; byte_value < end_byte; ++byte_value) { + for (int bit = 0; bit < 8; ++bit) { + if (byte_value & (1 << bit)) + bv->SetBit((byte_value - start_byte) * 8 * spacing + bit); + } + } + } + + // Expects that every return from NextSetBit is really set and that all others + // are really not set. Checks the return from NumSetBits also. + void ExpectCorrectBits(const BitVector& bv) { + int bit_index = -1; + int prev_bit_index = -1; + int num_bits_tested = 0; + while ((bit_index = bv.NextSetBit(bit_index)) >= 0) { + EXPECT_LT(bit_index, bv.size()); + // All bits in between must be 0. + for (int i = prev_bit_index + 1; i < bit_index; ++i) { + EXPECT_EQ(0, bv[i]) << "i = " << i << " prev = " << prev_bit_index; + } + // This bit must be 1. + EXPECT_EQ(1, bv[bit_index]) << "Bit index = " << bit_index; + ++num_bits_tested; + prev_bit_index = bit_index; + } + // Check the bits between the last and the end. + for (int i = prev_bit_index + 1; i < bv.size(); ++i) { + EXPECT_EQ(0, bv[i]); + } + EXPECT_EQ(num_bits_tested, bv.NumSetBits()); + } +}; + +// Tests the sieve of Eratosthenes as a way of testing set/reset and I/O. +TEST_F(BitVectorTest, Primes) { + BitVector map; + ComputePrimes(&map); + TestPrimes(map); + // It still works if we use the copy constructor. + BitVector map2(map); + TestPrimes(map2); + // Or if we assign it. + BitVector map3; + map3 = map; + TestPrimes(map3); + // Test file i/o too. + std::string filename = OutputNameToPath("primesbitvector"); + FILE* fp = fopen(filename.c_str(), "wb"); + ASSERT_TRUE(fp != nullptr); + EXPECT_TRUE(map.Serialize(fp)); + fclose(fp); + fp = fopen(filename.c_str(), "rb"); + ASSERT_TRUE(fp != nullptr); + BitVector read_map; + EXPECT_TRUE(read_map.DeSerialize(false, fp)); + fclose(fp); + TestPrimes(read_map); +} + +// Tests the many-to-one setup feature. +TEST_F(BitVectorTest, SetAll) { + // Test the default constructor and set/resetall. + BitVector map(42); + TestAll(map, false); + map.SetAllTrue(); + TestAll(map, true); + map.SetAllFalse(); + TestAll(map, false); +} + +// Tests the values in the tables offset_table_, next_table_, hamming_table_ +// by setting all possible byte patterns and verifying that the NextSetBit and +// NumSetBits functions return the correct values. +TEST_F(BitVectorTest, TestNextSetBit) { + BitVector bv; + for (int spacing = 1; spacing <= 5; ++spacing) { + SetBitPattern(0, 256, spacing, &bv); + ExpectCorrectBits(bv); + } +} + +// Tests the values in hamming_table_ more thoroughly by setting single byte +// patterns for each byte individually. +TEST_F(BitVectorTest, TestNumSetBits) { + BitVector bv; + for (int byte = 0; byte < 256; ++byte) { + SetBitPattern(byte, byte + 1, 1, &bv); + ExpectCorrectBits(bv); + } +} + +} // namespace. diff --git a/tesseract/unittest/capiexample_c_test.c b/tesseract/unittest/capiexample_c_test.c new file mode 100644 index 00000000..5917f0c4 --- /dev/null +++ b/tesseract/unittest/capiexample_c_test.c @@ -0,0 +1,21 @@ +/////////////////////////////////////////////////////////////////////// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Verifies that C is able to include capi header. +#include <tesseract/capi.h> + +// Verifies that the libtesseract library has C API symbols. +int main() +{ + printf("%s\n", TessVersion()); + return 0; +} diff --git a/tesseract/unittest/capiexample_test.cc b/tesseract/unittest/capiexample_test.cc new file mode 100644 index 00000000..3c843056 --- /dev/null +++ b/tesseract/unittest/capiexample_test.cc @@ -0,0 +1,19 @@ +/////////////////////////////////////////////////////////////////////// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Verifies that C++ is able to include capi header. +#include <tesseract/capi.h> + +#include <gtest/gtest.h> + +// Verifies that the libtesseract library has C API symbols. +TEST(C, VersionTest) { TessVersion(); } diff --git a/tesseract/unittest/cleanapi_test.cc b/tesseract/unittest/cleanapi_test.cc new file mode 100644 index 00000000..4d284af0 --- /dev/null +++ b/tesseract/unittest/cleanapi_test.cc @@ -0,0 +1,28 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <tesseract/baseapi.h> + +// Dummy enum in the global namespace that checks for collision with awkward +// names. +// If this test fails to compile, clean up the includes in tesseract/baseapi.h! +// They are not supposed to drag in definitions of any of the tesseract +// types included in this enum! +enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD }; + +#include "gtest/gtest.h" + +namespace tesseract { + +// Verifies that the global namespace is clean. +TEST(CleanNamespaceTess, DummyTest) { tesseract::TessBaseAPI api; } + +} // namespace. diff --git a/tesseract/unittest/colpartition_test.cc b/tesseract/unittest/colpartition_test.cc new file mode 100644 index 00000000..caebe605 --- /dev/null +++ b/tesseract/unittest/colpartition_test.cc @@ -0,0 +1,76 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "colpartition.h" + +#include "include_gunit.h" + +namespace tesseract { + +class TestableColPartition : public ColPartition { + public: + void SetColumnRange(int first, int last) { + set_first_column(first); + set_last_column(last); + } +}; + +class ColPartitionTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + void TearDown() {} +}; + +TEST_F(ColPartitionTest, IsInSameColumnAsReflexive) { + TestableColPartition a, b; + a.SetColumnRange(1, 2); + b.SetColumnRange(3, 3); + + EXPECT_TRUE(a.IsInSameColumnAs(a)); + EXPECT_TRUE(b.IsInSameColumnAs(b)); +} + +TEST_F(ColPartitionTest, IsInSameColumnAsBorders) { + TestableColPartition a, b, c, d; + a.SetColumnRange(0, 1); + b.SetColumnRange(1, 2); + c.SetColumnRange(2, 3); + d.SetColumnRange(4, 5); + + EXPECT_TRUE(a.IsInSameColumnAs(b)); + EXPECT_TRUE(b.IsInSameColumnAs(a)); + EXPECT_FALSE(c.IsInSameColumnAs(d)); + EXPECT_FALSE(d.IsInSameColumnAs(c)); + EXPECT_FALSE(a.IsInSameColumnAs(d)); +} + +TEST_F(ColPartitionTest, IsInSameColumnAsSuperset) { + TestableColPartition a, b; + a.SetColumnRange(4, 7); + b.SetColumnRange(2, 8); + + EXPECT_TRUE(a.IsInSameColumnAs(b)); + EXPECT_TRUE(b.IsInSameColumnAs(a)); +} + +TEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) { + TestableColPartition a, b; + a.SetColumnRange(3, 8); + b.SetColumnRange(6, 10); + + EXPECT_TRUE(a.IsInSameColumnAs(b)); + EXPECT_TRUE(b.IsInSameColumnAs(a)); +} + +} // namespace diff --git a/tesseract/unittest/commandlineflags_test.cc b/tesseract/unittest/commandlineflags_test.cc new file mode 100644 index 00000000..7b16fbdd --- /dev/null +++ b/tesseract/unittest/commandlineflags_test.cc @@ -0,0 +1,158 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "commandlineflags.h" + +#include "include_gunit.h" + +// Flags used for testing parser. +INT_PARAM_FLAG(foo_int, 0, "Integer flag for testing"); +INT_PARAM_FLAG(bar_int, 0, "Integer flag for testing"); +DOUBLE_PARAM_FLAG(foo_double, 0.1, "Double flag for testing"); +DOUBLE_PARAM_FLAG(bar_double, 0.2, "Double flag for testing"); +STRING_PARAM_FLAG(foo_string, "foo", "String flag for testing"); +STRING_PARAM_FLAG(bar_string, "bar", "String flag for testing"); +BOOL_PARAM_FLAG(foo_bool, false, "Bool flag for testing"); +BOOL_PARAM_FLAG(bar_bool, false, "Bool flag for testing"); +// A flag whose name is a single character, tested for backward +// compatibility. This should be selected to not conflict with existing flags +// in commontraining.cpp. +STRING_PARAM_FLAG(q, "", "Single character name"); + +namespace tesseract { + +class CommandlineflagsTest : public ::testing::Test { + protected: + void TestParser(int argc, const char** const_argv) { + TestParser("", argc, const_argv); + } + void TestParser(const char* usage, int argc, const char** const_argv) { + // Make a copy of the pointer since it can be altered by the function. + char** argv = const_cast<char**>(const_argv); + tesseract::ParseCommandLineFlags(usage, &argc, &argv, true); + } +}; + +TEST_F(CommandlineflagsTest, RemoveFlags) { + const char* const_argv[] = {"Progname", "--foo_int", "3", "file1.h", + "file2.h"}; + int argc = ARRAYSIZE(const_argv); + char** argv = const_cast<char**>(const_argv); + tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); + + // argv should be rearranged to look like { "Progname", "file1.h", "file2.h" } + EXPECT_EQ(3, argc); + EXPECT_STREQ("Progname", argv[0]); + EXPECT_STREQ("file1.h", argv[1]); + EXPECT_STREQ("file2.h", argv[2]); +} + +#if 0 // TODO: this test needs an update (it currently fails). +TEST_F(CommandlineflagsTest, PrintUsageAndExit) { + const char* argv[] = { "Progname", "--help" }; + EXPECT_EXIT(TestParser("Progname [flags]", ARRAYSIZE(argv), argv), + ::testing::ExitedWithCode(0), + "USAGE: Progname \\[flags\\]"); +} +#endif + +TEST_F(CommandlineflagsTest, ExitsWithErrorOnInvalidFlag) { + const char* argv[] = {"", "--test_nonexistent_flag"}; + EXPECT_EXIT(TestParser(ARRAYSIZE(argv), argv), ::testing::ExitedWithCode(1), + "ERROR: Non-existent flag"); +} + +TEST_F(CommandlineflagsTest, ParseIntegerFlags) { + const char* argv[] = {"", "--foo_int=3", "--bar_int", "-4"}; + TestParser(ARRAYSIZE(argv), argv); + EXPECT_EQ(3, FLAGS_foo_int); + EXPECT_EQ(-4, FLAGS_bar_int); + + const char* arg_no_value[] = {"", "--bar_int"}; + EXPECT_EXIT(TestParser(ARRAYSIZE(arg_no_value), arg_no_value), + ::testing::ExitedWithCode(1), "ERROR"); + + const char* arg_invalid_value[] = {"", "--bar_int", "--foo_int=3"}; + EXPECT_EXIT(TestParser(ARRAYSIZE(arg_invalid_value), arg_invalid_value), + ::testing::ExitedWithCode(1), "ERROR"); + + const char* arg_bad_format[] = {"", "--bar_int="}; + EXPECT_EXIT(TestParser(ARRAYSIZE(arg_bad_format), arg_bad_format), + ::testing::ExitedWithCode(1), "ERROR"); +} + +TEST_F(CommandlineflagsTest, ParseDoubleFlags) { + const char* argv[] = {"", "--foo_double=3.14", "--bar_double", "1.2"}; + TestParser(ARRAYSIZE(argv), argv); + + EXPECT_EQ(3.14, FLAGS_foo_double); + EXPECT_EQ(1.2, FLAGS_bar_double); + + const char* arg_no_value[] = {"", "--bar_double"}; + EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), + "ERROR"); + + const char* arg_bad_format[] = {"", "--bar_double="}; + EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1), + "ERROR"); +} + +TEST_F(CommandlineflagsTest, ParseStringFlags) { + const char* argv[] = {"", "--foo_string=abc", "--bar_string", "def"}; + TestParser(ARRAYSIZE(argv), argv); + + EXPECT_STREQ("abc", FLAGS_foo_string.c_str()); + EXPECT_STREQ("def", FLAGS_bar_string.c_str()); + + const char* arg_no_value[] = {"", "--bar_string"}; + EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1), + "ERROR"); + + FLAGS_bar_string.set_value("bar"); + const char* arg_empty_string[] = {"", "--bar_string="}; + TestParser(2, arg_empty_string); + EXPECT_STREQ("", FLAGS_bar_string.c_str()); +} + +TEST_F(CommandlineflagsTest, ParseBoolFlags) { + const char* argv[] = {"", "--foo_bool=true", "--bar_bool=1"}; + FLAGS_foo_bool.set_value(false); + FLAGS_bar_bool.set_value(false); + TestParser(ARRAYSIZE(argv), argv); + // Verify changed value + EXPECT_TRUE(FLAGS_foo_bool); + EXPECT_TRUE(FLAGS_bar_bool); + + const char* inv_argv[] = {"", "--foo_bool=false", "--bar_bool=0"}; + FLAGS_foo_bool.set_value(true); + FLAGS_bar_bool.set_value(true); + TestParser(3, inv_argv); + // Verify changed value + EXPECT_FALSE(FLAGS_foo_bool); + EXPECT_FALSE(FLAGS_bar_bool); + + const char* arg_implied_true[] = {"", "--bar_bool"}; + FLAGS_bar_bool.set_value(false); + TestParser(2, arg_implied_true); + EXPECT_TRUE(FLAGS_bar_bool); + + const char* arg_missing_val[] = {"", "--bar_bool="}; + EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1), + "ERROR"); +} + +TEST_F(CommandlineflagsTest, ParseOldFlags) { + EXPECT_STREQ("", FLAGS_q.c_str()); + const char* argv[] = {"", "-q", "text"}; + TestParser(ARRAYSIZE(argv), argv); + EXPECT_STREQ("text", FLAGS_q.c_str()); +} +} // namespace diff --git a/tesseract/unittest/cycletimer.h b/tesseract/unittest/cycletimer.h new file mode 100644 index 00000000..e1a13719 --- /dev/null +++ b/tesseract/unittest/cycletimer.h @@ -0,0 +1,61 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// Portability include to match the Google test environment. + +#ifndef TESSERACT_UNITTEST_CYCLETIMER_H +#define TESSERACT_UNITTEST_CYCLETIMER_H + +#include "absl/time/clock.h" // for GetCurrentTimeNanos + +// See https://github.com/google/or-tools/blob/master/ortools/base/timer.h +class CycleTimer { +public: + CycleTimer() { + Reset(); + } + + void Reset() { + running_ = false; + sum_ = 0; + start_ = 0; + } + + // When Start() is called multiple times, only the most recent is used. + void Start() { + running_ = true; + start_ = absl::GetCurrentTimeNanos(); + } + + void Restart() { + sum_ = 0; + Start(); + } + + void Stop() { + if (running_) { + sum_ += absl::GetCurrentTimeNanos() - start_; + running_ = false; + } + } + int64_t GetInMs() const { return GetNanos() / 1000000; } + + protected: + int64_t GetNanos() const { + return running_ ? absl::GetCurrentTimeNanos() - start_ + sum_ : sum_; + } + + private: + bool running_; + int64_t start_; + int64_t sum_; +}; + +#endif // TESSERACT_UNITTEST_CYCLETIMER_H diff --git a/tesseract/unittest/dawg_test.cc b/tesseract/unittest/dawg_test.cc new file mode 100644 index 00000000..4a40b050 --- /dev/null +++ b/tesseract/unittest/dawg_test.cc @@ -0,0 +1,115 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" + +#include "ratngs.h" +#include "unicharset.h" +#include "trie.h" + +#include <cstdlib> // for system +#include <fstream> // for ifstream +#include <set> +#include <string> +#include <vector> +#include <sys/stat.h> + +#ifndef SW_TESTING +#define wordlist2dawg_prog "wordlist2dawg" +#define dawg2wordlist_prog "dawg2wordlist" +#endif + +namespace tesseract { + +// Test some basic functionality dealing with Dawgs (compressed dictionaries, +// aka Directed Acyclic Word Graphs). +class DawgTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + void LoadWordlist(const std::string& filename, std::set<std::string>* words) const { + std::ifstream file(filename); + if (file.is_open()) { + std::string line; + while (getline(file, line)) { + // Remove trailing line terminators from line. + while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) { + line.resize(line.size() - 1); + } + // Add line to set. + words->insert(line.c_str()); + } + file.close(); + } + } + std::string TessBinaryPath(const std::string& name) const { + return file::JoinPath(TESSBIN_DIR, name); + } + std::string OutputNameToPath(const std::string& name) const { + return file::JoinPath(FLAGS_test_tmpdir, name); + } + int RunCommand(const std::string& program, const std::string& arg1, + const std::string& arg2, const std::string& arg3) const { + std::string cmdline = + TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3; + return system(cmdline.c_str()); + } + // Test that we are able to convert a wordlist file (one "word" per line) to + // a dawg (a compressed format) and then extract the original wordlist back + // out using the tools "wordlist2dawg" and "dawg2wordlist." + void TestDawgRoundTrip(const std::string& unicharset_filename, + const std::string& wordlist_filename) const { + std::set<std::string> orig_words, roundtrip_words; + std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename); + std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename); + std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg"); + std::string output_wordlist = OutputNameToPath(wordlist_filename); + LoadWordlist(orig_wordlist, &orig_words); + EXPECT_EQ( + RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0); + EXPECT_EQ( + RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), + 0); + LoadWordlist(output_wordlist, &roundtrip_words); + EXPECT_EQ(orig_words, roundtrip_words); + } +}; + +TEST_F(DawgTest, TestDawgConversion) { + TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq"); +} + +TEST_F(DawgTest, TestMatching) { + UNICHARSET unicharset; + unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str()); + tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, + unicharset.size(), 0); + WERD_CHOICE space_apos(" '", unicharset); + trie.add_word_to_dawg(space_apos); + + WERD_CHOICE space(" ", unicharset); + + // partial match ok - then good! + EXPECT_TRUE(trie.prefix_in_dawg(space, false)); + // require complete match - not present. + EXPECT_FALSE(trie.word_in_dawg(space)); + EXPECT_FALSE(trie.prefix_in_dawg(space, true)); + + // partial or complete match ok for full word: + EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false)); + EXPECT_TRUE(trie.word_in_dawg(space_apos)); + EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true)); +} + +} // namespace diff --git a/tesseract/unittest/denorm_test.cc b/tesseract/unittest/denorm_test.cc new file mode 100644 index 00000000..28328b15 --- /dev/null +++ b/tesseract/unittest/denorm_test.cc @@ -0,0 +1,99 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "blobs.h" +#include "normalis.h" + +#include "include_gunit.h" + +namespace tesseract { + +class DENORMTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + public: + void TearDown() {} + + void ExpectCorrectTransform(const DENORM& denorm, const TPOINT& src, + const TPOINT& result, bool local) { + TPOINT normed; + if (local) + denorm.LocalNormTransform(src, &normed); + else + denorm.NormTransform(nullptr, src, &normed); + EXPECT_EQ(result.x, normed.x); + EXPECT_EQ(result.y, normed.y); + // Now undo + TPOINT denormed; + if (local) + denorm.LocalDenormTransform(normed, &denormed); + else + denorm.DenormTransform(nullptr, normed, &denormed); + EXPECT_EQ(src.x, denormed.x); + EXPECT_EQ(src.y, denormed.y); + } +}; + +// Tests a simple baseline-style normalization. +TEST_F(DENORMTest, NoRotations) { + DENORM denorm; + denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, + 0.0f, static_cast<float>(kBlnBaselineOffset)); + TPOINT pt1(1100, 2000); + TPOINT result1(200, kBlnBaselineOffset); + ExpectCorrectTransform(denorm, pt1, result1, true); + ExpectCorrectTransform(denorm, pt1, result1, false); + TPOINT pt2(900, 2100); + TPOINT result2(-200, 300 + kBlnBaselineOffset); + ExpectCorrectTransform(denorm, pt2, result2, true); + ExpectCorrectTransform(denorm, pt2, result2, false); +} + +// Tests a simple baseline-style normalization with a rotation. +TEST_F(DENORMTest, WithRotations) { + DENORM denorm; + FCOORD rotation90(0.0f, 1.0f); + denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f, + 3.0f, 0.0f, static_cast<float>(kBlnBaselineOffset)); + + TPOINT pt1(1100, 2000); + TPOINT result1(0, 200 + kBlnBaselineOffset); + ExpectCorrectTransform(denorm, pt1, result1, true); + ExpectCorrectTransform(denorm, pt1, result1, false); + TPOINT pt2(900, 2100); + TPOINT result2(-300, kBlnBaselineOffset - 200); + ExpectCorrectTransform(denorm, pt2, result2, true); + ExpectCorrectTransform(denorm, pt2, result2, false); +} + +// Tests a simple baseline-style normalization with a second rotation & scale. +TEST_F(DENORMTest, Multiple) { + DENORM denorm; + denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f, + 0.0f, static_cast<float>(kBlnBaselineOffset)); + + DENORM denorm2; + FCOORD rotation90(0.0f, 1.0f); + denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f, + 0.25f, 0.0f, 0.0f); + TPOINT pt1(1050, 2000); + TPOINT result1(100, kBlnBaselineOffset); + ExpectCorrectTransform(denorm, pt1, result1, true); + ExpectCorrectTransform(denorm, pt1, result1, false); + TPOINT result2(kBlnBaselineOffset / 4, -14); + ExpectCorrectTransform(denorm2, result1, result2, true); + ExpectCorrectTransform(denorm2, pt1, result2, false); +} + +} // namespace. diff --git a/tesseract/unittest/doubleptr.h b/tesseract/unittest/doubleptr.h new file mode 100644 index 00000000..38628b5f --- /dev/null +++ b/tesseract/unittest/doubleptr.h @@ -0,0 +1,93 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +/////////////////////////////////////////////////////////////////////// +// File: doubleptr.h +// Description: Double-ended pointer that keeps pointing correctly even +// when reallocated or copied. +// Author: Ray Smith +// Created: Wed Mar 14 12:22:57 PDT 2012 +// +// (C) Copyright 2012, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_ +#define TESSERACT_CCUTIL_DOUBLEPTR_H_ + +#include "errcode.h" + +namespace tesseract { + +// A smart pointer class that implements a double-ended pointer. Each end +// points to the other end. The copy constructor and operator= have MOVE +// semantics, meaning that the relationship with the other end moves to the +// destination of the copy, leaving the source unattached. +// For this reason both the copy constructor and the operator= take a non-const +// reference argument, and the const reference versions cannot be used. +// DoublePtr is useful to incorporate into structures that are part of a +// collection such as GenericVector or STL containers, where reallocs can +// relocate the members. DoublePtr is also useful in a GenericHeap, where it +// can correctly maintain the pointer to an element of the heap despite it +// getting moved around on the heap. +class DoublePtr { + public: + DoublePtr() : other_end_(nullptr) {} + // Copy constructor steals the partner off src and is therefore a non + // const reference arg. + // Copying a const DoublePtr generates a compiler error. + DoublePtr(const DoublePtr& src) { + other_end_ = src.other_end_; + if (other_end_ != nullptr) { + other_end_->other_end_ = this; + ((DoublePtr&)src).other_end_ = nullptr; + } + } + // Operator= steals the partner off src, and therefore needs src to be a non- + // const reference. + // Assigning from a const DoublePtr generates a compiler error. + void operator=(const DoublePtr& src) { + Disconnect(); + other_end_ = src.other_end_; + if (other_end_ != nullptr) { + other_end_->other_end_ = this; + ((DoublePtr&)src).other_end_ = nullptr; + } + } + + // Connects this and other, discarding any existing connections. + void Connect(DoublePtr* other) { + other->Disconnect(); + Disconnect(); + other->other_end_ = this; + other_end_ = other; + } + // Disconnects this and other, making OtherEnd() return nullptr for both. + void Disconnect() { + if (other_end_ != nullptr) { + other_end_->other_end_ = nullptr; + other_end_ = nullptr; + } + } + // Returns the pointer to the other end of the double pointer. + DoublePtr* OtherEnd() const { + return other_end_; + } + + private: + // Pointer to the other end of the link. It is always true that either + // other_end_ == nullptr or other_end_->other_end_ == this. + DoublePtr* other_end_; +}; + +} // namespace tesseract. + +#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_ diff --git a/tesseract/unittest/equationdetect_test.cc b/tesseract/unittest/equationdetect_test.cc new file mode 100644 index 00000000..eb52231e --- /dev/null +++ b/tesseract/unittest/equationdetect_test.cc @@ -0,0 +1,549 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" + +#include "colpartitiongrid.h" +#include "equationdetect.h" +#include "tesseractclass.h" + +#include "allheaders.h" + +#include <memory> +#include <string> +#include <utility> + +#define ENABLE_IdentifySpecialText_TEST 0 +#if ENABLE_IdentifySpecialText_TEST +#define EQU_TRAINEDDATA_NAME "equ" +#else +#define EQU_TRAINEDDATA_NAME "equINTENTIONALLY_MISSING_FILE" +#endif + +namespace tesseract { + +class TestableEquationDetect : public EquationDetect { + public: + TestableEquationDetect(const char* tessdata, Tesseract* lang_tesseract) + : EquationDetect(tessdata, EQU_TRAINEDDATA_NAME) { + SetLangTesseract(lang_tesseract); + } + + // Insert a certain math and digit blobs into part. + void AddMathDigitBlobs(const int math_blobs, const int digit_blobs, + const int total_blobs, ColPartition* part) { + CHECK(part != nullptr); + CHECK_LE(math_blobs + digit_blobs, total_blobs); + int count = 0; + for (int i = 0; i < math_blobs; i++, count++) { + BLOBNBOX* blob = new BLOBNBOX(); + blob->set_special_text_type(BSTT_MATH); + part->AddBox(blob); + } + for (int i = 0; i < digit_blobs; i++, count++) { + BLOBNBOX* blob = new BLOBNBOX(); + blob->set_special_text_type(BSTT_DIGIT); + part->AddBox(blob); + } + for (int i = count; i < total_blobs; i++) { + BLOBNBOX* blob = new BLOBNBOX(); + blob->set_special_text_type(BSTT_NONE); + part->AddBox(blob); + } + } + + // Set up pix_binary for lang_tesseract_. + void SetPixBinary(Pix* pix) { + CHECK_EQ(1, pixGetDepth(pix)); + *(lang_tesseract_->mutable_pix_binary()) = pix; + } + + void RunIdentifySpecialText(BLOBNBOX* blob, const int height_th) { + IdentifySpecialText(blob, height_th); + } + + BlobSpecialTextType RunEstimateTypeForUnichar(const char* val) { + const UNICHARSET& unicharset = lang_tesseract_->unicharset; + return EstimateTypeForUnichar(unicharset, unicharset.unichar_to_id(val)); + } + + EquationDetect::IndentType RunIsIndented(ColPartitionGrid* part_grid, + ColPartition* part) { + this->part_grid_ = part_grid; + return IsIndented(part); + } + + bool RunIsNearSmallNeighbor(const TBOX& seed_box, const TBOX& part_box) { + return IsNearSmallNeighbor(seed_box, part_box); + } + + bool RunCheckSeedBlobsCount(ColPartition* part) { + return CheckSeedBlobsCount(part); + } + + float RunComputeForegroundDensity(const TBOX& tbox) { + return ComputeForegroundDensity(tbox); + } + + int RunCountAlignment(const GenericVector<int>& sorted_vec, const int val) { + return CountAlignment(sorted_vec, val); + } + + void RunSplitCPHorLite(ColPartition* part, + GenericVector<TBOX>* splitted_boxes) { + SplitCPHorLite(part, splitted_boxes); + } + + void RunSplitCPHor(ColPartition* part, + GenericVector<ColPartition*>* parts_splitted) { + SplitCPHor(part, parts_splitted); + } + + void TestComputeCPsSuperBBox(const TBOX& box, ColPartitionGrid* part_grid) { + CHECK(part_grid != nullptr); + part_grid_ = part_grid; + ComputeCPsSuperBBox(); + EXPECT_TRUE(*cps_super_bbox_ == box); + } +}; + +class EquationFinderTest : public testing::Test { + protected: + std::unique_ptr<TestableEquationDetect> equation_det_; + std::unique_ptr<Tesseract> tesseract_; + + // The directory for testdata; + std::string testdata_dir_; + + void SetUp() { + std::locale::global(std::locale("")); + tesseract_.reset(new Tesseract()); + tesseract_->init_tesseract(TESSDATA_DIR, "eng", OEM_TESSERACT_ONLY); + tesseract_->set_source_resolution(300); + equation_det_.reset( + new TestableEquationDetect(TESSDATA_DIR, tesseract_.get())); + equation_det_->SetResolution(300); + + testdata_dir_ = TESTDATA_DIR; + } + + void TearDown() { + tesseract_.reset(nullptr); + equation_det_.reset(nullptr); + } + + // Add a BLOCK covering the whole page. + void AddPageBlock(Pix* pix, BLOCK_LIST* blocks) { + CHECK(pix != nullptr); + CHECK(blocks != nullptr); + BLOCK_IT block_it(blocks); + BLOCK* block = + new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix)); + block_it.add_to_end(block); + } + + // Create col partitions, add into part_grid, and put them into all_parts. + void CreateColParts(const int rows, const int cols, + ColPartitionGrid* part_grid, + std::vector<ColPartition*>* all_parts) { + const int kWidth = 10, kHeight = 10; + ClearParts(all_parts); + for (int y = 0; y < rows; ++y) { + for (int x = 0; x < cols; ++x) { + int left = x * kWidth * 2, bottom = y * kHeight * 2; + TBOX box(left, bottom, left + kWidth, bottom + kHeight); + ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, + BRT_TEXT, BTFT_NONE); + part_grid->InsertBBox(true, true, part); + all_parts->push_back(part); + } + } + } + + void ClearParts(std::vector<ColPartition*>* all_parts) { + for (size_t i = 0; i < all_parts->size(); ++i) { + (*all_parts)[i]->DeleteBoxes(); + delete ((*all_parts)[i]); + } + } + + // Create a BLOBNBOX object with bounding box tbox, and add it into part. + void AddBlobIntoPart(const TBOX& tbox, ColPartition* part) { + CHECK(part != nullptr); + BLOBNBOX* blob = new BLOBNBOX(); + blob->set_bounding_box(tbox); + part->AddBox(blob); + } +}; + +TEST_F(EquationFinderTest, IdentifySpecialText) { +#if !ENABLE_IdentifySpecialText_TEST + GTEST_SKIP(); +#else // TODO: missing equ_gt1.tif + // Load Image. + std::string imagefile = file::JoinPath(testdata_dir_, "equ_gt1.tif"); + Pix* pix_binary = pixRead(imagefile.c_str()); + CHECK(pix_binary != nullptr && pixGetDepth(pix_binary) == 1); + + // Get components. + BLOCK_LIST blocks; + TO_BLOCK_LIST to_blocks; + AddPageBlock(pix_binary, &blocks); + Textord* textord = tesseract_->mutable_textord(); + textord->find_components(pix_binary, &blocks, &to_blocks); + + // Identify special texts from to_blocks. + TO_BLOCK_IT to_block_it(&to_blocks); + std::map<int, int> stt_count; + for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); + to_block_it.forward()) { + TO_BLOCK* to_block = to_block_it.data(); + BLOBNBOX_IT blob_it(&(to_block->blobs)); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + BLOBNBOX* blob = blob_it.data(); + // blob->set_special_text_type(BSTT_NONE); + equation_det_->RunIdentifySpecialText(blob, 0); + tensorflow::gtl::InsertIfNotPresent(&stt_count, blob->special_text_type(), 0); + stt_count[blob->special_text_type()]++; + } + } + + // Verify the number, but allow a range of +/- kCountRange before squealing. + const int kCountRange = 3; + EXPECT_GE(39 + kCountRange, stt_count[BSTT_NONE]); + EXPECT_LE(39 - kCountRange, stt_count[BSTT_NONE]); + + // if you count all the subscripts etc, there are ~45 italic chars. + EXPECT_GE(45 + kCountRange, stt_count[BSTT_ITALIC]); + EXPECT_LE(45 - kCountRange, stt_count[BSTT_ITALIC]); + EXPECT_GE(41 + kCountRange, stt_count[BSTT_DIGIT]); + EXPECT_LE(41 - kCountRange, stt_count[BSTT_DIGIT]); + EXPECT_GE(50 + kCountRange, stt_count[BSTT_MATH]); + EXPECT_LE(50 - kCountRange, stt_count[BSTT_MATH]); + EXPECT_GE(10 + kCountRange, stt_count[BSTT_UNCLEAR]); + EXPECT_LE(10 - kCountRange, stt_count[BSTT_UNCLEAR]); + + // Release memory. + pixDestroy(&pix_binary); +#endif +} + +TEST_F(EquationFinderTest, EstimateTypeForUnichar) { + // Test abc characters. + EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("a")); + EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("c")); + + // Test punctuation characters. + EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("'")); + EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(",")); + + // Test digits. + EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("1")); + EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("4")); + EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("|")); + + // Test math symbols. + EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar("(")); + EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar("+")); +} + +TEST_F(EquationFinderTest, IsIndented) { + ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000)); + + // Create five ColPartitions: + // part 1: ************ + // part 2: ********* + // part 3: ******* + // part 4: ***** + // + // part 5: ******** + TBOX box1(0, 950, 999, 999); + ColPartition* part1 = + ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part_grid.InsertBBox(true, true, part1); + TBOX box2(300, 920, 900, 940); + ColPartition* part2 = + ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part_grid.InsertBBox(true, true, part2); + TBOX box3(0, 900, 600, 910); + ColPartition* part3 = + ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part_grid.InsertBBox(true, true, part3); + TBOX box4(300, 890, 600, 899); + ColPartition* part4 = + ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part_grid.InsertBBox(true, true, part4); + TBOX box5(300, 500, 900, 510); + ColPartition* part5 = + ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part_grid.InsertBBox(true, true, part5); + + // Test + // part1 should be no indent. + EXPECT_EQ(EquationDetect::NO_INDENT, + equation_det_->RunIsIndented(&part_grid, part1)); + // part2 should be left indent in terms of part1. + EXPECT_EQ(EquationDetect::LEFT_INDENT, + equation_det_->RunIsIndented(&part_grid, part2)); + // part3 should be right indent. + EXPECT_EQ(EquationDetect::RIGHT_INDENT, + equation_det_->RunIsIndented(&part_grid, part3)); + // part4 should be both indented. + EXPECT_EQ(EquationDetect::BOTH_INDENT, + equation_det_->RunIsIndented(&part_grid, part4)); + // part5 should be no indent because it is too far from part1. + EXPECT_EQ(EquationDetect::NO_INDENT, + equation_det_->RunIsIndented(&part_grid, part5)); + + // Release memory. + part1->DeleteBoxes(); + delete (part1); + part2->DeleteBoxes(); + delete (part2); + part3->DeleteBoxes(); + delete (part3); + part4->DeleteBoxes(); + delete (part4); + part5->DeleteBoxes(); + delete (part5); +} + +TEST_F(EquationFinderTest, IsNearSmallNeighbor) { + // Create four tboxes: + // part 1, part 2 + // ***** ***** + // part 3: ***** + // + // part 4: ***************** + TBOX box1(0, 950, 499, 999); + TBOX box2(500, 950, 999, 998); + TBOX box3(0, 900, 499, 949); + TBOX box4(0, 550, 499, 590); + + // Test + // box2 should be box1's near neighbor but not vice versa. + EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box2)); + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box1)); + // box1 and box3 should be near neighbors of each other. + EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box3)); + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3)); + // box2 and box3 should not be near neighbors of each other. + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3)); + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box2)); + + // box4 should not be the near neighbor of any one. + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box1, box4)); + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box4)); + EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box4)); +} + +TEST_F(EquationFinderTest, CheckSeedBlobsCount) { + TBOX box(0, 950, 999, 999); + ColPartition* part1 = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition* part2 = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition* part3 = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + ColPartition* part4 = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + + // Part 1: 8 math, 0 digit, 20 total. + equation_det_->AddMathDigitBlobs(8, 0, 20, part1); + EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part1)); + + // Part 2: 1 math, 8 digit, 20 total. + equation_det_->AddMathDigitBlobs(1, 8, 20, part2); + EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part2)); + + // Part 3: 3 math, 8 digit, 8 total. + equation_det_->AddMathDigitBlobs(3, 8, 20, part3); + EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part3)); + + // Part 4: 8 math, 0 digit, 8 total. + equation_det_->AddMathDigitBlobs(0, 0, 8, part4); + EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part4)); + + // Release memory. + part1->DeleteBoxes(); + delete (part1); + part2->DeleteBoxes(); + delete (part2); + part3->DeleteBoxes(); + delete (part3); + part4->DeleteBoxes(); + delete (part4); +} + +TEST_F(EquationFinderTest, ComputeForegroundDensity) { + // Create the pix with top half foreground, bottom half background. + int width = 1024, height = 768; + Pix* pix = pixCreate(width, height, 1); + pixRasterop(pix, 0, 0, width, height / 2, PIX_SET, nullptr, 0, 0); + TBOX box1(100, 0, 140, 140), box2(100, height / 2 - 20, 140, height / 2 + 20), + box3(100, height - 40, 140, height); + equation_det_->SetPixBinary(pix); + + // Verify + EXPECT_NEAR(0.0, equation_det_->RunComputeForegroundDensity(box1), 0.0001f); + EXPECT_NEAR(0.5, equation_det_->RunComputeForegroundDensity(box2), 0.0001f); + EXPECT_NEAR(1.0, equation_det_->RunComputeForegroundDensity(box3), 0.0001f); +} + +TEST_F(EquationFinderTest, CountAlignment) { + GenericVector<int> vec; + vec.push_back(1); + vec.push_back(1); + vec.push_back(1); + vec.push_back(100); + vec.push_back(200); + vec.push_back(200); + + // Test the right point. + EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 1)); + EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 100)); + EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 200)); + + // Test the near neighbors. + EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 3)); + EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 99)); + EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 202)); + + // Test the far neighbors. + EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 150)); + EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 50)); + EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 250)); +} + +TEST_F(EquationFinderTest, ComputeCPsSuperBBox) { + Pix* pix = pixCreate(1001, 1001, 1); + equation_det_->SetPixBinary(pix); + ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000)); + + TBOX box1(0, 0, 999, 99); + ColPartition* part1 = + ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + TBOX box2(0, 100, 499, 199); + ColPartition* part2 = + ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + TBOX box3(500, 100, 999, 199); + ColPartition* part3 = + ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + TBOX box4(0, 200, 999, 299); + ColPartition* part4 = + ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + TBOX box5(0, 900, 999, 999); + ColPartition* part5 = + ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + + // Add part1->part3 into part_grid and test. + part_grid.InsertBBox(true, true, part1); + part_grid.InsertBBox(true, true, part2); + part_grid.InsertBBox(true, true, part3); + TBOX super_box(0, 0, 999, 199); + equation_det_->TestComputeCPsSuperBBox(super_box, &part_grid); + + // Add part4 and test. + part_grid.InsertBBox(true, true, part4); + TBOX super_box2(0, 0, 999, 299); + equation_det_->TestComputeCPsSuperBBox(super_box2, &part_grid); + + // Add part5 and test. + part_grid.InsertBBox(true, true, part5); + TBOX super_box3(0, 0, 999, 999); + equation_det_->TestComputeCPsSuperBBox(super_box3, &part_grid); + + // Release memory. + part1->DeleteBoxes(); + delete (part1); + part2->DeleteBoxes(); + delete (part2); + part3->DeleteBoxes(); + delete (part3); + part4->DeleteBoxes(); + delete (part4); + part5->DeleteBoxes(); + delete (part5); +} + +TEST_F(EquationFinderTest, SplitCPHorLite) { + TBOX box(0, 0, 999, 99); + ColPartition* part = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part->DeleteBoxes(); + part->set_median_width(10); + GenericVector<TBOX> splitted_boxes; + + // Test an empty part. + equation_det_->RunSplitCPHorLite(part, &splitted_boxes); + EXPECT_TRUE(splitted_boxes.empty()); + + // Test with one blob. + AddBlobIntoPart(TBOX(0, 0, 10, 50), part); + equation_det_->RunSplitCPHorLite(part, &splitted_boxes); + EXPECT_EQ(1, splitted_boxes.size()); + EXPECT_TRUE(TBOX(0, 0, 10, 50) == splitted_boxes[0]); + + // Add more blob and test. + AddBlobIntoPart(TBOX(11, 0, 20, 60), part); + AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point. + AddBlobIntoPart(TBOX(100, 0, 110, 15), part); + AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point. + AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point. + equation_det_->RunSplitCPHorLite(part, &splitted_boxes); + // Verify. + EXPECT_EQ(3, splitted_boxes.size()); + EXPECT_TRUE(TBOX(0, 0, 30, 60) == splitted_boxes[0]); + EXPECT_TRUE(TBOX(100, 0, 140, 45) == splitted_boxes[1]); + EXPECT_TRUE(TBOX(500, 0, 540, 35) == splitted_boxes[2]); + + part->DeleteBoxes(); + delete (part); +} + +TEST_F(EquationFinderTest, SplitCPHor) { + TBOX box(0, 0, 999, 99); + ColPartition* part = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part->DeleteBoxes(); + part->set_median_width(10); + GenericVector<ColPartition*> parts_splitted; + + // Test an empty part. + equation_det_->RunSplitCPHor(part, &parts_splitted); + EXPECT_TRUE(parts_splitted.empty()); + // Test with one blob. + AddBlobIntoPart(TBOX(0, 0, 10, 50), part); + + equation_det_->RunSplitCPHor(part, &parts_splitted); + EXPECT_EQ(1, parts_splitted.size()); + EXPECT_TRUE(TBOX(0, 0, 10, 50) == parts_splitted[0]->bounding_box()); + + // Add more blob and test. + AddBlobIntoPart(TBOX(11, 0, 20, 60), part); + AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point. + AddBlobIntoPart(TBOX(100, 0, 110, 15), part); + AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point. + AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point. + equation_det_->RunSplitCPHor(part, &parts_splitted); + + // Verify. + EXPECT_EQ(3, parts_splitted.size()); + EXPECT_TRUE(TBOX(0, 0, 30, 60) == parts_splitted[0]->bounding_box()); + EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box()); + EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box()); + + parts_splitted.delete_data_pointers(); + part->DeleteBoxes(); + delete (part); +} + +} // namespace tesseract diff --git a/tesseract/unittest/fileio_test.cc b/tesseract/unittest/fileio_test.cc new file mode 100644 index 00000000..00488918 --- /dev/null +++ b/tesseract/unittest/fileio_test.cc @@ -0,0 +1,66 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include <stdio.h> +#include <memory> + +#include "absl/strings/str_split.h" + +#include "fileio.h" +#include "include_gunit.h" + +namespace tesseract { + +TEST(FileTest, JoinPath) { + EXPECT_EQ("/abc/def", File::JoinPath("/abc", "def")); + EXPECT_EQ("/abc/def", File::JoinPath("/abc/", "def")); + EXPECT_EQ("def", File::JoinPath("", "def")); +} + +TEST(OutputBufferTest, WriteString) { + const int kMaxBufSize = 128; + char buffer[kMaxBufSize]; + for (int i = 0; i < kMaxBufSize; ++i) buffer[i] = '\0'; + FILE* fp = tmpfile(); + CHECK(fp != nullptr); + + std::unique_ptr<OutputBuffer> output(new OutputBuffer(fp)); + output->WriteString("Hello "); + output->WriteString("world!"); + + rewind(fp); + auto s = "Hello world!"; + fread(buffer, strlen(s), 1, fp); + EXPECT_STREQ(s, buffer); +} + +TEST(InputBufferTest, Read) { + const int kMaxBufSize = 128; + char buffer[kMaxBufSize]; + auto s = "Hello\n world!"; + strncpy(buffer, s, kMaxBufSize); + EXPECT_STREQ(s, buffer); + FILE* fp = tmpfile(); + CHECK(fp != nullptr); + fwrite(buffer, strlen(s), 1, fp); + rewind(fp); + + std::string str; + std::unique_ptr<InputBuffer> input(new InputBuffer(fp)); + EXPECT_TRUE(input->Read(&str)); + std::vector<std::string> lines = absl::StrSplit(str, '\n', absl::SkipEmpty()); + EXPECT_EQ(2, lines.size()); + EXPECT_EQ("Hello", lines[0]); + EXPECT_EQ(" world!", lines[1]); +} + +} // namespace diff --git a/tesseract/unittest/fuzzers/fuzzer-api.cpp b/tesseract/unittest/fuzzers/fuzzer-api.cpp new file mode 100644 index 00000000..a1e4e7c4 --- /dev/null +++ b/tesseract/unittest/fuzzers/fuzzer-api.cpp @@ -0,0 +1,101 @@ +#include <tesseract/baseapi.h> +#include <allheaders.h> + +#include <libgen.h> // for dirname +#include <cstdio> // for printf +#include <cstdlib> // for std::getenv, std::setenv +#include <string> // for std::string + +#ifndef TESSERACT_FUZZER_WIDTH +#define TESSERACT_FUZZER_WIDTH 100 +#endif + +#ifndef TESSERACT_FUZZER_HEIGHT +#define TESSERACT_FUZZER_HEIGHT 100 +#endif + +class BitReader { + private: + uint8_t const* data; + size_t size; + size_t shift; + + public: + BitReader(const uint8_t* data, size_t size) + : data(data), size(size), shift(0) {} + + int Read(void) { + if (size == 0) { + return 0; + } + + const int ret = ((*data) >> shift) & 1; + + shift++; + if (shift >= 8) { + shift = 0; + data++; + size--; + } + + return ret; + } +}; + +static tesseract::TessBaseAPI* api = nullptr; + +extern "C" int LLVMFuzzerInitialize(int* /*pArgc*/, char*** pArgv) { + if (std::getenv("TESSDATA_PREFIX") == nullptr) { + std::string binary_path = *pArgv[0]; + const std::string filepath = dirname(&binary_path[0]); + + const std::string tessdata_path = filepath + "/" + "tessdata"; + if (setenv("TESSDATA_PREFIX", tessdata_path.c_str(), 1) != 0) { + printf("Setenv failed\n"); + std::abort(); + } + } + + api = new tesseract::TessBaseAPI(); + if (api->Init(nullptr, "eng") != 0) { + printf("Cannot initialize API\n"); + abort(); + } + + /* Silence output */ + api->SetVariable("debug_file", "/dev/null"); + + return 0; +} + +static PIX* createPix(BitReader& BR, const size_t width, const size_t height) { + Pix* pix = pixCreate(width, height, 1); + + if (pix == nullptr) { + printf("pix creation failed\n"); + abort(); + } + + for (size_t i = 0; i < width; i++) { + for (size_t j = 0; j < height; j++) { + pixSetPixel(pix, i, j, BR.Read()); + } + } + + return pix; +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + BitReader BR(data, size); + + auto pix = createPix(BR, TESSERACT_FUZZER_WIDTH, TESSERACT_FUZZER_HEIGHT); + + api->SetImage(pix); + + char* outText = api->GetUTF8Text(); + + pixDestroy(&pix); + delete[] outText; + + return 0; +} diff --git a/tesseract/unittest/fuzzers/oss-fuzz-build.sh b/tesseract/unittest/fuzzers/oss-fuzz-build.sh new file mode 100755 index 00000000..d10f2d80 --- /dev/null +++ b/tesseract/unittest/fuzzers/oss-fuzz-build.sh @@ -0,0 +1,59 @@ +#!/bin/bash -eu +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +cd $SRC/leptonica +./autogen.sh +./configure --disable-shared +make SUBDIRS=src install -j$(nproc) +ldconfig + +cd $SRC/tesseract +./autogen.sh +CXXFLAGS="$CXXFLAGS -D_GLIBCXX_DEBUG" ./configure --disable-graphics --disable-shared +make -j$(nproc) + +cp -R $SRC/tessdata $OUT + +$CXX $CXXFLAGS \ + -I $SRC/tesseract/include \ + -I/usr/local/include/leptonica \ + $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api \ + $SRC/tesseract/.libs/libtesseract.a \ + /usr/local/lib/liblept.a \ + /usr/lib/x86_64-linux-gnu/libtiff.a \ + /usr/lib/x86_64-linux-gnu/libpng.a \ + /usr/lib/x86_64-linux-gnu/libjpeg.a \ + /usr/lib/x86_64-linux-gnu/libjbig.a \ + /usr/lib/x86_64-linux-gnu/liblzma.a \ + -lz \ + $LIB_FUZZING_ENGINE + +$CXX $CXXFLAGS \ + -DTESSERACT_FUZZER_WIDTH=512 \ + -DTESSERACT_FUZZER_HEIGHT=256 \ + -I $SRC/tesseract/include \ + -I/usr/local/include/leptonica \ + $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api-512x256 \ + $SRC/tesseract/.libs/libtesseract.a \ + /usr/local/lib/liblept.a \ + /usr/lib/x86_64-linux-gnu/libtiff.a \ + /usr/lib/x86_64-linux-gnu/libpng.a \ + /usr/lib/x86_64-linux-gnu/libjpeg.a \ + /usr/lib/x86_64-linux-gnu/libjbig.a \ + /usr/lib/x86_64-linux-gnu/liblzma.a \ + -lz \ + $LIB_FUZZING_ENGINE diff --git a/tesseract/unittest/heap_test.cc b/tesseract/unittest/heap_test.cc new file mode 100644 index 00000000..c2754181 --- /dev/null +++ b/tesseract/unittest/heap_test.cc @@ -0,0 +1,202 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include_gunit.h" + +#include "doubleptr.h" +#include "genericheap.h" +#include "genericvector.h" +#include "kdpair.h" + +#include <string> +#include <utility> + +namespace tesseract { + +int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0}; + +// The fixture for testing GenericHeap and DoublePtr. +class HeapTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + public: + virtual ~HeapTest(); + // Pushes the test data onto both the heap and the KDVector. + void PushTestData(GenericHeap<IntKDPair>* heap, KDVector* v) { + for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) { + IntKDPair pair(test_data[i], i); + heap->Push(&pair); + v->push_back(pair); + } + } + // Verifies that the data in the heap matches the vector (after sorting) by + // popping everything off the heap. + void VerifyHeapVectorMatch(GenericHeap<IntKDPair>* heap, KDVector* v) { + EXPECT_FALSE(heap->empty()); + EXPECT_EQ(heap->size(), v->size()); + // Sort the vector and check that the keys come out of the heap in the same + // order as v. + // Also check that the indices match, except for 9, which is duplicated. + v->sort(); + // Check that we have increasing order. + EXPECT_LT((*v)[0].key(), v->back().key()); + for (int i = 0; i < v->size(); ++i) { + EXPECT_EQ((*v)[i].key(), heap->PeekTop().key()); + // Indices don't necessarily match for equal keys, so don't test them. + if (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) { + while (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) { + heap->Pop(nullptr); + ++i; + EXPECT_FALSE(heap->empty()); + EXPECT_EQ((*v)[i].key(), heap->PeekTop().key()); + } + } else { + // The indices must also match if the key is unique. + EXPECT_EQ((*v)[i].data(), heap->PeekTop().data()); + } + EXPECT_FALSE(heap->empty()); + EXPECT_TRUE(heap->Pop(nullptr)); + } + EXPECT_TRUE(heap->empty()); + } +}; + +// Destructor. +// It is defined here, so the compiler can create a single vtable +// instead of a weak vtable (fixes compiler warning). +HeapTest::~HeapTest() = default; + +// Tests that a sort using a GenericHeap matches the result of a sort using +// a KDVector. +TEST_F(HeapTest, SortTest) { + GenericHeap<IntKDPair> heap; + EXPECT_TRUE(heap.empty()); + KDVector v; + EXPECT_EQ(heap.size(), v.size()); + // Push the test data onto both the heap and the KDVector. + PushTestData(&heap, &v); + VerifyHeapVectorMatch(&heap, &v); +} + +// Tests that pushing some stuff, popping some stuff, and then pushing more +// stuff results in output that matches the sort using a KDVector. +// a KDVector. +TEST_F(HeapTest, MixedTest) { + GenericHeap<IntKDPair> heap; + KDVector v; + // Push the test data onto both the heap and the KDVector. + PushTestData(&heap, &v); + // Sort the vector and remove the first 5 values from both heap and v. + v.sort(); + for (int i = 0; i < 5; ++i) { + heap.Pop(nullptr); + v.remove(0); + } + // Push the test data onto both the heap and the KDVector. + PushTestData(&heap, &v); + // Heap and vector should still match! + VerifyHeapVectorMatch(&heap, &v); +} + +// Tests that PopWorst still leaves the heap in a state such that it still +// matches a sorted KDVector. +TEST_F(HeapTest, PopWorstTest) { + GenericHeap<IntKDPair> heap; + KDVector v; + // Push the test data onto both the heap and the KDVector. + PushTestData(&heap, &v); + // Get the worst element off the heap. + IntKDPair pair; + heap.PopWorst(&pair); + EXPECT_EQ(pair.key(), 65536); + EXPECT_EQ(pair.data(), 6); + // Sort and remove the worst element from the vector. + v.sort(); + v.truncate(v.size() - 1); + // After that they should still match! + VerifyHeapVectorMatch(&heap, &v); +} + +// Tests that Reshuffle works and the heap still matches a KDVector with the +// same value changed. Doubles up as a test of DoublePtr. +TEST_F(HeapTest, RevalueTest) { + // Here the data element of the pair is a DoublePtr, which links the entries + // in the vector and heap, and we test a MAX heap. + typedef KDPairDec<int, DoublePtr> PtrPair; + GenericHeap<PtrPair> heap; + GenericVector<PtrPair> v; + // Push the test data onto both the heap and the vector. + for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) { + PtrPair h_pair; + h_pair.key() = test_data[i]; + PtrPair v_pair; + v_pair.key() = test_data[i]; + h_pair.data().Connect(&v_pair.data()); + heap.Push(&h_pair); + v.push_back(v_pair); + } + // Test changes both ways. Index 0 is 8, so change it to -1. + v[0].key() = -1; + // v[0].data.OtherEnd() is a pointer to the data element in the appropriate + // heap entry, wherever it may be. We can change its value via that pointer. + // Without Reshuffle, that would be a terribly bad thing to do, as it violates + // the heap invariant, making the heap corrupt. + PtrPair* pair_ptr = reinterpret_cast<PtrPair*>(v[0].data().OtherEnd()); + pair_ptr->key() = v[0].key(); + heap.Reshuffle(pair_ptr); + // Index 1 is 1. Change to 32767. + v[1].key() = 32767; + pair_ptr = reinterpret_cast<PtrPair*>(v[1].data().OtherEnd()); + pair_ptr->key() = v[1].key(); + heap.Reshuffle(pair_ptr); + // After the changes, popping the heap should still match the sorted order + // of the vector. + v.sort(); + EXPECT_GT(v[0].key(), v.back().key()); + for (int i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i].key(), heap.PeekTop().key()); + EXPECT_FALSE(heap.empty()); + heap.Pop(nullptr); + } + EXPECT_TRUE(heap.empty()); +} + +#if 0 +// Helper checks that the compiler rejects use of a copy constructor with +// a const argument and the default copy constructor is properly hidden by +// the non-const version. +static void ConstRefTest(const DoublePtr& ptr1) { + DoublePtr ptr2(ptr1); // Compiler error here. + EXPECT_EQ(&ptr2, ptr2.OtherEnd()->OtherEnd()); + EXPECT_TRUE(ptr1.OtherEnd() == nullptr); +} +#endif + +// Tests that DoublePtr works as expected. +TEST_F(HeapTest, DoublePtrTest) { + DoublePtr ptr1; + DoublePtr ptr2; + ptr1.Connect(&ptr2); + // Check that the correct copy constructor is used. + DoublePtr ptr3(ptr1); + EXPECT_EQ(&ptr3, ptr3.OtherEnd()->OtherEnd()); + EXPECT_TRUE(ptr1.OtherEnd() == nullptr); + // Check that the correct operator= is used. + ptr1 = ptr3; + EXPECT_EQ(&ptr1, ptr1.OtherEnd()->OtherEnd()); + EXPECT_TRUE(ptr3.OtherEnd() == nullptr); +} + +} // namespace tesseract diff --git a/tesseract/unittest/imagedata_test.cc b/tesseract/unittest/imagedata_test.cc new file mode 100644 index 00000000..31bd2f24 --- /dev/null +++ b/tesseract/unittest/imagedata_test.cc @@ -0,0 +1,131 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> +#include <vector> + +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" + +#include "imagedata.h" +#include "include_gunit.h" +#include "log.h" + +namespace tesseract { + +// Tests the caching mechanism of DocumentData/ImageData. + +class ImagedataTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + ImagedataTest() {} + + // Creates a fake DocumentData, writes it to a file, and returns the filename. + std::string MakeFakeDoc(int num_pages, unsigned doc_id, + std::vector<std::string>* page_texts) { + // The size of the fake images that we will use. + const int kImageSize = 1048576; + // Not using a real image here - just an array of zeros! We are just testing + // that the truth text matches. + std::vector<char> fake_image(kImageSize, 0); + DocumentData write_doc("My document"); + for (int p = 0; p < num_pages; ++p) { + // Make some fake text that is different for each page and save it. + page_texts->push_back( + absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id)); + // Make an imagedata and put it in the document. + ImageData* imagedata = + ImageData::Build("noname", p, "eng", fake_image.data(), + fake_image.size(), (*page_texts)[p].c_str(), nullptr); + EXPECT_EQ(kImageSize, imagedata->MemoryUsed()); + write_doc.AddPageToDocument(imagedata); + } + // Write it to a file. + std::string filename = file::JoinPath( + FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf")); + EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr)); + return filename; + } +}; + +TEST_F(ImagedataTest, CachesProperly) { + // This test verifies that Imagedata can be stored in a DocumentData and a + // collection of them is cached correctly given limited memory. + // Number of pages to put in the fake document. + const int kNumPages = 12; + // Allowances to read the document. Big enough for 1, 3, 0, all pages. + const int kMemoryAllowances[] = {2000000, 4000000, 1000000, 100000000, 0}; + // Order in which to read the pages, with some sequential and some seeks. + const int kPageReadOrder[] = {0, 1, 2, 3, 8, 4, 5, 6, 7, 11, 10, 9, -1}; + + std::vector<std::string> page_texts; + std::string filename = MakeFakeDoc(kNumPages, 0, &page_texts); + // Now try getting it back with different memory allowances and check that + // the pages can still be read. + for (int m = 0; kMemoryAllowances[m] > 0; ++m) { + DocumentData read_doc("My document"); + EXPECT_TRUE( + read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr)); + LOG(ERROR) << "Allowance = " << kMemoryAllowances[m]; + // Read the pages in a specific order. + for (int p = 0; kPageReadOrder[p] >= 0; ++p) { + int page = kPageReadOrder[p]; + const ImageData* imagedata = read_doc.GetPage(page); + EXPECT_NE(nullptr, imagedata); + //EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata); + // Check that this is the right page. + EXPECT_STREQ(page_texts[page].c_str(), + imagedata->transcription().c_str()); + } + } +} + +TEST_F(ImagedataTest, CachesMultiDocs) { + // This test verifies that DocumentCache works to store multiple DocumentData + // and the two caching strategies read images in the right order. + // Number of pages in each document. + const std::vector<int> kNumPages = {6, 5, 7}; + std::vector<std::vector<std::string>> page_texts; + std::vector<STRING> filenames; + for (size_t d = 0; d < kNumPages.size(); ++d) { + page_texts.emplace_back(std::vector<std::string>()); + std::string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back()); + filenames.push_back(STRING(filename.c_str())); + } + // Now try getting them back with different cache strategies and check that + // the pages come out in the right order. + DocumentCache robin_cache(8000000); + robin_cache.LoadDocuments(filenames, tesseract::CS_ROUND_ROBIN, nullptr); + DocumentCache serial_cache(8000000); + serial_cache.LoadDocuments(filenames, tesseract::CS_SEQUENTIAL, nullptr); + for (int p = 0; p <= 21; ++p) { + LOG(INFO) << "Page " << p; + const ImageData* robin_data = robin_cache.GetPageBySerial(p); + const ImageData* serial_data = serial_cache.GetPageBySerial(p); + CHECK(robin_data != nullptr); + CHECK(serial_data != nullptr); + int robin_doc = p % kNumPages.size(); + int robin_page = p / kNumPages.size() % kNumPages[robin_doc]; + // Check that this is the right page. + EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(), + robin_data->transcription().c_str()); + int serial_doc = p / kNumPages[0] % kNumPages.size(); + int serial_page = p % kNumPages[0] % kNumPages[serial_doc]; + EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(), + serial_data->transcription().c_str()); + } +} + +} // namespace. diff --git a/tesseract/unittest/include_gunit.h b/tesseract/unittest/include_gunit.h new file mode 100644 index 00000000..568326cb --- /dev/null +++ b/tesseract/unittest/include_gunit.h @@ -0,0 +1,76 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// Portability include to match the Google test environment. + +#ifndef TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ +#define TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ + +#include "errcode.h" // for ASSERT_HOST +#include "fileio.h" // for tesseract::File +#include "log.h" // for LOG +#include "gtest/gtest.h" + +const char* FLAGS_test_tmpdir = "./tmp"; + +class file : public tesseract::File { +public: + + static void MakeTmpdir() { +#if defined(_WIN32) + _mkdir(FLAGS_test_tmpdir); +#else + mkdir(FLAGS_test_tmpdir, S_IRWXU | S_IRWXG); +#endif + } + +// Create a file and write a string to it. + static bool WriteStringToFile(const std::string& contents, const std::string& filename) { + File::WriteStringToFileOrDie(contents, filename); + return true; + } + + static bool GetContents(const std::string& filename, std::string* out, int) { + return File::ReadFileToString(filename, out); + } + + static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) { + return WriteStringToFile(contents, name); + } + + static int Defaults() { + return 0; + } + + static std::string JoinPath(const std::string& s1, const std::string& s2) { + return tesseract::File::JoinPath(s1, s2); + } + + static std::string JoinPath(const std::string& s1, const std::string& s2, + const std::string& s3) { + return JoinPath(JoinPath(s1, s2), s3); + } +}; + +#define ARRAYSIZE(arr) (sizeof(arr) / sizeof(arr[0])) + +// /usr/include/tensorflow/core/platform/default/logging.h defines the CHECK* macros. +#if !defined(CHECK) +#define CHECK(condition) \ + if (!(condition)) \ + LOG(FATAL) << "Check failed: " #condition " " +#define CHECK_EQ(test, value) CHECK((test) == (value)) +#define CHECK_GT(test, value) CHECK((test) > (value)) +#define CHECK_LT(test, value) CHECK((test) < (value)) +#define CHECK_LE(test, value) CHECK((test) <= (value)) +#define CHECK_OK(test) CHECK(test) +#endif + +#endif // TESSERACT_UNITTEST_INCLUDE_GUNIT_H_ diff --git a/tesseract/unittest/indexmapbidi_test.cc b/tesseract/unittest/indexmapbidi_test.cc new file mode 100644 index 00000000..bdd3c895 --- /dev/null +++ b/tesseract/unittest/indexmapbidi_test.cc @@ -0,0 +1,117 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cmath> +#include <cstdio> +#include <string> + +#include "indexmapbidi.h" + +#include "include_gunit.h" + +const int kPrimeLimit = 1000; + +namespace tesseract { + +class IndexMapBiDiTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + public: + std::string OutputNameToPath(const std::string& name) { + return file::JoinPath(FLAGS_test_tmpdir, name); + } + // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes. + void ComputePrimes(IndexMapBiDi* map) { + map->Init(kPrimeLimit + 1, false); + map->SetMap(2, true); + // Set all the odds to true. + for (int i = 3; i <= kPrimeLimit; i += 2) map->SetMap(i, true); + int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit)); + for (int f = 3; f <= factor_limit; f += 2) { + if (map->SparseToCompact(f) >= 0) { + for (int m = 2; m * f <= kPrimeLimit; ++m) map->SetMap(f * m, false); + } + } + map->Setup(); + } + + void TestPrimes(const IndexMap& map) { + // Now all primes are mapped in the sparse map to their index. + // According to Wikipedia, the 168th prime is 997, and it has compact + // index 167 because we are indexing from 0. + EXPECT_EQ(167, map.SparseToCompact(997)); + EXPECT_EQ(997, map.CompactToSparse(167)); + // 995, 996, 998, 999 are not prime. + EXPECT_EQ(-1, map.SparseToCompact(995)); + EXPECT_EQ(-1, map.SparseToCompact(996)); + EXPECT_EQ(-1, map.SparseToCompact(998)); + EXPECT_EQ(-1, map.SparseToCompact(999)); + // The 167th prime is 991. + EXPECT_EQ(991, map.CompactToSparse(166)); + // There are 168 primes in 0..1000. + EXPECT_EQ(168, map.CompactSize()); + EXPECT_EQ(kPrimeLimit + 1, map.SparseSize()); + } +}; + +// Tests the sieve of Eratosthenes as a way of testing setup. +TEST_F(IndexMapBiDiTest, Primes) { + IndexMapBiDi map; + ComputePrimes(&map); + TestPrimes(map); + // It still works if we assign it to another. + IndexMapBiDi map2; + map2.CopyFrom(map); + TestPrimes(map2); + // Or if we assign it to a base class. + IndexMap base_map; + base_map.CopyFrom(map); + TestPrimes(base_map); + // Test file i/o too. + std::string filename = OutputNameToPath("primesmap"); + FILE* fp = fopen(filename.c_str(), "wb"); + CHECK(fp != nullptr); + EXPECT_TRUE(map.Serialize(fp)); + fclose(fp); + fp = fopen(filename.c_str(), "rb"); + CHECK(fp != nullptr); + IndexMapBiDi read_map; + EXPECT_TRUE(read_map.DeSerialize(false, fp)); + fclose(fp); + TestPrimes(read_map); +} + +// Tests the many-to-one setup feature. +TEST_F(IndexMapBiDiTest, ManyToOne) { + // Test the example in the comment on CompleteMerges. + IndexMapBiDi map; + map.Init(13, false); + map.SetMap(2, true); + map.SetMap(4, true); + map.SetMap(7, true); + map.SetMap(9, true); + map.SetMap(11, true); + map.Setup(); + map.Merge(map.SparseToCompact(2), map.SparseToCompact(9)); + map.Merge(map.SparseToCompact(4), map.SparseToCompact(11)); + map.CompleteMerges(); + EXPECT_EQ(3, map.CompactSize()); + EXPECT_EQ(13, map.SparseSize()); + EXPECT_EQ(1, map.SparseToCompact(4)); + EXPECT_EQ(4, map.CompactToSparse(1)); + EXPECT_EQ(1, map.SparseToCompact(11)); +} + +} // namespace. diff --git a/tesseract/unittest/intfeaturemap_test.cc b/tesseract/unittest/intfeaturemap_test.cc new file mode 100644 index 00000000..e95aa0c3 --- /dev/null +++ b/tesseract/unittest/intfeaturemap_test.cc @@ -0,0 +1,129 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "intfeaturemap.h" +#include "intfeaturespace.h" + +#include "include_gunit.h" + +// Random re-quantization to test that they don't have to be easy. +// WARNING! Change these and change the expected_misses calculation below. +const int kXBuckets = 16; +const int kYBuckets = 24; +const int kThetaBuckets = 13; + +namespace tesseract { + +class IntFeatureMapTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + public: + // Expects that the given vector has contiguous integer values in the + // range [start, end). + void ExpectContiguous(const GenericVector<int>& v, int start, int end) { + for (int i = start; i < end; ++i) { + EXPECT_EQ(i, v[i - start]); + } + } +}; + +// Tests the IntFeatureMap and implicitly the IntFeatureSpace underneath. +TEST_F(IntFeatureMapTest, Exhaustive) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because IntFeatureSpace is missing. + GTEST_SKIP(); +#else + IntFeatureSpace space; + space.Init(kXBuckets, kYBuckets, kThetaBuckets); + IntFeatureMap map; + map.Init(space); + int total_size = kIntFeatureExtent * kIntFeatureExtent * kIntFeatureExtent; + std::unique_ptr<INT_FEATURE_STRUCT[]> features( + new INT_FEATURE_STRUCT[total_size]); + // Fill the features with every value. + for (int y = 0; y < kIntFeatureExtent; ++y) { + for (int x = 0; x < kIntFeatureExtent; ++x) { + for (int theta = 0; theta < kIntFeatureExtent; ++theta) { + int f_index = (y * kIntFeatureExtent + x) * kIntFeatureExtent + theta; + features[f_index].X = x; + features[f_index].Y = y; + features[f_index].Theta = theta; + } + } + } + GenericVector<int> index_features; + map.IndexAndSortFeatures(features.get(), total_size, &index_features); + EXPECT_EQ(total_size, index_features.size()); + int total_buckets = kXBuckets * kYBuckets * kThetaBuckets; + GenericVector<int> map_features; + int misses = map.MapIndexedFeatures(index_features, &map_features); + EXPECT_EQ(0, misses); + EXPECT_EQ(total_buckets, map_features.size()); + ExpectContiguous(map_features, 0, total_buckets); + EXPECT_EQ(total_buckets, map.compact_size()); + EXPECT_EQ(total_buckets, map.sparse_size()); + + // Every offset should be within dx, dy, dtheta of the start point. + int dx = kIntFeatureExtent / kXBuckets + 1; + int dy = kIntFeatureExtent / kYBuckets + 1; + int dtheta = kIntFeatureExtent / kThetaBuckets + 1; + int bad_offsets = 0; + for (int index = 0; index < total_buckets; ++index) { + for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps; + ++dir) { + int offset_index = map.OffsetFeature(index, dir); + if (dir == 0) { + EXPECT_EQ(index, offset_index); + } else if (offset_index >= 0) { + INT_FEATURE_STRUCT f = map.InverseIndexFeature(index); + INT_FEATURE_STRUCT f2 = map.InverseIndexFeature(offset_index); + EXPECT_TRUE(f.X != f2.X || f.Y != f2.Y || f.Theta != f2.Theta); + EXPECT_LE(abs(f.X - f2.X), dx); + EXPECT_LE(abs(f.Y - f2.Y), dy); + int theta_delta = abs(f.Theta - f2.Theta); + if (theta_delta > kIntFeatureExtent / 2) + theta_delta = kIntFeatureExtent - theta_delta; + EXPECT_LE(theta_delta, dtheta); + } else { + ++bad_offsets; + INT_FEATURE_STRUCT f = map.InverseIndexFeature(index); + } + } + } + EXPECT_LE(bad_offsets, (kXBuckets + kYBuckets) * kThetaBuckets); + + // To test the mapping further, delete the 1st and last map feature, and + // test again. + map.DeleteMapFeature(0); + map.DeleteMapFeature(total_buckets - 1); + map.FinalizeMapping(nullptr); + map.IndexAndSortFeatures(features.get(), total_size, &index_features); + // Has no effect on index features. + EXPECT_EQ(total_size, index_features.size()); + misses = map.MapIndexedFeatures(index_features, &map_features); + int expected_misses = (kIntFeatureExtent / kXBuckets) * + (kIntFeatureExtent / kYBuckets) * + (kIntFeatureExtent / kThetaBuckets + 1); + expected_misses += (kIntFeatureExtent / kXBuckets) * + (kIntFeatureExtent / kYBuckets + 1) * + (kIntFeatureExtent / kThetaBuckets); + EXPECT_EQ(expected_misses, misses); + EXPECT_EQ(total_buckets - 2, map_features.size()); + ExpectContiguous(map_features, 0, total_buckets - 2); + EXPECT_EQ(total_buckets - 2, map.compact_size()); + EXPECT_EQ(total_buckets, map.sparse_size()); +#endif +} + +} // namespace. diff --git a/tesseract/unittest/intsimdmatrix_test.cc b/tesseract/unittest/intsimdmatrix_test.cc new file mode 100644 index 00000000..cdfbaa2c --- /dev/null +++ b/tesseract/unittest/intsimdmatrix_test.cc @@ -0,0 +1,135 @@ +/////////////////////////////////////////////////////////////////////// +// File: intsimdmatrix_test.cc +// Author: rays@google.com (Ray Smith) +// +// Copyright 2017 Google Inc. All Rights Reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "intsimdmatrix.h" +#include <memory> +#include <vector> +#include <gtest/gtest.h> +#include <gtest/internal/gtest-port.h> +#include "include_gunit.h" +#include "matrix.h" +#include "simddetect.h" +#include "tprintf.h" + +namespace tesseract { + +class IntSimdMatrixTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + // Makes a random weights matrix of the given size. + GENERIC_2D_ARRAY<int8_t> InitRandom(int no, int ni) { + GENERIC_2D_ARRAY<int8_t> a(no, ni, 0); + for (int i = 0; i < no; ++i) { + for (int j = 0; j < ni; ++j) { + a(i, j) = static_cast<int8_t>(random_.SignedRand(INT8_MAX)); + } + } + return a; + } + // Makes a random input vector of the given size, with rounding up. + std::vector<int8_t> RandomVector(int size, const IntSimdMatrix& matrix) { + int rounded_size = matrix.RoundInputs(size); + std::vector<int8_t> v(rounded_size, 0); + for (int i = 0; i < size; ++i) { + v[i] = static_cast<int8_t>(random_.SignedRand(INT8_MAX)); + } + return v; + } + // Makes a random scales vector of the given size. + std::vector<double> RandomScales(int size) { + std::vector<double> v(size); + for (int i = 0; i < size; ++i) { + v[i] = (1.0 + random_.SignedRand(1.0)) / INT8_MAX; + } + return v; + } + // Tests a range of sizes and compares the results against the generic version. + void ExpectEqualResults(const IntSimdMatrix& matrix) { + double total = 0.0; + for (int num_out = 1; num_out < 130; ++num_out) { + for (int num_in = 1; num_in < 130; ++num_in) { + GENERIC_2D_ARRAY<int8_t> w = InitRandom(num_out, num_in + 1); + std::vector<int8_t> u = RandomVector(num_in, matrix); + std::vector<double> scales = RandomScales(num_out); + int ro = num_out; + if (IntSimdMatrix::intSimdMatrix) + ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro); + std::vector<double> base_result(ro); + base_result.resize(num_out); + IntSimdMatrix::MatrixDotVector(w, scales, u.data(), base_result.data()); + std::vector<double> test_result(ro); + test_result.resize(num_out); + std::vector<int8_t> shaped_wi; + int32_t rounded_num_out; + matrix.Init(w, shaped_wi, rounded_num_out); + scales.reserve(rounded_num_out); + if (matrix.matrixDotVectorFunction) { + matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0], + &scales[0], &u[0], &test_result[0]); + } else { + IntSimdMatrix::MatrixDotVector(w, scales, u.data(), test_result.data()); + } + for (int i = 0; i < num_out; ++i) { + EXPECT_FLOAT_EQ(base_result[i], test_result[i]) << "i=" << i; + total += base_result[i]; + } + } + } + // Compare sum of all results with expected value. + EXPECT_FLOAT_EQ(total, 337849.39354684710); + } + + TRand random_; +}; + +// Test the C++ implementation without SIMD. +TEST_F(IntSimdMatrixTest, C) { + static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1}; + ExpectEqualResults(matrix); +} + +// Tests that the SSE implementation gets the same result as the vanilla. +TEST_F(IntSimdMatrixTest, SSE) { +#if defined(HAVE_SSE4_1) + if (!SIMDDetect::IsSSEAvailable()) { + GTEST_LOG_(INFO) << "No SSE found! Not tested!"; + GTEST_SKIP(); + } + ExpectEqualResults(IntSimdMatrix::intSimdMatrixSSE); +#else + GTEST_LOG_(INFO) << "SSE unsupported! Not tested!"; + GTEST_SKIP(); +#endif +} + +// Tests that the AVX2 implementation gets the same result as the vanilla. +TEST_F(IntSimdMatrixTest, AVX2) { +#if defined(HAVE_AVX2) + if (!SIMDDetect::IsAVX2Available()) { + GTEST_LOG_(INFO) << "No AVX2 found! Not tested!"; + GTEST_SKIP(); + } + ExpectEqualResults(IntSimdMatrix::intSimdMatrixAVX2); +#else + GTEST_LOG_(INFO) << "AVX2 unsupported! Not tested!"; + GTEST_SKIP(); +#endif +} + +} // namespace tesseract diff --git a/tesseract/unittest/lang_model_test.cc b/tesseract/unittest/lang_model_test.cc new file mode 100644 index 00000000..b059c18c --- /dev/null +++ b/tesseract/unittest/lang_model_test.cc @@ -0,0 +1,217 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> // for std::string + +#include "absl/strings/str_cat.h" + +#include "gmock/gmock.h" // for testing::ElementsAreArray + +#include "include_gunit.h" +#include "lang_model_helpers.h" +#include "log.h" // for LOG +#include "lstmtrainer.h" +#include "unicharset_training_utils.h" + +namespace tesseract { + +std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, name); +} + +// This is an integration test that verifies that CombineLangModel works to +// the extent that an LSTMTrainer can be initialized with the result, and it +// can encode strings. More importantly, the test verifies that adding an extra +// character to the unicharset does not change the encoding of strings. +TEST(LangModelTest, AddACharacter) { + constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&"; + constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹"; + // Setup the arguments. + std::string script_dir = LANGDATA_DIR; + std::string eng_dir = file::JoinPath(script_dir, "eng"); + std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset"); + UNICHARSET unicharset; + EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str())); + std::string version_str = "TestVersion"; + file::MakeTmpdir(); + std::string output_dir = FLAGS_test_tmpdir; + LOG(INFO) << "Output dir=" << output_dir << "\n"; + std::string lang1 = "eng"; + bool pass_through_recoder = false; + std::vector<STRING> words, puncs, numbers; + // If these reads fail, we get a warning message and an empty list of words. + ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr) + .split('\n', &words); + EXPECT_GT(words.size(), 0); + ReadFile(file::JoinPath(eng_dir, "eng.punc"), nullptr).split('\n', &puncs); + EXPECT_GT(puncs.size(), 0); + ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr) + .split('\n', &numbers); + EXPECT_GT(numbers.size(), 0); + bool lang_is_rtl = false; + // Generate the traineddata file. + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang1, pass_through_recoder, words, puncs, + numbers, lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata1 = + file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); + LSTMTrainer trainer1; + trainer1.InitCharSet(traineddata1); + std::vector<int> labels1; + EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1)); + STRING test1_decoded = trainer1.DecodeLabels(labels1); + std::string test1_str(&test1_decoded[0], test1_decoded.length()); + LOG(INFO) << "Labels1=" << test1_str << "\n"; + + // Add a new character to the unicharset and try again. + int size_before = unicharset.size(); + unicharset.unichar_insert("₹"); + SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, + &unicharset); + EXPECT_EQ(size_before + 1, unicharset.size()); + // Generate the traineddata file. + std::string lang2 = "extended"; + EXPECT_EQ(EXIT_SUCCESS, + CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang2, pass_through_recoder, words, puncs, numbers, + lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata2 = + file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); + LSTMTrainer trainer2; + trainer2.InitCharSet(traineddata2); + std::vector<int> labels2; + EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2)); + STRING test2_decoded = trainer2.DecodeLabels(labels2); + std::string test2_str(&test2_decoded[0], test2_decoded.length()); + LOG(INFO) << "Labels2=" << test2_str << "\n"; + // encode kTestStringRupees. + std::vector<int> labels3; + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3)); + STRING test3_decoded = trainer2.DecodeLabels(labels3); + std::string test3_str(&test3_decoded[0], test3_decoded.length()); + LOG(INFO) << "labels3=" << test3_str << "\n"; + // Copy labels1 to a std::vector, renumbering the null char to match trainer2. + // Since Tensor Flow's CTC implementation insists on having the null be the + // last label, and we want to be compatible, null has to be renumbered when + // we add a class. + int null1 = trainer1.null_char(); + int null2 = trainer2.null_char(); + EXPECT_EQ(null1 + 1, null2); + std::vector<int> labels1_v(labels1.size()); + for (int i = 0; i < labels1.size(); ++i) { + if (labels1[i] == null1) + labels1_v[i] = null2; + else + labels1_v[i] = labels1[i]; + } + EXPECT_THAT(labels1_v, + testing::ElementsAreArray(&labels2[0], labels2.size())); + // To make sure we we are not cheating somehow, we can now encode the Rupee + // symbol, which we could not do before. + EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2)); +} + +// Same as above test, for hin instead of eng +TEST(LangModelTest, AddACharacterHindi) { + constexpr char kTestString[] = "हिन्दी में एक लाइन लिखें"; + constexpr char kTestStringRupees[] = "हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००"; + // Setup the arguments. + std::string script_dir = LANGDATA_DIR; + std::string hin_dir = file::JoinPath(script_dir, "hin"); + std::string unicharset_path = TestDataNameToPath("hin_beam.unicharset"); + UNICHARSET unicharset; + EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str())); + std::string version_str = "TestVersion"; + file::MakeTmpdir(); + std::string output_dir = FLAGS_test_tmpdir; + LOG(INFO) << "Output dir=" << output_dir << "\n"; + std::string lang1 = "hin"; + bool pass_through_recoder = false; + std::vector<STRING> words, puncs, numbers; + // If these reads fail, we get a warning message and an empty list of words. + ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr) + .split('\n', &words); + EXPECT_GT(words.size(), 0); + ReadFile(file::JoinPath(hin_dir, "hin.punc"), nullptr).split('\n', &puncs); + EXPECT_GT(puncs.size(), 0); + ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr) + .split('\n', &numbers); + EXPECT_GT(numbers.size(), 0); + bool lang_is_rtl = false; + // Generate the traineddata file. + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang1, pass_through_recoder, words, puncs, + numbers, lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata1 = + file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); + LSTMTrainer trainer1; + trainer1.InitCharSet(traineddata1); + std::vector<int> labels1; + EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1)); + STRING test1_decoded = trainer1.DecodeLabels(labels1); + std::string test1_str(&test1_decoded[0], test1_decoded.length()); + LOG(INFO) << "Labels1=" << test1_str << "\n"; + + // Add a new character to the unicharset and try again. + int size_before = unicharset.size(); + unicharset.unichar_insert("₹"); + SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, + &unicharset); + EXPECT_EQ(size_before + 1, unicharset.size()); + // Generate the traineddata file. + std::string lang2 = "extendedhin"; + EXPECT_EQ(EXIT_SUCCESS, + CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang2, pass_through_recoder, words, puncs, numbers, + lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata2 = + file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); + LSTMTrainer trainer2; + trainer2.InitCharSet(traineddata2); + std::vector<int> labels2; + EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2)); + STRING test2_decoded = trainer2.DecodeLabels(labels2); + std::string test2_str(&test2_decoded[0], test2_decoded.length()); + LOG(INFO) << "Labels2=" << test2_str << "\n"; + // encode kTestStringRupees. + std::vector<int> labels3; + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3)); + STRING test3_decoded = trainer2.DecodeLabels(labels3); + std::string test3_str(&test3_decoded[0], test3_decoded.length()); + LOG(INFO) << "labels3=" << test3_str << "\n"; + // Copy labels1 to a std::vector, renumbering the null char to match trainer2. + // Since Tensor Flow's CTC implementation insists on having the null be the + // last label, and we want to be compatible, null has to be renumbered when + // we add a class. + int null1 = trainer1.null_char(); + int null2 = trainer2.null_char(); + EXPECT_EQ(null1 + 1, null2); + std::vector<int> labels1_v(labels1.size()); + for (int i = 0; i < labels1.size(); ++i) { + if (labels1[i] == null1) + labels1_v[i] = null2; + else + labels1_v[i] = labels1[i]; + } + EXPECT_THAT(labels1_v, + testing::ElementsAreArray(&labels2[0], labels2.size())); + // To make sure we we are not cheating somehow, we can now encode the Rupee + // symbol, which we could not do before. + EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2)); +} + +} // namespace tesseract diff --git a/tesseract/unittest/layout_test.cc b/tesseract/unittest/layout_test.cc new file mode 100644 index 00000000..8a20c908 --- /dev/null +++ b/tesseract/unittest/layout_test.cc @@ -0,0 +1,234 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> +#include <utility> + +#include "include_gunit.h" + +#include "allheaders.h" +#include <tesseract/baseapi.h> +#include "coutln.h" +#include "log.h" // for LOG +#include "mutableiterator.h" +#include "ocrblock.h" // for class BLOCK +#include "pageres.h" +#include "polyblk.h" +#include <tesseract/resultiterator.h> +#include "stepblob.h" + +namespace tesseract { + +/** String name for each block type. Keep in sync with PolyBlockType. */ +static const char* kPolyBlockNames[] = { + "Unknown", + "Flowing Text", + "Heading Text", + "Pullout Text", + "Equation", + "Inline Equation", + "Table", + "Vertical Text", + "Caption Text", + "Flowing Image", + "Heading Image", + "Pullout Image", + "Horizontal Line", + "Vertical Line", + "Noise", + "" // End marker for testing that sizes match. +}; + +const char* kStrings8087_054[] = { + "dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr}; +const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT, + PT_PULLOUT_IMAGE, PT_CAPTION_TEXT, + PT_FLOWING_TEXT}; + +// The fixture for testing Tesseract. +class LayoutTest : public testing::Test { + protected: + std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, "/" + name); + } + std::string TessdataPath() { + return file::JoinPath(TESSDATA_DIR, ""); + } + + LayoutTest() { src_pix_ = nullptr; } + ~LayoutTest() { pixDestroy(&src_pix_); } + + void SetImage(const char* filename, const char* lang) { + pixDestroy(&src_pix_); + src_pix_ = pixRead(TestDataNameToPath(filename).c_str()); + api_.Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY); + api_.SetPageSegMode(tesseract::PSM_AUTO); + api_.SetImage(src_pix_); + } + + // Tests reading order and block finding (very roughly) by iterating + // over the blocks, expecting that they contain the strings in order, + // allowing for other blocks in between. + // An empty string should match an image block, and a nullptr string + // indicates the end of the array. + void VerifyBlockTextOrder(const char* strings[], const PolyBlockType* blocks, + ResultIterator* it) { + it->Begin(); + int string_index = 0; + int block_index = 0; + do { + char* block_text = it->GetUTF8Text(tesseract::RIL_BLOCK); + if (block_text != nullptr && it->BlockType() == blocks[string_index] && + strstr(block_text, strings[string_index]) != nullptr) { + LOG(INFO) << "Found string " << strings[string_index] + << " in block " << block_index + << " of type " << kPolyBlockNames[blocks[string_index]] << "\n"; + // Found this one. + ++string_index; + } else if (it->BlockType() == blocks[string_index] && + block_text == nullptr && strings[string_index][0] == '\0') { + LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]] + << " at block " << block_index << "\n"; + // Found this one. + ++string_index; + } else { + LOG(INFO) << "No match found in block with text:\n" << block_text; + } + delete[] block_text; + ++block_index; + if (strings[string_index] == nullptr) break; + } while (it->Next(tesseract::RIL_BLOCK)); + EXPECT_TRUE(strings[string_index] == nullptr); + } + + // Tests that approximate order of the biggest text blocks is correct. + // Correctness is tested by the following simple rules: + // If a block overlaps its predecessor in x, then it must be below it. + // otherwise, if the block is not below its predecessor, then it must + // be to the left of it if right_to_left is true, or to the right otherwise. + void VerifyRoughBlockOrder(bool right_to_left, ResultIterator* it) { + int prev_left = 0; + int prev_right = 0; + int prev_bottom = 0; + it->Begin(); + do { + int left, top, right, bottom; + if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) && + PTIsTextType(it->BlockType()) && right - left > 800 && + bottom - top > 200) { + if (prev_right > prev_left) { + if (std::min(right, prev_right) > std::max(left, prev_left)) { + EXPECT_GE(top, prev_bottom) << "Overlapping block should be below"; + } else if (top < prev_bottom) { + if (right_to_left) { + EXPECT_GE(prev_left, right) << "Block should be to the left"; + } else { + EXPECT_GE(left, prev_right) << "Block should be to the right"; + } + } + } + prev_left = left; + prev_right = right; + prev_bottom = bottom; + } + } while (it->Next(tesseract::RIL_BLOCK)); + } + + // Tests that every blob assigned to the biggest text blocks is contained + // fully within its block by testing that the block polygon winds around + // the center of the bounding boxes of the outlines in the blob. + void VerifyTotalContainment(int winding_target, MutableIterator* it) { + it->Begin(); + do { + int left, top, right, bottom; + if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) && + PTIsTextType(it->BlockType()) && right - left > 800 && + bottom - top > 200) { + const PAGE_RES_IT* pr_it = it->PageResIt(); + POLY_BLOCK* pb = pr_it->block()->block->pdblk.poly_block(); + CHECK(pb != nullptr); + FCOORD skew = pr_it->block()->block->skew(); + EXPECT_GT(skew.x(), 0.0f); + EXPECT_GT(skew.y(), 0.0f); + // Iterate the words in the block. + MutableIterator word_it = *it; + do { + const PAGE_RES_IT* w_it = word_it.PageResIt(); + // Iterate the blobs in the word. + C_BLOB_IT b_it(w_it->word()->word->cblob_list()); + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + C_BLOB* blob = b_it.data(); + // Iterate the outlines in the blob. + C_OUTLINE_IT ol_it(blob->out_list()); + for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { + C_OUTLINE* ol = ol_it.data(); + TBOX box = ol->bounding_box(); + ICOORD middle((box.left() + box.right()) / 2, + (box.top() + box.bottom()) / 2); + EXPECT_EQ(winding_target, pb->winding_number(middle)); + } + } + } while (word_it.Next(tesseract::RIL_WORD) && + !word_it.IsAtBeginningOf(tesseract::RIL_BLOCK)); + } + } while (it->Next(tesseract::RIL_BLOCK)); + } + + Pix* src_pix_; + std::string ocr_text_; + tesseract::TessBaseAPI api_; +}; + +// Tests that array sizes match their intended size. +TEST_F(LayoutTest, ArraySizeTest) { + int size = 0; + for (size = 0; kPolyBlockNames[size][0] != '\0'; ++size) + ; + EXPECT_EQ(size, PT_COUNT); +} + +// Tests that Tesseract gets the important blocks and in the right order +// on a UNLV page numbered 8087_054.3B.tif. (Dubrovnik) +TEST_F(LayoutTest, UNLV8087_054) { + SetImage("8087_054.3B.tif", "eng"); + // Just run recognition. + EXPECT_EQ(api_.Recognize(nullptr), 0); + // Check iterator position. + tesseract::ResultIterator* it = api_.GetIterator(); + VerifyBlockTextOrder(kStrings8087_054, kBlocks8087_054, it); + delete it; +} + +// Tests that Tesseract gets the important blocks and in the right order +// on GOOGLE:13510798882202548:74:84.sj-79.tif (Hebrew image) +// TODO: replace hebrew.png by Google image referred above +TEST_F(LayoutTest, HebrewOrderingAndSkew) { + SetImage("hebrew.png", "eng"); + // Just run recognition. + EXPECT_EQ(api_.Recognize(nullptr), 0); + tesseract::MutableIterator* it = api_.GetMutableIterator(); + // In eng mode, block order should not be RTL. + VerifyRoughBlockOrder(false, it); + VerifyTotalContainment(1, it); + delete it; + // Now try again using Hebrew. + SetImage("hebrew.png", "heb"); + // Just run recognition. + EXPECT_EQ(api_.Recognize(nullptr), 0); + it = api_.GetMutableIterator(); + // In heb mode, block order should be RTL. + VerifyRoughBlockOrder(true, it); + // And blobs should still be fully contained. + VerifyTotalContainment(-1, it); + delete it; +} + +} // namespace diff --git a/tesseract/unittest/ligature_table_test.cc b/tesseract/unittest/ligature_table_test.cc new file mode 100644 index 00000000..0047f857 --- /dev/null +++ b/tesseract/unittest/ligature_table_test.cc @@ -0,0 +1,111 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "commandlineflags.h" +#include "fileio.h" +#include "include_gunit.h" +#include "ligature_table.h" +#include "pango_font_info.h" + +namespace tesseract { + +const char kEngNonLigatureText[] = "fidelity effigy ſteep"; +// Same as above text, but with "fi" in the first word and "ffi" in the second +// word replaced with their respective ligatures. +const char kEngLigatureText[] = "fidelity effigy ſteep"; +// Same as kEngLigatureText but with "fi" in both words replaced with their +// ligature. The test Verdana font does not support the "ffi" or "ſt" ligature. +const char kRenderableEngLigatureText[] = "fidelity effigy ſteep"; + +static PangoFontMap* font_map; + +class LigatureTableTest : public ::testing::Test { + protected: + void SetUp() override { + lig_table_ = LigatureTable::Get(); + if (!font_map) { + font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); + } + pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map)); + } + + static void SetUpTestCase() { + static std::locale system_locale(""); + std::locale::global(system_locale); + + FLAGS_fonts_dir = TESTING_DIR; + FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir; + file::MakeTmpdir(); + PangoFontInfo::SoftInitFontConfig(); // init early + } + LigatureTable* lig_table_; +}; + +TEST_F(LigatureTableTest, DoesFillLigatureTables) { + EXPECT_GT(lig_table_->norm_to_lig_table().size(), 0); + EXPECT_GT(lig_table_->lig_to_norm_table().size(), 0); +} + +TEST_F(LigatureTableTest, DoesAddLigatures) { + EXPECT_STREQ(kEngLigatureText, + lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str()); +} + +TEST_F(LigatureTableTest, DoesAddLigaturesWithSupportedFont) { + PangoFontInfo font; + EXPECT_TRUE(font.ParseFontDescriptionName("Verdana")); +printf("1:%s\n", kRenderableEngLigatureText); +printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); + EXPECT_STREQ(kRenderableEngLigatureText, + lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); +} + +TEST_F(LigatureTableTest, DoesNotAddLigaturesWithUnsupportedFont) { + PangoFontInfo font; + EXPECT_TRUE(font.ParseFontDescriptionName("Lohit Hindi")); + EXPECT_STREQ(kEngNonLigatureText, + lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str()); +} + +TEST_F(LigatureTableTest, DoesRemoveLigatures) { + EXPECT_STREQ(kEngNonLigatureText, + lig_table_->RemoveLigatures(kEngLigatureText).c_str()); +} + +TEST_F(LigatureTableTest, TestCustomLigatures) { + const char* kTestCases[] = { + "act", "a\uE003", "publiſh", "publi\uE006", "ſince", + "\uE007nce", "aſleep", "a\uE008eep", "neceſſary", "nece\uE009ary", + }; + for (size_t i = 0; i < ARRAYSIZE(kTestCases); i += 2) { + EXPECT_STREQ(kTestCases[i + 1], + lig_table_->AddLigatures(kTestCases[i], nullptr).c_str()); + EXPECT_STREQ(kTestCases[i], + lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str()); + EXPECT_STREQ(kTestCases[i], + lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str()); + } +} + +TEST_F(LigatureTableTest, TestRemovesCustomLigatures) { + const char* kTestCases[] = { + "fiction", + "fi\uE003ion", + "fiction", + }; + for (size_t i = 0; i < ARRAYSIZE(kTestCases); i += 3) { + EXPECT_STREQ(kTestCases[i + 1], + lig_table_->AddLigatures(kTestCases[i], nullptr).c_str()); + EXPECT_STREQ(kTestCases[i + 2], + lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str()); + } +} +} // namespace diff --git a/tesseract/unittest/linlsq_test.cc b/tesseract/unittest/linlsq_test.cc new file mode 100644 index 00000000..2ca0ea9e --- /dev/null +++ b/tesseract/unittest/linlsq_test.cc @@ -0,0 +1,118 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "linlsq.h" + +#include "include_gunit.h" + +namespace tesseract { + +class LLSQTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + public: + void TearDown() {} + + void ExpectCorrectLine(const LLSQ& llsq, double m, double c, double rms, + double pearson, double tolerance) { + EXPECT_NEAR(m, llsq.m(), tolerance); + EXPECT_NEAR(c, llsq.c(llsq.m()), tolerance); + EXPECT_NEAR(rms, llsq.rms(llsq.m(), llsq.c(llsq.m())), tolerance); + EXPECT_NEAR(pearson, llsq.pearson(), tolerance); + } + FCOORD PtsMean(const std::vector<FCOORD>& pts) { + FCOORD total(0, 0); + for (const auto& p : pts) { + total += p; + } + return (pts.size() > 0) ? total / pts.size() : total; + } + void VerifyRmsOrth(const std::vector<FCOORD>& pts, const FCOORD& orth) { + LLSQ llsq; + FCOORD xavg = PtsMean(pts); + FCOORD nvec = !orth; + nvec.normalise(); + double expected_answer = 0; + for (const auto& p : pts) { + llsq.add(p.x(), p.y()); + double dot = nvec % (p - xavg); + expected_answer += dot * dot; + } + expected_answer /= pts.size(); + expected_answer = sqrt(expected_answer); + EXPECT_NEAR(expected_answer, llsq.rms_orth(orth), 0.0001); + } + void ExpectCorrectVector(const LLSQ& llsq, FCOORD correct_mean_pt, + FCOORD correct_vector, float tolerance) { + FCOORD mean_pt = llsq.mean_point(); + FCOORD vector = llsq.vector_fit(); + EXPECT_NEAR(correct_mean_pt.x(), mean_pt.x(), tolerance); + EXPECT_NEAR(correct_mean_pt.y(), mean_pt.y(), tolerance); + EXPECT_NEAR(correct_vector.x(), vector.x(), tolerance); + EXPECT_NEAR(correct_vector.y(), vector.y(), tolerance); + } +}; + +// Tests a simple baseline-style normalization. +TEST_F(LLSQTest, BasicLines) { + LLSQ llsq; + llsq.add(1.0, 1.0); + llsq.add(2.0, 2.0); + ExpectCorrectLine(llsq, 1.0, 0.0, 0.0, 1.0, 1e-6); + float half_root_2 = sqrt(2.0) / 2.0f; + ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f), + FCOORD(half_root_2, half_root_2), 1e-6); + llsq.remove(2.0, 2.0); + llsq.add(1.0, 2.0); + llsq.add(10.0, 1.0); + llsq.add(-8.0, 1.0); + // The point at 1,2 pulls the result away from what would otherwise be a + // perfect fit to a horizontal line by 0.25 unit, with rms error of 0.433. + ExpectCorrectLine(llsq, 0.0, 1.25, 0.433, 0.0, 1e-2); + ExpectCorrectVector(llsq, FCOORD(1.0f, 1.25f), FCOORD(1.0f, 0.0f), 1e-3); + llsq.add(1.0, 2.0, 10.0); + // With a heavy weight, the point at 1,2 pulls the line nearer. + ExpectCorrectLine(llsq, 0.0, 1.786, 0.41, 0.0, 1e-2); + ExpectCorrectVector(llsq, FCOORD(1.0f, 1.786f), FCOORD(1.0f, 0.0f), 1e-3); +} + +// Tests a simple baseline-style normalization with a rotation. +TEST_F(LLSQTest, Vectors) { + LLSQ llsq; + llsq.add(1.0, 1.0); + llsq.add(1.0, -1.0); + ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-6); + llsq.add(0.9, -2.0); + llsq.add(1.1, -3.0); + llsq.add(0.9, 2.0); + llsq.add(1.10001, 3.0); + ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-3); +} + +// Verify that rms_orth() actually calculates: +// sqrt( sum (!nvec * (x_i - x_avg))^2 / n) +TEST_F(LLSQTest, RmsOrthWorksAsIntended) { + std::vector<FCOORD> pts; + pts.push_back(FCOORD(0.56, 0.95)); + pts.push_back(FCOORD(0.09, 0.09)); + pts.push_back(FCOORD(0.13, 0.77)); + pts.push_back(FCOORD(0.16, 0.83)); + pts.push_back(FCOORD(0.45, 0.79)); + VerifyRmsOrth(pts, FCOORD(1, 0)); + VerifyRmsOrth(pts, FCOORD(1, 1)); + VerifyRmsOrth(pts, FCOORD(1, 2)); + VerifyRmsOrth(pts, FCOORD(2, 1)); +} + +} // namespace. diff --git a/tesseract/unittest/list_test.cc b/tesseract/unittest/list_test.cc new file mode 100644 index 00000000..e6a2bf1d --- /dev/null +++ b/tesseract/unittest/list_test.cc @@ -0,0 +1,68 @@ +// (C) Copyright 2020, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#if 0 // TODO: add tests for CLIST +#include "clst.h" +#endif +#include "elst.h" +#if 0 // TODO: add tests for ELIST2 +#include "elst2.h" +#endif + +namespace tesseract { + +class ListTest : public ::testing::Test { + protected: + void SetUp() override { + static std::locale system_locale(""); + std::locale::global(system_locale); + } +}; + +class Elst : public ELIST_LINK { + public: + Elst(unsigned n) : value(n) { + } + unsigned value; +}; + +ELISTIZEH(Elst) +ELISTIZE(Elst) + +TEST_F(ListTest, TestELIST) { + Elst_LIST list; + auto it = ELIST_ITERATOR(&list); + for (unsigned i = 0; i < 10; i++) { + auto* elst = new Elst(i); + //EXPECT_TRUE(elst->empty()); + //EXPECT_EQ(elst->length(), 0); + it.add_to_end(elst); + } + it.move_to_first(); + unsigned n = 0; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + auto* elst = reinterpret_cast<Elst*>(it.data()); + EXPECT_EQ(elst->value, n); + n++; + } + it.forward(); + n++; + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + auto* elst = reinterpret_cast<Elst*>(it.extract()); + EXPECT_EQ(elst->value, n % 10); + n++; + delete elst; + } + // TODO: add more tests for ELIST +} + +} // namespace tesseract. diff --git a/tesseract/unittest/loadlang_test.cc b/tesseract/unittest/loadlang_test.cc new file mode 100644 index 00000000..ba7a9f6d --- /dev/null +++ b/tesseract/unittest/loadlang_test.cc @@ -0,0 +1,251 @@ +/////////////////////////////////////////////////////////////////////// +// File: loadlang_test.cc +// Description: Test loading of All languages and Scripts for Tesseract. +// Tests for All languages and scripts are Disabled by default. +// Force the disabled test to run if required by using the +// --gtest_also_run_disabled_tests argument. Author: Shree Devi Kumar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include <memory> // std::unique_ptr +#include <time.h> +#include <tesseract/baseapi.h> +#include "include_gunit.h" + +namespace tesseract { + +class QuickTest : public testing::Test { + protected: + virtual void SetUp() { start_time_ = time(nullptr); } + virtual void TearDown() { + const time_t end_time = time(nullptr); + EXPECT_TRUE(end_time - start_time_ <= 25) + << "The test took too long - " + << ::testing::PrintToString(end_time - start_time_); + } + time_t start_time_; +}; + +void LangLoader(const char* lang, const char* tessdatadir) { + std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI()); + ASSERT_FALSE(api->Init(tessdatadir, lang)) + << "Could not initialize tesseract for $lang."; + api->End(); +} + +// For all languages + +class LoadLanguage : public QuickTest, + public ::testing::WithParamInterface<const char*> {}; + +TEST_P(LoadLanguage, afr) { LangLoader("afr", GetParam()); } +TEST_P(LoadLanguage, amh) { LangLoader("amh", GetParam()); } +TEST_P(LoadLanguage, ara) { LangLoader("ara", GetParam()); } +TEST_P(LoadLanguage, asm) { LangLoader("asm", GetParam()); } +TEST_P(LoadLanguage, aze) { LangLoader("aze", GetParam()); } +TEST_P(LoadLanguage, aze_cyrl) { LangLoader("aze_cyrl", GetParam()); } +TEST_P(LoadLanguage, bel) { LangLoader("bel", GetParam()); } +TEST_P(LoadLanguage, ben) { LangLoader("ben", GetParam()); } +TEST_P(LoadLanguage, bod) { LangLoader("bod", GetParam()); } +TEST_P(LoadLanguage, bos) { LangLoader("bos", GetParam()); } +TEST_P(LoadLanguage, bre) { LangLoader("bre", GetParam()); } +TEST_P(LoadLanguage, bul) { LangLoader("bul", GetParam()); } +TEST_P(LoadLanguage, cat) { LangLoader("cat", GetParam()); } +TEST_P(LoadLanguage, ceb) { LangLoader("ceb", GetParam()); } +TEST_P(LoadLanguage, ces) { LangLoader("ces", GetParam()); } +TEST_P(LoadLanguage, chi_sim) { LangLoader("chi_sim", GetParam()); } +TEST_P(LoadLanguage, chi_sim_vert) { LangLoader("chi_sim_vert", GetParam()); } +TEST_P(LoadLanguage, chi_tra) { LangLoader("chi_tra", GetParam()); } +TEST_P(LoadLanguage, chi_tra_vert) { LangLoader("chi_tra_vert", GetParam()); } +TEST_P(LoadLanguage, chr) { LangLoader("chr", GetParam()); } +TEST_P(LoadLanguage, cos) { LangLoader("cos", GetParam()); } +TEST_P(LoadLanguage, cym) { LangLoader("cym", GetParam()); } +TEST_P(LoadLanguage, dan) { LangLoader("dan", GetParam()); } +TEST_P(LoadLanguage, deu) { LangLoader("deu", GetParam()); } +TEST_P(LoadLanguage, div) { LangLoader("div", GetParam()); } +TEST_P(LoadLanguage, dzo) { LangLoader("dzo", GetParam()); } +TEST_P(LoadLanguage, ell) { LangLoader("ell", GetParam()); } +TEST_P(LoadLanguage, eng) { LangLoader("eng", GetParam()); } +TEST_P(LoadLanguage, enm) { LangLoader("enm", GetParam()); } +TEST_P(LoadLanguage, epo) { LangLoader("epo", GetParam()); } +TEST_P(LoadLanguage, est) { LangLoader("est", GetParam()); } +TEST_P(LoadLanguage, eus) { LangLoader("eus", GetParam()); } +TEST_P(LoadLanguage, fao) { LangLoader("fao", GetParam()); } +TEST_P(LoadLanguage, fas) { LangLoader("fas", GetParam()); } +TEST_P(LoadLanguage, fil) { LangLoader("fil", GetParam()); } +TEST_P(LoadLanguage, fin) { LangLoader("fin", GetParam()); } +TEST_P(LoadLanguage, fra) { LangLoader("fra", GetParam()); } +TEST_P(LoadLanguage, frk) { LangLoader("frk", GetParam()); } +TEST_P(LoadLanguage, frm) { LangLoader("frm", GetParam()); } +TEST_P(LoadLanguage, fry) { LangLoader("fry", GetParam()); } +TEST_P(LoadLanguage, gla) { LangLoader("gla", GetParam()); } +TEST_P(LoadLanguage, gle) { LangLoader("gle", GetParam()); } +TEST_P(LoadLanguage, glg) { LangLoader("glg", GetParam()); } +TEST_P(LoadLanguage, grc) { LangLoader("grc", GetParam()); } +TEST_P(LoadLanguage, guj) { LangLoader("guj", GetParam()); } +TEST_P(LoadLanguage, hat) { LangLoader("hat", GetParam()); } +TEST_P(LoadLanguage, heb) { LangLoader("heb", GetParam()); } +TEST_P(LoadLanguage, hin) { LangLoader("hin", GetParam()); } +TEST_P(LoadLanguage, hrv) { LangLoader("hrv", GetParam()); } +TEST_P(LoadLanguage, hun) { LangLoader("hun", GetParam()); } +TEST_P(LoadLanguage, hye) { LangLoader("hye", GetParam()); } +TEST_P(LoadLanguage, iku) { LangLoader("iku", GetParam()); } +TEST_P(LoadLanguage, ind) { LangLoader("ind", GetParam()); } +TEST_P(LoadLanguage, isl) { LangLoader("isl", GetParam()); } +TEST_P(LoadLanguage, ita) { LangLoader("ita", GetParam()); } +TEST_P(LoadLanguage, ita_old) { LangLoader("ita_old", GetParam()); } +TEST_P(LoadLanguage, jav) { LangLoader("jav", GetParam()); } +TEST_P(LoadLanguage, jpn) { LangLoader("jpn", GetParam()); } +TEST_P(LoadLanguage, jpn_vert) { LangLoader("jpn_vert", GetParam()); } +TEST_P(LoadLanguage, kan) { LangLoader("kan", GetParam()); } +TEST_P(LoadLanguage, kat) { LangLoader("kat", GetParam()); } +TEST_P(LoadLanguage, kat_old) { LangLoader("kat_old", GetParam()); } +TEST_P(LoadLanguage, kaz) { LangLoader("kaz", GetParam()); } +TEST_P(LoadLanguage, khm) { LangLoader("khm", GetParam()); } +TEST_P(LoadLanguage, kir) { LangLoader("kir", GetParam()); } +// TEST_P(LoadLanguage, kmr) {LangLoader("kmr" , GetParam());} +TEST_P(LoadLanguage, kor) { LangLoader("kor", GetParam()); } +TEST_P(LoadLanguage, kor_vert) { LangLoader("kor_vert", GetParam()); } +TEST_P(LoadLanguage, lao) { LangLoader("lao", GetParam()); } +TEST_P(LoadLanguage, lat) { LangLoader("lat", GetParam()); } +TEST_P(LoadLanguage, lav) { LangLoader("lav", GetParam()); } +TEST_P(LoadLanguage, lit) { LangLoader("lit", GetParam()); } +TEST_P(LoadLanguage, ltz) { LangLoader("ltz", GetParam()); } +TEST_P(LoadLanguage, mal) { LangLoader("mal", GetParam()); } +TEST_P(LoadLanguage, mar) { LangLoader("mar", GetParam()); } +TEST_P(LoadLanguage, mkd) { LangLoader("mkd", GetParam()); } +TEST_P(LoadLanguage, mlt) { LangLoader("mlt", GetParam()); } +TEST_P(LoadLanguage, mon) { LangLoader("mon", GetParam()); } +TEST_P(LoadLanguage, mri) { LangLoader("mri", GetParam()); } +TEST_P(LoadLanguage, msa) { LangLoader("msa", GetParam()); } +TEST_P(LoadLanguage, mya) { LangLoader("mya", GetParam()); } +TEST_P(LoadLanguage, nep) { LangLoader("nep", GetParam()); } +TEST_P(LoadLanguage, nld) { LangLoader("nld", GetParam()); } +TEST_P(LoadLanguage, nor) { LangLoader("nor", GetParam()); } +TEST_P(LoadLanguage, oci) { LangLoader("oci", GetParam()); } +TEST_P(LoadLanguage, ori) { LangLoader("ori", GetParam()); } +TEST_P(LoadLanguage, osd) { LangLoader("osd", GetParam()); } +TEST_P(LoadLanguage, pan) { LangLoader("pan", GetParam()); } +TEST_P(LoadLanguage, pol) { LangLoader("pol", GetParam()); } +TEST_P(LoadLanguage, por) { LangLoader("por", GetParam()); } +TEST_P(LoadLanguage, pus) { LangLoader("pus", GetParam()); } +TEST_P(LoadLanguage, que) { LangLoader("que", GetParam()); } +TEST_P(LoadLanguage, ron) { LangLoader("ron", GetParam()); } +TEST_P(LoadLanguage, rus) { LangLoader("rus", GetParam()); } +TEST_P(LoadLanguage, san) { LangLoader("san", GetParam()); } +TEST_P(LoadLanguage, sin) { LangLoader("sin", GetParam()); } +TEST_P(LoadLanguage, slk) { LangLoader("slk", GetParam()); } +TEST_P(LoadLanguage, slv) { LangLoader("slv", GetParam()); } +TEST_P(LoadLanguage, snd) { LangLoader("snd", GetParam()); } +TEST_P(LoadLanguage, spa) { LangLoader("spa", GetParam()); } +TEST_P(LoadLanguage, spa_old) { LangLoader("spa_old", GetParam()); } +TEST_P(LoadLanguage, sqi) { LangLoader("sqi", GetParam()); } +TEST_P(LoadLanguage, srp) { LangLoader("srp", GetParam()); } +TEST_P(LoadLanguage, srp_latn) { LangLoader("srp_latn", GetParam()); } +TEST_P(LoadLanguage, sun) { LangLoader("sun", GetParam()); } +TEST_P(LoadLanguage, swa) { LangLoader("swa", GetParam()); } +TEST_P(LoadLanguage, swe) { LangLoader("swe", GetParam()); } +TEST_P(LoadLanguage, syr) { LangLoader("syr", GetParam()); } +TEST_P(LoadLanguage, tam) { LangLoader("tam", GetParam()); } +TEST_P(LoadLanguage, tat) { LangLoader("tat", GetParam()); } +TEST_P(LoadLanguage, tel) { LangLoader("tel", GetParam()); } +TEST_P(LoadLanguage, tgk) { LangLoader("tgk", GetParam()); } +TEST_P(LoadLanguage, tha) { LangLoader("tha", GetParam()); } +TEST_P(LoadLanguage, tir) { LangLoader("tir", GetParam()); } +TEST_P(LoadLanguage, ton) { LangLoader("ton", GetParam()); } +TEST_P(LoadLanguage, tur) { LangLoader("tur", GetParam()); } +TEST_P(LoadLanguage, uig) { LangLoader("uig", GetParam()); } +TEST_P(LoadLanguage, ukr) { LangLoader("ukr", GetParam()); } +TEST_P(LoadLanguage, urd) { LangLoader("urd", GetParam()); } +TEST_P(LoadLanguage, uzb) { LangLoader("uzb", GetParam()); } +TEST_P(LoadLanguage, uzb_cyrl) { LangLoader("uzb_cyrl", GetParam()); } +TEST_P(LoadLanguage, vie) { LangLoader("vie", GetParam()); } +TEST_P(LoadLanguage, yid) { LangLoader("yid", GetParam()); } +TEST_P(LoadLanguage, yor) { LangLoader("yor", GetParam()); } + +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadLanguage, + ::testing::Values(TESSDATA_DIR "_fast")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadLanguage, + ::testing::Values(TESSDATA_DIR "_best")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage, + ::testing::Values(TESSDATA_DIR)); + +// For all scripts + +class LoadScript : public QuickTest, + public ::testing::WithParamInterface<const char*> {}; + +TEST_P(LoadScript, Arabic) { LangLoader("script/Arabic", GetParam()); } +TEST_P(LoadScript, Armenian) { LangLoader("script/Armenian", GetParam()); } +TEST_P(LoadScript, Bengali) { LangLoader("script/Bengali", GetParam()); } +TEST_P(LoadScript, Canadian_Aboriginal) { + LangLoader("script/Canadian_Aboriginal", GetParam()); +} +TEST_P(LoadScript, Cherokee) { LangLoader("script/Cherokee", GetParam()); } +TEST_P(LoadScript, Cyrillic) { LangLoader("script/Cyrillic", GetParam()); } +TEST_P(LoadScript, Devanagari) { LangLoader("script/Devanagari", GetParam()); } +TEST_P(LoadScript, Ethiopic) { LangLoader("script/Ethiopic", GetParam()); } +TEST_P(LoadScript, Fraktur) { LangLoader("script/Fraktur", GetParam()); } +TEST_P(LoadScript, Georgian) { LangLoader("script/Georgian", GetParam()); } +TEST_P(LoadScript, Greek) { LangLoader("script/Greek", GetParam()); } +TEST_P(LoadScript, Gujarati) { LangLoader("script/Gujarati", GetParam()); } +TEST_P(LoadScript, Gurmukhi) { LangLoader("script/Gurmukhi", GetParam()); } +TEST_P(LoadScript, HanS) { LangLoader("script/HanS", GetParam()); } +TEST_P(LoadScript, HanS_vert) { LangLoader("script/HanS_vert", GetParam()); } +TEST_P(LoadScript, HanT) { LangLoader("script/HanT", GetParam()); } +TEST_P(LoadScript, HanT_vert) { LangLoader("script/HanT_vert", GetParam()); } +TEST_P(LoadScript, Hangul) { LangLoader("script/Hangul", GetParam()); } +TEST_P(LoadScript, Hangul_vert) { + LangLoader("script/Hangul_vert", GetParam()); +} +TEST_P(LoadScript, Hebrew) { LangLoader("script/Hebrew", GetParam()); } +TEST_P(LoadScript, Japanese) { LangLoader("script/Japanese", GetParam()); } +TEST_P(LoadScript, Japanese_vert) { + LangLoader("script/Japanese_vert", GetParam()); +} +TEST_P(LoadScript, Kannada) { LangLoader("script/Kannada", GetParam()); } +TEST_P(LoadScript, Khmer) { LangLoader("script/Khmer", GetParam()); } +TEST_P(LoadScript, Lao) { LangLoader("script/Lao", GetParam()); } +TEST_P(LoadScript, Latin) { LangLoader("script/Latin", GetParam()); } +TEST_P(LoadScript, Malayalam) { LangLoader("script/Malayalam", GetParam()); } +TEST_P(LoadScript, Myanmar) { LangLoader("script/Myanmar", GetParam()); } +TEST_P(LoadScript, Oriya) { LangLoader("script/Oriya", GetParam()); } +TEST_P(LoadScript, Sinhala) { LangLoader("script/Sinhala", GetParam()); } +TEST_P(LoadScript, Syriac) { LangLoader("script/Syriac", GetParam()); } +TEST_P(LoadScript, Tamil) { LangLoader("script/Tamil", GetParam()); } +TEST_P(LoadScript, Telugu) { LangLoader("script/Telugu", GetParam()); } +TEST_P(LoadScript, Thaana) { LangLoader("script/Thaana", GetParam()); } +TEST_P(LoadScript, Thai) { LangLoader("script/Thai", GetParam()); } +TEST_P(LoadScript, Tibetan) { LangLoader("script/Tibetan", GetParam()); } +TEST_P(LoadScript, Vietnamese) { LangLoader("script/Vietnamese", GetParam()); } + +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadScript, + ::testing::Values(TESSDATA_DIR "_fast")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadScript, + ::testing::Values(TESSDATA_DIR "_best")); +INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript, + ::testing::Values(TESSDATA_DIR)); + +class LoadLang : public QuickTest {}; + +// Test Load of English here, as the parameterized tests are disabled by +// default. +TEST_F(LoadLang, engFast) { LangLoader("eng", TESSDATA_DIR "_fast"); } +TEST_F(LoadLang, engBest) { LangLoader("eng", TESSDATA_DIR "_best"); } +TEST_F(LoadLang, engBestInt) { LangLoader("eng", TESSDATA_DIR); } + +// Use class LoadLang for languages which are NOT there in all three repos +TEST_F(LoadLang, kmrFast) { LangLoader("kmr", TESSDATA_DIR "_fast"); } +TEST_F(LoadLang, kmrBest) { LangLoader("kmr", TESSDATA_DIR "_best"); } +// TEST_F(LoadLang, kmrBestInt) {LangLoader("kmr" , TESSDATA_DIR);} + +} // namespace diff --git a/tesseract/unittest/log.h b/tesseract/unittest/log.h new file mode 100644 index 00000000..0b21f3ee --- /dev/null +++ b/tesseract/unittest/log.h @@ -0,0 +1,67 @@ +/////////////////////////////////////////////////////////////////////// +// File: log.h +// Description: Include for custom log message for unittest for tesseract. +// based on +// https://stackoverflow.com/questions/16491675/how-to-send-custom-message-in-google-c-testing-framework +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_UNITTEST_LOG_H_ +#define TESSERACT_UNITTEST_LOG_H_ + +// This is a minimal implementation of the TensorFlow logging API +// which is sufficient for the Tesseract unit tests. + +// See tensorflow/core/platform/default/logging.h for the original code. + +#include <iostream> + +enum LogLevel { + INFO, WARNING, ERROR, FATAL +}; + +// Avoid conflict with logging.h from TensorFlow. +#undef LOG + +static inline std::ostream& LOG(enum LogLevel level) +{ + switch (level) { + case INFO: + std::cout << "[INFO] "; + break; + case WARNING: + std::cout << "[WARN] "; + break; + case ERROR: + std::cout << "[ERROR] "; + break; + case FATAL: + std::cout << "[FATAL] "; + break; + } + return std::cout; +} + +// Avoid conflict with logging.h from TensorFlow. +#undef QCHECK + +// https://github.com/google/ion/blob/master/ion/base/logging.h +static inline std::ostream& QCHECK(bool condition) +{ + if (condition) { + static std::ostream null_stream(nullptr); + return null_stream; + } + return std::cout; +} + +#endif // TESSERACT_UNITTEST_LOG_H_ diff --git a/tesseract/unittest/lstm_recode_test.cc b/tesseract/unittest/lstm_recode_test.cc new file mode 100644 index 00000000..5365bf4b --- /dev/null +++ b/tesseract/unittest/lstm_recode_test.cc @@ -0,0 +1,45 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lstm_test.h" + +namespace tesseract { + +// Tests that training with unicharset recoding learns faster than without, +// for Korean. This test is split in two, so it can be run sharded. + +TEST_F(LSTMTrainerTest, RecodeTestKorBase) { + // A basic single-layer, bi-di 1d LSTM on Korean. + SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-full", "kor/kor.unicharset", + "kor.Arial_Unicode_MS.exp0.lstmf", false, true, 5e-4, false, "kor"); + double kor_full_err = TrainIterations(kTrainerIterations * 2); + EXPECT_LT(kor_full_err, 88); +// EXPECT_GT(kor_full_err, 85); + LOG(INFO) << "********** Expected < 88 ************\n" ; +} + +TEST_F(LSTMTrainerTest, RecodeTestKor) { + // A basic single-layer, bi-di 1d LSTM on Korean. + SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-recode", "kor/kor.unicharset", + "kor.Arial_Unicode_MS.exp0.lstmf", true, true, 5e-4, false, "kor"); + double kor_recode_err = TrainIterations(kTrainerIterations); + EXPECT_LT(kor_recode_err, 60); + LOG(INFO) << "********** Expected < 60 ************\n" ; +} + +// Tests that the given string encodes and decodes back to the same +// with both recode on and off for Korean. + +TEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) { + TestEncodeDecodeBoth("kor", "한국어 위키백과에 오신 것을 환영합니다!"); +} + +} // namespace tesseract. diff --git a/tesseract/unittest/lstm_squashed_test.cc b/tesseract/unittest/lstm_squashed_test.cc new file mode 100644 index 00000000..1dd08746 --- /dev/null +++ b/tesseract/unittest/lstm_squashed_test.cc @@ -0,0 +1,31 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lstm_test.h" + +namespace tesseract { + +// Tests that a Squashed network learns correctly. +// Almost as fast as the 2d-lstm. +TEST_F(LSTMTrainerTest, TestSquashed) { + // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom, and + // a small convolution/maxpool below that. + // Match training conditions to those typically used with this spec: + // recoding on, adam on. + SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]", + "SQU-2-layer-lstm", /*recode*/ true, /*adam*/ true); + double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2); + EXPECT_LT(lstm_2d_err, 80); + LOG(INFO) << "********** < 80 ************\n" ; + TestIntMode(kTrainerIterations); +} + +} // namespace tesseract. diff --git a/tesseract/unittest/lstm_test.cc b/tesseract/unittest/lstm_test.cc new file mode 100644 index 00000000..930384a6 --- /dev/null +++ b/tesseract/unittest/lstm_test.cc @@ -0,0 +1,221 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Generating the training data: +// If the format of the lstmf (ImageData) file changes, the training data will +// have to be regenerated as follows: +// +// Use --xsize 800 for text2image to be similar to original training data. +// +// src/training/tesstrain.sh --fonts_dir /usr/share/fonts --lang eng \ +// --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \ +// --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \ +// --fontlist "Arial" --maxpages 10 +// + +#include "lstm_test.h" + +namespace tesseract { + +// Tests that some simple networks can learn Arial and meet accuracy targets. +TEST_F(LSTMTrainerTest, BasicTest) { + // A Convolver sliding window classifier without LSTM. + SetupTrainer( + "[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 " + "Ct1,1,64O1c1]", + "no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false, + 2e-4, false, "eng"); + double non_lstm_err = TrainIterations(kTrainerIterations * 4); + EXPECT_LT(non_lstm_err, 98); + LOG(INFO) << "********** Expected < 98 ************\n" ; + + // A basic single-layer, single direction LSTM. + SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false); + double lstm_uni_err = TrainIterations(kTrainerIterations * 2); + EXPECT_LT(lstm_uni_err, 86); + LOG(INFO) << "********** Expected < 86 ************\n" ; + // Beats the convolver. (Although it does have a lot more weights, it still + // iterates faster.) + EXPECT_LT(lstm_uni_err, non_lstm_err); +} + +// Color learns almost as fast as normalized grey/2D. +TEST_F(LSTMTrainerTest, ColorTest) { + // A basic single-layer, single direction LSTM. + SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", + "2D-color-lstm", true, true); + double lstm_uni_err = TrainIterations(kTrainerIterations); + EXPECT_LT(lstm_uni_err, 85); +// EXPECT_GT(lstm_uni_err, 66); + LOG(INFO) << "********** Expected < 85 ************\n" ; +} + +TEST_F(LSTMTrainerTest, BidiTest) { + // A basic single-layer, bi-di 1d LSTM. + SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false); + double lstm_bi_err = TrainIterations(kTrainerIterations); + EXPECT_LT(lstm_bi_err, 75); + LOG(INFO) << "********** Expected < 75 ************\n" ; + // Int mode training is dead, so convert the trained network to int and check + // that its error rate is close to the float version. + TestIntMode(kTrainerIterations); +} + +// Tests that a 2d-2-layer network learns correctly. +// It takes a lot of iterations to get there. +TEST_F(LSTMTrainerTest, Test2D) { + // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom. + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", + "2-D-2-layer-lstm", false, false); + double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2 ); + EXPECT_LT(lstm_2d_err, 98); +// EXPECT_GT(lstm_2d_err, 90); + LOG(INFO) << "********** Expected < 98 ************\n" ; + // Int mode training is dead, so convert the trained network to int and check + // that its error rate is close to the float version. + TestIntMode(kTrainerIterations); +} + +// Tests that a 2d-2-layer network with Adam does *a lot* better than +// without it. +TEST_F(LSTMTrainerTest, TestAdam) { + // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom. + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", + "2-D-2-layer-lstm", false, true); + double lstm_2d_err = TrainIterations(kTrainerIterations); + EXPECT_LT(lstm_2d_err, 70); + LOG(INFO) << "********** Expected < 70 ************\n" ; + TestIntMode(kTrainerIterations); +} + +// Trivial test of training speed on a fairly complex network. +TEST_F(LSTMTrainerTest, SpeedTest) { + SetupTrainerEng( + "[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 " + "O1c1]", + "2-D-2-layer-lstm", false, true); + TrainIterations(kTrainerIterations); + LOG(INFO) << "********** *** ************\n" ; +} + +// Tests that two identical networks trained the same get the same results. +// Also tests that the same happens with a serialize/deserialize in the middle. +TEST_F(LSTMTrainerTest, DeterminismTest) { + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", + "2-D-2-layer-lstm", false, false); + double lstm_2d_err_a = TrainIterations(kTrainerIterations); + double act_error_a = trainer_->ActivationError(); + double char_error_a = trainer_->CharError(); + std::vector<char> trainer_a_data; + EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), + &trainer_a_data)); + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", + "2-D-2-layer-lstm", false, false); + double lstm_2d_err_b = TrainIterations(kTrainerIterations); + double act_error_b = trainer_->ActivationError(); + double char_error_b = trainer_->CharError(); + EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b); + EXPECT_FLOAT_EQ(act_error_a, act_error_b); + EXPECT_FLOAT_EQ(char_error_a, char_error_b); + // Now train some more iterations. + lstm_2d_err_b = TrainIterations(kTrainerIterations / 3); + act_error_b = trainer_->ActivationError(); + char_error_b = trainer_->CharError(); + // Unpack into a new trainer and train that some more too. + SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]", + "2-D-2-layer-lstm", false, false); + EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, trainer_.get())); + lstm_2d_err_a = TrainIterations(kTrainerIterations / 3); + act_error_a = trainer_->ActivationError(); + char_error_a = trainer_->CharError(); + EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b); + EXPECT_FLOAT_EQ(act_error_a, act_error_b); + EXPECT_FLOAT_EQ(char_error_a, char_error_b); + LOG(INFO) << "********** *** ************\n" ; +} + +// The baseline network against which to test the built-in softmax. +TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) { + // A basic single-layer, single direction LSTM. + SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true); + double lstm_uni_err = TrainIterations(kTrainerIterations * 2); + EXPECT_LT(lstm_uni_err, 60); +// EXPECT_GT(lstm_uni_err, 48); + LOG(INFO) << "********** Expected < 60 ************\n" ; + // Check that it works in int mode too. + TestIntMode(kTrainerIterations); + // If we run TestIntMode again, it tests that int_mode networks can + // serialize and deserialize correctly. + double delta = TestIntMode(kTrainerIterations); + // The two tests (both of int mode this time) should be almost identical. + LOG(INFO) << "Delta in Int mode error rates = " << delta << "\n"; + EXPECT_LT(delta, 0.01); +} + +// Tests that the built-in softmax does better than the external one, +// which has an error rate slightly less than 55%, as tested by +// SoftmaxBaselineTest. +TEST_F(LSTMTrainerTest, SoftmaxTest) { + // LSTM with a built-in softmax can beat the external softmax. + SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true); + double lstm_sm_err = TrainIterations(kTrainerIterations * 2); + EXPECT_LT(lstm_sm_err, 49.0); + LOG(INFO) << "********** Expected < 49 ************\n" ; + // Check that it works in int mode too. + TestIntMode(kTrainerIterations); +} + +// Tests that the built-in encoded softmax does better than the external one. +// It takes a lot of iterations to get there. +TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) { + // LSTM with a built-in encoded softmax can beat the external softmax. + SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true); + double lstm_sm_err = TrainIterations(kTrainerIterations * 2); + EXPECT_LT(lstm_sm_err, 62.0); + LOG(INFO) << "********** Expected < 62 ************\n" ; + // Check that it works in int mode too. + TestIntMode(kTrainerIterations); +} + +// Tests that layer access methods work correctly. +TEST_F(LSTMTrainerTest, TestLayerAccess) { + // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom. + SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm", + false, false); + // Number of layers. + const int kNumLayers = 8; + // Expected layer names. + const char* kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2", + ":3:0", ":4:0", ":4:1:0", ":5"}; + const char* kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL", + "Maxpool", "Lfys32", "Lbx128LTR", + "Lbx128", "Output"}; + // Expected number of weights. + const int kNumWeights[kNumLayers] = {0, + 0, + 16 * (25 + 1), + 0, + 32 * (4 * (32 + 16 + 1)), + 128 * (4 * (128 + 32 + 1)), + 128 * (4 * (128 + 32 + 1)), + 112 * (2 * 128 + 1)}; + + auto layers = trainer_->EnumerateLayers(); + EXPECT_EQ(kNumLayers, layers.size()); + for (int i = 0; i < kNumLayers && i < layers.size(); ++i) { + EXPECT_STREQ(kLayerIds[i], layers[i].c_str()); + EXPECT_STREQ(kLayerNames[i], + trainer_->GetLayer(layers[i])->name().c_str()); + EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights()); + } +} + +} // namespace tesseract. diff --git a/tesseract/unittest/lstm_test.h b/tesseract/unittest/lstm_test.h new file mode 100644 index 00000000..4f3d9572 --- /dev/null +++ b/tesseract/unittest/lstm_test.h @@ -0,0 +1,189 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TESSERACT_UNITTEST_LSTM_TEST_H_ +#define TESSERACT_UNITTEST_LSTM_TEST_H_ + +#include <memory> +#include <string> +#include <utility> + +#include "include_gunit.h" + +#include "absl/strings/str_cat.h" +#include "tprintf.h" +#include "helpers.h" + +#include "functions.h" +#include "lang_model_helpers.h" +#include "log.h" // for LOG +#include "lstmtrainer.h" +#include "unicharset.h" + +namespace tesseract { + +#if DEBUG_DETAIL == 0 +// Number of iterations to run all the trainers. +const int kTrainerIterations = 600; +// Number of iterations between accuracy checks. +const int kBatchIterations = 100; +#else +// Number of iterations to run all the trainers. +const int kTrainerIterations = 2; +// Number of iterations between accuracy checks. +const int kBatchIterations = 1; +#endif + +// The fixture for testing LSTMTrainer. +class LSTMTrainerTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + LSTMTrainerTest() {} + std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTDATA_DIR, + "" + name); + } + std::string TessDataNameToPath(const std::string& name) { + return file::JoinPath(TESSDATA_DIR, + "" + name); + } + std::string TestingNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, + "" + name); + } + + void SetupTrainerEng(const std::string& network_spec, const std::string& model_name, + bool recode, bool adam) { + SetupTrainer(network_spec, model_name, "eng/eng.unicharset", + "eng.Arial.exp0.lstmf", recode, adam, 5e-4, false, "eng"); + } + void SetupTrainer(const std::string& network_spec, const std::string& model_name, + const std::string& unicharset_file, const std::string& lstmf_file, + bool recode, bool adam, double learning_rate, + bool layer_specific, const std::string& kLang) { +// constexpr char kLang[] = "eng"; // Exact value doesn't matter. + std::string unicharset_name = TestDataNameToPath(unicharset_file); + UNICHARSET unicharset; + ASSERT_TRUE(unicharset.load_from_file(unicharset_name.c_str(), false)); + std::string script_dir = file::JoinPath( + LANGDATA_DIR, ""); + std::vector<STRING> words; + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir, + kLang, !recode, words, words, words, false, + nullptr, nullptr)); + std::string model_path = file::JoinPath(FLAGS_test_tmpdir, model_name); + std::string checkpoint_path = model_path + "_checkpoint"; + trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(), + 0, 0)); + trainer_->InitCharSet(file::JoinPath(FLAGS_test_tmpdir, kLang, + absl::StrCat(kLang, ".traineddata"))); + int net_mode = adam ? NF_ADAM : 0; + // Adam needs a higher learning rate, due to not multiplying the effective + // rate by 1/(1-momentum). + if (adam) learning_rate *= 20.0; + if (layer_specific) net_mode |= NF_LAYER_SPECIFIC_LR; + EXPECT_TRUE(trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1, + learning_rate, 0.9, 0.999)); + std::vector<STRING> filenames; + filenames.push_back(STRING(TestDataNameToPath(lstmf_file).c_str())); + EXPECT_TRUE(trainer_->LoadAllTrainingData(filenames, CS_SEQUENTIAL, false)); + LOG(INFO) << "Setup network:" << model_name << "\n" ; + } + // Trains for a given number of iterations and returns the char error rate. + double TrainIterations(int max_iterations) { + int iteration = trainer_->training_iteration(); + int iteration_limit = iteration + max_iterations; + double best_error = 100.0; + do { + STRING log_str; + int target_iteration = iteration + kBatchIterations; + // Train a few. + double mean_error = 0.0; + while (iteration < target_iteration && iteration < iteration_limit) { + trainer_->TrainOnLine(trainer_.get(), false); + iteration = trainer_->training_iteration(); + mean_error += trainer_->LastSingleError(ET_CHAR_ERROR); + } + trainer_->MaintainCheckpoints(nullptr, &log_str); + iteration = trainer_->training_iteration(); + mean_error *= 100.0 / kBatchIterations; + if (mean_error < best_error) best_error = mean_error; + } while (iteration < iteration_limit); + LOG(INFO) << "Trainer error rate = " << best_error << "\n"; + return best_error; + } + // Tests for a given number of iterations and returns the char error rate. + double TestIterations(int max_iterations) { + CHECK_GT(max_iterations, 0); + int iteration = trainer_->sample_iteration(); + double mean_error = 0.0; + int error_count = 0; + while (error_count < max_iterations) { + const ImageData& trainingdata = + *trainer_->mutable_training_data()->GetPageBySerial(iteration); + NetworkIO fwd_outputs, targets; + if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) != + UNENCODABLE) { + mean_error += trainer_->NewSingleError(ET_CHAR_ERROR); + ++error_count; + } + trainer_->SetIteration(++iteration); + } + mean_error *= 100.0 / max_iterations; + LOG(INFO) << "Tester error rate = " << mean_error << "\n" ; + return mean_error; + } + // Tests that the current trainer_ can be converted to int mode and still gets + // within 1% of the error rate. Returns the increase in error from float to + // int. + double TestIntMode(int test_iterations) { + std::vector<char> trainer_data; + EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(), + &trainer_data)); + // Get the error on the next few iterations in float mode. + double float_err = TestIterations(test_iterations); + // Restore the dump, convert to int and test error on that. + EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_data, trainer_.get())); + trainer_->ConvertToInt(); + double int_err = TestIterations(test_iterations); + EXPECT_LT(int_err, float_err + 1.0); + return int_err - float_err; + } + // Sets up a trainer with the given language and given recode+ctc condition. + // It then verifies that the given str encodes and decodes back to the same + // string. + void TestEncodeDecode(const std::string& lang, const std::string& str, bool recode) { + std::string unicharset_name = lang + "/" + lang + ".unicharset"; + std::string lstmf_name = lang + ".Arial_Unicode_MS.exp0.lstmf"; + SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name, + lstmf_name, recode, true, 5e-4, true, lang); + std::vector<int> labels; + EXPECT_TRUE(trainer_->EncodeString(str.c_str(), &labels)); + STRING decoded = trainer_->DecodeLabels(labels); + std::string decoded_str(&decoded[0], decoded.length()); + EXPECT_EQ(str, decoded_str); + } + // Calls TestEncodeDeode with both recode on and off. + void TestEncodeDecodeBoth(const std::string& lang, const std::string& str) { + TestEncodeDecode(lang, str, false); + TestEncodeDecode(lang, str, true); + } + + std::unique_ptr<LSTMTrainer> trainer_; +}; + +} // namespace tesseract. + +#endif // THIRD_PARTY_TESSERACT_UNITTEST_LSTM_TEST_H_ diff --git a/tesseract/unittest/lstmtrainer_test.cc b/tesseract/unittest/lstmtrainer_test.cc new file mode 100644 index 00000000..967d1fe5 --- /dev/null +++ b/tesseract/unittest/lstmtrainer_test.cc @@ -0,0 +1,106 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "allheaders.h" +#include <tesseract/baseapi.h> +#include "lstm_test.h" + +namespace tesseract { + +TEST_F(LSTMTrainerTest, EncodesEng) { + TestEncodeDecodeBoth("eng", + "The quick brown 'fox' jumps over: the lazy dog!"); +} + +TEST_F(LSTMTrainerTest, EncodesKan) { + TestEncodeDecodeBoth("kan", "ಫ್ರಬ್ರವರಿ ತತ್ವಾಂಶಗಳೆಂದರೆ ಮತ್ತು ಜೊತೆಗೆ ಕ್ರಮವನ್ನು"); +} + +TEST_F(LSTMTrainerTest, EncodesKor) { + TestEncodeDecodeBoth("kor", + "이는 것으로 다시 넣을 수는 있지만 선택의 의미는"); +} + +TEST_F(LSTMTrainerTest, MapCoder) { + LSTMTrainer fra_trainer; + fra_trainer.InitCharSet(TestDataNameToPath("fra/fra.traineddata")); + LSTMTrainer deu_trainer; + deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata")); + // A string that uses characters common to French and German. + std::string kTestStr = "The quick brown 'fox' jumps over: the lazy dog!"; + std::vector<int> deu_labels; + EXPECT_TRUE(deu_trainer.EncodeString(kTestStr.c_str(), &deu_labels)); + // The french trainer cannot decode them correctly. + STRING badly_decoded = fra_trainer.DecodeLabels(deu_labels); + std::string bad_str(&badly_decoded[0], badly_decoded.length()); + LOG(INFO) << "bad_str fra=" << bad_str << "\n"; + EXPECT_NE(kTestStr, bad_str); + // Encode the string as fra. + std::vector<int> fra_labels; + EXPECT_TRUE(fra_trainer.EncodeString(kTestStr.c_str(), &fra_labels)); + // Use the mapper to compute what the labels are as deu. + std::vector<int> mapping = fra_trainer.MapRecoder(deu_trainer.GetUnicharset(), + deu_trainer.GetRecoder()); + std::vector<int> mapped_fra_labels(fra_labels.size(), -1); + for (int i = 0; i < fra_labels.size(); ++i) { + mapped_fra_labels[i] = mapping[fra_labels[i]]; + EXPECT_NE(-1, mapped_fra_labels[i]) << "i=" << i << ", ch=" << kTestStr[i]; + EXPECT_EQ(mapped_fra_labels[i], deu_labels[i]) + << "i=" << i << ", ch=" << kTestStr[i] + << " has deu label=" << deu_labels[i] << ", but mapped to " + << mapped_fra_labels[i]; + } + // The german trainer can now decode them correctly. + STRING decoded = deu_trainer.DecodeLabels(mapped_fra_labels); + std::string ok_str(&decoded[0], decoded.length()); + LOG(INFO) << "ok_str deu=" << ok_str << "\n"; + EXPECT_EQ(kTestStr, ok_str); +} + +// Tests that the actual fra model can be converted to the deu character set +// and still read an eng image with 100% accuracy. +TEST_F(LSTMTrainerTest, ConvertModel) { + // Setup a trainer with a deu charset. + LSTMTrainer deu_trainer; + deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata")); + // Load the fra traineddata, strip out the model, and save to a tmp file. + TessdataManager mgr; + std::string fra_data = + file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata"); + CHECK(mgr.Init(fra_data.c_str())); + LOG(INFO) << "Load " << fra_data << "\n"; + file::MakeTmpdir(); + std::string model_path = file::JoinPath(FLAGS_test_tmpdir, "fra.lstm"); + CHECK(mgr.ExtractToFile(model_path.c_str())); + LOG(INFO) << "Extract " << model_path << "\n"; + // Load the fra model into the deu_trainer, and save the converted model. + CHECK(deu_trainer.TryLoadingCheckpoint(model_path.c_str(), fra_data.c_str())); + LOG(INFO) << "Checkpoint load for " << model_path << " and " << fra_data << "\n"; + std::string deu_data = file::JoinPath(FLAGS_test_tmpdir, "deu.traineddata"); + CHECK(deu_trainer.SaveTraineddata(deu_data.c_str())); + LOG(INFO) << "Save " << deu_data << "\n"; + // Now run the saved model on phototest. (See BasicTesseractTest in + // baseapi_test.cc). + TessBaseAPI api; + api.Init(FLAGS_test_tmpdir, "deu", tesseract::OEM_LSTM_ONLY); + Pix* src_pix = pixRead(TestingNameToPath("phototest.tif").c_str()); + CHECK(src_pix); + api.SetImage(src_pix); + std::unique_ptr<char[]> result(api.GetUTF8Text()); + std::string truth_text; + CHECK_OK(file::GetContents(TestingNameToPath("phototest.gold.txt"), + &truth_text, file::Defaults())); + + EXPECT_STREQ(truth_text.c_str(), result.get()); + pixDestroy(&src_pix); +} + +} // namespace tesseract diff --git a/tesseract/unittest/mastertrainer_test.cc b/tesseract/unittest/mastertrainer_test.cc new file mode 100644 index 00000000..0f93e221 --- /dev/null +++ b/tesseract/unittest/mastertrainer_test.cc @@ -0,0 +1,298 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Although this is a trivial-looking test, it exercises a lot of code: +// SampleIterator has to correctly iterate over the correct characters, or +// it will fail. +// The canonical and cloud features computed by TrainingSampleSet need to +// be correct, along with the distance caches, organizing samples by font +// and class, indexing of features, distance calculations. +// IntFeatureDist has to work, or the canonical samples won't work. +// Mastertrainer has ability to read tr files and set itself up tested. +// Finally the serialize/deserialize test ensures that MasterTrainer, +// TrainingSampleSet, TrainingSample can all serialize/deserialize correctly +// enough to reproduce the same results. + +#include "include_gunit.h" + +#include "log.h" // for LOG +#include "unicharset.h" +#include "errorcounter.h" +#include "mastertrainer.h" +#include "shapeclassifier.h" +#include "shapetable.h" +#include "trainingsample.h" +#include "commontraining.h" + +#include "absl/strings/numbers.h" // for safe_strto32 +#include "absl/strings/str_split.h" // for absl::StrSplit + +#include <string> +#include <utility> +#include <vector> + +using namespace tesseract; + +// Specs of the MockClassifier. +static const int kNumTopNErrs = 10; +static const int kNumTop2Errs = kNumTopNErrs + 20; +static const int kNumTop1Errs = kNumTop2Errs + 30; +static const int kNumTopTopErrs = kNumTop1Errs + 25; +static const int kNumNonReject = 1000; +static const int kNumCorrect = kNumNonReject - kNumTop1Errs; +// The total number of answers is given by the number of non-rejects plus +// all the multiple answers. +static const int kNumAnswers = kNumNonReject + 2 * (kNumTop2Errs - kNumTopNErrs) + + (kNumTop1Errs - kNumTop2Errs) + + (kNumTopTopErrs - kNumTop1Errs); + +#ifndef DISABLED_LEGACY_ENGINE +static bool safe_strto32(const std::string& str, int* pResult) +{ + long n = strtol(str.c_str(), nullptr, 0); + *pResult = n; + return true; +} +#endif + +// Mock ShapeClassifier that cheats by looking at the correct answer, and +// creates a specific pattern of errors that can be tested. +class MockClassifier : public ShapeClassifier { + public: + explicit MockClassifier(ShapeTable* shape_table) + : shape_table_(shape_table), num_done_(0), done_bad_font_(false) { + // Add a false font answer to the shape table. We pick a random unichar_id, + // add a new shape for it with a false font. Font must actually exist in + // the font table, but not match anything in the first 1000 samples. + false_unichar_id_ = 67; + false_shape_ = shape_table_->AddShape(false_unichar_id_, 25); + } + virtual ~MockClassifier() {} + + // Classifies the given [training] sample, writing to results. + // If debug is non-zero, then various degrees of classifier dependent debug + // information is provided. + // If keep_this (a shape index) is >= 0, then the results should always + // contain keep_this, and (if possible) anything of intermediate confidence. + // The return value is the number of classes saved in results. + int ClassifySample(const TrainingSample& sample, Pix* page_pix, + int debug, UNICHAR_ID keep_this, + std::vector<ShapeRating>* results) override { + results->clear(); + // Everything except the first kNumNonReject is a reject. + if (++num_done_ > kNumNonReject) return 0; + + int class_id = sample.class_id(); + int font_id = sample.font_id(); + int shape_id = shape_table_->FindShape(class_id, font_id); + // Get ids of some wrong answers. + int wrong_id1 = shape_id > 10 ? shape_id - 1 : shape_id + 1; + int wrong_id2 = shape_id > 10 ? shape_id - 2 : shape_id + 2; + if (num_done_ <= kNumTopNErrs) { + // The first kNumTopNErrs are top-n errors. + results->push_back(ShapeRating(wrong_id1, 1.0f)); + } else if (num_done_ <= kNumTop2Errs) { + // The next kNumTop2Errs - kNumTopNErrs are top-2 errors. + results->push_back(ShapeRating(wrong_id1, 1.0f)); + results->push_back(ShapeRating(wrong_id2, 0.875f)); + results->push_back(ShapeRating(shape_id, 0.75f)); + } else if (num_done_ <= kNumTop1Errs) { + // The next kNumTop1Errs - kNumTop2Errs are top-1 errors. + results->push_back(ShapeRating(wrong_id1, 1.0f)); + results->push_back(ShapeRating(shape_id, 0.8f)); + } else if (num_done_ <= kNumTopTopErrs) { + // The next kNumTopTopErrs - kNumTop1Errs are cases where the actual top + // is not correct, but do not count as a top-1 error because the rating + // is close enough to the top answer. + results->push_back(ShapeRating(wrong_id1, 1.0f)); + results->push_back(ShapeRating(shape_id, 0.99f)); + } else if (!done_bad_font_ && class_id == false_unichar_id_) { + // There is a single character with a bad font. + results->push_back(ShapeRating(false_shape_, 1.0f)); + done_bad_font_ = true; + } else { + // Everything else is correct. + results->push_back(ShapeRating(shape_id, 1.0f)); + } + return results->size(); + } + // Provides access to the ShapeTable that this classifier works with. + const ShapeTable* GetShapeTable() const override { return shape_table_; } + + private: + // Borrowed pointer to the ShapeTable. + ShapeTable* shape_table_; + // Unichar_id of a random character that occurs after the first 60 samples. + int false_unichar_id_; + // Shape index of prepared false answer for false_unichar_id. + int false_shape_; + // The number of classifications we have processed. + int num_done_; + // True after the false font has been emitted. + bool done_bad_font_; +}; + +const double kMin1lDistance = 0.25; + +// The fixture for testing Tesseract. +class MasterTrainerTest : public testing::Test { +#ifndef DISABLED_LEGACY_ENGINE + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, name); + } + std::string TmpNameToPath(const std::string& name) { + return file::JoinPath(FLAGS_test_tmpdir, name); + } + + MasterTrainerTest() { + shape_table_ = nullptr; + master_trainer_ = nullptr; + } + ~MasterTrainerTest() { + delete shape_table_; + } + + // Initializes the master_trainer_ and shape_table_. + // if load_from_tmp, then reloads a master trainer that was saved by a + // previous call in which it was false. + void LoadMasterTrainer() { + FLAGS_output_trainer = TmpNameToPath("tmp_trainer").c_str(); + FLAGS_F = file::JoinPath(LANGDATA_DIR, "font_properties").c_str(); + FLAGS_X = TestDataNameToPath("eng.xheights").c_str(); + FLAGS_U = TestDataNameToPath("eng.unicharset").c_str(); + std::string tr_file_name(TestDataNameToPath("eng.Arial.exp0.tr")); + const char* argv[] = {tr_file_name.c_str()}; + int argc = 1; + STRING file_prefix; + delete shape_table_; + shape_table_ = nullptr; + master_trainer_ = + LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix); + EXPECT_TRUE(master_trainer_ != nullptr); + EXPECT_TRUE(shape_table_ != nullptr); + } + + // EXPECTs that the distance between I and l in Arial is 0 and that the + // distance to 1 is significantly not 0. + void VerifyIl1() { + // Find the font id for Arial. + int font_id = master_trainer_->GetFontInfoId("Arial"); + EXPECT_GE(font_id, 0); + // Track down the characters we are interested in. + int unichar_I = master_trainer_->unicharset().unichar_to_id("I"); + EXPECT_GT(unichar_I, 0); + int unichar_l = master_trainer_->unicharset().unichar_to_id("l"); + EXPECT_GT(unichar_l, 0); + int unichar_1 = master_trainer_->unicharset().unichar_to_id("1"); + EXPECT_GT(unichar_1, 0); + // Now get the shape ids. + int shape_I = shape_table_->FindShape(unichar_I, font_id); + EXPECT_GE(shape_I, 0); + int shape_l = shape_table_->FindShape(unichar_l, font_id); + EXPECT_GE(shape_l, 0); + int shape_1 = shape_table_->FindShape(unichar_1, font_id); + EXPECT_GE(shape_1, 0); + + float dist_I_l = + master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l); + // No tolerance here. We expect that I and l should match exactly. + EXPECT_EQ(0.0f, dist_I_l); + float dist_l_I = + master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I); + // BOTH ways. + EXPECT_EQ(0.0f, dist_l_I); + + // l/1 on the other hand should be distinct. + float dist_l_1 = + master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1); + EXPECT_GT(dist_l_1, kMin1lDistance); + float dist_1_l = + master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l); + EXPECT_GT(dist_1_l, kMin1lDistance); + + // So should I/1. + float dist_I_1 = + master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1); + EXPECT_GT(dist_I_1, kMin1lDistance); + float dist_1_I = + master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I); + EXPECT_GT(dist_1_I, kMin1lDistance); + } + + // Objects declared here can be used by all tests in the test case for Foo. + ShapeTable* shape_table_; + std::unique_ptr<MasterTrainer> master_trainer_; +#endif +}; + +// Tests that the MasterTrainer correctly loads its data and reaches the correct +// conclusion over the distance between Arial I l and 1. +TEST_F(MasterTrainerTest, Il1Test) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because LoadTrainingData is missing. + GTEST_SKIP(); +#else + // Initialize the master_trainer_ and load the Arial tr file. + LoadMasterTrainer(); + VerifyIl1(); +#endif +} + +// Tests the ErrorCounter using a MockClassifier to check that it counts +// error categories correctly. +TEST_F(MasterTrainerTest, ErrorCounterTest) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because LoadTrainingData is missing. + GTEST_SKIP(); +#else + // Initialize the master_trainer_ from the saved tmp file. + LoadMasterTrainer(); + // Add the space character to the shape_table_ if not already present to + // count junk. + if (shape_table_->FindShape(0, -1) < 0) shape_table_->AddShape(0, 0); + // Make a mock classifier. + auto shape_classifier = std::make_unique<MockClassifier>(shape_table_); + // Get the accuracy report. + STRING accuracy_report; + master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0, + false, shape_classifier.get(), + &accuracy_report); + LOG(INFO) << accuracy_report.c_str(); + std::string result_string = accuracy_report.c_str(); + std::vector<std::string> results = + absl::StrSplit(result_string, '\t', absl::SkipEmpty()); + EXPECT_EQ(tesseract::CT_SIZE + 1, results.size()); + int result_values[tesseract::CT_SIZE]; + for (int i = 0; i < tesseract::CT_SIZE; ++i) { + EXPECT_TRUE(safe_strto32(results[i + 1], &result_values[i])); + } + // These tests are more-or-less immune to additions to the number of + // categories or changes in the training data. + int num_samples = master_trainer_->GetSamples()->num_raw_samples(); + EXPECT_EQ(kNumCorrect, result_values[tesseract::CT_UNICHAR_TOP_OK]); + EXPECT_EQ(1, result_values[tesseract::CT_FONT_ATTR_ERR]); + EXPECT_EQ(kNumTopTopErrs, result_values[tesseract::CT_UNICHAR_TOPTOP_ERR]); + EXPECT_EQ(kNumTop1Errs, result_values[tesseract::CT_UNICHAR_TOP1_ERR]); + EXPECT_EQ(kNumTop2Errs, result_values[tesseract::CT_UNICHAR_TOP2_ERR]); + EXPECT_EQ(kNumTopNErrs, result_values[tesseract::CT_UNICHAR_TOPN_ERR]); + // Each of the TOPTOP errs also counts as a multi-unichar. + EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs, + result_values[tesseract::CT_OK_MULTI_UNICHAR]); + EXPECT_EQ(num_samples - kNumNonReject, result_values[tesseract::CT_REJECT]); + EXPECT_EQ(kNumAnswers, result_values[tesseract::CT_NUM_RESULTS]); +#endif +} diff --git a/tesseract/unittest/matrix_test.cc b/tesseract/unittest/matrix_test.cc new file mode 100644 index 00000000..c900308d --- /dev/null +++ b/tesseract/unittest/matrix_test.cc @@ -0,0 +1,137 @@ +/////////////////////////////////////////////////////////////////////// +// File: matrix_test.cc +// Author: rays@google.com (Ray Smith) +// +// Copyright 2016 Google Inc. All Rights Reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "matrix.h" +#include "include_gunit.h" + +namespace tesseract { + +class MatrixTest : public ::testing::Test { + protected: + void SetUp() override { + std::locale::global(std::locale("")); + } + + // Fills src_ with data so it can pretend to be a tensor thus: + // dims_=[5, 4, 3, 2] + // array_=[0, 1, 2, ....119] + // tensor=[[[[0, 1][2, 3][4, 5]] + // [[6, 7][8, 9][10, 11]] + // [[12, 13][14, 15][16, 17]] + // [[18, 19][20, 21][22, 23]]] + // [[[24, 25]... + MatrixTest() { + src_.Resize(1, kInputSize_, 0); + for (int i = 0; i < kInputSize_; ++i) { + src_.put(0, i, i); + } + for (int i = 0; i < kNumDims_; ++i) dims_[i] = 5 - i; + } + // Number of dimensions in src_. + static const int kNumDims_ = 4; + // Number of elements in src_. + static const int kInputSize_ = 120; + // Size of each dimension in src_; + int dims_[kNumDims_]; + // Input array filled with [0,kInputSize). + GENERIC_2D_ARRAY<int> src_; +}; + +// Tests that the RotatingTranspose function does the right thing for various +// transformations. +// dims=[5, 4, 3, 2]->[5, 2, 4, 3] +TEST_F(MatrixTest, RotatingTranspose_3_1) { + GENERIC_2D_ARRAY<int> m; + src_.RotatingTranspose(dims_, kNumDims_, 3, 1, &m); + m.ResizeNoInit(kInputSize_ / 3, 3); + // Verify that the result is: + // output tensor=[[[[0, 2, 4][6, 8, 10][12, 14, 16][18, 20, 22]] + // [[1, 3, 5][7, 9, 11][13, 15, 17][19, 21, 23]]] + // [[[24, 26, 28]... + EXPECT_EQ(0, m(0, 0)); + EXPECT_EQ(2, m(0, 1)); + EXPECT_EQ(4, m(0, 2)); + EXPECT_EQ(6, m(1, 0)); + EXPECT_EQ(1, m(4, 0)); + EXPECT_EQ(24, m(8, 0)); + EXPECT_EQ(26, m(8, 1)); + EXPECT_EQ(25, m(12, 0)); +} + +// dims=[5, 4, 3, 2]->[3, 5, 4, 2] +TEST_F(MatrixTest, RotatingTranspose_2_0) { + GENERIC_2D_ARRAY<int> m; + src_.RotatingTranspose(dims_, kNumDims_, 2, 0, &m); + m.ResizeNoInit(kInputSize_ / 2, 2); + // Verify that the result is: + // output tensor=[[[[0, 1][6, 7][12, 13][18, 19]] + // [[24, 25][30, 31][36, 37][42, 43]] + // [[48, 49][54, 55][60, 61][66, 67]] + // [[72, 73][78, 79][84, 85][90, 91]] + // [[96, 97][102, 103][108, 109][114, 115]]] + // [[[2,3]... + EXPECT_EQ(0, m(0, 0)); + EXPECT_EQ(1, m(0, 1)); + EXPECT_EQ(6, m(1, 0)); + EXPECT_EQ(7, m(1, 1)); + EXPECT_EQ(24, m(4, 0)); + EXPECT_EQ(25, m(4, 1)); + EXPECT_EQ(30, m(5, 0)); + EXPECT_EQ(2, m(20, 0)); +} + +// dims=[5, 4, 3, 2]->[5, 3, 2, 4] +TEST_F(MatrixTest, RotatingTranspose_1_3) { + GENERIC_2D_ARRAY<int> m; + src_.RotatingTranspose(dims_, kNumDims_, 1, 3, &m); + m.ResizeNoInit(kInputSize_ / 4, 4); + // Verify that the result is: + // output tensor=[[[[0, 6, 12, 18][1, 7, 13, 19]] + // [[2, 8, 14, 20][3, 9, 15, 21]] + // [[4, 10, 16, 22][5, 11, 17, 23]]] + // [[[24, 30, 36, 42]... + EXPECT_EQ(0, m(0, 0)); + EXPECT_EQ(6, m(0, 1)); + EXPECT_EQ(1, m(1, 0)); + EXPECT_EQ(2, m(2, 0)); + EXPECT_EQ(3, m(3, 0)); + EXPECT_EQ(4, m(4, 0)); + EXPECT_EQ(5, m(5, 0)); + EXPECT_EQ(24, m(6, 0)); + EXPECT_EQ(30, m(6, 1)); +} + +// dims=[5, 4, 3, 2]->[4, 3, 5, 2] +TEST_F(MatrixTest, RotatingTranspose_0_2) { + GENERIC_2D_ARRAY<int> m; + src_.RotatingTranspose(dims_, kNumDims_, 0, 2, &m); + m.ResizeNoInit(kInputSize_ / 2, 2); + // Verify that the result is: + // output tensor=[[[[0, 1][24, 25][48, 49][72, 73][96, 97]] + // [[2, 3][26, 27][50, 51][74, 75][98, 99]] + // [[4, 5][28, 29][52, 53][76, 77][100, 101]]] + // [[[6, 7]... + EXPECT_EQ(0, m(0, 0)); + EXPECT_EQ(1, m(0, 1)); + EXPECT_EQ(24, m(1, 0)); + EXPECT_EQ(25, m(1, 1)); + EXPECT_EQ(96, m(4, 0)); + EXPECT_EQ(97, m(4, 1)); + EXPECT_EQ(2, m(5, 0)); + EXPECT_EQ(6, m(15, 0)); +} + +} // namespace diff --git a/tesseract/unittest/networkio_test.cc b/tesseract/unittest/networkio_test.cc new file mode 100644 index 00000000..3c25f14f --- /dev/null +++ b/tesseract/unittest/networkio_test.cc @@ -0,0 +1,217 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "networkio.h" +#include "stridemap.h" +#ifdef INCLUDE_TENSORFLOW +#include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D +#endif + +namespace tesseract { + +class NetworkioTest : public ::testing::Test { + protected: + void SetUp() override { + std::locale::global(std::locale("")); + } + +#ifdef INCLUDE_TENSORFLOW + // Sets up an Array2d object of the given size, initialized to increasing + // values starting with start. + std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) { + std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize)); + int value = start; + for (int y = 0; y < ysize; ++y) { + for (int x = 0; x < xsize; ++x) { + (*a)(y, x) = value++; + } + } + return a; + } + // Sets up a NetworkIO with a batch of 2 "images" of known values. + void SetupNetworkIO(NetworkIO* nio) { + std::vector<std::unique_ptr<xla::Array2D<int>>> arrays; + arrays.push_back(SetupArray(3, 4, 0)); + arrays.push_back(SetupArray(4, 5, 12)); + std::vector<std::pair<int, int>> h_w_sizes; + for (size_t i = 0; i < arrays.size(); ++i) { + h_w_sizes.emplace_back(arrays[i].get()->height(), + arrays[i].get()->width()); + } + StrideMap stride_map; + stride_map.SetStride(h_w_sizes); + nio->ResizeToMap(true, stride_map, 2); + // Iterate over the map, setting nio's contents from the arrays. + StrideMap::Index index(stride_map); + do { + int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT), + index.index(FD_WIDTH)); + nio->SetPixel(index.t(), 0, 128 + value, 0.0f, 128.0f); + nio->SetPixel(index.t(), 1, 128 - value, 0.0f, 128.0f); + } while (index.Increment()); + } +#endif +}; + +// Tests that the initialization via SetPixel works and the resize correctly +// fills with zero where image sizes don't match. +TEST_F(NetworkioTest, InitWithZeroFill) { +#ifdef INCLUDE_TENSORFLOW + NetworkIO nio; + nio.Resize2d(true, 32, 2); + int width = nio.Width(); + for (int t = 0; t < width; ++t) { + nio.SetPixel(t, 0, 0, 0.0f, 128.0f); + nio.SetPixel(t, 1, 0, 0.0f, 128.0f); + } + // The initialization will wipe out all previously set values. + SetupNetworkIO(&nio); + nio.ZeroInvalidElements(); + StrideMap::Index index(nio.stride_map()); + int next_t = 0; + int pos = 0; + do { + int t = index.t(); + // The indexed values just increase monotonically. + int value = nio.i(t)[0]; + EXPECT_EQ(value, pos); + value = nio.i(t)[1]; + EXPECT_EQ(value, -pos); + // When we skip t values, the data is always 0. + while (next_t < t) { + EXPECT_EQ(nio.i(next_t)[0], 0); + EXPECT_EQ(nio.i(next_t)[1], 0); + ++next_t; + } + ++pos; + ++next_t; + } while (index.Increment()); + EXPECT_EQ(pos, 32); + EXPECT_EQ(next_t, 40); +#else + LOG(INFO) << "Skip test because of missing xla::Array2D"; + GTEST_SKIP(); +#endif +} + +// Tests that CopyWithYReversal works. +TEST_F(NetworkioTest, CopyWithYReversal) { +#ifdef INCLUDE_TENSORFLOW + NetworkIO nio; + SetupNetworkIO(&nio); + NetworkIO copy; + copy.CopyWithYReversal(nio); + StrideMap::Index index(copy.stride_map()); + int next_t = 0; + int pos = 0; + std::vector<int> expected_values = { + 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 27, 28, 29, 30, + 31, 22, 23, 24, 25, 26, 17, 18, 19, 20, 21, 12, 13, 14, 15, 16}; + do { + int t = index.t(); + // The indexed values match the expected values. + int value = copy.i(t)[0]; + EXPECT_EQ(value, expected_values[pos]); + value = copy.i(t)[1]; + EXPECT_EQ(value, -expected_values[pos]); + // When we skip t values, the data is always 0. + while (next_t < t) { + EXPECT_EQ(copy.i(next_t)[0], 0) << "Failure t = " << next_t; + EXPECT_EQ(copy.i(next_t)[1], 0) << "Failure t = " << next_t; + ++next_t; + } + ++pos; + ++next_t; + } while (index.Increment()); + EXPECT_EQ(pos, 32); + EXPECT_EQ(next_t, 40); +#else + LOG(INFO) << "Skip test because of missing xla::Array2D"; + GTEST_SKIP(); +#endif +} + +// Tests that CopyWithXReversal works. +TEST_F(NetworkioTest, CopyWithXReversal) { +#ifdef INCLUDE_TENSORFLOW + NetworkIO nio; + SetupNetworkIO(&nio); + NetworkIO copy; + copy.CopyWithXReversal(nio); + StrideMap::Index index(copy.stride_map()); + int next_t = 0; + int pos = 0; + std::vector<int> expected_values = { + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 16, 15, 14, 13, + 12, 21, 20, 19, 18, 17, 26, 25, 24, 23, 22, 31, 30, 29, 28, 27}; + do { + int t = index.t(); + // The indexed values match the expected values. + int value = copy.i(t)[0]; + EXPECT_EQ(value, expected_values[pos]); + value = copy.i(t)[1]; + EXPECT_EQ(value, -expected_values[pos]); + // When we skip t values, the data is always 0. + while (next_t < t) { + EXPECT_EQ(copy.i(next_t)[0], 0) << "Failure t = " << next_t; + EXPECT_EQ(copy.i(next_t)[1], 0) << "Failure t = " << next_t; + ++next_t; + } + ++pos; + ++next_t; + } while (index.Increment()); + EXPECT_EQ(pos, 32); + EXPECT_EQ(next_t, 40); +#else + LOG(INFO) << "Skip test because of missing xla::Array2D"; + GTEST_SKIP(); +#endif +} + +// Tests that CopyWithXYTranspose works. +TEST_F(NetworkioTest, CopyWithXYTranspose) { +#ifdef INCLUDE_TENSORFLOW + NetworkIO nio; + SetupNetworkIO(&nio); + NetworkIO copy; + copy.CopyWithXYTranspose(nio); + StrideMap::Index index(copy.stride_map()); + int next_t = 0; + int pos = 0; + std::vector<int> expected_values = { + 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 17, 22, 27, + 13, 18, 23, 28, 14, 19, 24, 29, 15, 20, 25, 30, 16, 21, 26, 31}; + do { + int t = index.t(); + // The indexed values match the expected values. + int value = copy.i(t)[0]; + EXPECT_EQ(value, expected_values[pos]); + value = copy.i(t)[1]; + EXPECT_EQ(value, -expected_values[pos]); + // When we skip t values, the data is always 0. + while (next_t < t) { + EXPECT_EQ(copy.i(next_t)[0], 0); + EXPECT_EQ(copy.i(next_t)[1], 0); + ++next_t; + } + ++pos; + ++next_t; + } while (index.Increment()); + EXPECT_EQ(pos, 32); + EXPECT_EQ(next_t, 40); +#else + LOG(INFO) << "Skip test because of missing xla::Array2D"; + GTEST_SKIP(); +#endif +} + +} // namespace diff --git a/tesseract/unittest/normstrngs_test.cc b/tesseract/unittest/normstrngs_test.cc new file mode 100644 index 00000000..301bbd68 --- /dev/null +++ b/tesseract/unittest/normstrngs_test.cc @@ -0,0 +1,422 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/str_format.h" // for absl::StrFormat +#include "include_gunit.h" +#include "normstrngs.h" +#include "normstrngs_test.h" +#include <tesseract/unichar.h> +#ifdef INCLUDE_TENSORFLOW +#include "util/utf8/unilib.h" // for UniLib +#endif + +#include "include_gunit.h" + +namespace tesseract { + +#if defined(MISSING_CODE) +static std::string EncodeAsUTF8(const char32 ch32) { + UNICHAR uni_ch(ch32); + return std::string(uni_ch.utf8(), uni_ch.utf8_len()); +} +#endif + +TEST(NormstrngsTest, BasicText) { + const char* kBasicText = "AbCd Ef"; + std::string result; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, + GraphemeNorm::kNormalize, kBasicText, + &result)); + EXPECT_STREQ(kBasicText, result.c_str()); +} + +TEST(NormstrngsTest, LigatureText) { + const char* kTwoByteLigText = "ij"; // U+0133 (ij) -> ij + std::string result; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, + GraphemeNorm::kNormalize, kTwoByteLigText, + &result)); + EXPECT_STREQ("ij", result.c_str()); + + const char* kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, + GraphemeNorm::kNormalize, kThreeByteLigText, + &result)); + EXPECT_STREQ("finds", result.c_str()); +} + +TEST(NormstrngsTest, OcrSpecificNormalization) { + const char* kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (') + std::string result; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, + GraphemeNorm::kNormalize, kSingleQuoteText, + &result)); + EXPECT_STREQ("'Hi", result.c_str()); + + const char* kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (") + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, + GraphemeNorm::kNormalize, kDoubleQuoteText, + &result)); + EXPECT_STREQ("\"Hi", result.c_str()); + + const char* kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-) + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize, + GraphemeNorm::kNormalize, kEmDash, &result)); + EXPECT_STREQ("Hi-", result.c_str()); + // Without the ocr normalization, these changes are not made. + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, kSingleQuoteText, + &result)); + EXPECT_STREQ(kSingleQuoteText, result.c_str()); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, kDoubleQuoteText, + &result)); + EXPECT_STREQ(kDoubleQuoteText, result.c_str()); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, kEmDash, &result)); + EXPECT_STREQ(kEmDash, result.c_str()); +} + +// Sample text used in tests. +const char kEngText[] = "the quick brown fox jumps over the lazy dog"; +const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा"; +const char kKorText[] = "이는 것으로"; +// Hindi words containing illegal vowel sequences. +const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात", + "कहीअे", "पत्रिाका", "छह्णाीस"}; +// Thai illegal sequences. +const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"}; + +TEST(NormstrngsTest, DetectsCorrectText) { + std::string chars; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, kEngText, &chars)); + EXPECT_STREQ(kEngText, chars.c_str()); + + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, kHinText, &chars)) + << "Incorrect text: '" << kHinText << "'"; + EXPECT_STREQ(kHinText, chars.c_str()); + + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, kKorText, &chars)); + EXPECT_STREQ(kKorText, chars.c_str()); +} + +TEST(NormstrngsTest, DetectsIncorrectText) { + for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) { + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, + kBadlyFormedHinWords[i], nullptr)) + << kBadlyFormedHinWords[i]; + } + for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) { + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone, + GraphemeNorm::kNormalize, + kBadlyFormedThaiWords[i], nullptr)) + << kBadlyFormedThaiWords[i]; + } +} + +TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) { + std::string nonindic = "Here's some latin text."; + std::string dest; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, nonindic.c_str(), + &dest)) + << PrintString32WithUnicodes(nonindic); + EXPECT_EQ(dest, nonindic); +} + +TEST(NormstrngsTest, NoLonelyJoiners) { + std::string str = "x\u200d\u0d06\u0d34\u0d02"; + std::vector<std::string> glyphs; + // Returns true, but the joiner is gone. + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[0], std::string("x")); + EXPECT_EQ(glyphs[1], std::string("\u0d06")); + EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02")); +} + +TEST(NormstrngsTest, NoLonelyJoinersPlus) { + std::string str = "\u0d2a\u200d+\u0d2a\u0d4b"; + std::vector<std::string> glyphs; + // Returns true, but the joiner is gone. + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[0], std::string("\u0d2a")); + EXPECT_EQ(glyphs[1], std::string("+")); + EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b")); +} + +TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) { + std::string str = "\u200d+\u200c\u200d"; + // Returns true, but the joiners are gone. + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string("+")); + str = "\u200d\u200c\u200d"; + // Without the plus, the string is invalid. + std::string result; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)) + << PrintString32WithUnicodes(result); +} + +TEST(NormstrngsTest, JoinersStayInArabic) { + std::string str = "\u0628\u200c\u0628\u200d\u0628"; + // Returns true, string untouched. + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 5, 5, 2, str); +} + +TEST(NormstrngsTest, DigitOK) { + std::string str = "\u0cea"; // Digit 4. + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str); +} + +TEST(NormstrngsTest, DandaOK) { + std::string str = "\u0964"; // Single danda. + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str); + str = "\u0965"; // Double danda. + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str); +} + +TEST(NormstrngsTest, AllScriptsRegtest) { + // Tests some valid text in a large number of scripts, some of which were + // found to be rejected by an earlier version. + const std::vector<std::pair<std::string, std::string>> kScriptText( + {{"Arabic", + " فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن" + "توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة " + "مجموعه هیچ اثری در فنون هنر و ادب و ترجمه، تقدیم پیشگاه ارجمند " + "سازنده تاریخ نگاه میکرد و به اصطلاح انسان و فطرت انسانی را زیربنای"}, + {"Armenian", + "անտիկ աշխարհի փիլիսոփաների կենսագրությունը, թե′ նրանց ուս-" + "պատրաստւում է դալ (բուլամա): Կովկասում կաթից նաև պատ-" + "Հոգաբարձութեան յղել այդ անձին յիմարութիւնը հաստա-" + "գծերը եւ միջագծերը կը համրուին վարէն վեր:"}, + {"Bengali", + "এসে দাঁড়ায় দাও খানি উঁচিয়ে নিয়ে । ঝরনার স্বচ্ছ জলে প্রতিবিম্বিত " + "পাঠিয়ে, গোবিন্দ স্মরণ করে, নির্ভয়ে রওনা হয়েছিল। তাতে সে " + "সুলতার। মনে পড়ে বিয়ের সময় বাবা এদের বাড়ি থেকে ঘুরে " + "কিন্তু তারপর মাতৃহৃদয় কেমন করে আছে? কী"}, + {"Cyrillic", + "достей, є ще нагороди й почесті, є хай і сумнівна, але слава, " + "вып., 96б). Параўн. найсвятший у 1 знач., насвятейший у 1 знач., " + "»Правді«, — гітлерівські окупанти винищували нижчі раси, після дру- " + "І знов майдан зачорнів од народу. Всередині чоло-"}, + {"Devanagari", + "डा॰ नै हात्तीमाथि चढेर त्यो भएनेर आइपुगे। राजालाई देखी " + "बाबतीत लिहिणे ही एक मोठीच जबाबदारी आहे. काकासाहेबांच्या कार्याचा " + "प्रबंध, आधोगिक प्रबंध तथा बैंकिंग एवम वाणिज्य आदि विषयों में " + "चित्रकृती दिल्या. शंभराहून अधिक देश आज आपापले चित्रपट निर्माण करीत"}, + {"Greek", + "Μέσα ένα τετράδιο είχα στριμώξει το πρώτο " + "νον αξίως τού ευαγγελίου τού χριστού πολιτεύεσθε, ίνα " + "οὐδεμία ὑπ' αὐτοῦ μνεία γίνεται τῶν οἰκείων χωρίων. " + "είτα την φάσιν αυτήν ην ούτος εποιήσατο κατά του Μίκω-"}, + {"Gujarati", + "ઉપહારગૃહે ને નાટ્યસ્થળે આ એ જ તેલ કડકડતું " + "શકી. ભાવવધારો અટકાવી નથી શકી અને બેકારીને " + "ત્યાં વાંકુથી પાછે આવ્યો, ચોરીનો માલ સોંપવા ! " + "કહી. એણે રેશમના કપડામાં વીંટી રાખેલ કુંવરીની છબી"}, + {"Gurmukhi", + "ਯਾਦ ਰਹੇ ਕਿ ‘ਨਫਰਤ ’ ਦਾ ਵਿਸ਼ਾ ਕ੍ਰਾਤੀ ਨਹੀ ਹੈ ਅਤੇ ਕਵੀ ਦੀ ਇਹ " + "ਮਹਾਂ ਨੰਦਾ ਕੋਲ ਇਕ ਚੀਜ਼ ਸੀ ਉਹ ਸੀ ਸਚ, ਕੋਰਾ ਸਚ, ਬੇਧਤ੍ਰਕ ਕਹਿੳ " + "ਭੂਰਾ ਸਾਨੂੰ ਥੜਾ ਚੰਗਾ ਲਗਦਾ ਸੀ । ਉਸ ਦਾ ਇਕ ਪੈਰ ਜਨਮ ਤੋ " + "ਨੂੰ ਇਹ ਅਧਿਕਾਰ ਦਿੱਤਾ ਕਿ ਉਹ ਸਿੱਖ ਵਿਰੋਧ ਦਾ ਸੰਗਠਨ ਕਰੇ ਅਤੇ 3 ਸਤੰਬਰ,"}, + {"Hangul", + "로 들어갔다. 이대통령은 아이젠하워 대통령의 뒷모습을 보면서 " + "그것뿐인 줄 아요? 노름도 했다 캅니다. 빌어묵을 놈이 그러 " + "의 가장 과학적 태도이며, 우리 역사를 가장 정확하게 학습할 수 있는 " + "마르크스 레" + "각하는 그는 그들의 식사보장을 위해 때때로 집에"}, + {"HanS", + "大凡世界上的先生可 分 三 种: 第一种只会教书, 只会拿一 " + "书像是探宝一样,在茶叶店里我买过西湖龙井﹑黄山毛峰﹑福建的铁观音﹑大红" + " " + "持 “左” 倾冒险主义的干部,便扣上 “富农 " + "笑说:“我听说了,王总工程师也跟我说过了,只是工作忙,谁"}, + {"HanT", + "叁、 銀行資產管理的群組分析模式 " + "民國六十三年,申請就讀台灣大學歷史研究所,並從事著述," + "質言之﹐在社會結構中﹐性質﹑特徵﹑地位相類似的一羣人﹐由於 " + "董橋,一九四二年生,福建晉江人,國立成功大學外"}, + {"Hebrew", + " אֵ-לִי, אֵ-לִי, כֵּיַצד מְטַפְּסִים בְּקִירוֹת שֶׁל זְכוּכִי" + " הראשון חוצה אותי שוב. אני בסיבוב הרביעי, הוא בטח מתחיל את" + " ווערטער געהאט, אבער דער עיקר איז ניט דאָס וואָרט, נאָר" + " על גחלת היהדות המקורית בעירך, נתת צביון ואופי מיוחד"}, + {"Japanese", + "は異民族とみなされていた。楚の荘王(前613〜前 " + "を詳細に吟味する。実際の治療活動の領域は便宜上、(1) 障害者 " + "困難性は多角企業の場合原則として部門別に判断されている.). " + "☆ご希望の団体には見本をお送りします"}, + {"Kannada", + "ಕೂಡ ಯುದ್ಧ ಮಾಡಿ ಜಯಪಡೆ. ನಂತರ ನಗರದೊಳಕ್ಕೆ ನಡೆ ಇದನ್ನು " + "ಅಸಹ್ಯದೃಶ್ಯ ಯಾರಿಗಾದರೂ ನಾಚಿಕೆತರುವಂತಹದಾಗಿದೆ. ಆರೋಗ್ಯ ದೃಷ್ಟಿ " + "ಯಾಗಲಿ, ಮೋಹನನಾಗಲಿ ಇಂಥ ಬಿಸಿಲಿನಲ್ಲಿ ಎಂದೂ ಬಹಳ ಹೊತ್ತು " + "\"ಇದೆ...ಖಂಡಿತಾ ಇದೆ\" ಅಂದ ಮನಸ್ಸಿನಲ್ಲಿಯೇ ವಂದಿಸುತ್ತಾ,"}, + {"Khmer", + "សិតសក់និងផ្លាស់សម្លៀកបំពាក់ពេលយប់ចេញ។ " + "និយាយអំពីនគរនេះ ប្រាប់ដល់លោកទាំងមូលឲ្យដឹងច្បាស់លាស់អំពី " + "កន្លះកាថាសម្រាប់ទន្ទេញឲ្យងាយចាំ បោះពុម្ពនៅក្នុងទ្រង់ទ្រាយបច្ចុប្បន្ន " + "ឯកសារនេះបានផ្សព្វផ្សាយនៅក្នុងសន្និសីទ"}, + {"Lao", + "ເອີຍ ! ຟັງສຽງຟ້າມັນຮ້ອງຮ່ວນ ມັນດັງໄກໆ ເອີຍ " + "ໄດລຽງດູລາວມາດວບຄວາມລາບາກຫລາຍ; " + "ບາງໄດ້ ເຈົ້າລອງສູ້ບໍ່ໄດ້ຈຶ່ງຫນີລົງມາວຽງຈັນ. " + "ລົບອອກຈາກ 3 ເຫລືອ 1, ຂ້ອຍຂຽນ 1 (1)"}, + {"Latin", + "režisoru, palīdzēja to manu domīgo, kluso Dzejas metru ielikt " + "Ešte nedávno sa chcel mladý Novomeský „liečiť” " + "tiivisia kysymyksiä, mistä seuraa, että spekula- | don luonteesta " + "Grabiel Sanchez, yang bertani selama 120 tahun meninggal"}, + {"Malayalam", + "അമൂർത്തചിത്രമായിരിക്കും. ഛേ! ആ വീട്ടിലേക്ക് അവളൊന്നിച്ച് പോകേണ്ടതാ " + "മൃഗങ്ങൾക്ക് എന്തെക്കിലും പറ്റിയാൽ മാത്രം ഞാനതു " + "വെലക്ക് വേണമെങ്കിൽ തരാം. എന്തോ തരും? പറ. " + "എല്ലാം കഴിഞ്ഞ് സീനിയറിന്റെ അടുത്തു ചെന്ന് കാൽതൊട്ട"}, + {"Tamil", + "பொருத்தமாகப் பாடினாள் நம் ஔவைப் பாட்டி. காவிரி " + "உள்ளடக்கி நிற்பது விநோத வார்த்தையின் அஃறிணை " + "சூரிய கிரஹண சமயத்தில் குருக்ஷேத்திரம் செல்வது " + "காலங்களில் வெளியே போகும்பொழுது, 'ஸார்', 'ஸார்',"}, + {"Telugu", + "1892లో ఆమె 10వ సంవత్సరంలో గుంటూరు తాలూకా వేములాపాడు " + "ఫండ్స్ చట్టము'నందు చేయబడెను. తరువాత క్రీ. శ. " + "సంచారము చేయును. మీరు ఇప్పుడే కాళకాలయమునకు " + "ఎంతటి సరళమైన భాషలో వ్రాశాడో విశదమవుతుంది. పైగా ఆనాటి భాష"}, + {"Thai", + "อ้อ! กับนัง....แม่ยอดพระกลิ่น นั่นเอง ! หรับก็ย่อมจะรู้โดยชัดเจนว่า " + "ถ้าตราบใดยังมีเรือปืนอยู่ใกล้ ๆ แล้ว ตราบนั้น " + "พระดำรินี้ ที่มีคตีทำกรวยหมากและธูปเทียน " + "อันยานมีเรือเปนต้นฃ้ามยาก ฯ เพราะว่าแม่น้ำนั่นมีน้ำใสยิ่ง แม้เพียง"}, + {"Vietnamese", + "vợ đến tai mụ hung thần Xăng-tô- mê-a. Mụ vô cùng " + "chiếc xe con gấu chạy qua nhà. Nhưng thỉnh thoảng " + "hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng " + "Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}}); + + for (const auto& p : kScriptText) { + std::string normalized; + EXPECT_TRUE(tesseract::NormalizeUTF8String( + tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, + tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized)) + << "Script=" << p.first << " text=" << p.second; + } +} + +TEST(NormstrngsTest, IsWhitespace) { + // U+0020 is whitespace + EXPECT_TRUE(IsWhitespace(' ')); + EXPECT_TRUE(IsWhitespace('\t')); + EXPECT_TRUE(IsWhitespace('\r')); + EXPECT_TRUE(IsWhitespace('\n')); + // U+2000 through U+200A + for (char32 ch = 0x2000; ch <= 0x200A; ++ch) { + SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch)); + EXPECT_TRUE(IsWhitespace(ch)); + } + // U+3000 is whitespace + EXPECT_TRUE(IsWhitespace(0x3000)); + // ZWNBSP is not considered a space. + EXPECT_FALSE(IsWhitespace(0xFEFF)); +} + +TEST(NormstrngsTest, SpanUTF8Whitespace) { + EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\n")); + EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\nabc")); + EXPECT_EQ(0, SpanUTF8Whitespace("abc \t\r\nabc")); + EXPECT_EQ(0, SpanUTF8Whitespace("")); +} + +TEST(NormstrngsTest, SpanUTF8NotWhitespace) { + const char kHinText[] = "पिताने विवाह"; + const char kKorText[] = "이는 것으로 다시 넣을"; + const char kMixedText[] = "والفكر 123 والصراع abc"; + + EXPECT_EQ(0, SpanUTF8NotWhitespace("")); + EXPECT_EQ(0, SpanUTF8NotWhitespace(" abc")); + EXPECT_EQ(0, SpanUTF8NotWhitespace("\rabc")); + EXPECT_EQ(0, SpanUTF8NotWhitespace("\tabc")); + EXPECT_EQ(0, SpanUTF8NotWhitespace("\nabc")); + EXPECT_EQ(3, SpanUTF8NotWhitespace("abc def")); + EXPECT_EQ(18, SpanUTF8NotWhitespace(kHinText)); + EXPECT_EQ(6, SpanUTF8NotWhitespace(kKorText)); + EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText)); +} + +// Test that the method clones the util/utf8/unilib definition of +// interchange validity. +TEST(NormstrngsTest, IsInterchangeValid) { +#ifdef INCLUDE_TENSORFLOW + const int32_t kMinUnicodeValue = 33; + const int32_t kMaxUnicodeValue = 0x10FFFF; + for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { + SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch)); + EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch)); + } +#else + GTEST_SKIP(); +#endif +} + +// Test that the method clones the util/utf8/unilib definition of +// 7-bit ASCII interchange validity. +TEST(NormstrngsTest, IsInterchangeValid7BitAscii) { +#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW) + const int32_t kMinUnicodeValue = 33; + const int32_t kMaxUnicodeValue = 0x10FFFF; + for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { + SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch)); + std::string str = EncodeAsUTF8(ch); + EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str), + IsInterchangeValid7BitAscii(ch)); + } +#else + // Skipped because of missing UniLib::IsInterchangeValid7BitAscii. + GTEST_SKIP(); +#endif +} + +// Test that the method clones the util/utf8/unilib definition of +// fullwidth-halfwidth . +TEST(NormstrngsTest, FullwidthToHalfwidth) { + // U+FF21 -> U+0041 (Latin capital letter A) + EXPECT_EQ('A', FullwidthToHalfwidth(0xFF21)); + // U+FF05 -> U+0025 (percent sign) + EXPECT_EQ('%', FullwidthToHalfwidth(0xFF05)); + // U+FFE6 -> U+20A9 (won sign) + EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6)); + +#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW) + // Skipped because of missing UniLib::FullwidthToHalfwidth. + const int32_t kMinUnicodeValue = 33; + const int32_t kMaxUnicodeValue = 0x10FFFF; + for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { + if (!IsValidCodepoint(ch)) continue; + SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch)); + std::string str = EncodeAsUTF8(ch); + const std::string expected_half_str = + UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true); + EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch))); + } +#endif +} + +} // namespace tesseract diff --git a/tesseract/unittest/normstrngs_test.h b/tesseract/unittest/normstrngs_test.h new file mode 100644 index 00000000..3b459348 --- /dev/null +++ b/tesseract/unittest/normstrngs_test.h @@ -0,0 +1,84 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ +#define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ + +#include <sstream> // for std::stringstream +#include <string> +#include <vector> +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include <tesseract/unichar.h> + +namespace tesseract { + +inline std::string CodepointList(const std::vector<char32>& str32) { + std::stringstream result; + int total_chars = str32.size(); + result << std::hex; + for (int i = 0; i < total_chars; ++i) { + result << "[" << str32[i] << "]"; + } + return result.str(); +} + +inline std::string PrintString32WithUnicodes(const std::string& str) { + std::vector<char32> str32 = UNICHAR::UTF8ToUTF32(str.c_str()); + return absl::StrCat("\"", str, "\" ", CodepointList(str32)); +} + +inline std::string PrintStringVectorWithUnicodes(const std::vector<std::string>& glyphs) { + std::string result; + for (const auto& s : glyphs) { + result += "Glyph:"; + result += PrintString32WithUnicodes(s) + "\n"; + } + return result; +} + +inline void ExpectGraphemeModeResults(const std::string& str, UnicodeNormMode u_mode, + int unicode_count, int glyph_count, + int grapheme_count, + const std::string& target_str) { + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true, + str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), unicode_count) + << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), "")); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, + GraphemeNormMode::kGlyphSplit, true, + str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), glyph_count) + << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), "")); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, + GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), grapheme_count) + << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), "")); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone, + GraphemeNormMode::kSingleString, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(target_str, glyphs[0]); + std::string result; + EXPECT_TRUE(NormalizeUTF8String( + u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); + EXPECT_EQ(target_str, result); +} + +} // namespace tesseract + +#endif // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ diff --git a/tesseract/unittest/nthitem_test.cc b/tesseract/unittest/nthitem_test.cc new file mode 100644 index 00000000..4d08ffae --- /dev/null +++ b/tesseract/unittest/nthitem_test.cc @@ -0,0 +1,120 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kdpair.h" + +#include "include_gunit.h" + +namespace tesseract { + +int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0, -32767, 6, 7}; + +// The fixture for testing GenericHeap and DoublePtr. +class NthItemTest : public testing::Test { + protected: + void SetUp() override { + std::locale::global(std::locale("")); + } + + public: + virtual ~NthItemTest(); + // Pushes the test data onto the KDVector. + void PushTestData(KDVector* v) { + for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) { + IntKDPair pair(test_data[i], i); + v->push_back(pair); + } + } +}; + +// Destructor. +// It is defined here, so the compiler can create a single vtable +// instead of a weak vtable (fixes compiler warning). +NthItemTest::~NthItemTest() = default; + +// Tests basic results. +TEST_F(NthItemTest, GeneralTest) { + KDVector v; + // Push the test data onto the KDVector. + PushTestData(&v); + // Get the min item. + int index = v.choose_nth_item(0); + // The result is -32767. + EXPECT_EQ(-32767, v[index].key()); + // Get the max item. + index = v.choose_nth_item(v.size() - 1); + // The result is 65536. + EXPECT_EQ(65536, v[index].key()); + // Invalid items are silently truncated to valid. + // Get the min item. + index = v.choose_nth_item(-1); + // The result is -32767. + EXPECT_EQ(-32767, v[index].key()); + // Get the max item. + index = v.choose_nth_item(v.size()); + // The result is 65536. + EXPECT_EQ(65536, v[index].key()); +} + +// Tests results on boring data with lots of duplication. +TEST_F(NthItemTest, BoringTest) { + KDVector v; + // Push the test data onto the KDVector. + int test_data[] = {8, 8, 8, 8, 8, 7, 7, 7, 7}; + for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) { + IntKDPair pair(test_data[i], i); + v.push_back(pair); + } + // The 3rd item is 7 but the 4th is 8.. + int index = v.choose_nth_item(3); + // The result is 7. + EXPECT_EQ(7, v[index].key()); + index = v.choose_nth_item(4); + // The result is 8. + EXPECT_EQ(8, v[index].key()); + // Get the min item. + index = v.choose_nth_item(0); + // The result is 7. + EXPECT_EQ(7, v[index].key()); + // Get the max item. + index = v.choose_nth_item(v.size() - 1); + // The result is 8. + EXPECT_EQ(8, v[index].key()); +} + +// Tests that a unique median in an odd-size array is found correctly. +TEST_F(NthItemTest, UniqueTest) { + KDVector v; + // Push the test data onto the KDVector. + PushTestData(&v); + // Get the median item. + int index = v.choose_nth_item(v.size() / 2); + // The result is 6, it started out at index 11. + EXPECT_EQ(6, v[index].key()); + EXPECT_EQ(11, v[index].data()); +} + +// Tests that an equal median is found correctly. +TEST_F(NthItemTest, EqualTest) { + KDVector v; + // Push the test data onto the KDVector. + PushTestData(&v); + // Add an extra 8. This makes the median 7. + IntKDPair pair(8, 13); + v.push_back(pair); + // Get the median item. + int index = v.choose_nth_item(v.size() / 2); + // The result is 7, it started out at index 4 or 12. + EXPECT_EQ(7, v[index].key()); + EXPECT_TRUE(v[index].data() == 4 || v[index].data() == 12); +} + +} // namespace tesseract diff --git a/tesseract/unittest/osd_test.cc b/tesseract/unittest/osd_test.cc new file mode 100644 index 00000000..5100a6f9 --- /dev/null +++ b/tesseract/unittest/osd_test.cc @@ -0,0 +1,133 @@ +/////////////////////////////////////////////////////////////////////// +// File: osd_test.cc +// Description: OSD Tests for Tesseract. +// Author: ShreeDevi Kumar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// based on https://gist.github.com/amitdo/7c7a522004dd79b398340c9595b377e1 + +// expects clones of tessdata, tessdata_fast and tessdata_best repos + +//#include "log.h" +#include <iostream> +#include <memory> // std::unique_ptr +#include <string> +#include <tesseract/baseapi.h> +#include "include_gunit.h" +#include "allheaders.h" + +namespace tesseract { + +class TestClass : public testing::Test { + protected: +}; + +#ifndef DISABLED_LEGACY_ENGINE +static void OSDTester(int expected_deg, const char* imgname, const char* tessdatadir) { + // log.info() << tessdatadir << " for image: " << imgname << std::endl; + std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI()); + ASSERT_FALSE(api->Init(tessdatadir, "osd")) + << "Could not initialize tesseract."; + Pix* image = pixRead(imgname); + ASSERT_TRUE(image != nullptr) << "Failed to read test image."; + api->SetImage(image); + int orient_deg; + float orient_conf; + const char* script_name; + float script_conf; + bool detected = api->DetectOrientationScript(&orient_deg, &orient_conf, + &script_name, &script_conf); + ASSERT_FALSE(!detected) << "Failed to detect OSD."; + printf( + "************ Orientation in degrees: %d, Orientation confidence: %.2f\n" + " Script: %s, Script confidence: %.2f\n", + orient_deg, orient_conf, script_name, script_conf); + EXPECT_EQ(expected_deg, orient_deg); + api->End(); + pixDestroy(&image); +} +#endif + +class OSDTest : public TestClass, + public ::testing::WithParamInterface< + std::tuple<int, const char*, const char*>> {}; + +TEST_P(OSDTest, MatchOrientationDegrees) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because TessBaseAPI::DetectOrientationScript is missing. + GTEST_SKIP(); +#else + OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()), + std::get<2>(GetParam())); +#endif +} + +INSTANTIATE_TEST_SUITE_P( + TessdataEngEuroHebrew, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/phototest.tif", + TESTING_DIR "/eurotext.tif", + TESTING_DIR "/hebrew.png"), + ::testing::Values(TESSDATA_DIR))); + +INSTANTIATE_TEST_SUITE_P( + TessdataBestEngEuroHebrew, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/phototest.tif", + TESTING_DIR "/eurotext.tif", + TESTING_DIR "/hebrew.png"), + ::testing::Values(TESSDATA_DIR "_best"))); + +INSTANTIATE_TEST_SUITE_P( + TessdataFastEngEuroHebrew, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/phototest.tif", + TESTING_DIR "/eurotext.tif", + TESTING_DIR "/hebrew.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); + +INSTANTIATE_TEST_SUITE_P( + TessdataFastRotated90, OSDTest, + ::testing::Combine(::testing::Values(90), + ::testing::Values(TESTING_DIR + "/phototest-rotated-R.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); + +INSTANTIATE_TEST_SUITE_P( + TessdataFastRotated180, OSDTest, + ::testing::Combine(::testing::Values(180), + ::testing::Values(TESTING_DIR + "/phototest-rotated-180.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); + +INSTANTIATE_TEST_SUITE_P( + TessdataFastRotated270, OSDTest, + ::testing::Combine(::testing::Values(270), + ::testing::Values(TESTING_DIR + "/phototest-rotated-L.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); + +INSTANTIATE_TEST_SUITE_P( + TessdataFastDevaRotated270, OSDTest, + ::testing::Combine(::testing::Values(270), + ::testing::Values(TESTING_DIR + "/devatest-rotated-270.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); + +INSTANTIATE_TEST_SUITE_P( + TessdataFastDeva, OSDTest, + ::testing::Combine(::testing::Values(0), + ::testing::Values(TESTING_DIR "/devatest.png"), + ::testing::Values(TESSDATA_DIR "_fast"))); + +} // namespace diff --git a/tesseract/unittest/pagesegmode_test.cc b/tesseract/unittest/pagesegmode_test.cc new file mode 100644 index 00000000..60dcf8da --- /dev/null +++ b/tesseract/unittest/pagesegmode_test.cc @@ -0,0 +1,114 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(_WIN32) +#include <io.h> // for _access +#else +#include <unistd.h> // for access +#endif +#include <string> +#include "allheaders.h" +#include <tesseract/baseapi.h> +#include "helpers.h" +#include "log.h" +#include "include_gunit.h" + +namespace tesseract { + +// Replacement for std::filesystem::exists (C++-17) +static bool file_exists(const char* filename) { +#if defined(_WIN32) + return _access(filename, 0) == 0; +#else + return access(filename, 0) == 0; +#endif +} + +// The fixture for testing Tesseract. +class PageSegModeTest : public testing::Test { + protected: + PageSegModeTest() = default; + ~PageSegModeTest() { + pixDestroy(&src_pix_); + } + + void SetUp() override { + static std::locale system_locale(""); + std::locale::global(system_locale); + } + + void SetImage(const char* filename) { + pixDestroy(&src_pix_); + src_pix_ = pixRead(filename); + api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY); + api_.SetImage(src_pix_); + } + + // Tests that the given rectangle produces exactly the given text in the + // given segmentation mode (after chopping off the last 2 newlines.) + void VerifyRectText(tesseract::PageSegMode mode, const char* str, + int left, int top, int width, int height) { + api_.SetPageSegMode(mode); + api_.SetRectangle(left, top, width, height); + char* result = api_.GetUTF8Text(); + chomp_string(result); + chomp_string(result); + EXPECT_STREQ(str, result); + delete[] result; + } + + // Tests that the given rectangle does NOT produce the given text in the + // given segmentation mode. + void NotRectText(tesseract::PageSegMode mode, const char* str, + int left, int top, int width, int height) { + api_.SetPageSegMode(mode); + api_.SetRectangle(left, top, width, height); + char* result = api_.GetUTF8Text(); + EXPECT_STRNE(str, result); + delete[] result; + } + + Pix* src_pix_ = nullptr; + std::string ocr_text_; + tesseract::TessBaseAPI api_; +}; + +// Tests the single-word segmentation mode, and that it performs correctly +// and differently to line and block mode. +TEST_F(PageSegModeTest, WordTest) { + std::string filename = file::JoinPath(TESTING_DIR, "segmodeimg.tif"); + if (!file_exists(filename.c_str())) { + LOG(INFO) << "Skip test because of missing " << filename << '\n'; + GTEST_SKIP(); + } else { + SetImage(filename.c_str()); + // Test various rectangles around the inverse page number. + VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1419, 264, 69, 34); + VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1411, 252, 78, 62); + VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1396, 218, 114, 102); + // Test a random pair of words as a line + VerifyRectText(tesseract::PSM_SINGLE_LINE, + "What should", 237, 393, 256, 36); + // Test a random pair of words as a word + VerifyRectText(tesseract::PSM_SINGLE_WORD, + "Whatshould", 237, 393, 256, 36); + // Test single block mode. + VerifyRectText(tesseract::PSM_SINGLE_BLOCK, + "both the\nfrom the", 237, 450, 172, 94); + // But doesn't work in line or word mode. + NotRectText(tesseract::PSM_SINGLE_LINE, + "both the\nfrom the", 237, 450, 172, 94); + NotRectText(tesseract::PSM_SINGLE_WORD, + "both the\nfrom the", 237, 450, 172, 94); + } +} + +} // namespace diff --git a/tesseract/unittest/pango_font_info_test.cc b/tesseract/unittest/pango_font_info_test.cc new file mode 100644 index 00000000..5d1c7af7 --- /dev/null +++ b/tesseract/unittest/pango_font_info_test.cc @@ -0,0 +1,334 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstdio> +#include <string> +#include <pango/pango.h> +#include "include_gunit.h" +#include "commandlineflags.h" +#include "fileio.h" +#include "pango_font_info.h" +#include "absl/strings/str_cat.h" // for absl::StrCat +#include "gmock/gmock-matchers.h" // for EXPECT_THAT +#ifdef INCLUDE_TENSORFLOW +#include "util/utf8/unicodetext.h" // for UnicodeText +#endif + +namespace tesseract { + +// Fonts in testdata directory +const char* kExpectedFontNames[] = { + "Arab", + "Arial Bold Italic", + "DejaVu Sans Ultra-Light", + "Lohit Hindi", +#if PANGO_VERSION <= 12005 + "Times New Roman", +#else + "Times New Roman,", // Pango v1.36.2 requires a trailing ',' +#endif + "UnBatang", + "Verdana" +}; + +// Sample text used in tests. +const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع"; +const char kEngText[] = "the quick brown fox jumps over the lazy dog"; +const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा"; +const char kKorText[] = "이는 것으로"; +// Hindi words containing illegal vowel sequences. +const char* kBadlyFormedHinWords[] = { +#if PANGO_VERSION <= 12005 + "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस", +#endif + // Pango v1.36.2 will render the above words even though they are invalid. + "प्रंात", nullptr +}; + +static PangoFontMap* font_map; + +class PangoFontInfoTest : public ::testing::Test { + protected: + void SetUp() override { + if (!font_map) { + font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); + } + pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map)); + } + + // Creates a fake fonts.conf file that points to the testdata fonts for + // fontconfig to initialize with. + static void SetUpTestCase() { + static std::locale system_locale(""); + std::locale::global(system_locale); + + FLAGS_fonts_dir = TESTING_DIR; + FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir; + file::MakeTmpdir(); + PangoFontInfo::SoftInitFontConfig(); // init early + } + + PangoFontInfo font_info_; +}; + +TEST_F(PangoFontInfoTest, TestNonDefaultConstructor) { + PangoFontInfo font("Arial Bold Italic 12"); + EXPECT_EQ(12, font.font_size()); + EXPECT_EQ("Arial", font.family_name()); +} + +TEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) { + EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Bold Italic 12")); + EXPECT_EQ(12, font_info_.font_size()); + EXPECT_EQ("Arial", font_info_.family_name()); + + EXPECT_TRUE(font_info_.ParseFontDescriptionName("Verdana 10")); + EXPECT_EQ(10, font_info_.font_size()); + EXPECT_EQ("Verdana", font_info_.family_name()); + + EXPECT_TRUE(font_info_.ParseFontDescriptionName("DejaVu Sans Ultra-Light")); + EXPECT_EQ("DejaVu Sans", font_info_.family_name()); +} + +TEST_F(PangoFontInfoTest, DoesParseMissingFonts) { + // Font family one of whose faces exists but this one doesn't. + EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12")); + EXPECT_EQ(12, font_info_.font_size()); + EXPECT_EQ("Arial", font_info_.family_name()); + + // Font family that doesn't exist in testdata. It will still parse the + // description name. But without the file, it will not be able to populate + // some font family details, like is_monospace(). + EXPECT_TRUE(font_info_.ParseFontDescriptionName("Georgia 10")); + EXPECT_EQ(10, font_info_.font_size()); + EXPECT_EQ("Georgia", font_info_.family_name()); +} + +TEST_F(PangoFontInfoTest, DoesGetSpacingProperties) { + EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12")); + int x_bearing, x_advance; + EXPECT_TRUE(font_info_.GetSpacingProperties("A", &x_bearing, &x_advance)); + EXPECT_GT(x_advance, 0); + EXPECT_TRUE(font_info_.GetSpacingProperties("a", &x_bearing, &x_advance)); + EXPECT_GT(x_advance, 0); +} + +TEST_F(PangoFontInfoTest, CanRenderString) { + font_info_.ParseFontDescriptionName("Verdana 12"); + EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText))); + + font_info_.ParseFontDescriptionName("UnBatang 12"); + EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText))); + + font_info_.ParseFontDescriptionName("Lohit Hindi 12"); + EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText))); +} + +TEST_F(PangoFontInfoTest, CanRenderLigature) { + font_info_.ParseFontDescriptionName("Arab 12"); + const char kArabicLigature[] = "لا"; + EXPECT_TRUE( + font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature))); + + printf("Next word\n"); + EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText))); +} + +TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) { + font_info_.ParseFontDescriptionName("Verdana 12"); + EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText))); +} + +TEST_F(PangoFontInfoTest, CannotRenderInvalidString) { + font_info_.ParseFontDescriptionName("Lohit Hindi 12"); + for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) { + EXPECT_FALSE(font_info_.CanRenderString(kBadlyFormedHinWords[i], + strlen(kBadlyFormedHinWords[i]))) + << "Can render " << kBadlyFormedHinWords[i]; + } +} + +TEST_F(PangoFontInfoTest, CanDropUncoveredChars) { + font_info_.ParseFontDescriptionName("Verdana 12"); + // Verdana cannot render the "ff" ligature + std::string word = "office"; + EXPECT_EQ(1, font_info_.DropUncoveredChars(&word)); + EXPECT_EQ("oice", word); + + // Don't drop non-letter characters like word joiners. + const char* kJoiners[] = { + "\u2060", // U+2060 (WJ) + "\u200C", // U+200C (ZWJ) + "\u200D" // U+200D (ZWNJ) + }; + for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) { + word = kJoiners[i]; + EXPECT_EQ(0, font_info_.DropUncoveredChars(&word)); + EXPECT_STREQ(kJoiners[i], word.c_str()); + } +} + +// ------------------------ FontUtils ------------------------------------ + +class FontUtilsTest : public ::testing::Test { + protected: + void SetUp() override { + file::MakeTmpdir(); + } + // Creates a fake fonts.conf file that points to the testdata fonts for + // fontconfig to initialize with. + static void SetUpTestCase() { + FLAGS_fonts_dir = TESTING_DIR; + FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir; + if (!font_map) { + font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); + } + pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map)); + } + +#ifdef INCLUDE_TENSORFLOW + void CountUnicodeChars(const char* utf8_text, + std::unordered_map<char32, int64_t>* ch_map) { + ch_map->clear(); + UnicodeText ut; + ut.PointToUTF8(utf8_text, strlen(utf8_text)); + for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) { +#if 0 + if (UnicodeProps::IsWhitespace(*it)) continue; +#else + if (std::isspace(*it)) continue; +#endif + ++(*ch_map)[*it]; + } + } +#endif +}; + +TEST_F(FontUtilsTest, DoesFindAvailableFonts) { + EXPECT_TRUE(FontUtils::IsAvailableFont("Arial Bold Italic")); + EXPECT_TRUE(FontUtils::IsAvailableFont("Verdana")); + EXPECT_TRUE(FontUtils::IsAvailableFont("DejaVu Sans Ultra-Light")); + + // Test that we can support font name convention for Pango v1.30.2 even when + // we are running an older version. + EXPECT_TRUE(FontUtils::IsAvailableFont("Times New Roman,")); +} + +TEST_F(FontUtilsTest, DoesDetectMissingFonts) { + // Only bold italic face is available. + EXPECT_FALSE(FontUtils::IsAvailableFont("Arial")); + // Don't have a ttf for the Courier family. + EXPECT_FALSE(FontUtils::IsAvailableFont("Courier")); + // Pango "synthesizes" the italic font from the available Verdana Regular and + // includes it in its list, but it is not really loadable. + EXPECT_FALSE(FontUtils::IsAvailableFont("Verdana Italic")); + // We have "Dejavu Sans Ultra-Light" but not its medium weight counterpart. + EXPECT_FALSE(FontUtils::IsAvailableFont("DejaVu Sans")); +} + +TEST_F(FontUtilsTest, DoesListAvailableFonts) { + const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts(); + EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames)); + for (auto& font : fonts) { + PangoFontInfo font_info; + EXPECT_TRUE(font_info.ParseFontDescriptionName(font)); + } +} + +#ifdef INCLUDE_TENSORFLOW +TEST_F(FontUtilsTest, DoesFindBestFonts) { + std::string fonts_list; + std::unordered_map<char32, int64_t> ch_map; + CountUnicodeChars(kEngText, &ch_map); + EXPECT_EQ(26, ch_map.size()); // 26 letters + std::vector<std::pair<const char*, std::vector<bool> > > font_flags; + std::string best_list = FontUtils::BestFonts(ch_map, &font_flags); + EXPECT_TRUE(best_list.size()); + // All fonts except Lohit Hindi should render English text. + EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size()); + + CountUnicodeChars(kKorText, &ch_map); + best_list = FontUtils::BestFonts(ch_map, &font_flags); + EXPECT_TRUE(best_list.size()); + // Only UnBatang font family is able to render korean. + EXPECT_EQ(1, font_flags.size()); + EXPECT_STREQ("UnBatang", font_flags[0].first); +} +#endif + +TEST_F(FontUtilsTest, DoesSelectFont) { + const char* kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr}; + const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr}; + for (int i = 0; kLangText[i] != nullptr; ++i) { + SCOPED_TRACE(kLangNames[i]); + std::vector<std::string> graphemes; + std::string selected_font; + EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), + &selected_font, &graphemes)); + EXPECT_TRUE(selected_font.size()); + EXPECT_TRUE(graphemes.size()); + } +} + +TEST_F(FontUtilsTest, DoesFailToSelectFont) { + const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع"; + std::vector<std::string> graphemes; + std::string selected_font; + EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), + &selected_font, &graphemes)); +} + +#if 0 +// Needs fix. FontUtils::GetAllRenderableCharacters was removed +// because of deprecated pango_coverage_max. +TEST_F(FontUtilsTest, GetAllRenderableCharacters) { + const int32_t kHindiChar = 0x0905; + const int32_t kArabicChar = 0x0623; + const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator + const int32_t kOghamChar = 0x1680; // Ogham space mark + std::vector<bool> unicode_mask; + FontUtils::GetAllRenderableCharacters(&unicode_mask); + EXPECT_TRUE(unicode_mask['A']); + EXPECT_TRUE(unicode_mask['1']); + EXPECT_TRUE(unicode_mask[kHindiChar]); + EXPECT_TRUE(unicode_mask[kArabicChar]); + EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian. +#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham + EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham. +#endif + unicode_mask.clear(); + + std::vector<std::string> selected_fonts; + selected_fonts.push_back("Lohit Hindi"); + FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask); + EXPECT_TRUE(unicode_mask['1']); + EXPECT_TRUE(unicode_mask[kHindiChar]); + EXPECT_FALSE(unicode_mask['A']); // Lohit doesn't render English, + EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic, + EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian, + EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham. + unicode_mask.clear(); + + // Check that none of the included fonts cover the Mongolian or Ogham space + // characters. + for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) { + SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f])); + FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask); +#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham + EXPECT_FALSE(unicode_mask[kOghamChar]); +#endif + EXPECT_FALSE(unicode_mask[kMongolianChar]); + unicode_mask.clear(); + } +} +#endif + +} // namespace diff --git a/tesseract/unittest/paragraphs_test.cc b/tesseract/unittest/paragraphs_test.cc new file mode 100644 index 00000000..16134cac --- /dev/null +++ b/tesseract/unittest/paragraphs_test.cc @@ -0,0 +1,705 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> // for std::string + +#include "absl/strings/str_cat.h" // for absl::StrCat +#include "absl/strings/str_join.h" // for absl::StrJoin +#include "absl/strings/str_split.h" // for absl::StrSplit + +#include "include_gunit.h" // for TEST +#include "log.h" // for LOG + +#include "genericvector.h" +// ccmain +#include "paragraphs.h" +#include "paragraphs_internal.h" +// ccstruct +#include "ocrpara.h" + +namespace tesseract { + +// Functions for making monospace ASCII trial text for the paragraph detector. +const ParagraphJustification kLeft = JUSTIFICATION_LEFT; +const ParagraphJustification kCenter = JUSTIFICATION_CENTER; +const ParagraphJustification kRight = JUSTIFICATION_RIGHT; +const ParagraphJustification kUnknown = JUSTIFICATION_UNKNOWN; + +enum TextModelInputType { + PCONT = 0, // Continuation line of a paragraph (default). + PSTART = 1, // First line of a paragraph. + PNONE = 2, // Not a paragraph line. +}; + +struct TextAndModel { + const char* ascii; + TextModelInputType model_type; + + // fields corresponding to PARA (see ccstruct/ocrpara.h) + ParagraphModel model; + bool is_very_first_or_continuation; + bool is_list_item; +}; + +// Imagine that the given text is typewriter ASCII with each character ten +// pixels wide and twenty pixels high and return an appropriate row_info. +void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) { + const int kCharWidth = 10; + const int kLineSpace = 30; + info->text = text; + info->has_leaders = + strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr; + info->has_drop_cap = false; + info->pix_ldistance = info->pix_rdistance = 0; + info->average_interword_space = kCharWidth; + info->pix_xheight = kCharWidth; + info->lword_text = info->rword_text = ""; + info->ltr = true; + + std::vector<std::string> words = absl::StrSplit(text, ' ', absl::SkipEmpty()); + info->num_words = words.size(); + if (info->num_words < 1) return; + + info->lword_text = words[0].c_str(); + info->rword_text = words[words.size() - 1].c_str(); + int lspace = 0; + while (lspace < info->text.size() && text[lspace] == ' ') { + lspace++; + } + int rspace = 0; + while (rspace < info->text.size() && + text[info->text.size() - rspace - 1] == ' ') { + rspace++; + } + + int top = -kLineSpace * row_number; + int bottom = top - kLineSpace; + int row_right = kCharWidth * info->text.size(); + int lword_width = kCharWidth * info->lword_text.size(); + int rword_width = kCharWidth * info->rword_text.size(); + info->pix_ldistance = lspace * kCharWidth; + info->pix_rdistance = rspace * kCharWidth; + info->lword_box = + TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top); + info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom, + row_right - info->pix_rdistance, top); + LeftWordAttributes( + nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item, + &info->lword_likely_starts_idea, &info->lword_likely_ends_idea); + RightWordAttributes( + nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item, + &info->rword_likely_starts_idea, &info->rword_likely_ends_idea); +} + +void MakeAsciiRowInfos(const TextAndModel* row_infos, int n, + std::vector<RowInfo>* output) { + output->clear(); + RowInfo info; + for (int i = 0; i < n; i++) { + AsciiToRowInfo(row_infos[i].ascii, i, &info); + output->push_back(info); + } +} + +// Given n rows of reference ground truth, evaluate whether the n rows +// of PARA * pointers yield the same paragraph breakpoints. +void EvaluateParagraphDetection(const TextAndModel* correct, int n, + const GenericVector<PARA*>& detector_output) { + int incorrect_breaks = 0; + int missed_breaks = 0; + int poorly_matched_models = 0; + int bad_crowns = 0; + int bad_list_items = 0; + ASSERT_EQ(detector_output.size(), n); + for (int i = 1; i < n; i++) { + bool has_break = correct[i].model_type != PCONT; + bool detected_break = (detector_output[i - 1] != detector_output[i]); + if (has_break && !detected_break) missed_breaks++; + if (detected_break && !has_break) incorrect_breaks++; + if (has_break) { + if (correct[i].model_type == PNONE) { + if (detector_output[i]->model != nullptr) { + poorly_matched_models++; + } + } else { + if (correct[i].model.justification() != kUnknown && + (detector_output[i]->model == nullptr || + !correct[i].model.Comparable(*detector_output[i]->model))) { + poorly_matched_models++; + } + } + if (correct[i].is_very_first_or_continuation ^ + detector_output[i]->is_very_first_or_continuation) { + bad_crowns++; + } + if (correct[i].is_list_item ^ detector_output[i]->is_list_item) { + bad_list_items++; + } + } + } + EXPECT_EQ(incorrect_breaks, 0); + EXPECT_EQ(missed_breaks, 0); + EXPECT_EQ(poorly_matched_models, 0); + EXPECT_EQ(bad_list_items, 0); + EXPECT_EQ(bad_crowns, 0); + if (incorrect_breaks || missed_breaks || poorly_matched_models || + bad_list_items || bad_crowns) { + std::vector<std::string> dbg_lines; + dbg_lines.push_back("# =========================="); + dbg_lines.push_back("# Correct paragraph breaks:"); + dbg_lines.push_back("# =========================="); + for (int i = 0; i < n; i++) { + if (correct[i].model_type != PCONT) { + dbg_lines.push_back(absl::StrCat( + correct[i].ascii, " # ", correct[i].model.ToString().c_str(), + correct[i].is_very_first_or_continuation ? " crown" : "", + correct[i].is_list_item ? " li" : "")); + } else { + dbg_lines.push_back(correct[i].ascii); + } + } + dbg_lines.push_back(""); + dbg_lines.push_back("# =========================="); + dbg_lines.push_back("# Paragraph detector output:"); + dbg_lines.push_back("# =========================="); + for (int i = 0; i < n; i++) { + std::string annotation; + if (i == 0 || (detector_output[i - 1] != detector_output[i])) { + if (detector_output[i] && detector_output[i]->model) { + annotation += absl::StrCat( + " # ", detector_output[i]->model->ToString().c_str(), + detector_output[i]->is_very_first_or_continuation ? " crown" : "", + detector_output[i]->is_list_item ? " li" : ""); + } else { + annotation = " # Unmodeled paragraph."; + } + } + dbg_lines.push_back(absl::StrCat(correct[i].ascii, annotation)); + } + LOG(INFO) << "Discrepency!\n" << absl::StrJoin(dbg_lines, "\n"); + } +} + +void TestParagraphDetection(const TextAndModel* correct, int num_rows) { + std::vector<RowInfo> row_infos; + GenericVector<PARA*> row_owners; + PARA_LIST paragraphs; + std::vector<ParagraphModel*> models; + + MakeAsciiRowInfos(correct, num_rows, &row_infos); + int debug_level(3); + tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, ¶graphs, + &models); + EvaluateParagraphDetection(correct, num_rows, row_owners); + for (auto* model : models) { + delete model; + } +} + +TEST(ParagraphsTest, ListItemsIdentified) { + EXPECT_TRUE(tesseract::AsciiLikelyListItem("iii")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("A.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("B.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("C.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("1.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("2.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("3.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("1")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("2")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("3")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("[[1]]")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-1.")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-2")); + EXPECT_TRUE(tesseract::AsciiLikelyListItem("(A)(i)")); + + EXPECT_FALSE(tesseract::AsciiLikelyListItem("The")); + EXPECT_FALSE(tesseract::AsciiLikelyListItem("first")); + EXPECT_FALSE(tesseract::AsciiLikelyListItem("house")); + EXPECT_FALSE(tesseract::AsciiLikelyListItem("Oregonian.")); + EXPECT_FALSE(tesseract::AsciiLikelyListItem("on.")); +} + +typedef ParagraphModel PModel; + +const TextAndModel kTwoSimpleParagraphs[] = { + {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"This paragraph starts at the top", PCONT, PModel(), false, false}, + {"of the page and takes 3 lines. ", PCONT, PModel(), false, false}, + {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is not a continuation ", PCONT, PModel(), false, false}, + {"from a previous page, as it is ", PCONT, PModel(), false, false}, + {"indented just like this second ", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestSimpleParagraphDetection) { + TestParagraphDetection(kTwoSimpleParagraphs, + ABSL_ARRAYSIZE(kTwoSimpleParagraphs)); +} + +const TextAndModel kFewCluesWithCrown[] = { + {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0), + true, false}, + {"of the page and takes two lines.", PCONT, PModel(), false, false}, + {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is a continuation from", PCONT, PModel(), false, false}, + {"a previous page, as it is ", PCONT, PModel(), false, false}, + {"indented just like this second ", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestFewCluesWithCrown) { + TestParagraphDetection(kFewCluesWithCrown, + ABSL_ARRAYSIZE(kFewCluesWithCrown)); +} + +const TextAndModel kCrownedParagraph[] = { + {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), + true, false}, + {"often not indented as the rest ", PCONT, PModel(), false, false}, + {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false}, + {"less it should be counted as the", PCONT, PModel(), false, false}, + {"same type of paragraph. ", PCONT, PModel(), false, false}, + {" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"graphs are both indented two ", PCONT, PModel(), false, false}, + {"spaces. ", PCONT, PModel(), false, false}, + {" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestCrownParagraphDetection) { + TestParagraphDetection(kCrownedParagraph, ABSL_ARRAYSIZE(kCrownedParagraph)); +} + +const TextAndModel kFlushLeftParagraphs[] = { + {"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false}, + {"flush left paragraphs (those", PCONT, PModel(), false, false}, + {"with no body indent) are not", PCONT, PModel(), false, false}, + {"actually crowns. ", PCONT, PModel(), false, false}, + {"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false}, + {"also flush left aligned. Usual-", PCONT, PModel(), false, false}, + {"ly, these paragraphs are set", PCONT, PModel(), false, false}, + {"apart vertically by some white-", PCONT, PModel(), false, false}, + {"space, but you can also detect", PCONT, PModel(), false, false}, + {"them by observing the big empty", PCONT, PModel(), false, false}, + {"space at the ends of the para-", PCONT, PModel(), false, false}, + {"graphs. ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsText, TestRealFlushLeftParagraphs) { + TestParagraphDetection(kFlushLeftParagraphs, + ABSL_ARRAYSIZE(kFlushLeftParagraphs)); +} + +const TextAndModel kSingleFullPageContinuation[] = { + {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false}, + {"continuation. It flows from", PCONT, PModel(), false, false}, + {"line to line, using the full", PCONT, PModel(), false, false}, + {"column width with no clear", PCONT, PModel(), false, false}, + {"paragraph break, because it", PCONT, PModel(), false, false}, + {"actually doesn't have one. It", PCONT, PModel(), false, false}, + {"is the middle of one monster", PCONT, PModel(), false, false}, + {"paragraph continued from the", PCONT, PModel(), false, false}, + {"previous page and continuing", PCONT, PModel(), false, false}, + {"onto the next page. There-", PCONT, PModel(), false, false}, + {"fore, it ends up getting", PCONT, PModel(), false, false}, + {"marked as a crown and then", PCONT, PModel(), false, false}, + {"getting re-marked as any ex-", PCONT, PModel(), false, false}, + {"isting model. Not great, but", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestSingleFullPageContinuation) { + const TextAndModel* correct = kSingleFullPageContinuation; + int num_rows = ABSL_ARRAYSIZE(kSingleFullPageContinuation); + std::vector<RowInfo> row_infos; + GenericVector<PARA*> row_owners; + PARA_LIST paragraphs; + std::vector<ParagraphModel*> models; + models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10)); + MakeAsciiRowInfos(correct, num_rows, &row_infos); + tesseract::DetectParagraphs(3, &row_infos, &row_owners, ¶graphs, &models); + EvaluateParagraphDetection(correct, num_rows, row_owners); + for (auto* model : models) { + delete model; + } +} + +const TextAndModel kRightAligned[] = { + {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false}, + {" uncommon in Left-to-Right", PCONT, PModel(), false, false}, + {" languages, but they do", PCONT, PModel(), false, false}, + {" exist.", PCONT, PModel(), false, false}, + {" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false}, + {" horribly tiny paragraphs in", PCONT, PModel(), false, false}, + {" tables on which we have no", PCONT, PModel(), false, false}, + {" chance anyways.", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestRightAlignedParagraph) { + TestParagraphDetection(kRightAligned, ABSL_ARRAYSIZE(kRightAligned)); +} + +const TextAndModel kTinyParagraphs[] = { + {" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"obvious paragraph text, you might", PCONT, PModel(), false, false}, + {"find short exchanges of dialogue ", PCONT, PModel(), false, false}, + {"between characters. ", PCONT, PModel(), false, false}, + {" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"mark a new paragraph whenever one", PCONT, PModel(), false, false}, + {"of the statistics (left, right or", PCONT, PModel(), false, false}, + {"center) changes from one text-", PCONT, PModel(), false, false}, + {"line to the next. Such an", PCONT, PModel(), false, false}, + {"approach would misclassify the", PCONT, PModel(), false, false}, + {"tiny paragraphs above as a single", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestTinyParagraphs) { + TestParagraphDetection(kTinyParagraphs, ABSL_ARRAYSIZE(kTinyParagraphs)); +} + +const TextAndModel kComplexPage1[] = { + {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false}, + {" Centered Title ", PCONT, PModel(), false, false}, + {" Paragraph Detection ", PCONT, PModel(), false, false}, + {" OCR TEAM ", PCONT, PModel(), false, false}, + {" 10 November 2010 ", PCONT, PModel(), false, false}, + {" ", PNONE, PModel(), false, false}, + {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"This paragraph starts at the top", PCONT, PModel(), false, false}, + {"of the page and takes 3 lines. ", PCONT, PModel(), false, false}, + {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is not a continuation ", PCONT, PModel(), false, false}, + {"from a previous page, as it is ", PCONT, PModel(), false, false}, + {"indented just like this second ", PCONT, PModel(), false, false}, + {"paragraph. ", PCONT, PModel(), false, false}, + {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), + true, false}, + {" looks like the prior text ", PCONT, PModel(), false, false}, + {" but it is indented more ", PCONT, PModel(), false, false}, + {" and is fully justified. ", PCONT, PModel(), false, false}, + {" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"centered text, block quotes, ", PCONT, PModel(), false, false}, + {"normal paragraphs, and lists ", PCONT, PModel(), false, false}, + {"like what follows? ", PCONT, PModel(), false, false}, + {"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" looking for lines where the ", PCONT, PModel(), false, false}, + {" first word of the next line ", PCONT, PModel(), false, false}, + {" would fit on the previous ", PCONT, PModel(), false, false}, + {" line. ", PCONT, PModel(), false, false}, + {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" Python and try it out. ", PCONT, PModel(), false, false}, + {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" mistakes. ", PCONT, PModel(), false, false}, + {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"you can try to identify source ", PCONT, PModel(), false, false}, + {"code. Ouch! ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestComplexPage1) { + TestParagraphDetection(kComplexPage1, ABSL_ARRAYSIZE(kComplexPage1)); +} + +// The same as above, but wider. +const TextAndModel kComplexPage2[] = { + {" Awesome ", PSTART, + PModel(kCenter, 0, 0, 0, 0), false, false}, + {" Centered Title ", PCONT, PModel(), false, false}, + {" Paragraph Detection ", PCONT, PModel(), false, false}, + {" OCR TEAM ", PCONT, PModel(), false, false}, + {" 10 November 2010 ", PCONT, PModel(), false, false}, + {" ", PNONE, PModel(), false, false}, + {" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"This paragraph starts at the top of", PCONT, PModel(), false, false}, + {"the page and takes 3 lines. ", PCONT, PModel(), false, false}, + {" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"which indicates that the first ", PCONT, PModel(), false, false}, + {"paragraph is not a continuation ", PCONT, PModel(), false, false}, + {"from a previous page, as it is in- ", PCONT, PModel(), false, false}, + {"dented just like this second para- ", PCONT, PModel(), false, false}, + {"graph. ", PCONT, PModel(), false, false}, + {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), + true, false}, + {" looks like the prior text ", PCONT, PModel(), false, false}, + {" but it is indented more ", PCONT, PModel(), false, false}, + {" and is fully justified. ", PCONT, PModel(), false, false}, + {" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"ed text, block quotes, normal para-", PCONT, PModel(), false, false}, + {"graphs, and lists like what follow?", PCONT, PModel(), false, false}, + {"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!! + {"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" looking for lines where the ", PCONT, PModel(), false, false}, + {" first word of the next line ", PCONT, PModel(), false, false}, + {" would fit on the previous line. ", PCONT, PModel(), false, false}, + {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" Python and try it out. ", PCONT, PModel(), false, false}, + {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" mistakes. ", PCONT, PModel(), false, false}, + {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), + false, true}, + {" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"you can try to identify source ", PCONT, PModel(), false, false}, + {"code. Ouch! ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestComplexPage2) { + TestParagraphDetection(kComplexPage2, ABSL_ARRAYSIZE(kComplexPage2)); +} + +const TextAndModel kSubtleCrown[] = { + {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), + true, false}, + {"often not indented as the rest ", PCONT, PModel(), false, false}, + {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false}, + {"less it should be counted as the", PCONT, PModel(), false, false}, + {"same type of paragraph. ", PCONT, PModel(), false, false}, + {" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false}, + {"should suffice. ", PCONT, PModel(), false, false}, + {" 1235 ", PNONE, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestSubtleCrown) { + TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown) - 1); +} + +TEST(ParagraphsTest, TestStrayLineInBlock) { + TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown)); +} + +const TextAndModel kUnlvRep3AO[] = { + {" Defined contribution plans cover employees in Australia, New", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, false}, + {"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), false, false}, + {"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false}, + {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false}, + {"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, false}, + {"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, false}, + {"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, false}, + {" In addition to providing pension benefits, the Company pro- ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, false}, + {"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, false}, + {"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, false}, + {"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, false}, + {"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, false}, + {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false}, + {"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, false}, + {" The U.S. plan covering the parent company is the largest plan.", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, false}, + {"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, false}, + {"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, false}, + {"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), false, false}, + {"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, false}, + {"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, false}, + {"credited service. The Company has the ability to change these ", PCONT, PModel(), false, false}, + {"benefits at any time. ", PCONT, PModel(), false, false}, + {" Effective October 1993, the Company amended its health ", PSTART, + PModel(kLeft, 0, 50, 0, 0), false, false}, + {"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false}, + {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false}, + {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false}, + {"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, false}, + {"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false}, + {"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, false}, + {"for 1994 by approximately $83. ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, TestUnlvInsurance) { + TestParagraphDetection(kUnlvRep3AO, ABSL_ARRAYSIZE(kUnlvRep3AO)); +} + +// The basic outcome we want for something with a bunch of leader dots is that +// we group each logical entry as a separate item. Without knowledge of +// leaders, we would most likely mark the text below as a simple right aligned +// paragraph or two. +// This example comes from Volume 9886293, Page 5 +const TextAndModel kTableOfContents[] = { + {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, + {" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false}, +}; + +TEST(ParagraphsTest, TestSplitsOutLeaderLines) { + TestParagraphDetection(kTableOfContents, ABSL_ARRAYSIZE(kTableOfContents)); +} + +const TextAndModel kTextWithSourceCode[] = { + {" A typical page of a programming book may contain", PSTART, + PModel(kLeft, 0, 20, 0, 0), false, false}, + {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false}, + {"being described in prose. Such examples should be", PCONT, PModel(), false, false}, + {"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false}, + {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false}, + {"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false}, + {"source code would lead to a bad reading experience", PCONT, PModel(), false, false}, + {"when the text is re-flowed. ", PCONT, PModel(), false, false}, + {" Let's show this by describing the function fact-", PSTART, + PModel(kLeft, 0, 20, 0, 0), false, false}, + {"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false}, + {"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false}, + {"that the typical C implementation will only work ", PCONT, PModel(), false, false}, + {"for values less than about 12: ", PCONT, PModel(), false, false}, + {" ", PNONE, PModel(), false, false}, + {" # Naive implementation in C ", PCONT, PModel(), false, false}, + {" int factorial(int n) { ", PCONT, PModel(), false, false}, + {" if (n < 2) ", PCONT, PModel(), false, false}, + {" return 1; ", PCONT, PModel(), false, false}, + {" return n * factorial(n - 1); ", PCONT, PModel(), false, false}, + {" } ", PCONT, PModel(), false, false}, + {" ", PCONT, PModel(), false, false}, + {" The C programming language does not have built- ", PSTART, + PModel(kLeft, 0, 20, 0, 0), false, false}, + {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false}, + {"naive implementation simply returns random values ", PCONT, PModel(), false, false}, + {"if even a moderate sized n is provided. ", PCONT, PModel(), false, false}, +}; + +TEST(ParagraphsTest, NotDistractedBySourceCode) { + TestParagraphDetection(kTextWithSourceCode, + ABSL_ARRAYSIZE(kTextWithSourceCode)); +} + +const TextAndModel kOldManAndSea[] = { + {"royal palm which are called guano and in it there was a bed, a", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false}, + {"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), false, false}, + {"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), false, false}, + {"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), false, false}, + {"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), false, false}, + {"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), false, false}, + {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false}, + {"shirt. ", PCONT, PModel(), false, false}, + {" \"What do you have to eat?\" the boy asked. ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"A pot of yellow rice with fish. Do you want some?\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"No. I will eat at home. Do you want me to make the fire?\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"No. I will make it later on. Or I may eat the rice cold.\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"May I take the cast net?\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"Of course.\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" There was no cast net and the boy remembered when they had", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), false, false}, + {"pot of yellow rice and fish and the boy knew this too. " + " ", PCONT, PModel(), false, false}, + {" \"Eighty-five is a lucky number,\" the old man said. \"How", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"would you like to see me bring one in that dressed out over a " + "thou-", PCONT, PModel(), false, false}, + {"sand pounds? " + " ", PCONT, PModel(), false, false}, + {" \"I'll get the cast net and go for sardines. Will you sit in the " + "sun", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"in the doorway?\" " + " ", PCONT, PModel(), false, false}, + {" \"Yes. I have yesterday's paper and I will read the baseball.\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" The boy did not know whether yesterday's paper was a fiction", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"too. But the old man brought it out from under the bed. ", PCONT, PModel(), false, false}, + {" \"Pedrico gave it to me at the bodega,\" he explained. " + " ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"I'll be back when I have the sardines. I'll keep yours and mine", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"together on ice and we can share them in the morning. When I", PCONT, PModel(), false, false}, + {"come back you can tell me about the baseball.\" ", PCONT, PModel(), false, false}, + {" \"The Yankees cannot lose.\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"But I fear the Indians of Cleveland.\" ", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {" \"Have faith in the Yankees my son. Think of the great Di-", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"Maggio.\" ", PCONT, PModel(), false, false}, + {" \"I fear both the Tigers of Detroit and the Indians of Cleve-", + PSTART, PModel(kLeft, 0, 50, 0, 0), false, false}, + {"land.\" ", PCONT, PModel(), false, false} +}; + +TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) { + TestParagraphDetection(kOldManAndSea, ABSL_ARRAYSIZE(kOldManAndSea)); +} + +const TextAndModel kNewZealandIndex[] = { + {"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {" 138 ", PCONT, PModel(), false, false}, + {"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {" 145 ", PCONT, PModel(), false, false}, + {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {" 85 ", PCONT, PModel(), false, false}, + {"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}, + {"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false} +}; + +TEST(ParagraphsTest, IndexPageTest) { + TestParagraphDetection(kNewZealandIndex, ABSL_ARRAYSIZE(kNewZealandIndex)); +} + +// TODO(eger): Add some right-to-left examples, and fix the algorithm as needed. + +} // namespace diff --git a/tesseract/unittest/params_model_test.cc b/tesseract/unittest/params_model_test.cc new file mode 100644 index 00000000..8627ab8e --- /dev/null +++ b/tesseract/unittest/params_model_test.cc @@ -0,0 +1,75 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> // std::string +#include <vector> + +#include "include_gunit.h" +#include "params_model.h" +#include "serialis.h" // TFile +#include "tprintf.h" // tprintf + +namespace tesseract { + +// Test some basic I/O of params model files (automated learning of language +// model weights). +#ifndef DISABLED_LEGACY_ENGINE +static bool LoadFromFile(tesseract::ParamsModel& model, const char* lang, const char* full_path) { + tesseract::TFile fp; + if (!fp.Open(full_path, nullptr)) { + tprintf("Error opening file %s\n", full_path); + return false; + } + return model.LoadFromFp(lang, &fp); +} +#endif + +class ParamsModelTest : public testing::Test { +#ifndef DISABLED_LEGACY_ENGINE + protected: + void SetUp() override { + std::locale::global(std::locale("")); + } + + std::string TestDataNameToPath(const std::string& name) const { + return file::JoinPath(TESTDATA_DIR, name); + } + std::string OutputNameToPath(const std::string& name) const { + return file::JoinPath(FLAGS_test_tmpdir, name); + } + // Test that we are able to load a params model, save it, reload it, + // and verify that the re-serialized version is the same as the original. + void TestParamsModelRoundTrip(const std::string& params_model_filename) const { + tesseract::ParamsModel orig_model; + tesseract::ParamsModel duplicate_model; + file::MakeTmpdir(); + std::string orig_file = TestDataNameToPath(params_model_filename); + std::string out_file = OutputNameToPath(params_model_filename); + + EXPECT_TRUE(LoadFromFile(orig_model, "eng", orig_file.c_str())); + EXPECT_TRUE(orig_model.SaveToFile(out_file.c_str())); + + EXPECT_TRUE(LoadFromFile(duplicate_model, "eng", out_file.c_str())); + EXPECT_TRUE(orig_model.Equivalent(duplicate_model)); + } +#endif +}; + +TEST_F(ParamsModelTest, TestEngParamsModelIO) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because ParamsModel::LoadFromFp is missing. + GTEST_SKIP(); +#else + TestParamsModelRoundTrip("eng.params_model"); +#endif +} + +} // namespace diff --git a/tesseract/unittest/progress_test.cc b/tesseract/unittest/progress_test.cc new file mode 100644 index 00000000..dbe30269 --- /dev/null +++ b/tesseract/unittest/progress_test.cc @@ -0,0 +1,165 @@ +/////////////////////////////////////////////////////////////////////// +// File: progress_test.cc +// Description: Progress reporting API Test for Tesseract. +// Author: Jaroslaw Kubik +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// expects clone of tessdata_fast repo in ../../tessdata_fast + +#include "include_gunit.h" + +#include <tesseract/baseapi.h> +#include <tesseract/ocrclass.h> + +#include "allheaders.h" +#include "gmock/gmock.h" + +#include <fstream> +#include <iostream> +#include <locale> +#include <memory> // std::unique_ptr +#include <string> + +#include <time.h> + +namespace tesseract { + +class QuickTest : public testing::Test { + protected: + virtual void SetUp() { start_time_ = time(nullptr); } + virtual void TearDown() { + const time_t end_time = time(nullptr); + EXPECT_TRUE(end_time - start_time_ <= 25) + << "The test took too long - " + << ::testing::PrintToString(end_time - start_time_); + } + time_t start_time_; +}; + +class ClassicMockProgressSink { + public: + MOCK_METHOD1(classicProgress, bool(int)); + MOCK_METHOD1(cancel, bool(int)); + + ETEXT_DESC monitor; + + ClassicMockProgressSink() { + monitor.progress_callback = [](int progress, int, int, int, int) -> bool { + return instance->classicProgress(progress); + }; + monitor.cancel = [](void* ths, int words) -> bool { + return ((ClassicMockProgressSink*)ths)->cancel(words); + }; + monitor.cancel_this = this; + instance = this; + } + + static ClassicMockProgressSink* instance; +}; + +ClassicMockProgressSink* ClassicMockProgressSink::instance = nullptr; + +class NewMockProgressSink : public ClassicMockProgressSink { + public: + MOCK_METHOD1(progress, bool(int)); + + NewMockProgressSink() { + monitor.progress_callback2 = [](ETEXT_DESC* ths, int, int, int, + int) -> bool { + return ((NewMockProgressSink*)ths->cancel_this)->progress(ths->progress); + }; + } +}; + +void ClassicProgressTester(const char* imgname, const char* tessdatadir, + const char* lang) { + using ::testing::_; + using ::testing::AllOf; + using ::testing::AtLeast; + using ::testing::DoAll; + using ::testing::Gt; + using ::testing::Le; + using ::testing::Return; + using ::testing::SaveArg; + + std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI()); + ASSERT_FALSE(api->Init(tessdatadir, lang)) + << "Could not initialize tesseract."; + Pix* image = pixRead(imgname); + ASSERT_TRUE(image != nullptr) << "Failed to read test image."; + api->SetImage(image); + + ClassicMockProgressSink progressSink; + + int currentProgress = -1; + EXPECT_CALL(progressSink, + classicProgress(AllOf(Gt<int&>(currentProgress), Le(100)))) + .Times(AtLeast(5)) + .WillRepeatedly(DoAll(SaveArg<0>(¤tProgress), Return(false))); + EXPECT_CALL(progressSink, cancel(_)) + .Times(AtLeast(5)) + .WillRepeatedly(Return(false)); + + EXPECT_EQ(api->Recognize(&progressSink.monitor), false); + EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%"; + + api->End(); + pixDestroy(&image); +} + +void NewProgressTester(const char* imgname, const char* tessdatadir, + const char* lang) { + using ::testing::_; + using ::testing::AllOf; + using ::testing::AtLeast; + using ::testing::DoAll; + using ::testing::Gt; + using ::testing::Le; + using ::testing::Return; + using ::testing::SaveArg; + + std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI()); + ASSERT_FALSE(api->Init(tessdatadir, lang)) + << "Could not initialize tesseract."; + Pix* image = pixRead(imgname); + ASSERT_TRUE(image != nullptr) << "Failed to read test image."; + api->SetImage(image); + + NewMockProgressSink progressSink; + + int currentProgress = -1; + EXPECT_CALL(progressSink, classicProgress(_)).Times(0); + EXPECT_CALL(progressSink, progress(AllOf(Gt<int&>(currentProgress), Le(100)))) + .Times(AtLeast(5)) + .WillRepeatedly(DoAll(SaveArg<0>(¤tProgress), Return(false))); + EXPECT_CALL(progressSink, cancel(_)) + .Times(AtLeast(5)) + .WillRepeatedly(Return(false)); + + EXPECT_EQ(api->Recognize(&progressSink.monitor), false); + EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%"; + + api->End(); + pixDestroy(&image); +} + +TEST(QuickTest, ClassicProgressReporting) { + ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", + "eng"); +} + +TEST(QuickTest, NewProgressReporting) { + NewProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng"); +} + +} // namespace diff --git a/tesseract/unittest/qrsequence_test.cc b/tesseract/unittest/qrsequence_test.cc new file mode 100644 index 00000000..783228d8 --- /dev/null +++ b/tesseract/unittest/qrsequence_test.cc @@ -0,0 +1,69 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include <algorithm> +#include <vector> + +#include "cycletimer.h" +#include "include_gunit.h" +#include "log.h" +#include "qrsequence.h" + +namespace tesseract { + +class TestableQRSequenceGenerator : public QRSequenceGenerator { + public: + explicit TestableQRSequenceGenerator(const int& N) : QRSequenceGenerator(N) {} + // Overriding scope for testing + using QRSequenceGenerator::GetBinaryReversedInteger; +}; + +// Verifies binary inversion for a small range. +TEST(QRSequenceGenerator, GetBinaryReversedInteger) { + const int kRangeSize = 8; + TestableQRSequenceGenerator generator(kRangeSize); + int reversed_vals[kRangeSize] = {0, 4, 2, 6, 1, 5, 3, 7}; + for (int i = 0; i < kRangeSize; ++i) + EXPECT_EQ(reversed_vals[i], generator.GetBinaryReversedInteger(i)); +} + +// Trivial test fixture for a parameterized test. +class QRSequenceGeneratorTest : public ::testing::TestWithParam<int> { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } +}; + +TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) { + const int kRangeSize = GetParam(); + TestableQRSequenceGenerator generator(kRangeSize); + std::vector<int> vals(kRangeSize); + CycleTimer timer; + timer.Restart(); + for (int i = 0; i < kRangeSize; ++i) vals[i] = generator.GetVal(); + LOG(INFO) << kRangeSize << "-length sequence took " << timer.GetInMs() << "ms"; + // Sort the numbers to verify that we've covered the range without repetition. + std::sort(vals.begin(), vals.end()); + for (int i = 0; i < kRangeSize; ++i) { + EXPECT_EQ(i, vals[i]); + if (i != vals[i]) { + LOG(INFO) << "Aborting remaining comparisons"; + break; + } + } +} + +// Run a parameterized test using the following range sizes. +INSTANTIATE_TEST_SUITE_P(RangeTest, QRSequenceGeneratorTest, + ::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6)); +} // namespace diff --git a/tesseract/unittest/recodebeam_test.cc b/tesseract/unittest/recodebeam_test.cc new file mode 100644 index 00000000..6e9bc4e3 --- /dev/null +++ b/tesseract/unittest/recodebeam_test.cc @@ -0,0 +1,483 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "include_gunit.h" +#include "log.h" // for LOG + +#include "genericvector.h" +#include "recodebeam.h" +#include "matrix.h" +#include "pageres.h" +#include "ratngs.h" +#include "unicharcompress.h" +#include "normstrngs.h" +#include "unicharset_training_utils.h" + +#include "helpers.h" + +#include "absl/strings/str_format.h" // for absl::StrFormat + +namespace tesseract { + +// Number of characters to test beam search with. +const int kNumChars = 100; +// Amount of extra random data to pad with after. +const int kPadding = 64; +// Dictionary test data. +// The top choice is: "Gef s wordsright.". +// The desired phrase is "Gets words right.". +// There is a competing dictionary phrase: "Get swords right.". +// ... due to the following errors from the network: +// f stronger than t in "Get". +// weak space between Gef and s and between s and words. +// weak space between words and right. +const char* kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d", + "s", "", "r", "i", "g", "h", "t", ".", nullptr}; +const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, + 0.89, 0.99, 0.99, 0.99, 0.99, 0.95, + 0.99, 0.90, 0.90, 0.90, 0.95, 0.75}; +const char* kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h", + "S", " ", "t", "I", "9", "b", "f", ",", nullptr}; +const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, + 0.10, 0.01, 0.01, 0.01, 0.01, 0.05, + 0.01, 0.09, 0.09, 0.09, 0.05, 0.25}; + +const char* kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr}; +const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98}; +const char* kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr}; +const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01}; + +const char* kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr}; +const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97}; +const char* kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr}; +const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01}; + +class RecodeBeamTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + RecodeBeamTest() : lstm_dict_(&ccutil_) {} + ~RecodeBeamTest() { lstm_dict_.End(); } + + // Loads and compresses the given unicharset. + void LoadUnicharset(const std::string& unicharset_name) { + std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, + "radical-stroke.txt"); + std::string unicharset_file = + file::JoinPath(TESTDATA_DIR, unicharset_name); + std::string radical_data; + CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, + file::Defaults())); + CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str())); + unichar_null_char_ = ccutil_.unicharset.has_special_codes() + ? UNICHAR_BROKEN + : ccutil_.unicharset.size(); + STRING radical_str(radical_data.c_str()); + EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, + &radical_str)); + RecodedCharID code; + recoder_.EncodeUnichar(unichar_null_char_, &code); + encoded_null_char_ = code(0); + // Space should encode as itself. + recoder_.EncodeUnichar(UNICHAR_SPACE, &code); + EXPECT_EQ(UNICHAR_SPACE, code(0)); + std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt"); + STRING encoding = recoder_.GetEncodingAsString(ccutil_.unicharset); + std::string encoding_str(&encoding[0], encoding.size()); + CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults())); + LOG(INFO) << "Wrote encoding to:" << output_name << "\n"; + } + // Loads the dictionary. + void LoadDict(const std::string& lang) { + std::string traineddata_name = lang + ".traineddata"; + std::string traineddata_file = + file::JoinPath(TESTDATA_DIR, traineddata_name); + lstm_dict_.SetupForLoad(nullptr); + tesseract::TessdataManager mgr; + mgr.Init(traineddata_file.c_str()); + lstm_dict_.LoadLSTM(lang.c_str(), &mgr); + lstm_dict_.FinishLoad(); + } + + // Expects the appropriate results from the compressed_ ccutil_.unicharset. + void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output, + const GenericVector<int>& transcription) { + // Get the utf8 string of the transcription. + std::string truth_utf8; + for (int i = 0; i < transcription.size(); ++i) { + truth_utf8 += ccutil_.unicharset.id_to_unichar(transcription[i]); + } + PointerVector<WERD_RES> words; + ExpectCorrect(output, truth_utf8, nullptr, &words); + } + void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output, + const std::string& truth_utf8, Dict* dict, + PointerVector<WERD_RES>* words) { + RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict); + beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr); + // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug: + // beam_search.DebugBeams(ccutil_.unicharset); + std::vector<int> labels, xcoords; + beam_search.ExtractBestPathAsLabels(&labels, &xcoords); + LOG(INFO) << "Labels size = " << labels.size() << " coords " + << xcoords.size() << "\n"; + // Now decode using recoder_. + std::string decoded; + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + RecodedCharID code; + int index = start; + int uni_id = INVALID_UNICHAR_ID; + do { + code.Set(code.length(), labels[index++]); + uni_id = recoder_.DecodeUnichar(code); + } while (index < labels.size() && + code.length() < RecodedCharID::kMaxCodeLen && + (uni_id == INVALID_UNICHAR_ID || + !recoder_.IsValidFirstCode(labels[index]))); + EXPECT_NE(INVALID_UNICHAR_ID, uni_id) + << "index=" << index << "/" << labels.size(); + // To the extent of truth_utf8, we expect decoded to match, but if + // transcription is shorter, that is OK too, as we may just be testing + // that we get a valid sequence when padded with random data. + if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) + decoded += ccutil_.unicharset.id_to_unichar(uni_id); + end = index; + } + EXPECT_EQ(truth_utf8, decoded); + + // Check that ExtractBestPathAsUnicharIds does the same thing. + std::vector<int> unichar_ids; + std::vector<float> certainties, ratings; + beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, + &unichar_ids, &certainties, + &ratings, &xcoords); + std::string u_decoded; + float total_rating = 0.0f; + for (int u = 0; u < unichar_ids.size(); ++u) { + // To the extent of truth_utf8, we expect decoded to match, but if + // transcription is shorter, that is OK too, as we may just be testing + // that we get a valid sequence when padded with random data. + if (u_decoded.size() < truth_utf8.size()) { + const char* str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]); + total_rating += ratings[u]; + LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u, + unichar_ids[u], str, certainties[u], + ratings[u], total_rating, xcoords[u]) << "\n"; + if (str[0] == ' ') total_rating = 0.0f; + u_decoded += str; + } + } + EXPECT_EQ(truth_utf8, u_decoded); + + // Check that ExtractBestPathAsWords does the same thing. + TBOX line_box(0, 0, 100, 10); + for (int i = 0; i < 2; ++i) { + beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, + &ccutil_.unicharset, words); + std::string w_decoded; + for (int w = 0; w < words->size(); ++w) { + const WERD_RES* word = (*words)[w]; + if (w_decoded.size() < truth_utf8.size()) { + if (!w_decoded.empty() && word->word->space()) w_decoded += " "; + w_decoded += word->best_choice->unichar_string().c_str(); + } + LOG(INFO) << absl::StrFormat("Word:%d = %s, c=%g, r=%g, perm=%d", w, + word->best_choice->unichar_string().c_str(), + word->best_choice->certainty(), + word->best_choice->rating(), + word->best_choice->permuter()) << "\n"; + } + std::string w_trunc(w_decoded.data(), truth_utf8.size()); + if (truth_utf8 != w_trunc) { + tesseract::NormalizeUTF8String( + tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize, + tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded); + w_trunc.assign(w_decoded.data(), truth_utf8.size()); + } + EXPECT_EQ(truth_utf8, w_trunc); + } + } + // Generates easy encoding of the given unichar_ids, and pads with at least + // padding of random data. + GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs( + const GenericVector<int>& unichar_ids, int padding) { + int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen; + int num_codes = recoder_.code_range(); + GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f); + // Fill with random data. + TRand random; + for (int t = 0; t < width; ++t) { + for (int i = 0; i < num_codes; ++i) + outputs(t, i) = random.UnsignedRand(0.25); + } + int t = 0; + for (int i = 0; i < unichar_ids.size(); ++i) { + RecodedCharID code; + int len = recoder_.EncodeUnichar(unichar_ids[i], &code); + EXPECT_NE(0, len); + for (int j = 0; j < len; ++j) { + // Make the desired answer a clear winner. + if (j > 0 && code(j) == code(j - 1)) { + // We will collapse adjacent equal codes so put a null in between. + outputs(t++, encoded_null_char_) = 1.0f; + } + outputs(t++, code(j)) = 1.0f; + } + // Put a 0 as a null char in between. + outputs(t++, encoded_null_char_) = 1.0f; + } + // Normalize the probs. + for (int t = 0; t < width; ++t) { + double sum = 0.0; + for (int i = 0; i < num_codes; ++i) sum += outputs(t, i); + for (int i = 0; i < num_codes; ++i) outputs(t, i) /= sum; + } + + return outputs; + } + // Encodes a utf8 string (character) as unichar_id, then recodes, and sets + // the score for the appropriate sequence of codes, returning the ending t. + int EncodeUTF8(const char* utf8_str, float score, int start_t, TRand* random, + GENERIC_2D_ARRAY<float>* outputs) { + int t = start_t; + std::vector<int> unichar_ids; + EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, + nullptr, nullptr)); + if (unichar_ids.empty() || utf8_str[0] == '\0') { + unichar_ids.clear(); + unichar_ids.push_back(unichar_null_char_); + } + int num_ids = unichar_ids.size(); + for (int u = 0; u < num_ids; ++u) { + RecodedCharID code; + int len = recoder_.EncodeUnichar(unichar_ids[u], &code); + EXPECT_NE(0, len); + for (int i = 0; i < len; ++i) { + // Apply the desired score. + (*outputs)(t++, code(i)) = score; + if (random != nullptr && + t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) { + int dups = static_cast<int>(random->UnsignedRand(3.0)); + for (int d = 0; d < dups; ++d) { + // Duplicate the desired score. + (*outputs)(t++, code(i)) = score; + } + } + } + if (random != nullptr && + t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) { + int dups = static_cast<int>(random->UnsignedRand(3.0)); + for (int d = 0; d < dups; ++d) { + // Add a random number of nulls as well. + (*outputs)(t++, encoded_null_char_) = score; + } + } + } + return t; + } + // Generates an encoding of the given 4 arrays as synthetic network scores. + // uses scores1 for chars1 and scores2 for chars2, and everything else gets + // the leftovers shared out equally. Note that empty string encodes as the + // null_char_. + GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char* chars1[], + const float scores1[], + const char* chars2[], + const float scores2[], + TRand* random) { + int width = 0; + while (chars1[width] != nullptr) ++width; + int padding = width * RecodedCharID::kMaxCodeLen; + int num_codes = recoder_.code_range(); + GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f); + int t = 0; + for (int i = 0; i < width; ++i) { + // In case there is overlap in the codes between 1st and 2nd choice, it + // is better to encode the 2nd choice first. + int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs); + int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs); + // Advance t to the max end, setting everything else to the leftovers. + int max_t = std::max(end_t1, end_t2); + while (t < max_t) { + double total_score = 0.0; + for (int j = 0; j < num_codes; ++j) total_score += outputs(t, j); + double null_remainder = (1.0 - total_score) / 2.0; + double remainder = null_remainder / (num_codes - 2); + if (outputs(t, encoded_null_char_) < null_remainder) { + outputs(t, encoded_null_char_) += null_remainder; + } else { + remainder += remainder; + } + for (int j = 0; j < num_codes; ++j) { + if (outputs(t, j) == 0.0f) outputs(t, j) = remainder; + } + ++t; + } + } + // Fill the rest with null chars. + while (t < width + padding) { + outputs(t++, encoded_null_char_) = 1.0f; + } + return outputs; + } + UnicharCompress recoder_; + int unichar_null_char_ = 0; + int encoded_null_char_ = 0; + CCUtil ccutil_; + Dict lstm_dict_; +}; + +TEST_F(RecodeBeamTest, DoesChinese) { + LOG(INFO) << "Testing chi_tra" << "\n"; + LoadUnicharset("chi_tra.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + GenericVector<int> transcription; + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + GENERIC_2D_ARRAY<float> outputs = + GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); + LOG(INFO) << "Testing chi_sim" << "\n"; + LoadUnicharset("chi_sim.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + transcription.clear(); + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + outputs = GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); +} + +TEST_F(RecodeBeamTest, DoesJapanese) { + LOG(INFO) << "Testing jpn" << "\n"; + LoadUnicharset("jpn.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + GenericVector<int> transcription; + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + GENERIC_2D_ARRAY<float> outputs = + GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); +} + +TEST_F(RecodeBeamTest, DoesKorean) { + LOG(INFO) << "Testing kor" << "\n"; + LoadUnicharset("kor.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + GenericVector<int> transcription; + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + GENERIC_2D_ARRAY<float> outputs = + GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); +} + +TEST_F(RecodeBeamTest, DoesKannada) { + LOG(INFO) << "Testing kan" << "\n"; + LoadUnicharset("kan.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + GenericVector<int> transcription; + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + GENERIC_2D_ARRAY<float> outputs = + GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); +} + +TEST_F(RecodeBeamTest, DoesMarathi) { + LOG(INFO) << "Testing mar" << "\n"; + LoadUnicharset("mar.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + GenericVector<int> transcription; + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + GENERIC_2D_ARRAY<float> outputs = + GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); +} + +TEST_F(RecodeBeamTest, DoesEnglish) { + LOG(INFO) << "Testing eng" << "\n"; + LoadUnicharset("eng.unicharset"); + // Correctly reproduce the first kNumchars characters from easy output. + GenericVector<int> transcription; + for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) + transcription.push_back(i); + GENERIC_2D_ARRAY<float> outputs = + GenerateRandomPaddedOutputs(transcription, kPadding); + ExpectCorrect(outputs, transcription); +} + +TEST_F(RecodeBeamTest, DISABLED_EngDictionary) { + LOG(INFO) << "Testing eng dictionary" << "\n"; + LoadUnicharset("eng_beam.unicharset"); + GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs( + kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr); + std::string default_str; + for (int i = 0; kGWRTops[i] != nullptr; ++i) default_str += kGWRTops[i]; + PointerVector<WERD_RES> words; + ExpectCorrect(outputs, default_str, nullptr, &words); + // Now try again with the dictionary. + LoadDict("eng_beam"); + ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words); +} + +TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) { + LOG(INFO) << "Testing zh_hans dictionary" << "\n"; + LoadUnicharset("zh_hans.unicharset"); + GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs( + kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr); + PointerVector<WERD_RES> words; + ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words); + // Each is an individual word, with permuter = top choice. + EXPECT_EQ(7, words.size()); + for (int w = 0; w < words.size(); ++w) { + EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter()); + } + // Now try again with the dictionary. + LoadDict("zh_hans"); + ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words); + // Number of words expected. + const int kNumWords = 5; + // Content of the words. + const char* kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"}; + // Permuters of the words. + const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, + TOP_CHOICE_PERM, TOP_CHOICE_PERM, + SYSTEM_DAWG_PERM}; + EXPECT_EQ(kNumWords, words.size()); + for (int w = 0; w < kNumWords && w < words.size(); ++w) { + EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str()); + EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter()); + } +} + +// Tests that a recoder built with decomposed unicode allows true ctc +// arbitrary duplicates and inserted nulls inside the multicode sequence. +TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) { + LOG(INFO) << "Testing duplicates in multi-code sequences" << "\n"; + LoadUnicharset("vie.d.unicharset"); + tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset); + TRand random; + GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs( + kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random); + PointerVector<WERD_RES> words; + std::string truth_str; + tesseract::NormalizeUTF8String( + tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, + tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str); + ExpectCorrect(outputs, truth_str, nullptr, &words); +} + +} // namespace diff --git a/tesseract/unittest/rect_test.cc b/tesseract/unittest/rect_test.cc new file mode 100644 index 00000000..5d9d439f --- /dev/null +++ b/tesseract/unittest/rect_test.cc @@ -0,0 +1,176 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "rect.h" + +#include "include_gunit.h" + +namespace tesseract { + +class TBOXTest : public testing::Test { + public: + void SetUp() { + std::locale::global(std::locale("")); + } + + void TearDown() {} +}; + +TEST_F(TBOXTest, OverlapInside) { + TBOX a(10, 10, 20, 20); + TBOX b(11, 11, 12, 12); + + EXPECT_TRUE(a.overlap(b)); + EXPECT_TRUE(b.overlap(a)); + EXPECT_DOUBLE_EQ(0.01, a.overlap_fraction(b)); + EXPECT_DOUBLE_EQ(1.0, b.overlap_fraction(a)); +} + +TEST_F(TBOXTest, OverlapBoolCorners) { + TBOX mid(10, 10, 30, 30); + TBOX bottom_left(5, 5, 15, 15); + TBOX top_left(5, 25, 15, 35); + // other corners covered by symmetry + + EXPECT_TRUE(mid.overlap(bottom_left)); + EXPECT_TRUE(bottom_left.overlap(mid)); + EXPECT_TRUE(mid.overlap(top_left)); + EXPECT_TRUE(top_left.overlap(mid)); +} + +TEST_F(TBOXTest, OverlapFractionCorners) { + TBOX mid(10, 10, 30, 30); + TBOX bottom_left(5, 5, 15, 15); + TBOX top_left(5, 25, 15, 35); + // other corners covered by symmetry + + EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), + mid.overlap_fraction(bottom_left)); + EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), + bottom_left.overlap_fraction(mid)); + EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(top_left)); + EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), top_left.overlap_fraction(mid)); +} + +TEST_F(TBOXTest, OverlapBoolSides) { + TBOX mid(10, 10, 30, 30); + TBOX left(5, 15, 15, 25); + TBOX bottom(15, 5, 25, 15); + // other sides covered by symmetry + + EXPECT_TRUE(mid.overlap(left)); + EXPECT_TRUE(left.overlap(mid)); + EXPECT_TRUE(mid.overlap(bottom)); + EXPECT_TRUE(bottom.overlap(mid)); +} + +TEST_F(TBOXTest, OverlapFractionSides) { + TBOX mid(10, 10, 30, 30); + TBOX left(5, 15, 15, 25); + TBOX bottom(15, 5, 25, 15); + // other sides covered by symmetry + + EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(left)); + EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), left.overlap_fraction(mid)); + EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(bottom)); + EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), bottom.overlap_fraction(mid)); +} + +TEST_F(TBOXTest, OverlapBoolSpan) { + TBOX mid(10, 10, 30, 30); + TBOX vertical(15, 5, 25, 35); + TBOX horizontal(5, 15, 35, 25); + // other sides covered by symmetry in other test cases + + EXPECT_TRUE(mid.overlap(vertical)); + EXPECT_TRUE(vertical.overlap(mid)); + EXPECT_TRUE(mid.overlap(horizontal)); + EXPECT_TRUE(horizontal.overlap(mid)); +} + +TEST_F(TBOXTest, OverlapFractionSpan) { + TBOX mid(10, 10, 30, 30); + TBOX vertical(15, 5, 25, 35); + TBOX horizontal(5, 15, 35, 25); + // other sides covered by symmetry in other test cases + + EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0), + mid.overlap_fraction(vertical)); + EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0), + vertical.overlap_fraction(mid)); + EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0), + mid.overlap_fraction(horizontal)); + EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0), + horizontal.overlap_fraction(mid)); +} + +// TODO(nbeato): pretty much all cases +TEST_F(TBOXTest, OverlapOutsideTests) { + TBOX mid(10, 10, 30, 30); + TBOX left(0, 15, 5, 25); + + EXPECT_FALSE(mid.overlap(left)); + EXPECT_FALSE(left.overlap(mid)); + EXPECT_DOUBLE_EQ(0.0, mid.overlap_fraction(left)); + EXPECT_DOUBLE_EQ(0.0, left.overlap_fraction(mid)); +} + +TEST_F(TBOXTest, OverlapXFraction) { + TBOX a(10, 10, 20, 20); + TBOX b(12, 100, 26, 200); + TBOX c(0, 0, 100, 100); + TBOX d(0, 0, 1, 1); + + EXPECT_DOUBLE_EQ(8.0 / 10.0, a.x_overlap_fraction(b)); + EXPECT_DOUBLE_EQ(8.0 / 14.0, b.x_overlap_fraction(a)); + EXPECT_DOUBLE_EQ(1.0, a.x_overlap_fraction(c)); + EXPECT_DOUBLE_EQ(10.0 / 100.0, c.x_overlap_fraction(a)); + EXPECT_DOUBLE_EQ(0.0, a.x_overlap_fraction(d)); + EXPECT_DOUBLE_EQ(0.0, d.x_overlap_fraction(a)); +} + +TEST_F(TBOXTest, OverlapYFraction) { + TBOX a(10, 10, 20, 20); + TBOX b(100, 12, 200, 26); + TBOX c(0, 0, 100, 100); + TBOX d(0, 0, 1, 1); + + EXPECT_DOUBLE_EQ(8.0 / 10.0, a.y_overlap_fraction(b)); + EXPECT_DOUBLE_EQ(8.0 / 14.0, b.y_overlap_fraction(a)); + EXPECT_DOUBLE_EQ(1.0, a.y_overlap_fraction(c)); + EXPECT_DOUBLE_EQ(10.0 / 100.0, c.y_overlap_fraction(a)); + EXPECT_DOUBLE_EQ(0.0, a.y_overlap_fraction(d)); + EXPECT_DOUBLE_EQ(0.0, d.y_overlap_fraction(a)); +} + +TEST_F(TBOXTest, OverlapXFractionZeroSize) { + TBOX zero(10, 10, 10, 10); + TBOX big(0, 0, 100, 100); + TBOX small(0, 0, 1, 1); + + EXPECT_DOUBLE_EQ(1.0, zero.x_overlap_fraction(big)); + EXPECT_DOUBLE_EQ(0.0, big.x_overlap_fraction(zero)); + EXPECT_DOUBLE_EQ(0.0, zero.x_overlap_fraction(small)); + EXPECT_DOUBLE_EQ(0.0, small.x_overlap_fraction(zero)); +} + +TEST_F(TBOXTest, OverlapYFractionZeroSize) { + TBOX zero(10, 10, 10, 10); + TBOX big(0, 0, 100, 100); + TBOX small(0, 0, 1, 1); + + EXPECT_DOUBLE_EQ(1.0, zero.y_overlap_fraction(big)); + EXPECT_DOUBLE_EQ(0.0, big.y_overlap_fraction(zero)); + EXPECT_DOUBLE_EQ(0.0, zero.y_overlap_fraction(small)); + EXPECT_DOUBLE_EQ(0.0, small.y_overlap_fraction(zero)); +} + +} // namespace diff --git a/tesseract/unittest/resultiterator_test.cc b/tesseract/unittest/resultiterator_test.cc new file mode 100644 index 00000000..50e18949 --- /dev/null +++ b/tesseract/unittest/resultiterator_test.cc @@ -0,0 +1,612 @@ + +#include <tesseract/resultiterator.h> +#include <string> +#include "allheaders.h" +#include <tesseract/baseapi.h> +#include "genericvector.h" +#include "scrollview.h" + +#include "include_gunit.h" +#include "log.h" // for LOG +#include "absl/strings/str_format.h" // for absl::StrFormat + +namespace tesseract { + +// DEFINE_string(tess_config, "", "config file for tesseract"); +// DEFINE_bool(visual_test, false, "Runs a visual test using scrollview"); + +// Helper functions for converting to STL vectors +template <typename T> +void ToVector(const GenericVector<T>& from, std::vector<T>* to) { + to->clear(); + for (int i = 0; i < from.size(); i++) to->push_back(from[i]); +} + +template <typename T> +void ToVector(const std::vector<T>& from, std::vector<T>* to) { + to->clear(); + for (int i = 0; i < from.size(); i++) to->push_back(from[i]); +} + +// The fixture for testing Tesseract. +class ResultIteratorTest : public testing::Test { + protected: + std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR , name); + } + std::string TessdataPath() { + return file::JoinPath(TESSDATA_DIR, ""); + } + std::string OutputNameToPath(const std::string& name) { + file::MakeTmpdir(); + return file::JoinPath(FLAGS_test_tmpdir, name); + } + + ResultIteratorTest() { src_pix_ = nullptr; } + ~ResultIteratorTest() {} + + void SetImage(const char* filename) { + src_pix_ = pixRead(TestDataNameToPath(filename).c_str()); + api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY); +// if (!FLAGS_tess_config.empty()) +// api_.ReadConfigFile(FLAGS_tess_config.c_str()); + api_.SetPageSegMode(tesseract::PSM_AUTO); + api_.SetImage(src_pix_); + pixDestroy(&src_pix_); + src_pix_ = api_.GetInputImage(); + } + + // Rebuilds the image using the binary images at the given level, and + // EXPECTs that the number of pixels in the xor of the rebuilt image with + // the original is at most max_diff. + void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator* it) { + it->Begin(); + int width = pixGetWidth(src_pix_); + int height = pixGetHeight(src_pix_); + int depth = pixGetDepth(src_pix_); + Pix* pix = pixCreate(width, height, depth); + EXPECT_TRUE(depth == 1 || depth == 8); + if (depth == 8) pixSetAll(pix); + do { + int left, top, right, bottom; + PageIteratorLevel im_level = level; + // If the return is false, it is a non-text block so get the block image. + if (!it->BoundingBox(level, &left, &top, &right, &bottom)) { + im_level = tesseract::RIL_BLOCK; + EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom)); + } + LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right + << ", B:" << bottom << "]" << "\n"; + Pix* block_pix; + if (depth == 1) { + block_pix = it->GetBinaryImage(im_level); + pixRasterop(pix, left, top, right - left, bottom - top, + PIX_SRC ^ PIX_DST, block_pix, 0, 0); + } else { + block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top); + pixRasterop(pix, left, top, pixGetWidth(block_pix), + pixGetHeight(block_pix), PIX_SRC & PIX_DST, block_pix, 0, + 0); + } + CHECK(block_pix != nullptr); + pixDestroy(&block_pix); + } while (it->Next(level)); +// if (base::GetFlag(FLAGS_v) >= 1) +// pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG); + pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0); + if (depth == 8) { + Pix* binary_pix = pixThresholdToBinary(pix, 128); + pixDestroy(&pix); + pixInvert(binary_pix, binary_pix); + pix = binary_pix; + } +// if (base::GetFlag(FLAGS_v) >= 1) +// pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG); + l_int32 pixcount; + pixCountPixels(pix, &pixcount, nullptr); + if (pixcount > max_diff) { + std::string outfile = OutputNameToPath("failedxor.png"); + LOG(INFO) << "outfile = " << outfile << "\n"; + pixWrite(outfile.c_str(), pix, IFF_PNG); + } + pixDestroy(&pix); + LOG(INFO) << absl::StrFormat("At level %d: pix diff = %d\n", level, pixcount); + EXPECT_LE(pixcount, max_diff); +// if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff); + } + + // Rebuilds the text from the iterator strings at the given level, and + // EXPECTs that the rebuild string exactly matches the truth string. + void VerifyIteratorText(const std::string& truth, PageIteratorLevel level, + ResultIterator* it) { + LOG(INFO) << "Text Test Level " << level << "\n"; + it->Begin(); + std::string result; + do { + char* text = it->GetUTF8Text(level); + result += text; + delete[] text; + if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) && + it->IsAtFinalElement(tesseract::RIL_WORD, level)) { + if (it->IsAtFinalElement(tesseract::RIL_TEXTLINE, level)) { + result += '\n'; + } else { + result += ' '; + } + if (it->IsAtFinalElement(tesseract::RIL_PARA, level) && + !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) + result += '\n'; + } + } while (it->Next(level)); + EXPECT_STREQ(truth.c_str(), result.c_str()) + << "Rebuild failed at Text Level " << level; + } + + void VerifyRebuilds(int block_limit, int para_limit, int line_limit, + int word_limit, int symbol_limit, PageIterator* it) { + VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it); + VerifyRebuild(para_limit, tesseract::RIL_PARA, it); + VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it); + VerifyRebuild(word_limit, tesseract::RIL_WORD, it); + VerifyRebuild(symbol_limit, tesseract::RIL_SYMBOL, it); + } + + void VerifyAllText(const std::string& truth, ResultIterator* it) { + VerifyIteratorText(truth, tesseract::RIL_BLOCK, it); + VerifyIteratorText(truth, tesseract::RIL_PARA, it); + VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it); + VerifyIteratorText(truth, tesseract::RIL_WORD, it); + VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it); + } + + // Verifies that ResultIterator::CalculateTextlineOrder() produces the right + // results given an array of word directions (word_dirs[num_words]), an + // expected output reading order + // (expected_reading_order[num_reading_order_entries]) and a given reading + // context (ltr or rtl). + void ExpectTextlineReadingOrder(bool in_ltr_context, + const StrongScriptDirection* word_dirs, + int num_words, int* expected_reading_order, + int num_reading_order_entries) const { + std::vector<StrongScriptDirection> gv_word_dirs; + for (int i = 0; i < num_words; i++) { + gv_word_dirs.push_back(word_dirs[i]); + } + + std::vector<int> output; + ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, + &output); + // STL vector can be used with EXPECT_EQ, so convert... + std::vector<int> correct_order( + expected_reading_order, + expected_reading_order + num_reading_order_entries); + std::vector<int> calculated_order; + ToVector(output, &calculated_order); + EXPECT_EQ(correct_order, calculated_order); + } + + // Verify that ResultIterator::CalculateTextlineOrder() produces sane output + // for a given array of word_dirs[num_words] in ltr or rtl context. + // Sane means that the output contains some permutation of the indices + // 0..[num_words - 1] interspersed optionally with negative (marker) values. + void VerifySaneTextlineOrder(bool in_ltr_context, + const StrongScriptDirection* word_dirs, + int num_words) const { + std::vector<StrongScriptDirection> gv_word_dirs; + for (int i = 0; i < num_words; i++) { + gv_word_dirs.push_back(word_dirs[i]); + } + + std::vector<int> output; + ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs, + &output); + ASSERT_GE(output.size(), num_words); + std::vector<int> output_copy(output); + std::sort(output_copy.begin(), output_copy.end()); + bool sane = true; + int j = 0; + while (j < output_copy.size() && output_copy[j] < 0) j++; + for (int i = 0; i < num_words; i++, j++) { + if (output_copy[j] != i) { + sane = false; + break; + } + } + if (j != output_copy.size()) { + sane = false; + } + if (!sane) { + std::vector<int> output_copy2, empty; + ToVector(output, &output_copy2); + EXPECT_EQ(output_copy2, empty) + << " permutation of 0.." << num_words - 1 << " not found in " + << (in_ltr_context ? "ltr" : "rtl") << " context."; + } + } + + // Objects declared here can be used by all tests in the test case for Foo. + Pix* src_pix_; // Borrowed from api_. Do not destroy. + std::string ocr_text_; + tesseract::TessBaseAPI api_; +}; + +// Tests layout analysis output (and scrollview) on the UNLV page numbered +// 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true. +// +//TEST_F(ResultIteratorTest, VisualTest) { +// if (!FLAGS_visual_test) return; +// const char* kIms[] = {"8087_054.3G.tif", "8071_093.3B.tif", nullptr}; +// for (int i = 0; kIms[i] != nullptr; ++i) { +// SetImage(kIms[i]); +// // Just run layout analysis. +// PageIterator* it = api_.AnalyseLayout(); +// EXPECT_FALSE(it == nullptr); +// // Make a scrollview window for the display. +// int width = pixGetWidth(src_pix_); +// int height = pixGetHeight(src_pix_); +// ScrollView* win = +// new ScrollView(kIms[i], 100, 100, width / 2, height / 2, width, height); +// win->Image(src_pix_, 0, 0); +// it->Begin(); +// ScrollView::Color color = ScrollView::RED; +// win->Brush(ScrollView::NONE); +// do { +// Pta* pts = it->BlockPolygon(); +// if (pts != nullptr) { +// win->Pen(color); +// int num_pts = ptaGetCount(pts); +// l_float32 x, y; +// ptaGetPt(pts, num_pts - 1, &x, &y); +// win->SetCursor(static_cast<int>(x), static_cast<int>(y)); +// for (int p = 0; p < num_pts; ++p) { +// ptaGetPt(pts, p, &x, &y); +// win->DrawTo(static_cast<int>(x), static_cast<int>(y)); +// } +// } +// ptaDestroy(&pts); +// } while (it->Next(tesseract::RIL_BLOCK)); +// win->Update(); +// delete win->AwaitEvent(SVET_DESTROY); +// delete win; +// delete it; +// } +//} + +// Tests that Tesseract gets exactly the right answer on phototest. +TEST_F(ResultIteratorTest, EasyTest) { + SetImage("phototest.tif"); + // Just run layout analysis. + PageIterator* p_it = api_.AnalyseLayout(); + EXPECT_FALSE(p_it == nullptr); + // Check iterator position. + EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK)); + // This should be a single block. + EXPECT_FALSE(p_it->Next(tesseract::RIL_BLOCK)); + EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK)); + + // The images should rebuild almost perfectly. + LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" << "\n"; + VerifyRebuilds(10, 10, 0, 0, 0, p_it); + delete p_it; + + char* result = api_.GetUTF8Text(); + ocr_text_ = result; + delete[] result; + ResultIterator* r_it = api_.GetIterator(); + // The images should rebuild almost perfectly. + LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" << "\n"; + VerifyRebuilds(8, 8, 0, 0, 40, r_it); + // Test the text. + LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" << "\n"; + VerifyAllText(ocr_text_, r_it); + + // The images should rebuild almost perfectly. + LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" << "\n"; + VerifyRebuilds(8, 8, 0, 0, 40, r_it); + + r_it->Begin(); + // Test baseline of the first line. + int x1, y1, x2, y2; + r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2); + LOG(INFO) << absl::StrFormat("Baseline (%d,%d)->(%d,%d)", x1, y1, x2, y2) << "\n"; + // Make sure we have a decent vector. + EXPECT_GE(x2, x1 + 400); + // The point 200,116 should be very close to the baseline. + // (x3,y3) is the vector from (x1,y1) to (200,116) + int x3 = 200 - x1; + int y3 = 116 - y1; + x2 -= x1; + y2 -= y1; + // The cross product (x2,y1)x(x3,y3) should be small. + int product = x2 * y3 - x3 * y2; + EXPECT_LE(abs(product), x2); + + // Test font attributes for each word. + do { + bool bold, italic, underlined, monospace, serif, smallcaps; + int pointsize, font_id; + const char* font = + r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, + &serif, &smallcaps, &pointsize, &font_id); + float confidence = r_it->Confidence(tesseract::RIL_WORD); + EXPECT_GE(confidence, 80.0f); + char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g", + word_str, font, font_id, pointsize, confidence) << "\n"; + delete[] word_str; + EXPECT_FALSE(bold); + EXPECT_FALSE(italic); + EXPECT_FALSE(underlined); + EXPECT_FALSE(monospace); + EXPECT_FALSE(serif); + // The text is about 31 pixels high. Above we say the source is 200 ppi, + // which translates to: + // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts + EXPECT_GE(pointsize, 11.16 - 1.50); + EXPECT_LE(pointsize, 11.16 + 1.50); + } while (r_it->Next(tesseract::RIL_WORD)); + delete r_it; +} + +// Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik) +TEST_F(ResultIteratorTest, ComplexTest) { + SetImage("8087_054.3B.tif"); + // Just run layout analysis. + PageIterator* it = api_.AnalyseLayout(); + EXPECT_FALSE(it == nullptr); + // The images should rebuild almost perfectly. + VerifyRebuilds(2073, 2073, 2080, 2081, 2090, it); + delete it; +} + +// Tests image rebuild on the UNLV page numbered 8087_054.3G.tif. (Dubrovnik) +TEST_F(ResultIteratorTest, GreyTest) { + SetImage("8087_054.3G.tif"); + // Just run layout analysis. + PageIterator* it = api_.AnalyseLayout(); + EXPECT_FALSE(it == nullptr); + // The images should rebuild almost perfectly. + VerifyRebuilds(600, 600, 600, 600, 600, it); + delete it; +} + +// Tests that Tesseract gets smallcaps and dropcaps. +TEST_F(ResultIteratorTest, SmallCapDropCapTest) { + SetImage("8071_093.3B.tif"); + char* result = api_.GetUTF8Text(); + delete[] result; + ResultIterator* r_it = api_.GetIterator(); + // Iterate over the words. + int found_dropcaps = 0; + int found_smallcaps = 0; + int false_positives = 0; + do { + bool bold, italic, underlined, monospace, serif, smallcaps; + int pointsize, font_id; + r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, + &smallcaps, &pointsize, &font_id); + char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + if (word_str != nullptr) { + LOG(INFO) << absl::StrFormat("Word %s is %s", word_str, + smallcaps ? "SMALLCAPS" : "Normal") << "\n"; + if (r_it->SymbolIsDropcap()) { + ++found_dropcaps; + } + if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 || + strcmp(word_str, "RALPH") == 0 || + strcmp(word_str, "KINNEY") == 0 || // Not working yet. + strcmp(word_str, "BENNETT") == 0) { + EXPECT_TRUE(smallcaps) << word_str; + ++found_smallcaps; + } else { + if (smallcaps) ++false_positives; + } + // No symbol other than the first of any word should be dropcap. + ResultIterator s_it(*r_it); + while (s_it.Next(tesseract::RIL_SYMBOL) && + !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) { + if (s_it.SymbolIsDropcap()) { + char* sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL); + LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str, + word_str); + delete[] sym_str; + } + EXPECT_FALSE(s_it.SymbolIsDropcap()); + } + delete[] word_str; + } + } while (r_it->Next(tesseract::RIL_WORD)); + delete r_it; + EXPECT_EQ(1, found_dropcaps); + EXPECT_GE(4, found_smallcaps); + EXPECT_LE(false_positives, 3); +} + +#if 0 +// TODO(rays) uncomment on the next change to layout analysis. +// CL 22736106 breaks it, but it is fixed in the change when +// the textline finders start to collapse. + +// Tests that Tesseract gets subscript and superscript. +// TODO(rays) This test is a bit feeble, due to bad textline finding on this +// image, so beef up the test a bit when we get less false positive subs. +TEST_F(ResultIteratorTest, SubSuperTest) { + SetImage("0146_281.3B.tif"); + char* result = api_.GetUTF8Text(); + delete [] result; + ResultIterator* r_it = api_.GetIterator(); + // Iterate over the symbols. + // Accuracy isn't great, so just count up and expect a decent count of + // positives and negatives. + const char kAllowedSupers[] = "O0123456789-"; + int found_subs = 0; + int found_supers = 0; + int found_normal = 0; + do { + if (r_it->SymbolIsSubscript()) { + ++found_subs; + } else if (r_it->SymbolIsSuperscript()) { + result = r_it->GetUTF8Text(tesseract::RIL_SYMBOL); + if (strchr(kAllowedSupers, result[0]) == nullptr) { + char* word = r_it->GetUTF8Text(tesseract::RIL_WORD); + LOG(ERROR) << absl::StrFormat("Char %s in word %s is unexpected super!", + result, word); + delete [] word; + EXPECT_TRUE(strchr(kAllowedSupers, result[0]) != nullptr); + } + delete [] result; + ++found_supers; + } else { + ++found_normal; + } + } while (r_it->Next(tesseract::RIL_SYMBOL)); + delete r_it; + LOG(INFO) << absl::StrFormat("Subs = %d, supers= %d, normal = %d", + found_subs, found_supers, found_normal) << "\n"; + EXPECT_GE(found_subs, 25); + EXPECT_GE(found_supers, 25); + EXPECT_GE(found_normal, 1350); +} +#endif + +static const StrongScriptDirection dL = DIR_LEFT_TO_RIGHT; +static const StrongScriptDirection dR = DIR_RIGHT_TO_LEFT; +static const StrongScriptDirection dN = DIR_NEUTRAL; + +// Test that a sequence of words that could be interpreted to start from +// the left side left-to-right or from the right side right-to-left is +// interpreted appropriately in different contexts. +TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) { + const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR}; + int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart, + 0, 1, 2, 3, ResultIterator::kMinorRunEnd}; + int reading_order_ltr_context[] = {0, 1, + 2, 3, + 4, ResultIterator::kMinorRunStart, + 7, 6, + 5, ResultIterator::kMinorRunEnd}; + + ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs), + reading_order_ltr_context, + ABSL_ARRAYSIZE(reading_order_ltr_context)); + ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs), + reading_order_rtl_context, + ABSL_ARRAYSIZE(reading_order_rtl_context)); +} + +// Tests that clearly left-direction text (with no right-to-left indications) +// comes out strictly left to right no matter the context. +TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) { + const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dN, dL, dL}; + // The order here is just left to right, nothing fancy. + int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7}; + // In the strange event that this shows up in an RTL paragraph, nonetheless + // just presume the whole thing is an LTR line. + int reading_order_rtl_context[] = { + ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7, + ResultIterator::kMinorRunEnd}; + + ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs), + reading_order_ltr_context, + ABSL_ARRAYSIZE(reading_order_ltr_context)); + ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs), + reading_order_rtl_context, + ABSL_ARRAYSIZE(reading_order_rtl_context)); +} + +// Test that right-direction text comes out strictly right-to-left in +// a right-to-left context. +TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) { + const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR}; + // The order here is just right-to-left, nothing fancy. + int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0}; + ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs), + reading_order_rtl_context, + ABSL_ARRAYSIZE(reading_order_rtl_context)); +} + +TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) { + // Iterate through all 7-word sequences and make sure that the output + // contains each of the indices 0..6 exactly once. + const int kNumWords(7); + const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations + StrongScriptDirection word_dirs[kNumWords]; + for (int i = 0; i < kNumCombos; i++) { + // generate the next combination. + int tmp = i; + for (int j = 0; j < kNumWords; j++) { + word_dirs[j] = static_cast<StrongScriptDirection>(tmp % 4); + tmp = tmp / 4; + } + VerifySaneTextlineOrder(true, word_dirs, kNumWords); + VerifySaneTextlineOrder(false, word_dirs, kNumWords); + } +} + +// TODO: Missing image +TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) { + SetImage("5318c4b679264.jpg"); + char* result = api_.GetUTF8Text(); + delete[] result; + ResultIterator* r_it = api_.GetIterator(); + // Iterate over the words. + do { + char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + if (word_str != nullptr) { + LOG(INFO) << absl::StrFormat("Word %s:", word_str) << "\n"; + ResultIterator s_it = *r_it; + do { + tesseract::ChoiceIterator c_it(s_it); + do { + const char* char_str = c_it.GetUTF8Text(); + if (char_str == nullptr) + LOG(INFO) << "Null char choice" << "\n"; + else + LOG(INFO) << "Char choice " << char_str << "\n"; + CHECK(char_str != nullptr); + } while (c_it.Next()); + } while ( + !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) && + s_it.Next(tesseract::RIL_SYMBOL)); + delete[] word_str; + } + } while (r_it->Next(tesseract::RIL_WORD)); + delete r_it; +} + +// TODO: Missing image +TEST_F(ResultIteratorTest, NonNullConfidencesTest) { +// SetImage("line6.tiff"); + SetImage("trainingitalline.tif"); + api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK); + // Force recognition so we can used the result iterator. + // We don't care about the return from GetUTF8Text. + char* result = api_.GetUTF8Text(); + delete[] result; + ResultIterator* r_it = api_.GetIterator(); + // Iterate over the words. + do { + char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); + if (word_str != nullptr) { + EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD)); + EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL)); + ResultIterator s_it = *r_it; + do { + const char* char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL); + CHECK(char_str != nullptr); + float confidence = s_it.Confidence(tesseract::RIL_SYMBOL); + LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str, + confidence); + delete[] char_str; + } while ( + !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) && + s_it.Next(tesseract::RIL_SYMBOL)); + delete[] word_str; + } else { + LOG(INFO) << "Empty word found" << "\n"; + } + } while (r_it->Next(tesseract::RIL_WORD)); + delete r_it; +} + +} // namespace diff --git a/tesseract/unittest/scanutils_test.cc b/tesseract/unittest/scanutils_test.cc new file mode 100644 index 00000000..e6917fce --- /dev/null +++ b/tesseract/unittest/scanutils_test.cc @@ -0,0 +1,114 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <iostream> // for cout + +#include "include_gunit.h" +#include "scanutils.h" + +namespace tesseract { + +class ScanutilsTest : public ::testing::Test { + protected: + void SetUp() override { + } +}; + +TEST_F(ScanutilsTest, DoesScanf) { + // This test verifies that tfscanf does Scanf the same as stdio fscanf. + // There are probably a gazillion more test cases that could be added, but + // these brought the tesseract and unittest test results in line. + std::string filename = file::JoinPath(TESTDATA_DIR, "scanftest.txt"); + FILE* fp1 = fopen(filename.c_str(), "r"); + if (fp1 == nullptr) { + std::cout << "Failed to open file " << filename << '\n'; + GTEST_SKIP(); + } + FILE* fp2 = fopen(filename.c_str(), "r"); + if (fp2 == nullptr) { + std::cout << "Failed to open file " << filename << '\n'; + fclose(fp1); + GTEST_SKIP(); + } + // The file contains this: + // 42.5 17 0.001000 -0.001000 + // 0 1 123 -123 0x100 + // abcdefghijklmnopqrstuvwxyz + // abcdefghijklmnopqrstuvwxyz + // MF 25 6.25e-2 0.5e5 -1e+4 + // 42 MF 25 6.25e-2 0.5 + // 24 + const int kNumFloats = 4; + float f1[kNumFloats], f2[kNumFloats]; + int r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]); + int r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]); + EXPECT_EQ(r1, kNumFloats); + EXPECT_EQ(r2, kNumFloats); + if (r1 == r2) { + for (int i = 0; i < r1; ++i) { + EXPECT_FLOAT_EQ(f1[i], f2[i]); + } + } + const int kNumInts = 5; + int i1[kNumInts], i2[kNumInts]; + r1 = fscanf(fp1, "%d %d %d %d %i", &i1[0], &i1[1], &i1[2], &i1[3], &i1[4]); + r2 = tfscanf(fp2, "%d %d %d %d %i", &i2[0], &i2[1], &i2[2], &i2[3], &i2[4]); + EXPECT_EQ(r1, kNumInts); + EXPECT_EQ(r2, kNumInts); + if (r1 == r2) { + for (int i = 0; i < kNumInts; ++i) { + EXPECT_EQ(i1[i], i2[i]); + } + } + const int kStrLen = 1024; + char s1[kStrLen]; + char s2[kStrLen]; + r1 = fscanf(fp1, "%1023s", s1); + r2 = tfscanf(fp2, "%1023s", s2); + EXPECT_EQ(r1, r2); + EXPECT_STREQ(s1, s2); + EXPECT_EQ(26, strlen(s2)); + r1 = fscanf(fp1, "%20s", s1); + r2 = tfscanf(fp2, "%20s", s2); + EXPECT_EQ(r1, r2); + EXPECT_STREQ(s1, s2); + EXPECT_EQ(20, strlen(s2)); + // Now read the rest of the alphabet. + r1 = fscanf(fp1, "%1023s", s1); + r2 = tfscanf(fp2, "%1023s", s2); + EXPECT_EQ(r1, r2); + EXPECT_STREQ(s1, s2); + EXPECT_EQ(6, strlen(s2)); + r1 = fscanf(fp1, "%1023s", s1); + r2 = tfscanf(fp2, "%1023s", s2); + EXPECT_EQ(r1, r2); + EXPECT_STREQ(s1, s2); + EXPECT_EQ(2, strlen(s2)); + r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]); + r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]); + EXPECT_EQ(r1, r2); + for (int i = 0; i < kNumFloats; ++i) EXPECT_FLOAT_EQ(f1[i], f2[i]); + // Test the * for field suppression. + r1 = fscanf(fp1, "%d %*s %*d %*f %*f", &i1[0]); + r2 = tfscanf(fp2, "%d %*s %*d %*f %*f", &i2[0]); + EXPECT_EQ(r1, r2); + EXPECT_EQ(i1[0], i2[0]); + // We should still see the next value and no phantoms. + r1 = fscanf(fp1, "%d %1023s", &i1[0], s1); + r2 = tfscanf(fp2, "%d %1023s", &i2[0], s2); + EXPECT_EQ(r1, r2); + EXPECT_EQ(1, r2); + EXPECT_EQ(i1[0], i2[0]); + fclose(fp2); + fclose(fp1); +} + +} // namespace diff --git a/tesseract/unittest/shapetable_test.cc b/tesseract/unittest/shapetable_test.cc new file mode 100644 index 00000000..285ed833 --- /dev/null +++ b/tesseract/unittest/shapetable_test.cc @@ -0,0 +1,182 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> +#include <utility> + +#include "absl/strings/str_format.h" // for absl::StrFormat + +#include "include_gunit.h" + +#include "serialis.h" +#include "shapetable.h" +#include "unicharset.h" + +namespace tesseract { + +#ifndef DISABLED_LEGACY_ENGINE + +static std::string TmpNameToPath(const std::string& name) { + return file::JoinPath(FLAGS_test_tmpdir, name); +} + +// Sets up a simple shape with some unichars. +static void Setup352(int font_id, Shape* shape) { + shape->AddToShape(3, font_id); + shape->AddToShape(5, font_id); + shape->AddToShape(2, font_id); +} + +// Verifies some properties of the 352 shape. +static void Expect352(int font_id, const Shape& shape) { + EXPECT_EQ(3, shape.size()); + EXPECT_TRUE(shape.ContainsUnichar(2)); + EXPECT_TRUE(shape.ContainsUnichar(3)); + EXPECT_TRUE(shape.ContainsUnichar(5)); + EXPECT_FALSE(shape.ContainsUnichar(1)); + EXPECT_TRUE(shape.ContainsUnicharAndFont(2, font_id)); + EXPECT_FALSE(shape.ContainsUnicharAndFont(2, font_id - 1)); + EXPECT_FALSE(shape.ContainsUnicharAndFont(font_id, 2)); + // It should be a subset of itself. + EXPECT_TRUE(shape.IsSubsetOf(shape)); +} + +#endif + +// The fixture for testing Shape. +class ShapeTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } +}; + +// Tests that a Shape works as expected for all the basic functions. +TEST_F(ShapeTest, BasicTest) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because Shape is missing. + GTEST_SKIP(); +#else + Shape shape1; + EXPECT_EQ(0, shape1.size()); + Setup352(101, &shape1); + Expect352(101, shape1); + // It should still work after file I/O. + std::string filename = TmpNameToPath("shapefile"); + FILE* fp = fopen(filename.c_str(), "wb"); + ASSERT_TRUE(fp != nullptr); + EXPECT_TRUE(shape1.Serialize(fp)); + fclose(fp); + TFile tfp; + EXPECT_TRUE(tfp.Open(filename.c_str(), nullptr)); + Shape shape2; + EXPECT_TRUE(shape2.DeSerialize(&tfp)); + Expect352(101, shape2); + // They should be subsets of each other. + EXPECT_TRUE(shape1.IsSubsetOf(shape2)); + EXPECT_TRUE(shape2.IsSubsetOf(shape1)); + // They should be equal unichars. + EXPECT_TRUE(shape1.IsEqualUnichars(&shape2)); + // and still pass afterwards. + Expect352(101, shape1); + Expect352(101, shape2); +#endif +} + +// Tests AddShape separately, as it takes quite a bit of work. +TEST_F(ShapeTest, AddShapeTest) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because Shape is missing. + GTEST_SKIP(); +#else + Shape shape1; + Setup352(101, &shape1); + Expect352(101, shape1); + // Now setup a different shape with different content. + Shape shape2; + shape2.AddToShape(3, 101); // Duplicates shape1. + shape2.AddToShape(5, 110); // Different font to shape1. + shape2.AddToShape(7, 101); // Different unichar to shape1. + // They should NOT be subsets of each other. + EXPECT_FALSE(shape1.IsSubsetOf(shape2)); + EXPECT_FALSE(shape2.IsSubsetOf(shape1)); + // Now add shape2 to shape1. + shape1.AddShape(shape2); + // Test subsets again. + EXPECT_FALSE(shape1.IsSubsetOf(shape2)); + EXPECT_TRUE(shape2.IsSubsetOf(shape1)); + EXPECT_EQ(4, shape1.size()); + EXPECT_FALSE(shape1.ContainsUnichar(1)); + EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 101)); + EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 110)); + EXPECT_FALSE(shape1.ContainsUnicharAndFont(3, 110)); + EXPECT_FALSE(shape1.ContainsUnicharAndFont(7, 110)); + EXPECT_FALSE(shape1.IsEqualUnichars(&shape2)); +#endif +} + +// The fixture for testing Shape. +class ShapeTableTest : public testing::Test {}; + +// Tests that a Shape works as expected for all the basic functions. +TEST_F(ShapeTableTest, FullTest) { +#ifdef DISABLED_LEGACY_ENGINE + // Skip test because Shape is missing. + GTEST_SKIP(); +#else + Shape shape1; + Setup352(101, &shape1); + // Build a shape table with the same data, but in separate shapes. + UNICHARSET unicharset; + unicharset.unichar_insert(" "); + for (int i = 1; i <= 10; ++i) { + std::string class_str = absl::StrFormat("class%d", i); + unicharset.unichar_insert(class_str.c_str()); + } + ShapeTable st(unicharset); + EXPECT_EQ(0, st.AddShape(3, 101)); + EXPECT_EQ(1, st.AddShape(5, 101)); + EXPECT_EQ(2, st.AddShape(2, 101)); + EXPECT_EQ(3, st.NumShapes()); + Expect352(101, shape1); + EXPECT_EQ(3, st.AddShape(shape1)); + for (int i = 0; i < 3; ++i) { + EXPECT_FALSE(st.MutableShape(i)->IsEqualUnichars(&shape1)); + } + EXPECT_TRUE(st.MutableShape(3)->IsEqualUnichars(&shape1)); + EXPECT_TRUE(st.AnyMultipleUnichars()); + st.DeleteShape(3); + EXPECT_FALSE(st.AnyMultipleUnichars()); + + // Now merge to make a single shape like shape1. + EXPECT_EQ(1, st.MasterUnicharCount(0)); + st.MergeShapes(0, 1); + EXPECT_EQ(3, st.MergedUnicharCount(1, 2)); + st.MergeShapes(1, 2); + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(3, st.MasterUnicharCount(i)); + // Master font count is the sum of all the font counts in the shape, not + // the actual number of different fonts in the shape. + EXPECT_EQ(3, st.MasterFontCount(i)); + } + EXPECT_EQ(0, st.MasterDestinationIndex(1)); + EXPECT_EQ(0, st.MasterDestinationIndex(2)); + ShapeTable st2; + st2.AppendMasterShapes(st, nullptr); + EXPECT_EQ(1, st.NumMasterShapes()); + EXPECT_EQ(1, st2.NumShapes()); + EXPECT_TRUE(st2.MutableShape(0)->IsEqualUnichars(&shape1)); + EXPECT_TRUE(st2.AnyMultipleUnichars()); +#endif +} + +} // namespace diff --git a/tesseract/unittest/stats_test.cc b/tesseract/unittest/stats_test.cc new file mode 100644 index 00000000..58c3483d --- /dev/null +++ b/tesseract/unittest/stats_test.cc @@ -0,0 +1,59 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "genericvector.h" +#include "kdpair.h" +#include "statistc.h" + +#include "include_gunit.h" + +namespace tesseract { + +const int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1}; + +class STATSTest : public testing::Test { + public: + void SetUp() { + std::locale::global(std::locale("")); + stats_.set_range(0, 16); + for (size_t i = 0; i < ARRAYSIZE(kTestData); ++i) + stats_.add(i, kTestData[i]); + } + + void TearDown() {} + + STATS stats_; +}; + +// Tests some basic numbers from the stats_. +TEST_F(STATSTest, BasicStats) { + EXPECT_EQ(37, stats_.get_total()); + EXPECT_EQ(2, stats_.mode()); + EXPECT_EQ(12, stats_.pile_count(2)); +} + +// Tests the top_n_modes function. +TEST_F(STATSTest, TopNModes) { + GenericVector<tesseract::KDPairInc<float, int> > modes; + int num_modes = stats_.top_n_modes(3, &modes); + EXPECT_EQ(3, num_modes); + // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14. + EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key()); + EXPECT_EQ(14, modes[0].data()); + // Mode 1 is 2 10 1 = 13 total count with a mean of 5 12/13. + EXPECT_FLOAT_EQ(5.0f + 12.0f / 13, modes[1].key()); + EXPECT_EQ(13, modes[1].data()); + // Mode 2 is 4 1 1 = 6 total count with a mean of 13.5. + EXPECT_FLOAT_EQ(13.5f, modes[2].key()); + EXPECT_EQ(6, modes[2].data()); +} + +} // namespace. diff --git a/tesseract/unittest/stridemap_test.cc b/tesseract/unittest/stridemap_test.cc new file mode 100644 index 00000000..fa1ef234 --- /dev/null +++ b/tesseract/unittest/stridemap_test.cc @@ -0,0 +1,219 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef INCLUDE_TENSORFLOW +#include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D +#else +#include <array> // std::array +#endif +#include "include_gunit.h" +#include "stridemap.h" + +namespace tesseract { + +#if !defined(INCLUDE_TENSORFLOW) && 0 +namespace xla { + +template <typename T> +class Array2D : public std::vector<T> { + public: + Array2D() : std::vector<T>(std::vector<int64_t>{0, 0}) {} + + Array2D(const int64_t n1, const int64_t n2) + : std::vector<T>(std::vector<int64_t>{n1, n2}) {} + + Array2D(const int64_t n1, const int64_t n2, const T value) + : std::vector<T>({n1, n2}, value) {} +}; +} +#endif + +class StridemapTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + +#ifdef INCLUDE_TENSORFLOW + // Sets up an Array2d object of the given size, initialized to increasing + // values starting with start. + std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) { + std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize)); + int value = start; + for (int y = 0; y < ysize; ++y) { + for (int x = 0; x < xsize; ++x) { +#ifdef INCLUDE_TENSORFLOW + (*a)(y, x) = value++; +#else + a[y][x] = value++; +#endif + } + } + return a; + } +#endif +}; + +TEST_F(StridemapTest, Indexing) { + // This test verifies that with a batch of arrays of different sizes, the + // iteration index each of them in turn, without going out of bounds. +#ifdef INCLUDE_TENSORFLOW + std::vector<std::unique_ptr<xla::Array2D<int>>> arrays; + arrays.push_back(SetupArray(3, 4, 0)); + arrays.push_back(SetupArray(4, 5, 12)); + arrays.push_back(SetupArray(4, 4, 32)); + arrays.push_back(SetupArray(3, 5, 48)); + std::vector<std::pair<int, int>> h_w_sizes; + for (size_t i = 0; i < arrays.size(); ++i) { + h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width()); + } + StrideMap stride_map; + stride_map.SetStride(h_w_sizes); + StrideMap::Index index(stride_map); + int pos = 0; + do { + EXPECT_GE(index.t(), pos); + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), + index.index(FD_WIDTH)), + pos); + EXPECT_EQ(index.IsLast(FD_BATCH), + index.index(FD_BATCH) == arrays.size() - 1); + EXPECT_EQ( + index.IsLast(FD_HEIGHT), + index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1); + EXPECT_EQ( + index.IsLast(FD_WIDTH), + index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1); + EXPECT_TRUE(index.IsValid()); + ++pos; + } while (index.Increment()); + LOG(INFO) << "pos=" << pos; + index.InitToLast(); + do { + --pos; + EXPECT_GE(index.t(), pos); + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), + index.index(FD_WIDTH)), + pos); + StrideMap::Index copy(index); + // Since a change in batch index changes the height and width, it isn't + // necessarily true that the position is still valid, even when changing + // to another valid batch index. + if (index.IsLast(FD_BATCH)) { + EXPECT_FALSE(copy.AddOffset(1, FD_BATCH)); + } + copy = index; + EXPECT_EQ(index.IsLast(FD_HEIGHT), !copy.AddOffset(1, FD_HEIGHT)); + copy = index; + EXPECT_EQ(index.IsLast(FD_WIDTH), !copy.AddOffset(1, FD_WIDTH)); + copy = index; + if (index.index(FD_BATCH) == 0) { + EXPECT_FALSE(copy.AddOffset(-1, FD_BATCH)); + } + copy = index; + EXPECT_EQ(index.index(FD_HEIGHT) == 0, !copy.AddOffset(-1, FD_HEIGHT)); + copy = index; + EXPECT_EQ(index.index(FD_WIDTH) == 0, !copy.AddOffset(-1, FD_WIDTH)); + copy = index; + EXPECT_FALSE(copy.AddOffset(10, FD_WIDTH)); + copy = index; + EXPECT_FALSE(copy.AddOffset(-10, FD_HEIGHT)); + EXPECT_TRUE(index.IsValid()); + } while (index.Decrement()); +#else + LOG(INFO) << "Skip test because of missing xla::Array2D"; + GTEST_SKIP(); +#endif +} + +TEST_F(StridemapTest, Scaling) { + // This test verifies that with a batch of arrays of different sizes, the + // scaling/reduction functions work as expected. +#ifdef INCLUDE_TENSORFLOW + std::vector<std::unique_ptr<xla::Array2D<int>>> arrays; + arrays.push_back(SetupArray(3, 4, 0)); // 0-11 + arrays.push_back(SetupArray(4, 5, 12)); // 12-31 + arrays.push_back(SetupArray(4, 4, 32)); // 32-47 + arrays.push_back(SetupArray(3, 5, 48)); // 48-62 + std::vector<std::pair<int, int>> h_w_sizes; + for (size_t i = 0; i < arrays.size(); ++i) { + h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width()); + } + StrideMap stride_map; + stride_map.SetStride(h_w_sizes); + + // Scale x by 2, keeping y the same. + std::vector<int> values_x2 = {0, 1, 4, 5, 8, 9, 12, 13, 17, 18, + 22, 23, 27, 28, 32, 33, 36, 37, 40, 41, + 44, 45, 48, 49, 53, 54, 58, 59}; + StrideMap test_map(stride_map); + test_map.ScaleXY(2, 1); + StrideMap::Index index(test_map); + int pos = 0; + do { + int expected_value = values_x2[pos++]; + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), + index.index(FD_WIDTH)), + expected_value); + } while (index.Increment()); + EXPECT_EQ(pos, values_x2.size()); + + test_map = stride_map; + // Scale y by 2, keeping x the same. + std::vector<int> values_y2 = {0, 1, 2, 3, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 32, 33, 34, 35, + 36, 37, 38, 39, 48, 49, 50, 51, 52}; + test_map.ScaleXY(1, 2); + index.InitToFirst(); + pos = 0; + do { + int expected_value = values_y2[pos++]; + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), + index.index(FD_WIDTH)), + expected_value); + } while (index.Increment()); + EXPECT_EQ(pos, values_y2.size()); + + test_map = stride_map; + // Scale x and y by 2. + std::vector<int> values_xy2 = {0, 1, 12, 13, 17, 18, 32, 33, 36, 37, 48, 49}; + test_map.ScaleXY(2, 2); + index.InitToFirst(); + pos = 0; + do { + int expected_value = values_xy2[pos++]; + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), + index.index(FD_WIDTH)), + expected_value); + } while (index.Increment()); + EXPECT_EQ(pos, values_xy2.size()); + + test_map = stride_map; + // Reduce Width to 1. + std::vector<int> values_x_to_1 = {0, 4, 8, 12, 17, 22, 27, + 32, 36, 40, 44, 48, 53, 58}; + test_map.ReduceWidthTo1(); + index.InitToFirst(); + pos = 0; + do { + int expected_value = values_x_to_1[pos++]; + EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT), + index.index(FD_WIDTH)), + expected_value); + } while (index.Increment()); + EXPECT_EQ(pos, values_x_to_1.size()); +#else + LOG(INFO) << "Skip test because of missing xla::Array2D"; + GTEST_SKIP(); +#endif +} + +} // namespace diff --git a/tesseract/unittest/stringrenderer_test.cc b/tesseract/unittest/stringrenderer_test.cc new file mode 100644 index 00000000..8cba6e4f --- /dev/null +++ b/tesseract/unittest/stringrenderer_test.cc @@ -0,0 +1,564 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" + +#include "boxchar.h" +#include "boxread.h" +#include "commandlineflags.h" +#include "stringrenderer.h" +#include "strngs.h" + +#include "absl/strings/str_split.h" // for absl::StrSplit +#include "allheaders.h" + +#include <memory> +#include <string> + +BOOL_PARAM_FLAG(display, false, "Display image for inspection"); + +namespace tesseract { + +const char kEngText[] = "the quick brown fox jumps over the lazy dog"; +const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा"; + +const char kKorText[] = "이는 것으로 다시 넣을 1234 수는 있지만 선택의 의미는"; +const char kArabicText[] = + "والفكر والصراع ، بالتأمل والفهم والتحليل ، " + "بالعلم والفن ، وأخيرا بالضحك أوبالبكاء ، "; +const char kMixedText[] = "والفكر 123 والصراع abc"; + +const char kEngNonLigatureText[] = "fidelity"; +// Same as kEngNonLigatureText, but with "fi" replaced with its ligature. +const char kEngLigatureText[] = "fidelity"; + +static PangoFontMap* font_map; + +class StringRendererTest : public ::testing::Test { + protected: + void SetUp() override { + if (!font_map) { + font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT); + } + pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map)); + } + + static void SetUpTestCase() { + static std::locale system_locale(""); + std::locale::global(system_locale); + + l_chooseDisplayProg(L_DISPLAY_WITH_XZGV); + FLAGS_fonts_dir = TESTING_DIR; + FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir; + file::MakeTmpdir(); + PangoFontInfo::SoftInitFontConfig(); // init early + } + + void DisplayClusterBoxes(Pix* pix) { + if (!FLAGS_display) return; + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + Boxa* boxes = boxaCreate(0); + for (const auto& boxchar : boxchars) { + if (boxchar->box()) + boxaAddBox(boxes, const_cast<Box*>(boxchar->box()), L_CLONE); + } + Pix* box_pix = pixDrawBoxaRandom(pix, boxes, 1); + boxaDestroy(&boxes); + pixDisplay(box_pix, 0, 0); + pixDestroy(&box_pix); + } + std::unique_ptr<StringRenderer> renderer_; +}; + +TEST_F(StringRendererTest, DoesRenderToImage) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + + renderer_.reset(new StringRenderer("UnBatang 10", 600, 600)); + EXPECT_EQ(strlen(kKorText), + renderer_->RenderToImage(kKorText, strlen(kKorText), &pix)); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + + renderer_.reset(new StringRenderer("Lohit Hindi 10", 600, 600)); + EXPECT_EQ(strlen(kHinText), + renderer_->RenderToImage(kHinText, strlen(kHinText), &pix)); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + + // RTL text + renderer_.reset(new StringRenderer("Arab 10", 600, 600)); + EXPECT_EQ(strlen(kArabicText), + renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + + // Mixed direction Arabic + english text + renderer_.reset(new StringRenderer("Arab 10", 600, 600)); + EXPECT_EQ(strlen(kMixedText), + renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); +} + +TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + // Underline all words but NOT intervening spaces. + renderer_->set_underline_start_prob(1.0); + renderer_->set_underline_continuation_prob(0); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + renderer_->ClearBoxes(); + + // Underline all words AND intervening spaces. + renderer_->set_underline_start_prob(1.0); + renderer_->set_underline_continuation_prob(1.0); + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + renderer_->ClearBoxes(); + + // Underline words and intervening spaces with 0.5 prob. + renderer_->set_underline_start_prob(0.5); + renderer_->set_underline_continuation_prob(0.5); + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); +} + +TEST_F(StringRendererTest, DoesHandleNewlineCharacters) { + const char kRawText[] = "\n\n\n A \nB \nC \n\n\n"; + const char kStrippedText[] = " A B C "; // text with newline chars removed + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kRawText), + renderer_->RenderToImage(kRawText, strlen(kRawText), &pix)); + EXPECT_TRUE(pix != nullptr); + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + // 3 characters + 4 spaces => 7 boxes + EXPECT_EQ(7, boxchars.size()); + if (boxchars.size() == 7) { + // Verify the text content of the boxchars + for (size_t i = 0; i < boxchars.size(); ++i) { + EXPECT_EQ(std::string(1, kStrippedText[i]), boxchars[i]->ch()); + } + } + DisplayClusterBoxes(pix); + pixDestroy(&pix); +} + +TEST_F(StringRendererTest, DoesRenderLigatures) { + renderer_.reset(new StringRenderer("Arab 12", 600, 250)); + const char kArabicLigature[] = "لا"; + + Pix* pix = nullptr; + EXPECT_EQ( + strlen(kArabicLigature), + renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix)); + EXPECT_TRUE(pix != nullptr); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + const std::vector<BoxChar*>& boxes = renderer_->GetBoxes(); + EXPECT_EQ(1, boxes.size()); + EXPECT_TRUE(boxes[0]->box() != nullptr); + EXPECT_STREQ(kArabicLigature, boxes[0]->ch().c_str()); + DisplayClusterBoxes(pix); + pixDestroy(&pix); + + renderer_.reset(new StringRenderer("Arab 12", 600, 250)); + const char kArabicMixedText[] = "والفكر والصراع 1234,\nوالفكر لا والصراع"; + renderer_->RenderToImage(kArabicMixedText, strlen(kArabicMixedText), &pix); + DisplayClusterBoxes(pix); + pixDestroy(&pix); +} + +static int FindBoxCharXCoord(const std::vector<BoxChar*>& boxchars, + const std::string& ch) { + for (const auto& boxchar : boxchars) { + if (boxchar->ch() == ch) return boxchar->box()->x; + } + return INT_MAX; +} + +TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) { + renderer_.reset(new StringRenderer("Arab 10", 600, 600)); + Pix* pix = nullptr; + // Arabic letters should be in decreasing x-coordinates + const char kArabicWord[] = "\u0644\u0627\u0641\u0643\u0631"; + const std::string kRevWord = "\u0631\u0643\u0641\u0627\u0644"; + renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix); + std::string boxes_str = renderer_->GetBoxesStr(); + // Decode to get the box text strings. + EXPECT_FALSE(boxes_str.empty()); + std::vector<STRING> texts; + EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts, + nullptr, nullptr)); + std::string ltr_str; + for (size_t i = 0; i < texts.size(); ++i) { + ltr_str += texts[i].c_str(); + } + // The string should come out perfectly reversed, despite there being a + // ligature. + EXPECT_EQ(ltr_str, kRevWord); + // Just to prove there was a ligature, the number of texts is less than the + // number of unicodes. + EXPECT_LT(texts.size(), 5); + pixDestroy(&pix); +} + +TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) { + renderer_.reset(new StringRenderer("Arab 10", 600, 600)); + Pix* pix = nullptr; + // Arabic letters should be in decreasing x-coordinates + const char kArabicWord[] = "والفكر"; + renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + for (size_t i = 1; i < boxchars.size(); ++i) { + EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) + << boxchars[i - 1]->ch(); + } + pixDestroy(&pix); + + // English letters should be in increasing x-coordinates + const char kEnglishWord[] = "Google"; + renderer_->ClearBoxes(); + renderer_->RenderToImage(kEnglishWord, strlen(kEnglishWord), &pix); + EXPECT_EQ(boxchars.size(), strlen(kEnglishWord)); + for (size_t i = 1; i < boxchars.size(); ++i) { + EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x) + << boxchars[i - 1]->ch(); + } + pixDestroy(&pix); + + // Mixed text should satisfy both. + renderer_->ClearBoxes(); + renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix); + EXPECT_LT(FindBoxCharXCoord(boxchars, "a"), FindBoxCharXCoord(boxchars, "b")); + EXPECT_LT(FindBoxCharXCoord(boxchars, "1"), FindBoxCharXCoord(boxchars, "2")); + EXPECT_GT(FindBoxCharXCoord(boxchars, "و"), FindBoxCharXCoord(boxchars, "ر")); + pixDestroy(&pix); +} + +TEST_F(StringRendererTest, DoesRenderVerticalText) { + Pix* pix = nullptr; + renderer_.reset(new StringRenderer("UnBatang 10", 600, 600)); + renderer_->set_vertical_text(true); + EXPECT_EQ(strlen(kKorText), + renderer_->RenderToImage(kKorText, strlen(kKorText), &pix)); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + DisplayClusterBoxes(pix); + pixDestroy(&pix); +} + +// Checks that we preserve charboxes across RenderToImage calls, with +// appropriate page numbers. +TEST_F(StringRendererTest, DoesKeepAllImageBoxes) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + Pix* pix = nullptr; + int num_boxes_per_page = 0; + const int kNumTrials = 2; + for (int i = 0; i < kNumTrials; ++i) { + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + EXPECT_TRUE(pix != nullptr); + pixDestroy(&pix); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + if (!num_boxes_per_page) { + num_boxes_per_page = renderer_->GetBoxes().size(); + } else { + EXPECT_EQ((i + 1) * num_boxes_per_page, renderer_->GetBoxes().size()); + } + for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page; + ++j) { + EXPECT_EQ(i, renderer_->GetBoxes()[j]->page()); + } + } +} + +TEST_F(StringRendererTest, DoesClearBoxes) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + pixDestroy(&pix); + EXPECT_GT(renderer_->GetBoxes().size(), 0); + const int num_boxes_per_page = renderer_->GetBoxes().size(); + + renderer_->ClearBoxes(); + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + pixDestroy(&pix); + EXPECT_EQ(num_boxes_per_page, renderer_->GetBoxes().size()); +} + +TEST_F(StringRendererTest, DoesLigatureTextForRendering) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + renderer_->set_add_ligatures(true); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kEngNonLigatureText), + renderer_->RenderToImage(kEngNonLigatureText, + strlen(kEngNonLigatureText), &pix)); + pixDestroy(&pix); + // There should be one less box than letters due to the 'fi' ligature. + EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size()); + // The output box text should be ligatured. + EXPECT_STREQ("fi", renderer_->GetBoxes()[0]->ch().c_str()); +} + +TEST_F(StringRendererTest, DoesRetainInputLigatureForRendering) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kEngLigatureText), + renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText), + &pix)); + pixDestroy(&pix); + // There should be one less box than letters due to the 'fi' ligature. + EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size()); + // The output box text should be ligatured. + EXPECT_STREQ("\uFB01", renderer_->GetBoxes()[0]->ch().c_str()); +} + +TEST_F(StringRendererTest, DoesStripUnrenderableWords) { + // Verdana should only be able to render the english letters and numbers in + // the mixed text. + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + std::string text(kMixedText); + EXPECT_GT(renderer_->StripUnrenderableWords(&text), 0); + EXPECT_EQ(" 123 abc", text); +} + +TEST_F(StringRendererTest, DoesRenderWordBoxes) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + renderer_->set_output_word_boxes(true); + Pix* pix = nullptr; + EXPECT_EQ(strlen(kEngText), + renderer_->RenderToImage(kEngText, strlen(kEngText), &pix)); + pixDestroy(&pix); + // Verify #boxchars = #words + #spaces + std::vector<std::string> words = + absl::StrSplit(kEngText, ' ', absl::SkipEmpty()); + const int kNumSpaces = words.size() - 1; + const int kExpectedNumBoxes = words.size() + kNumSpaces; + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + EXPECT_EQ(kExpectedNumBoxes, boxchars.size()); + // Verify content of words and spaces + for (size_t i = 0; i < boxchars.size(); i += 2) { + EXPECT_EQ(words[i / 2], boxchars[i]->ch()); + if (i < boxchars.size() - 1) { + EXPECT_EQ(" ", boxchars[i + 1]->ch()); + EXPECT_TRUE(boxchars[i + 1]->box() == nullptr); + } + } +} + +TEST_F(StringRendererTest, DoesRenderWordBoxesFromMultiLineText) { + renderer_.reset(new StringRenderer("Verdana 10", 600, 600)); + renderer_->set_output_word_boxes(true); + Pix* pix = nullptr; + const char kMultlineText[] = "the quick brown fox\njumps over the lazy dog"; + EXPECT_EQ(strlen(kMultlineText), + renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix)); + pixDestroy(&pix); + // Verify #boxchars = #words + #spaces + #newlines + std::vector<std::string> words = + absl::StrSplit(kMultlineText, absl::ByAnyChar(" \n"), absl::SkipEmpty()); + const int kNumSeparators = words.size() - 1; + const int kExpectedNumBoxes = words.size() + kNumSeparators; + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + EXPECT_EQ(kExpectedNumBoxes, boxchars.size()); + // Verify content of words and spaces + for (size_t i = 0; i < boxchars.size(); i += 2) { + EXPECT_EQ(words[i / 2], boxchars[i]->ch()); + if (i + 1 < boxchars.size()) { + EXPECT_EQ(" ", boxchars[i + 1]->ch()); + EXPECT_TRUE(boxchars[i + 1]->box() == nullptr); + } + } +} + +TEST_F(StringRendererTest, DoesRenderAllFontsToImage) { + renderer_.reset(new StringRenderer("Verdana 10", 1200, 1200)); + size_t offset = 0; + std::string font_used; + do { + Pix* pix = nullptr; + font_used.clear(); + offset += renderer_->RenderAllFontsToImage( + 1.0, kEngText + offset, strlen(kEngText + offset), &font_used, &pix); + if (offset < strlen(kEngText)) { + EXPECT_TRUE(pix != nullptr); + EXPECT_STRNE("", font_used.c_str()); + } + if (FLAGS_display) pixDisplay(pix, 0, 0); + pixDestroy(&pix); + } while (offset < strlen(kEngText)); +} + +TEST_F(StringRendererTest, DoesNotRenderWordJoiner) { + renderer_.reset(new StringRenderer("Verdana 10", 500, 200)); + const std::string word = "A- -B C-D A BC"; + const std::string joined_word = StringRenderer::InsertWordJoiners(word); + Pix* pix = nullptr; + renderer_->RenderToImage(joined_word.c_str(), joined_word.length(), &pix); + pixDestroy(&pix); + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + const std::string kWordJoinerUTF8 = "\u2060"; + ASSERT_EQ(word.length(), boxchars.size()); + for (size_t i = 0; i < boxchars.size(); ++i) { + EXPECT_NE(kWordJoinerUTF8, boxchars[i]->ch()); + EXPECT_EQ(word.substr(i, 1), boxchars[i]->ch()); + } +} + +TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) { + renderer_.reset(new StringRenderer("Verdana 10", 500, 200)); + renderer_->set_drop_uncovered_chars(true); + const std::string kWord = "office"; + const std::string kCleanWord = "oice"; + Pix* pix = nullptr; + EXPECT_FALSE( + renderer_->font().CanRenderString(kWord.c_str(), kWord.length())); + EXPECT_FALSE(renderer_->font().CoversUTF8Text(kWord.c_str(), kWord.length())); + int offset = renderer_->RenderToImage(kWord.c_str(), kWord.length(), &pix); + pixDestroy(&pix); + const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes(); + EXPECT_EQ(kWord.length(), offset); + ASSERT_EQ(kCleanWord.length(), boxchars.size()); + for (size_t i = 0; i < boxchars.size(); ++i) { + EXPECT_EQ(kCleanWord.substr(i, 1), boxchars[i]->ch()); + } +} + +// ------------ StringRenderer::ConvertBasicLatinToFullwidthLatin() ------------ + +TEST(ConvertBasicLatinToFullwidthLatinTest, DoesConvertBasicLatin) { + const std::string kHalfAlpha = "ABCD"; + const std::string kFullAlpha = "ABCD"; + EXPECT_EQ(kFullAlpha, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha)); + + const std::string kHalfDigit = "0123"; + const std::string kFullDigit = "0123"; + EXPECT_EQ(kFullDigit, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit)); + + const std::string kHalfSym = "()[]:;!?"; + const std::string kFullSym = "()[]:;!?"; + EXPECT_EQ(kFullSym, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym)); +} + +TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertFullwidthLatin) { + const std::string kFullAlpha = "ABCD"; + EXPECT_EQ(kFullAlpha, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha)); + + const std::string kFullDigit = "0123"; + EXPECT_EQ(kFullDigit, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit)); + + const std::string kFullSym = "()[]:;!?"; + EXPECT_EQ(kFullSym, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym)); +} + +TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertNonLatin) { + const std::string kHalfKana = "アイウエオ"; + const std::string kFullKana = "アイウエオ"; + EXPECT_EQ(kHalfKana, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana)); + EXPECT_EQ(kFullKana, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana)); +} + +TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) { + const std::string kHalfSpace = " "; + const std::string kFullSpace = " "; + EXPECT_EQ(kHalfSpace, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace)); + EXPECT_EQ(kFullSpace, + StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace)); +} + +// ------------ StringRenderer::ConvertFullwidthLatinToBasicLatin() ------------ + +TEST(ConvertFullwidthLatinToBasicLatinTest, DoesConvertFullwidthLatin) { + const std::string kHalfAlpha = "ABCD"; + const std::string kFullAlpha = "ABCD"; + EXPECT_EQ(kHalfAlpha, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha)); + + const std::string kHalfDigit = "0123"; + const std::string kFullDigit = "0123"; + EXPECT_EQ(kHalfDigit, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit)); + + const std::string kHalfSym = "()[]:;!?"; + const std::string kFullSym = "()[]:;!?"; + EXPECT_EQ(kHalfSym, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym)); +} + +TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertBasicLatin) { + const std::string kHalfAlpha = "ABCD"; + EXPECT_EQ(kHalfAlpha, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha)); + + const std::string kHalfDigit = "0123"; + EXPECT_EQ(kHalfDigit, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit)); + + const std::string kHalfSym = "()[]:;!?"; + EXPECT_EQ(kHalfSym, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym)); +} + +TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertNonLatin) { + const std::string kHalfKana = "アイウエオ"; + const std::string kFullKana = "アイウエオ"; + EXPECT_EQ(kHalfKana, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana)); + EXPECT_EQ(kFullKana, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana)); +} + +TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertSpace) { + const std::string kHalfSpace = " "; + const std::string kFullSpace = " "; + EXPECT_EQ(kHalfSpace, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace)); + EXPECT_EQ(kFullSpace, + StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace)); +} +} // namespace diff --git a/tesseract/unittest/syntaxnet/base.h b/tesseract/unittest/syntaxnet/base.h new file mode 100644 index 00000000..5dabbbda --- /dev/null +++ b/tesseract/unittest/syntaxnet/base.h @@ -0,0 +1,61 @@ +/* Copyright 2016 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef SYNTAXNET_BASE_H_ +#define SYNTAXNET_BASE_H_ + +#include <functional> +#include <string> +#include <unordered_map> +#include <unordered_set> +#include <vector> + +#include "google/protobuf/util/message_differencer.h" + + +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/default/integral_types.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/protobuf.h" + + + +using tensorflow::int8; +using tensorflow::int16; +using tensorflow::int32; +using tensorflow::int64; +using tensorflow::uint8; +using tensorflow::uint16; +using tensorflow::uint64; +using tensorflow::uint32; +using tensorflow::protobuf::TextFormat; +using tensorflow::mutex_lock; +using tensorflow::mutex; +using std::map; +using std::pair; +using std::vector; +using std::unordered_map; +using std::unordered_set; +typedef signed int char32; + +using tensorflow::StringPiece; +using std::string; + + + // namespace syntaxnet + +#endif // SYNTAXNET_BASE_H_ diff --git a/tesseract/unittest/tablefind_test.cc b/tesseract/unittest/tablefind_test.cc new file mode 100644 index 00000000..df6d511c --- /dev/null +++ b/tesseract/unittest/tablefind_test.cc @@ -0,0 +1,261 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "colpartition.h" +#include "colpartitiongrid.h" +#include "tablefind.h" + +#include "include_gunit.h" + +namespace tesseract { + +class TestableTableFinder : public tesseract::TableFinder { + public: + using TableFinder::GapInXProjection; + using TableFinder::HasLeaderAdjacent; + using TableFinder::InsertLeaderPartition; + using TableFinder::InsertTextPartition; + using TableFinder::set_global_median_blob_width; + using TableFinder::set_global_median_ledding; + using TableFinder::set_global_median_xheight; + using TableFinder::SplitAndInsertFragmentedTextPartition; + + void ExpectPartition(const TBOX& box) { + tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_); + gsearch.SetUniqueMode(true); + gsearch.StartFullSearch(); + ColPartition* part = nullptr; + bool found = false; + while ((part = gsearch.NextFullSearch()) != nullptr) { + if (part->bounding_box().left() == box.left() && + part->bounding_box().bottom() == box.bottom() && + part->bounding_box().right() == box.right() && + part->bounding_box().top() == box.top()) { + found = true; + } + } + EXPECT_TRUE(found); + } + void ExpectPartitionCount(int expected_count) { + tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_); + gsearch.SetUniqueMode(true); + gsearch.StartFullSearch(); + ColPartition* part = nullptr; + int count = 0; + while ((part = gsearch.NextFullSearch()) != nullptr) { + ++count; + } + EXPECT_EQ(expected_count, count); + } +}; + +class TableFinderTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + free_boxes_it_.set_to_list(&free_boxes_); + finder_.reset(new TestableTableFinder()); + finder_->Init(1, ICOORD(0, 0), ICOORD(500, 500)); + // gap finding + finder_->set_global_median_xheight(5); + finder_->set_global_median_blob_width(5); + } + + void TearDown() { + if (partition_.get() != nullptr) partition_->DeleteBoxes(); + DeletePartitionListBoxes(); + finder_.reset(nullptr); + } + + void MakePartition(int x_min, int y_min, int x_max, int y_max) { + MakePartition(x_min, y_min, x_max, y_max, 0, 0); + } + + void MakePartition(int x_min, int y_min, int x_max, int y_max, + int first_column, int last_column) { + if (partition_.get() != nullptr) partition_->DeleteBoxes(); + TBOX box; + box.set_to_given_coords(x_min, y_min, x_max, y_max); + partition_.reset( + ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE)); + partition_->set_first_column(first_column); + partition_->set_last_column(last_column); + } + + void InsertTextPartition(ColPartition* part) { + finder_->InsertTextPartition(part); + free_boxes_it_.add_after_then_move(part); + } + + void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max) { + InsertLeaderPartition(x_min, y_min, x_max, y_max, 0, 0); + } + + void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max, + int first_column, int last_column) { + TBOX box; + box.set_to_given_coords(x_min, y_min, x_max, y_max); + ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, + BRT_UNKNOWN, BTFT_LEADER); + part->set_first_column(first_column); + part->set_last_column(last_column); + finder_->InsertLeaderPartition(part); + free_boxes_it_.add_after_then_move(part); + } + + void DeletePartitionListBoxes() { + for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list(); + free_boxes_it_.forward()) { + ColPartition* part = free_boxes_it_.data(); + part->DeleteBoxes(); + } + } + + std::unique_ptr<TestableTableFinder> finder_; + std::unique_ptr<ColPartition> partition_; + + private: + tesseract::ColPartition_CLIST free_boxes_; + tesseract::ColPartition_C_IT free_boxes_it_; +}; + +TEST_F(TableFinderTest, GapInXProjectionNoGap) { + int data[100]; + for (int i = 0; i < 100; ++i) data[i] = 10; + EXPECT_FALSE(finder_->GapInXProjection(data, 100)); +} + +TEST_F(TableFinderTest, GapInXProjectionEdgeGap) { + int data[100]; + for (int i = 0; i < 10; ++i) data[i] = 2; + for (int i = 10; i < 90; ++i) data[i] = 10; + for (int i = 90; i < 100; ++i) data[i] = 2; + EXPECT_FALSE(finder_->GapInXProjection(data, 100)); +} + +TEST_F(TableFinderTest, GapInXProjectionExists) { + int data[100]; + for (int i = 0; i < 10; ++i) data[i] = 10; + for (int i = 10; i < 90; ++i) data[i] = 2; + for (int i = 90; i < 100; ++i) data[i] = 10; + EXPECT_TRUE(finder_->GapInXProjection(data, 100)); +} + +TEST_F(TableFinderTest, HasLeaderAdjacentOverlapping) { + InsertLeaderPartition(90, 0, 150, 5); + MakePartition(0, 0, 100, 10); + EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(0, 25, 100, 40); + EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(145, 0, 200, 20); + EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(40, 0, 50, 4); + EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_)); +} + +TEST_F(TableFinderTest, HasLeaderAdjacentNoOverlap) { + InsertLeaderPartition(90, 10, 150, 15); + MakePartition(0, 10, 85, 20); + EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(0, 25, 100, 40); + EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(0, 0, 100, 10); + EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_)); + // TODO(nbeato): is this a useful metric? case fails + // MakePartition(160, 0, 200, 15); // leader is primarily above it + // EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_)); +} + +TEST_F(TableFinderTest, HasLeaderAdjacentPreservesColumns) { + InsertLeaderPartition(90, 0, 150, 5, 1, 2); + MakePartition(0, 0, 85, 10, 0, 0); + EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(0, 0, 100, 10, 0, 1); + EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(0, 0, 200, 10, 0, 5); + EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_)); + MakePartition(155, 0, 200, 10, 5, 5); + EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_)); +} + +// TODO(nbeato): Only testing a splitting case. Add more... +// Also test non-split cases. +TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicPass) { + finder_->set_global_median_blob_width(3); + finder_->set_global_median_xheight(10); + + TBOX part_box(10, 5, 100, 15); + ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); + all->set_type(PT_FLOWING_TEXT); + all->set_blob_type(BRT_TEXT); + all->set_flow(BTFT_CHAIN); + all->set_left_margin(10); + all->set_right_margin(100); + TBOX blob_box = part_box; + for (int i = 10; i <= 20; i += 5) { + blob_box.set_left(i + 1); + blob_box.set_right(i + 4); + all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box))); + } + for (int i = 35; i <= 55; i += 5) { + blob_box.set_left(i + 1); + blob_box.set_right(i + 4); + all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box))); + } + for (int i = 80; i <= 95; i += 5) { + blob_box.set_left(i + 1); + blob_box.set_right(i + 4); + all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box))); + } + // TODO(nbeato): Ray's newer code... + // all->ClaimBoxes(); + all->ComputeLimits(); // This is to make sure median iinfo is set. + InsertTextPartition(all); // This is to delete blobs + ColPartition* fragment_me = all->CopyButDontOwnBlobs(); + + finder_->SplitAndInsertFragmentedTextPartition(fragment_me); + finder_->ExpectPartition(TBOX(11, 5, 24, 15)); + finder_->ExpectPartition(TBOX(36, 5, 59, 15)); + finder_->ExpectPartition(TBOX(81, 5, 99, 15)); + finder_->ExpectPartitionCount(3); +} + +TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) { + finder_->set_global_median_blob_width(3); + finder_->set_global_median_xheight(10); + + TBOX part_box(10, 5, 100, 15); + ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1)); + all->set_type(PT_FLOWING_TEXT); + all->set_blob_type(BRT_TEXT); + all->set_flow(BTFT_CHAIN); + all->set_left_margin(10); + all->set_right_margin(100); + TBOX blob_box = part_box; + for (int i = 10; i <= 95; i += 5) { + blob_box.set_left(i + 1); + blob_box.set_right(i + 4); + all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box))); + } + // TODO(nbeato): Ray's newer code... + // all->ClaimBoxes(); + all->ComputeLimits(); // This is to make sure median iinfo is set. + InsertTextPartition(all); // This is to delete blobs + ColPartition* fragment_me = all->CopyButDontOwnBlobs(); + + finder_->SplitAndInsertFragmentedTextPartition(fragment_me); + finder_->ExpectPartition(TBOX(11, 5, 99, 15)); + finder_->ExpectPartitionCount(1); +} + +} // namespace diff --git a/tesseract/unittest/tablerecog_test.cc b/tesseract/unittest/tablerecog_test.cc new file mode 100644 index 00000000..3dfb32c5 --- /dev/null +++ b/tesseract/unittest/tablerecog_test.cc @@ -0,0 +1,316 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "colpartition.h" +#include "colpartitiongrid.h" +#include "tablerecog.h" + +#include "include_gunit.h" + +namespace tesseract { + +class TestableTableRecognizer : public tesseract::TableRecognizer { + public: + using TableRecognizer::FindLinesBoundingBox; + using TableRecognizer::HasSignificantLines; + using TableRecognizer::RecognizeLinedTable; + using TableRecognizer::RecognizeTable; + using TableRecognizer::RecognizeWhitespacedTable; +}; + +class TestableStructuredTable : public tesseract::StructuredTable { + public: + using StructuredTable::CountHorizontalIntersections; + using StructuredTable::CountVerticalIntersections; + using StructuredTable::FindLinedStructure; + using StructuredTable::FindWhitespacedColumns; + using StructuredTable::FindWhitespacedStructure; + using StructuredTable::VerifyLinedTableCells; + + void InjectCellY(int y) { + cell_y_.push_back(y); + cell_y_.sort(); + } + void InjectCellX(int x) { + cell_x_.push_back(x); + cell_x_.sort(); + } + + void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) { + ASSERT_EQ(0, (almost_done - second) % add); + EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size()); + EXPECT_EQ(x_min, cell_x_.get(0)); + EXPECT_EQ(x_max, cell_x_.get(cell_x_.size() - 1)); + for (int i = 1; i < cell_x_.size() - 1; ++i) { + EXPECT_EQ(second + add * (i - 1), cell_x_.get(i)); + } + } + + void ExpectSortedX() { + EXPECT_GT(cell_x_.size(), 0); + for (int i = 1; i < cell_x_.size(); ++i) { + EXPECT_LT(cell_x_.get(i - 1), cell_x_.get(i)); + } + } +}; + +class SharedTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + ICOORD bleft(0, 0); + ICOORD tright(1000, 1000); + text_grid_.reset(new ColPartitionGrid(5, bleft, tright)); + line_grid_.reset(new ColPartitionGrid(5, bleft, tright)); + } + + void TearDown() { + tesseract::ColPartition_IT memory(&allocated_parts_); + for (memory.mark_cycle_pt(); !memory.cycled_list(); memory.forward()) { + memory.data()->DeleteBoxes(); + } + } + + void InsertPartitions() { + for (int row = 0; row < 800; row += 20) + for (int col = 0; col < 500; col += 25) + InsertPartition(col + 1, row + 1, col + 24, row + 19); + } + + void InsertPartition(int left, int bottom, int right, int top) { + TBOX box(left, bottom, right, top); + ColPartition* part = + ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); + part->set_median_width(3); + part->set_median_height(3); + text_grid_->InsertBBox(true, true, part); + + tesseract::ColPartition_IT add_it(&allocated_parts_); + add_it.add_after_stay_put(part); + } + + void InsertLines() { + line_box_.set_to_given_coords( + 100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(), + 450 + line_grid_->gridsize(), 50 + line_grid_->gridsize()); + for (int i = 10; i <= 50; i += 10) InsertHorizontalLine(100, 450, i); + for (int i = 100; i <= 450; i += 50) InsertVerticalLine(i, 10, 50); + + for (int i = 100; i <= 200; i += 20) InsertHorizontalLine(0, 100, i); + } + + void InsertHorizontalLine(int left, int right, int y) { + TBOX box(left, y - line_grid_->gridsize(), right, + y + line_grid_->gridsize()); + ColPartition* part = + ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE); + line_grid_->InsertBBox(true, true, part); + + tesseract::ColPartition_IT add_it(&allocated_parts_); + add_it.add_after_stay_put(part); + } + void InsertVerticalLine(int x, int bottom, int top) { + TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(), + top); + ColPartition* part = + ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE); + line_grid_->InsertBBox(true, true, part); + + tesseract::ColPartition_IT add_it(&allocated_parts_); + add_it.add_after_stay_put(part); + } + + void InsertCellsInLines() { + for (int y = 10; y <= 50; y += 10) + for (int x = 100; x <= 450; x += 50) + InsertPartition(x + 1, y + 1, x + 49, y + 9); + } + + TBOX line_box_; + std::unique_ptr<ColPartitionGrid> text_grid_; + std::unique_ptr<ColPartitionGrid> line_grid_; + ColPartition_LIST allocated_parts_; +}; + +class TableRecognizerTest : public SharedTest { + protected: + void SetUp() { + SharedTest::SetUp(); + recognizer_.reset(new TestableTableRecognizer()); + recognizer_->Init(); + recognizer_->set_text_grid(text_grid_.get()); + recognizer_->set_line_grid(line_grid_.get()); + } + + std::unique_ptr<TestableTableRecognizer> recognizer_; +}; + +class StructuredTableTest : public SharedTest { + protected: + void SetUp() { + SharedTest::SetUp(); + table_.reset(new TestableStructuredTable()); + table_->Init(); + table_->set_text_grid(text_grid_.get()); + table_->set_line_grid(line_grid_.get()); + } + + std::unique_ptr<TestableStructuredTable> table_; +}; + +TEST_F(TableRecognizerTest, HasSignificantLinesBasicPass) { + InsertLines(); + TBOX smaller_guess(120, 15, 370, 45); + TBOX larger_guess(90, 5, 490, 70); + EXPECT_TRUE(recognizer_->HasSignificantLines(line_box_)); + EXPECT_TRUE(recognizer_->HasSignificantLines(larger_guess)); + EXPECT_TRUE(recognizer_->HasSignificantLines(smaller_guess)); +} + +TEST_F(TableRecognizerTest, HasSignificantLinesBasicFail) { + InsertLines(); + TBOX box(370, 35, 500, 45); + EXPECT_FALSE(recognizer_->HasSignificantLines(box)); +} + +TEST_F(TableRecognizerTest, HasSignificantLinesHorizontalOnlyFails) { + InsertLines(); + TBOX box(0, 100, 200, 200); + EXPECT_FALSE(recognizer_->HasSignificantLines(box)); +} + +TEST_F(TableRecognizerTest, FindLinesBoundingBoxBasic) { + InsertLines(); + TBOX box(0, 0, 200, 50); + bool result = recognizer_->FindLinesBoundingBox(&box); + EXPECT_TRUE(result); + EXPECT_EQ(line_box_.left(), box.left()); + EXPECT_EQ(line_box_.right(), box.right()); + EXPECT_EQ(line_box_.bottom(), box.bottom()); + EXPECT_EQ(line_box_.top(), box.top()); +} + +TEST_F(TableRecognizerTest, RecognizeLinedTableBasic) { + InsertLines(); + TBOX guess(120, 15, 370, 45); + tesseract::StructuredTable table; + table.set_text_grid(text_grid_.get()); + table.set_line_grid(line_grid_.get()); + + EXPECT_TRUE(recognizer_->RecognizeLinedTable(guess, &table)); + EXPECT_EQ(line_box_.bottom(), table.bounding_box().bottom()); + EXPECT_EQ(line_box_.top(), table.bounding_box().top()); + EXPECT_EQ(line_box_.left(), table.bounding_box().left()); + EXPECT_EQ(line_box_.right(), table.bounding_box().right()); + EXPECT_EQ(line_box_.area(), table.bounding_box().area()); + EXPECT_EQ(7, table.column_count()); + EXPECT_EQ(4, table.row_count()); + EXPECT_EQ(28, table.cell_count()); + EXPECT_TRUE(table.is_lined()); +} + +TEST_F(TableRecognizerTest, RecognizeWhitespacedTableBasic) { + InsertPartitions(); + TBOX guess(0, 0, 500, 800); + + tesseract::StructuredTable table; + table.set_text_grid(text_grid_.get()); + table.set_line_grid(line_grid_.get()); + EXPECT_TRUE(recognizer_->RecognizeWhitespacedTable(guess, &table)); + EXPECT_EQ(1, table.bounding_box().bottom()); + EXPECT_EQ(799, table.bounding_box().top()); + EXPECT_EQ(1, table.bounding_box().left()); + EXPECT_EQ(499, table.bounding_box().right()); + EXPECT_EQ(798 * 498, table.bounding_box().area()); + EXPECT_EQ(500 / 25, table.column_count()); + EXPECT_EQ(800 / 20, table.row_count()); + EXPECT_EQ(500 * 800 / 20 / 25, table.cell_count()); + EXPECT_FALSE(table.is_lined()); +} + +TEST_F(StructuredTableTest, CountVerticalIntersectionsAll) { + table_->set_bounding_box(TBOX(0, 0, 1000, 1000)); + InsertPartition(0, 0, 100, 10); + InsertPartition(1, 12, 43, 21); + EXPECT_EQ(2, table_->CountVerticalIntersections(4)); + EXPECT_EQ(2, table_->CountVerticalIntersections(20)); + EXPECT_EQ(2, table_->CountVerticalIntersections(40)); + EXPECT_EQ(1, table_->CountVerticalIntersections(50)); + EXPECT_EQ(1, table_->CountVerticalIntersections(60)); + EXPECT_EQ(1, table_->CountVerticalIntersections(80)); + EXPECT_EQ(1, table_->CountVerticalIntersections(95)); + EXPECT_EQ(0, table_->CountVerticalIntersections(104)); + EXPECT_EQ(0, table_->CountVerticalIntersections(150)); +} + +TEST_F(StructuredTableTest, CountHorizontalIntersectionsAll) { + table_->set_bounding_box(TBOX(0, 0, 1000, 1000)); + InsertPartition(0, 3, 100, 10); + InsertPartition(110, 5, 200, 16); + + EXPECT_EQ(0, table_->CountHorizontalIntersections(0)); + EXPECT_EQ(1, table_->CountHorizontalIntersections(4)); + EXPECT_EQ(2, table_->CountHorizontalIntersections(8)); + EXPECT_EQ(1, table_->CountHorizontalIntersections(12)); + EXPECT_EQ(0, table_->CountHorizontalIntersections(20)); +} + +TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) { + for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y); + for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x); + InsertLines(); + InsertCellsInLines(); + table_->set_bounding_box(line_box_); + EXPECT_TRUE(table_->VerifyLinedTableCells()); +} + +TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) { + for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y); + for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x); + InsertLines(); + InsertCellsInLines(); + InsertPartition(101, 11, 299, 19); + table_->set_bounding_box(line_box_); + EXPECT_FALSE(table_->VerifyLinedTableCells()); +} + +TEST_F(StructuredTableTest, VerifyLinedTableVerticalFail) { + for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y); + for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x); + InsertLines(); + InsertCellsInLines(); + InsertPartition(151, 21, 199, 39); + table_->set_bounding_box(line_box_); + EXPECT_FALSE(table_->VerifyLinedTableCells()); +} + +TEST_F(StructuredTableTest, FindWhitespacedColumnsBasic) { + InsertPartitions(); + TBOX guess(0, 0, 500, 800); + table_->set_bounding_box(guess); + table_->FindWhitespacedColumns(); + table_->ExpectCellX(1, 25, 25, 475, 499); +} + +TEST_F(StructuredTableTest, FindWhitespacedColumnsSorted) { + InsertPartitions(); + TBOX guess(0, 0, 500, 800); + table_->set_bounding_box(guess); + table_->FindWhitespacedColumns(); + table_->ExpectSortedX(); +} + +// TODO(nbeato): check failure cases +// TODO(nbeato): check Recognize processes correctly on trivial real examples. + +} // namespace diff --git a/tesseract/unittest/tabvector_test.cc b/tesseract/unittest/tabvector_test.cc new file mode 100644 index 00000000..dab0ace8 --- /dev/null +++ b/tesseract/unittest/tabvector_test.cc @@ -0,0 +1,130 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> + +#include "tabvector.h" + +#include "include_gunit.h" + +namespace tesseract { + +class TabVectorTest : public testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + vector_.reset(); + } + + void TearDown() {} + + void MakeSimpleTabVector(int x1, int y1, int x2, int y2) { + vector_.reset(new TabVector()); + vector_->set_startpt(ICOORD(x1, y1)); + vector_->set_endpt(ICOORD(x2, y2)); + } + + std::unique_ptr<TabVector> vector_; +}; + +TEST_F(TabVectorTest, SetStartEndPointsMatch) { + vector_.reset(new TabVector()); + ICOORD start(51, 65); + ICOORD end(7568, 234); + // Test coordinates individually to avoid adding an ostream operator + // explicitly to the ICOORD class (Droid doesn't support it). + vector_->set_startpt(start); + EXPECT_EQ(start.x(), vector_->startpt().x()); + EXPECT_EQ(start.y(), vector_->startpt().y()); + vector_->set_endpt(end); + EXPECT_EQ(end.x(), vector_->endpt().x()); + EXPECT_EQ(end.y(), vector_->endpt().y()); +} + +TEST_F(TabVectorTest, XAtY45DegreeSlopeInRangeExact) { + MakeSimpleTabVector(0, 0, 100, 100); + for (int y = 0; y <= 100; ++y) { + int x = vector_->XAtY(y); + EXPECT_EQ(y, x); + } +} + +TEST_F(TabVectorTest, XAtYVerticalInRangeExact) { + const int x = 120; // Arbitrary choice + MakeSimpleTabVector(x, 0, x, 100); + for (int y = 0; y <= 100; ++y) { + int result_x = vector_->XAtY(y); + EXPECT_EQ(x, result_x); + } +} + +TEST_F(TabVectorTest, XAtYHorizontal) { + const int y = 76; // arbitrary + MakeSimpleTabVector(0, y, 100, y); + EXPECT_EQ(0, vector_->XAtY(y)); + // TODO(nbeato): What's the failure condition? + // Undefined! Should not pass! Allow until resolved answer. + EXPECT_EQ(0, vector_->XAtY(10)); +} + +TEST_F(TabVectorTest, XAtYRoundingSimple) { + MakeSimpleTabVector(0, 0, 2, 10000); + int x = vector_->XAtY(1); + EXPECT_EQ(0, x); + x = vector_->XAtY(4999); + EXPECT_EQ(0, x); + x = vector_->XAtY(5001); + EXPECT_EQ(1, x); + x = vector_->XAtY(9999); + EXPECT_EQ(1, x); +} + +TEST_F(TabVectorTest, XAtYLargeNumbers) { + // Assume a document is 800 DPI, + // the width of a page is 10 inches across (8000 pixels), and + // the height of the page is 15 inches (12000 pixels). + MakeSimpleTabVector(7804, 504, 7968, 11768); // Arbitrary for vertical line + int x = vector_->XAtY(6136); // test mid point + EXPECT_EQ(7886, x); +} + +TEST_F(TabVectorTest, XAtYHorizontalInRangeExact) { + const int y = 120; // Arbitrary choice + MakeSimpleTabVector(50, y, 150, y); + + int x = vector_->XAtY(y); + EXPECT_EQ(50, x); +} + +TEST_F(TabVectorTest, VOverlapInRangeSimple) { + MakeSimpleTabVector(0, 0, 100, 100); + int overlap = vector_->VOverlap(90, 10); + EXPECT_EQ(80, overlap); + overlap = vector_->VOverlap(100, 0); + EXPECT_EQ(100, overlap); +} + +TEST_F(TabVectorTest, VOverlapOutOfRange) { + MakeSimpleTabVector(0, 10, 100, 90); + int overlap = vector_->VOverlap(100, 0); + EXPECT_EQ(80, overlap); +} + +TEST_F(TabVectorTest, XYFlip) { + MakeSimpleTabVector(1, 2, 3, 4); + vector_->XYFlip(); + EXPECT_EQ(2, vector_->startpt().x()); + EXPECT_EQ(1, vector_->startpt().y()); + EXPECT_EQ(4, vector_->endpt().x()); + EXPECT_EQ(3, vector_->endpt().y()); +} + +} // namespace diff --git a/tesseract/unittest/tatweel_test.cc b/tesseract/unittest/tatweel_test.cc new file mode 100644 index 00000000..4bd8b337 --- /dev/null +++ b/tesseract/unittest/tatweel_test.cc @@ -0,0 +1,114 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(_WIN32) +#include <io.h> // for _access +#else +#include <unistd.h> // for access +#endif + +#include "include_gunit.h" +#include "dawg.h" +#include "trie.h" +#include "unicharset.h" +#ifdef INCLUDE_TENSORFLOW +#include "util/utf8/unicodetext.h" // for UnicodeText +#endif + +namespace tesseract { + +// Replacement for std::filesystem::exists (C++-17) +static bool file_exists(const char* filename) { +#if defined(_WIN32) + return _access(filename, 0) == 0; +#else + return access(filename, 0) == 0; +#endif +} + +class TatweelTest : public ::testing::Test { + protected: + void SetUp() override { + static std::locale system_locale(""); + std::locale::global(system_locale); + } + + TatweelTest() { +#ifdef INCLUDE_TENSORFLOW + std::string filename = TestDataNameToPath("ara.wordlist"); + if (file_exists(filename.c_str())) { + std::string wordlist(u8"\u0640"); + CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults())); + // Put all the unicodes in the unicharset_. + UnicodeText text; + text.PointToUTF8(wordlist.data(), wordlist.size()); + int num_tatweel = 0; + for (auto it = text.begin(); it != text.end(); ++it) { + std::string utf8 = it.get_utf8_string(); + if (utf8.find(u8"\u0640") != std::string::npos) ++num_tatweel; + unicharset_.unichar_insert(utf8.c_str()); + } + LOG(INFO) << "Num tatweels in source data=" << num_tatweel; + EXPECT_GT(num_tatweel, 0); + } +#endif + } + + std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTDATA_DIR, name); + } + UNICHARSET unicharset_; +}; + +TEST_F(TatweelTest, UnicharsetIgnoresTatweel) { + // This test verifies that the unicharset ignores the Tatweel character. + for (int i = 0; i < unicharset_.size(); ++i) { + const char* utf8 = unicharset_.id_to_unichar(i); + EXPECT_EQ(strstr(utf8, u8"\u0640"), nullptr); + } +} + +TEST_F(TatweelTest, DictIgnoresTatweel) { + // This test verifies that the dictionary ignores the Tatweel character. + tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, + unicharset_.size(), 0); + std::string filename = TestDataNameToPath("ara.wordlist"); + if (!file_exists(filename.c_str())) { + LOG(INFO) << "Skip test because of missing " << filename; + GTEST_SKIP(); + } else { + EXPECT_TRUE(trie.read_and_add_word_list( + filename.c_str(), unicharset_, + tesseract::Trie::RRP_REVERSE_IF_HAS_RTL)); + EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false)); + } +} + +TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) { + // This test verifies that a load of an existing unicharset keeps any + // existing tatweel for backwards compatibility. + std::string filename = TestDataNameToPath("ara.unicharset"); + if (!file_exists(filename.c_str())) { + LOG(INFO) << "Skip test because of missing " << filename; + GTEST_SKIP(); + } else { + EXPECT_TRUE(unicharset_.load_from_file(filename.c_str())); + int num_tatweel = 0; + for (int i = 0; i < unicharset_.size(); ++i) { + const char* utf8 = unicharset_.id_to_unichar(i); + if (strstr(utf8, u8"\u0640") != nullptr) ++num_tatweel; + } + LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel; + EXPECT_EQ(num_tatweel, 4); + } +} + +} // namespace diff --git a/tesseract/unittest/tesseract_leaksanitizer.supp b/tesseract/unittest/tesseract_leaksanitizer.supp new file mode 100644 index 00000000..6cc39999 --- /dev/null +++ b/tesseract/unittest/tesseract_leaksanitizer.supp @@ -0,0 +1,12 @@ +# Suppress memory leaks. +# Use with LSAN_OPTIONS=suppressions=tesseract_lsan.supp +leak:FcLangSetCreate +leak:FcPatternObjectAddWithBinding +leak:FcPatternObjectInsertElt +leak:FcValueListAppend +leak:FcValueListDuplicate +leak:FcValueListPrepend +leak:IA__FcLangSetCreate +leak:IA__FcValueSave +leak:libfontconfig.so +leak:libfreetype.so diff --git a/tesseract/unittest/textlineprojection_test.cc b/tesseract/unittest/textlineprojection_test.cc new file mode 100644 index 00000000..f8423615 --- /dev/null +++ b/tesseract/unittest/textlineprojection_test.cc @@ -0,0 +1,262 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <allheaders.h> +#include <string> // for std::string + +#include "absl/strings/str_format.h" // for absl::StrFormat +#include "include_gunit.h" + +#include <tesseract/baseapi.h> +#include "colfind.h" +#include "log.h" // for LOG +#include "mutableiterator.h" +#include <tesseract/osdetect.h> +#include "pageres.h" +#include "tesseractclass.h" +#include "textlineprojection.h" + +namespace tesseract { + +// Minimum score for a STRONG_CHAIN textline. +// NOTE: Keep in sync with textlineprojection.cc. +const int kMinStrongTextValue = 6; + +// The fixture for testing Tesseract. +class TextlineProjectionTest : public testing::Test { + protected: + std::string OutputNameToPath(const std::string& name) { + file::MakeTmpdir(); + return file::JoinPath(FLAGS_test_tmpdir, name); + } + + TextlineProjectionTest() { + src_pix_ = nullptr; + bin_pix_ = nullptr; + tesseract_ = nullptr; + finder_ = nullptr; + denorm_ = nullptr; + projection_ = nullptr; + } + virtual ~TextlineProjectionTest() { + pixDestroy(&src_pix_); + pixDestroy(&bin_pix_); + delete finder_; + delete tesseract_; + } + + void SetImage(const char* filename) { + pixDestroy(&src_pix_); + src_pix_ = pixRead(file::JoinPath(TESTING_DIR, filename).c_str()); + api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY); + api_.SetPageSegMode(tesseract::PSM_AUTO_OSD); + api_.SetImage(src_pix_); + } + + // Ugly hacked-together function sets up projection_ and denorm_ by setting + // up for auto pagelayout, setting up a ColumnFinder, running it, and + // using accessors to get at the internal denorm and projection. + // If the coordinates have been rotated, the denorm should match + // correctly and transform coordinates back to the projection. + // We throw away all the blocks, blobs etc, and test the projection with + // the resultiterator from a separate BaseAPI run. + void SetupProjection() { + tesseract::TessdataManager mgr; + Tesseract* osd_tess = new Tesseract; + OSResults osr; + EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, nullptr, "osd", + tesseract::OEM_TESSERACT_ONLY, nullptr, 0, + nullptr, nullptr, false, &mgr), + 0); + tesseract_ = new Tesseract; + EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, nullptr, "eng", + tesseract::OEM_TESSERACT_ONLY, nullptr, 0, + nullptr, nullptr, false, &mgr), + 0); + bin_pix_ = api_.GetThresholdedImage(); + *tesseract_->mutable_pix_binary() = pixClone(bin_pix_); + osd_tess->set_source_resolution(api_.tesseract()->source_resolution()); + tesseract_->set_source_resolution(api_.tesseract()->source_resolution()); + int width = pixGetWidth(bin_pix_); + int height = pixGetHeight(bin_pix_); + // First make a single block covering the whole image. + BLOCK* block = new BLOCK("", true, 0, 0, 0, 0, width, height); + block->set_right_to_left(false); + BLOCK_LIST src_blocks; + BLOCK_IT block_it(&src_blocks); + block_it.add_to_end(block); + Pix* photomask_pix = nullptr; + // The blocks made by the ColumnFinder. Moved to blocks before return. + BLOCK_LIST found_blocks; + TO_BLOCK_LIST temp_blocks; + finder_ = tesseract_->SetupPageSegAndDetectOrientation( + tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess, &osr, &temp_blocks, + &photomask_pix, nullptr); + TO_BLOCK_IT to_block_it(&temp_blocks); + TO_BLOCK* to_block = to_block_it.data(); + denorm_ = finder_->denorm(); + TO_BLOCK_LIST to_blocks; + BLOBNBOX_LIST diacritic_blobs; + EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block, + photomask_pix, nullptr, nullptr, nullptr, + &found_blocks, &diacritic_blobs, &to_blocks), + 0); + projection_ = finder_->projection(); + pixDestroy(&photomask_pix); + delete osd_tess; + } + + // Helper evaluates the given box, expects the result to be greater_than + // or !greater_than the target_value and provides diagnostics if not. + void EvaluateBox(const TBOX& box, bool greater_or_equal, int target_value, + const char* text, const char* message) { + int value = projection_->EvaluateBox(box, denorm_, false); + if (greater_or_equal != (value > target_value)) { + LOG(INFO) << absl::StrFormat( + "EvaluateBox too %s:%d vs %d for %s word '%s' at:", + greater_or_equal ? "low" : "high", value, target_value, message, + text); + box.print(); + value = projection_->EvaluateBox(box, denorm_, true); + } else { + LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'", + value, message, text); + } + if (greater_or_equal) { + EXPECT_GE(value, target_value); + } else { + EXPECT_LT(value, target_value); + } + } + + // Helper evaluates the DistanceOfBoxFromBox function by expecting that + // box should be nearer to true_box than false_box. + void EvaluateDistance(const TBOX& box, const TBOX& true_box, + const TBOX& false_box, const char* text, + const char* message) { + int true_dist = + projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false); + int false_dist = + projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false); + if (false_dist <= true_dist) { + LOG(INFO) << absl::StrFormat( + "Distance wrong:%d vs %d for %s word '%s' at:", + false_dist, true_dist, message, text); + true_box.print(); + projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, true); + projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, true); + } else { + LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'", + false_dist, true_dist, message, text); + } + } + + // Tests the projection on the word boxes of the given image. + // line_height is the cap + descender size of the text. + void VerifyBoxes(const char* imagefile, int line_height) { + SetImage(imagefile); + api_.Recognize(nullptr); + SetupProjection(); + MutableIterator* it = api_.GetMutableIterator(); + do { + char* text = it->GetUTF8Text(tesseract::RIL_WORD); + const PAGE_RES_IT* pr_it = it->PageResIt(); + WERD_RES* word = pr_it->word(); + // The word_box refers to the internal, possibly rotated, coords. + TBOX word_box = word->word->bounding_box(); + bool small_word = word_box.height() * 1.5 < line_height; + bool tall_word = word_box.height() * 1.125 > line_height; + // We pad small and tall words differently because ascenders and + // descenders affect the position and size of the upper/lower boxes. + int padding; + if (small_word) { + padding = word_box.height(); + } else if (tall_word) { + padding = word_box.height() / 3; + } else { + padding = word_box.height() / 2; + } + // Test that the word box gets a good score. + EvaluateBox(word_box, true, kMinStrongTextValue, text, "Real Word"); + + // Now test a displaced box, both above and below the word. + TBOX upper_box(word_box); + upper_box.set_bottom(word_box.top()); + upper_box.set_top(word_box.top() + padding); + EvaluateBox(upper_box, false, kMinStrongTextValue, text, "Upper Word"); + EvaluateBox(upper_box, true, -1, text, "Upper Word not vertical"); + TBOX lower_box = word_box; + lower_box.set_top(word_box.bottom()); + lower_box.set_bottom(word_box.bottom() - padding); + if (tall_word) lower_box.move(ICOORD(0, padding / 2)); + EvaluateBox(lower_box, false, kMinStrongTextValue, text, "Lower Word"); + EvaluateBox(lower_box, true, -1, text, "Lower Word not vertical"); + + // Since some words have no text below and some words have no text above + // check that at least one of the boxes satisfies BoxOutOfTextline. + bool upper_or_lower_out_of_textline = + projection_->BoxOutOfHTextline(upper_box, denorm_, false) || + projection_->BoxOutOfHTextline(lower_box, denorm_, false); + if (!upper_or_lower_out_of_textline) { + projection_->BoxOutOfHTextline(upper_box, denorm_, true); + projection_->BoxOutOfHTextline(lower_box, denorm_, true); + } + EXPECT_TRUE(upper_or_lower_out_of_textline); + + // Now test DistanceOfBoxFromBox by faking a challenger word, and asking + // that each pad box be nearer to its true textline than the + // challenger. Due to the tight spacing of latin text, getting + // the right position and size of these test boxes is quite fiddly. + padding = line_height / 4; + upper_box.set_top(upper_box.bottom() + padding); + TBOX target_box(word_box); + if (!small_word) { + upper_box.move(ICOORD(0, -padding * 3 / 2)); + } + target_box.set_top(upper_box.bottom()); + TBOX upper_challenger(upper_box); + upper_challenger.set_bottom(upper_box.top()); + upper_challenger.set_top(upper_box.top() + word_box.height()); + EvaluateDistance(upper_box, target_box, upper_challenger, text, + "Upper Word"); + if (tall_word) lower_box.move(ICOORD(0, padding / 2)); + lower_box.set_bottom(lower_box.top() - padding); + target_box = word_box; + target_box.set_bottom(lower_box.top()); + TBOX lower_challenger(lower_box); + lower_challenger.set_top(lower_box.bottom()); + lower_challenger.set_bottom(lower_box.bottom() - word_box.height()); + EvaluateDistance(lower_box, target_box, lower_challenger, text, + "Lower Word"); + + delete[] text; + } while (it->Next(tesseract::RIL_WORD)); + delete it; + } + + Pix* src_pix_; + Pix* bin_pix_; + BLOCK_LIST blocks_; + std::string ocr_text_; + tesseract::TessBaseAPI api_; + Tesseract* tesseract_; + ColumnFinder* finder_; + const DENORM* denorm_; + const TextlineProjection* projection_; +}; + +// Tests all word boxes on an unrotated image. +TEST_F(TextlineProjectionTest, Unrotated) { VerifyBoxes("phototest.tif", 31); } + +// Tests character-level applyboxes on italic Times New Roman. +TEST_F(TextlineProjectionTest, Rotated) { VerifyBoxes("phototestrot.tif", 31); } + +} // namespace diff --git a/tesseract/unittest/tfile_test.cc b/tesseract/unittest/tfile_test.cc new file mode 100644 index 00000000..166405ff --- /dev/null +++ b/tesseract/unittest/tfile_test.cc @@ -0,0 +1,179 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "genericvector.h" +#include "serialis.h" + +#include "include_gunit.h" + +namespace tesseract { + +// Tests TFile and std::vector serialization by serializing and +// writing/reading. + +class TfileTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + } + + TfileTest() {} + + // Some data to serialize. + class MathData { + public: + MathData() : num_squares_(0), num_triangles_(0) {} + void Setup() { + // Setup some data. + for (int s = 0; s < 42; ++s) squares_.push_back(s * s); + num_squares_ = squares_.size(); + for (int t = 0; t < 52; ++t) triangles_.push_back(t * (t + 1) / 2); + num_triangles_ = triangles_.size(); + } + void ExpectEq(const MathData& other) { + // Check the data. + EXPECT_EQ(num_squares_, other.num_squares_); + for (int s = 0; s < squares_.size(); ++s) + EXPECT_EQ(squares_[s], other.squares_[s]); + EXPECT_EQ(num_triangles_, other.num_triangles_); + for (int s = 0; s < triangles_.size(); ++s) + EXPECT_EQ(triangles_[s], other.triangles_[s]); + } + bool Serialize(TFile* fp) { + if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false; + if (!squares_.Serialize(fp)) return false; + if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) + return false; + if (!triangles_.Serialize(fp)) return false; + return true; + } + bool DeSerialize(TFile* fp) { + if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1) + return false; + if (!squares_.DeSerialize(fp)) return false; + if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1) + return false; + if (!triangles_.DeSerialize(fp)) return false; + return true; + } + bool SerializeBigEndian(TFile* fp) { + ReverseN(&num_squares_, sizeof(num_squares_)); + if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false; + // Write an additional reversed size before the vector, which will get + // used as its size on reading. + if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false; + for (int i = 0; i < squares_.size(); ++i) + ReverseN(&squares_[i], sizeof(squares_[i])); + if (!squares_.Serialize(fp)) return false; + ReverseN(&num_triangles_, sizeof(num_triangles_)); + if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) + return false; + if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1) + return false; + for (int i = 0; i < triangles_.size(); ++i) + ReverseN(&triangles_[i], sizeof(triangles_[i])); + return triangles_.Serialize(fp); + } + bool DeSerializeBigEndian(TFile* fp) { + if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1) + return false; + if (!squares_.DeSerialize(fp)) return false; + // The first element is the size that was written, so we will delete it + // and read the last element separately. + int last_element; + if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1) + return false; + squares_.remove(0); + squares_.push_back(last_element); + if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1) + return false; + if (!triangles_.DeSerialize(fp)) return false; + if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1) + return false; + triangles_.remove(0); + triangles_.push_back(last_element); + return true; + } + + private: + GenericVector<int> squares_; + int num_squares_; + GenericVector<int> triangles_; + int num_triangles_; + }; +}; + +TEST_F(TfileTest, Serialize) { + // This test verifies that Tfile can serialize a class. + MathData m1; + m1.Setup(); + std::vector<char> data; + TFile fpw; + fpw.OpenWrite(&data); + EXPECT_TRUE(m1.Serialize(&fpw)); + TFile fpr; + EXPECT_TRUE(fpr.Open(&data[0], data.size())); + MathData m2; + EXPECT_TRUE(m2.DeSerialize(&fpr)); + m1.ExpectEq(m2); + MathData m3; + EXPECT_FALSE(m3.DeSerialize(&fpr)); + fpr.Rewind(); + EXPECT_TRUE(m3.DeSerialize(&fpr)); + m1.ExpectEq(m3); +} + +TEST_F(TfileTest, FGets) { + // This test verifies that Tfile can interleave FGets with binary data. + MathData m1; + std::string line_str = "This is a textline with a newline\n"; + m1.Setup(); + std::vector<char> data; + TFile fpw; + fpw.OpenWrite(&data); + EXPECT_TRUE(m1.Serialize(&fpw)); + EXPECT_EQ(1, fpw.FWrite(line_str.data(), line_str.size(), 1)); + EXPECT_TRUE(m1.Serialize(&fpw)); + // Now get back the 2 copies of m1 with the line in between. + TFile fpr; + EXPECT_TRUE(fpr.Open(&data[0], data.size())); + MathData m2; + EXPECT_TRUE(m2.DeSerialize(&fpr)); + m1.ExpectEq(m2); + const int kBufsize = 1024; + char buffer[kBufsize + 1]; + EXPECT_EQ(buffer, fpr.FGets(buffer, kBufsize)); + EXPECT_STREQ(line_str.c_str(), buffer); + MathData m3; + EXPECT_TRUE(m3.DeSerialize(&fpr)); + m1.ExpectEq(m3); +} + +TEST_F(TfileTest, BigEndian) { + // This test verifies that Tfile can auto-reverse big-endian data. + MathData m1; + m1.Setup(); + std::vector<char> data; + TFile fpw; + fpw.OpenWrite(&data); + EXPECT_TRUE(m1.SerializeBigEndian(&fpw)); + TFile fpr; + EXPECT_TRUE(fpr.Open(&data[0], data.size())); + fpr.set_swap(true); + MathData m2; + EXPECT_TRUE(m2.DeSerializeBigEndian(&fpr)); + // That serialize was destructive, so test against a fresh MathData. + MathData m3; + m3.Setup(); + m3.ExpectEq(m2); +} + +} // namespace diff --git a/tesseract/unittest/third_party/utf/rune.c b/tesseract/unittest/third_party/utf/rune.c new file mode 100644 index 00000000..3d860570 --- /dev/null +++ b/tesseract/unittest/third_party/utf/rune.c @@ -0,0 +1,357 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#include <stdarg.h> +#include <string.h> +#include "third_party/utf/utf.h" +#include "third_party/utf/utfdef.h" + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1<<Bitx)-1, /* 0011 1111 */ + Testx = Maskx ^ 0xFF, /* 1100 0000 */ + + Bad = Runeerror, +}; + +/* + * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 + * This is a slower but "safe" version of the old chartorune + * that works on strings that are not necessarily null-terminated. + * + * If you know for sure that your string is null-terminated, + * chartorune will be a bit faster. + * + * It is guaranteed not to attempt to access "length" + * past the incoming pointer. This is to avoid + * possible access violations. If the string appears to be + * well-formed but incomplete (i.e., to get the whole Rune + * we'd need to read past str+length) then we'll set the Rune + * to Bad and return 0. + * + * Note that if we have decoding problems for other + * reasons, we return 1 instead of 0. + */ +int +charntorune(Rune *rune, const char *str, int length) +{ + int c, c1, c2, c3; + long l; + + /* When we're not allowed to read anything */ + if(length <= 0) { + goto badlen; + } + + /* + * one character sequence (7-bit value) + * 00000-0007F => T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + // If we can't read more than one character we must stop + if(length <= 1) { + goto badlen; + } + + /* + * two character sequence (11-bit value) + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + // If we can't read more than two characters we must stop + if(length <= 2) { + goto badlen; + } + + /* + * three character sequence (16-bit value) + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + if (length <= 3) + goto badlen; + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + if (l > Runemax) + goto bad; + *rune = l; + return 4; + } + + // Support for 5-byte or longer UTF-8 would go here, but + // since we don't have that, we'll just fall through to bad. + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +badlen: + *rune = Bad; + return 0; + +} + + +/* + * This is the older "unsafe" version, which works fine on + * null-terminated strings. + */ +int +chartorune(Rune *rune, const char *str) +{ + int c, c1, c2, c3; + long l; + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + if (l > Runemax) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { + *consumed = charntorune(rune, str, length); + return *rune != Runeerror || *consumed == 3; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +runenlen(const Rune *r, int nrune) +{ + int nb; + ulong c; /* Rune is signed, so use unsigned for range check. */ + + nb = 0; + while(nrune--) { + c = *r++; + if (c <= Rune1) + nb++; + else if (c <= Rune2) + nb += 2; + else if (c <= Rune3) + nb += 3; + else if (c <= Runemax) + nb += 4; + else + nb += 3; /* Runeerror = 0xFFFD, see runetochar */ + } + return nb; +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(uchar*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} diff --git a/tesseract/unittest/third_party/utf/utf.h b/tesseract/unittest/third_party/utf/utf.h new file mode 100644 index 00000000..06982e58 --- /dev/null +++ b/tesseract/unittest/third_party/utf/utf.h @@ -0,0 +1,246 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#ifndef _UTFH_ +#define _UTFH_ 1 + +#include <stdint.h> + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * rune routines + */ + +/* + * These routines were written by Rob Pike and Ken Thompson + * and first appeared in Plan 9. + * SEE ALSO + * utf (7) + * tcs (1) +*/ + +// runetochar copies (encodes) one rune, pointed to by r, to at most +// UTFmax bytes starting at s and returns the number of bytes generated. + +int runetochar(char* s, const Rune* r); + + +// chartorune copies (decodes) at most UTFmax bytes starting at s to +// one rune, pointed to by r, and returns the number of bytes consumed. +// If the input is not exactly in UTF format, chartorune will set *r +// to Runeerror and return 1. +// +// Note: There is no special case for a "null-terminated" string. A +// string whose first byte has the value 0 is the UTF8 encoding of the +// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal +// anywhere else in a UTF sequence. + +int chartorune(Rune* r, const char* s); + + +// charntorune is like chartorune, except that it will access at most +// n bytes of s. If the UTF sequence is incomplete within n bytes, +// charntorune will set *r to Runeerror and return 0. If it is complete +// but not in UTF format, it will set *r to Runeerror and return 1. +// +// Added 2004-09-24 by Wei-Hwa Huang + +int charntorune(Rune* r, const char* s, int n); + +// isvalidcharntorune(str, n, r, consumed) +// is a convenience function that calls "*consumed = charntorune(r, str, n)" +// and returns an int (logically boolean) indicating whether the first +// n bytes of str was a valid and complete UTF sequence. + +int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed); + +// runelen returns the number of bytes required to convert r into UTF. + +int runelen(Rune r); + + +// runenlen returns the number of bytes required to convert the n +// runes pointed to by r into UTF. + +int runenlen(const Rune* r, int n); + + +// fullrune returns 1 if the string s of length n is long enough to be +// decoded by chartorune, and 0 otherwise. This does not guarantee +// that the string contains a legal UTF encoding. This routine is used +// by programs that obtain input one byte at a time and need to know +// when a full rune has arrived. + +int fullrune(const char* s, int n); + +// The following routines are analogous to the corresponding string +// routines with "utf" substituted for "str", and "rune" substituted +// for "chr". + +// utflen returns the number of runes that are represented by the UTF +// string s. (cf. strlen) + +int utflen(const char* s); + + +// utfnlen returns the number of complete runes that are represented +// by the first n bytes of the UTF string s. If the last few bytes of +// the string contain an incompletely coded rune, utfnlen will not +// count them; in this way, it differs from utflen, which includes +// every byte of the string. (cf. strnlen) + +int utfnlen(const char* s, long n); + + +// utfrune returns a pointer to the first occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string. The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strchr) + +const char* utfrune(const char* s, Rune r); + + +// utfrrune returns a pointer to the last occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string. The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strrchr) + +const char* utfrrune(const char* s, Rune r); + + +// utfutf returns a pointer to the first occurrence of the UTF string +// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the +// null string, utfutf returns s1. (cf. strstr) + +const char* utfutf(const char* s1, const char* s2); + + +// utfecpy copies UTF sequences until a null sequence has been copied, +// but writes no sequences beyond es1. If any sequences are copied, +// s1 is terminated by a null sequence, and a pointer to that sequence +// is returned. Otherwise, the original s1 is returned. (cf. strecpy) + +char* utfecpy(char *s1, char *es1, const char *s2); + + + +// These functions are rune-string analogues of the corresponding +// functions in strcat (3). +// +// These routines first appeared in Plan 9. +// SEE ALSO +// memmove (3) +// rune (3) +// strcat (2) +// +// BUGS: The outcome of overlapping moves varies among implementations. + +Rune* runestrcat(Rune* s1, const Rune* s2); +Rune* runestrncat(Rune* s1, const Rune* s2, long n); + +const Rune* runestrchr(const Rune* s, Rune c); + +int runestrcmp(const Rune* s1, const Rune* s2); +int runestrncmp(const Rune* s1, const Rune* s2, long n); + +Rune* runestrcpy(Rune* s1, const Rune* s2); +Rune* runestrncpy(Rune* s1, const Rune* s2, long n); +Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2); + +Rune* runestrdup(const Rune* s); + +const Rune* runestrrchr(const Rune* s, Rune c); +long runestrlen(const Rune* s); +const Rune* runestrstr(const Rune* s1, const Rune* s2); + + + +// The following routines test types and modify cases for Unicode +// characters. Unicode defines some characters as letters and +// specifies three cases: upper, lower, and title. Mappings among the +// cases are also defined, although they are not exhaustive: some +// upper case letters have no lower case mapping, and so on. Unicode +// also defines several character properties, a subset of which are +// checked by these routines. These routines are based on Unicode +// version 3.0.0. +// +// NOTE: The routines are implemented in C, so the boolean functions +// (e.g., isupperrune) return 0 for false and 1 for true. +// +// +// toupperrune, tolowerrune, and totitlerune are the Unicode case +// mappings. These routines return the character unchanged if it has +// no defined mapping. + +Rune toupperrune(Rune r); +Rune tolowerrune(Rune r); +Rune totitlerune(Rune r); + + +// isupperrune tests for upper case characters, including Unicode +// upper case letters and targets of the toupper mapping. islowerrune +// and istitlerune are defined analogously. + +int isupperrune(Rune r); +int islowerrune(Rune r); +int istitlerune(Rune r); + + +// isalpharune tests for Unicode letters; this includes ideographs in +// addition to alphabetic characters. + +int isalpharune(Rune r); + + +// isdigitrune tests for digits. Non-digit numbers, such as Roman +// numerals, are not included. + +int isdigitrune(Rune r); + + +// isideographicrune tests for ideographic characters and numbers, as +// defined by the Unicode standard. + +int isideographicrune(Rune r); + + +// isspacerune tests for whitespace characters, including "C" locale +// whitespace, Unicode defined whitespace, and the "zero-width +// non-break space" character. + +int isspacerune(Rune r); + + +// (The comments in this file were copied from the manpage files rune.3, +// isalpharune.3, and runestrcat.3. Some formatting changes were also made +// to conform to Google style. /JRM 11/11/05) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tesseract/unittest/third_party/utf/utfdef.h b/tesseract/unittest/third_party/utf/utfdef.h new file mode 100644 index 00000000..4b58ae87 --- /dev/null +++ b/tesseract/unittest/third_party/utf/utfdef.h @@ -0,0 +1,14 @@ +#define uchar _utfuchar +#define ushort _utfushort +#define uint _utfuint +#define ulong _utfulong +#define vlong _utfvlong +#define uvlong _utfuvlong + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; + +#define nelem(x) (sizeof(x)/sizeof((x)[0])) +#define nil ((void*)0) diff --git a/tesseract/unittest/unichar_test.cc b/tesseract/unittest/unichar_test.cc new file mode 100644 index 00000000..54394436 --- /dev/null +++ b/tesseract/unittest/unichar_test.cc @@ -0,0 +1,43 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "gmock/gmock.h" // for testing::ElementsAreArray +#include <tesseract/unichar.h> + +namespace tesseract { + +TEST(UnicharTest, Conversion) { + // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8 + // show the required conversion properties. + // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes. + const char* kUTF8Src = "a\u05d0\u0ca4\U0002a714"; + const std::vector<char32> kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714}; + // Check for round-trip conversion. + std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src); + EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src)); + std::string utf8 = UNICHAR::UTF32ToUTF8(utf32); + EXPECT_STREQ(kUTF8Src, utf8.c_str()); +} + +TEST(UnicharTest, InvalidText) { + // This test verifies that Unichar correctly deals with invalid text. + const char* kInvalidUTF8 = "a b\200d string"; + const std::vector<char32> kInvalidUTF32 = {'a', ' ', 0x200000, 'x'}; + // Invalid utf8 produces an empty vector. + std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8); + EXPECT_TRUE(utf32.empty()); + // Invalid utf32 produces an empty string. + std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32); + EXPECT_TRUE(utf8.empty()); +} + +} // namespace diff --git a/tesseract/unittest/unicharcompress_test.cc b/tesseract/unittest/unicharcompress_test.cc new file mode 100644 index 00000000..1777930e --- /dev/null +++ b/tesseract/unittest/unicharcompress_test.cc @@ -0,0 +1,257 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> + +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_split.h" +#include "allheaders.h" + +#include "include_gunit.h" +#include "log.h" // for LOG +#include "serialis.h" +#include "tprintf.h" +#include "unicharcompress.h" + +namespace tesseract { + +class UnicharcompressTest : public ::testing::Test { + protected: + void SetUp() { + std::locale::global(std::locale("")); + file::MakeTmpdir(); + } + + // Loads and compresses the given unicharset. + void LoadUnicharset(const std::string& unicharset_name) { + std::string radical_stroke_file = + file::JoinPath(LANGDATA_DIR, "radical-stroke.txt"); + std::string unicharset_file = + file::JoinPath(TESTDATA_DIR, unicharset_name); + std::string radical_data; + CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, + file::Defaults())); + CHECK(unicharset_.load_from_file(unicharset_file.c_str())); + STRING radical_str(radical_data.c_str()); + null_char_ = + unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size(); + compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str); + // Get the encoding of the null char. + RecodedCharID code; + compressed_.EncodeUnichar(null_char_, &code); + encoded_null_char_ = code(0); + std::string output_name = file::JoinPath( + FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt")); + STRING encoding = compressed_.GetEncodingAsString(unicharset_); + std::string encoding_str(&encoding[0], encoding.size()); + CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults())); + LOG(INFO) << "Wrote encoding to:" << output_name; + } + // Serializes and de-serializes compressed_ over itself. + void SerializeAndUndo() { + std::vector<char> data; + TFile wfp; + wfp.OpenWrite(&data); + EXPECT_TRUE(compressed_.Serialize(&wfp)); + TFile rfp; + rfp.Open(&data[0], data.size()); + EXPECT_TRUE(compressed_.DeSerialize(&rfp)); + } + // Returns true if the lang is in CJK. + bool IsCJKLang(const std::string& lang) { + return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || + lang == "jpn"; + } + // Returns true if the lang is Indic. + bool IsIndicLang(const std::string& lang) { + return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || + lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" || + lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" || + lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" || + lang == "tel"; + } + + // Expects the appropriate results from the compressed_ unicharset_. + void ExpectCorrect(const std::string& lang) { + // Count the number of times each code is used in each element of + // RecodedCharID. + RecodedCharID zeros; + for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0); + int code_range = compressed_.code_range(); + std::vector<RecodedCharID> times_seen(code_range, zeros); + for (int u = 0; u <= unicharset_.size(); ++u) { + if (u != UNICHAR_SPACE && u != null_char_ && + (u == unicharset_.size() || (unicharset_.has_special_codes() && + u < SPECIAL_UNICHAR_CODES_COUNT))) { + continue; // Not used so not encoded. + } + RecodedCharID code; + int len = compressed_.EncodeUnichar(u, &code); + // Check round-trip encoding. + int unichar_id; + GenericVector<UNICHAR_ID> normed_ids; + if (u == null_char_ || u == unicharset_.size()) { + unichar_id = null_char_; + } else { + unichar_id = u; + } + EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code)); + // Check that the codes are valid. + for (int i = 0; i < len; ++i) { + int code_val = code(i); + EXPECT_GE(code_val, 0); + EXPECT_LT(code_val, code_range); + times_seen[code_val].Set(i, times_seen[code_val](i) + 1); + } + } + // Check that each code is used in at least one position. + for (int c = 0; c < code_range; ++c) { + int num_used = 0; + for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) { + if (times_seen[c](i) != 0) ++num_used; + } + EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range; + } + // Check that GetNextCodes/GetFinalCodes lists match the times_seen, + // and create valid codes. + RecodedCharID code; + CheckCodeExtensions(code, times_seen); + // Finally, we achieved all that using a codebook < 10% of the size of + // the original unicharset, for CK or Indic, and 20% with J, but just + // no bigger for all others. + if (IsCJKLang(lang) || IsIndicLang(lang)) { + EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10)); + } else { + EXPECT_LE(code_range, unicharset_.size() + 1); + } + LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " + << code_range; + } + // Checks for extensions of the current code that either finish a code, or + // extend it and checks those extensions recursively. + void CheckCodeExtensions(const RecodedCharID& code, + const std::vector<RecodedCharID>& times_seen) { + RecodedCharID extended = code; + int length = code.length(); + const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code); + if (final_codes != nullptr) { + for (int i = 0; i < final_codes->size(); ++i) { + int ending = (*final_codes)[i]; + EXPECT_GT(times_seen[ending](length), 0); + extended.Set(length, ending); + int unichar_id = compressed_.DecodeUnichar(extended); + EXPECT_NE(INVALID_UNICHAR_ID, unichar_id); + } + } + const GenericVector<int>* next_codes = compressed_.GetNextCodes(code); + if (next_codes != nullptr) { + for (int i = 0; i < next_codes->size(); ++i) { + int extension = (*next_codes)[i]; + EXPECT_GT(times_seen[extension](length), 0); + extended.Set(length, extension); + CheckCodeExtensions(extended, times_seen); + } + } + } + + UnicharCompress compressed_; + UNICHARSET unicharset_; + int null_char_; + // The encoding of the null_char_. + int encoded_null_char_; +}; + +TEST_F(UnicharcompressTest, DoesChinese) { + LOG(INFO) << "Testing chi_tra"; + LoadUnicharset("chi_tra.unicharset"); + ExpectCorrect("chi_tra"); + LOG(INFO) << "Testing chi_sim"; + LoadUnicharset("chi_sim.unicharset"); + ExpectCorrect("chi_sim"); +} + +TEST_F(UnicharcompressTest, DoesJapanese) { + LOG(INFO) << "Testing jpn"; + LoadUnicharset("jpn.unicharset"); + ExpectCorrect("jpn"); +} + +TEST_F(UnicharcompressTest, DoesKorean) { + LOG(INFO) << "Testing kor"; + LoadUnicharset("kor.unicharset"); + ExpectCorrect("kor"); +} + +TEST_F(UnicharcompressTest, DoesKannada) { + LOG(INFO) << "Testing kan"; + LoadUnicharset("kan.unicharset"); + ExpectCorrect("kan"); + SerializeAndUndo(); + ExpectCorrect("kan"); +} + +TEST_F(UnicharcompressTest, DoesMarathi) { + LOG(INFO) << "Testing mar"; + LoadUnicharset("mar.unicharset"); + ExpectCorrect("mar"); +} + +TEST_F(UnicharcompressTest, DoesEnglish) { + LOG(INFO) << "Testing eng"; + LoadUnicharset("eng.unicharset"); + ExpectCorrect("eng"); +} + +// Tests that a unicharset that contains double-letter ligatures (eg ff) has +// no null char in the encoding at all. +TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) { + LOG(INFO) << "Testing por with ligatures"; + LoadUnicharset("por.unicharset"); + ExpectCorrect("por"); + // Check that any unichar-id that is encoded with multiple codes has the + // correct encoded_nulll_char_ in between. + for (int u = 0; u <= unicharset_.size(); ++u) { + RecodedCharID code; + int len = compressed_.EncodeUnichar(u, &code); + if (len > 1) { + // The should not be any null char in the code. + for (int i = 0; i < len; ++i) { + EXPECT_NE(encoded_null_char_, code(i)); + } + } + } +} + +// Tests that GetEncodingAsString returns the right result for a trivial +// unicharset. +TEST_F(UnicharcompressTest, GetEncodingAsString) { + LoadUnicharset("trivial.unicharset"); + ExpectCorrect("trivial"); + STRING encoding = compressed_.GetEncodingAsString(unicharset_); + std::string encoding_str(&encoding[0], encoding.length()); + std::vector<std::string> lines = + absl::StrSplit(encoding_str, "\n", absl::SkipEmpty()); + EXPECT_EQ(5, lines.size()); + // The first line is always space. + EXPECT_EQ("0\t ", lines[0]); + // Next we have i. + EXPECT_EQ("1\ti", lines[1]); + // Next we have f. + EXPECT_EQ("2\tf", lines[2]); + // Next we have the fi ligature: fi. There are no nulls in it, as there are no + // repeated letter ligatures in this unicharset, unlike por.unicharset above. + EXPECT_EQ("2,1\tfi", lines[3]); + // Finally the null character. + EXPECT_EQ("3\t<nul>", lines[4]); +} + +} // namespace tesseract diff --git a/tesseract/unittest/unicharset_test.cc b/tesseract/unittest/unicharset_test.cc new file mode 100644 index 00000000..401a34c1 --- /dev/null +++ b/tesseract/unittest/unicharset_test.cc @@ -0,0 +1,161 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> +#include "log.h" // for LOG +#include "unicharset.h" +#include "gmock/gmock.h" // for testing::ElementsAreArray +#include "include_gunit.h" + +using testing::ElementsAreArray; + +namespace tesseract { + +class UnicharsetTest : public ::testing::Test { + protected: + void SetUp() override { + std::locale::global(std::locale("")); + } +}; + +TEST(UnicharsetTest, Basics) { + // This test verifies basic insertion, unichar_to_id, and encode. + UNICHARSET u; + u.unichar_insert("a"); + EXPECT_EQ(u.size(), 4); + u.unichar_insert("f"); + EXPECT_EQ(u.size(), 5); + u.unichar_insert("i"); + EXPECT_EQ(u.size(), 6); + // The fi ligature is NOT added because it can be encoded with a cleanup as f + // then i. + u.unichar_insert("\ufb01"); + EXPECT_EQ(u.size(), 6); + u.unichar_insert("e"); + EXPECT_EQ(u.size(), 7); + u.unichar_insert("n"); + EXPECT_EQ(u.size(), 8); + EXPECT_EQ(u.unichar_to_id("f"), 4); + EXPECT_EQ(u.unichar_to_id("i"), 5); + // The fi ligature has no valid id. + EXPECT_EQ(u.unichar_to_id("\ufb01"), INVALID_UNICHAR_ID); + // The fi pair has no valid id. + EXPECT_EQ(u.unichar_to_id("fi"), INVALID_UNICHAR_ID); + std::vector<int> labels; + EXPECT_TRUE(u.encode_string("affine", true, &labels, nullptr, nullptr)); + std::vector<int> v(&labels[0], &labels[0] + labels.size()); + EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6})); + // With the fi ligature encoding fails without a pre-cleanup. + std::string lig_str = "af\ufb01ne"; + EXPECT_FALSE( + u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr)); + lig_str = u.CleanupString(lig_str.c_str()); + EXPECT_TRUE( + u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr)); + v = std::vector<int>(&labels[0], &labels[0] + labels.size()); + EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6})); +} + +TEST(UnicharsetTest, Multibyte) { + // This test verifies basic insertion, unichar_to_id, and encode. + // The difference from Basic above is that now we are testing multi-byte + // unicodes instead of single byte. + UNICHARSET u; + // Insert some Arabic letters. + u.unichar_insert("\u0627"); + EXPECT_EQ(u.size(), 4); + u.unichar_insert("\u062c"); + EXPECT_EQ(u.size(), 5); + u.unichar_insert("\u062f"); + EXPECT_EQ(u.size(), 6); + u.unichar_insert("\ufb01"); // fi ligature is added as fi pair. + EXPECT_EQ(u.size(), 7); + u.unichar_insert("\u062b"); + EXPECT_EQ(u.size(), 8); + u.unichar_insert("\u0635"); + EXPECT_EQ(u.size(), 9); + EXPECT_EQ(u.unichar_to_id("\u0627"), 3); + EXPECT_EQ(u.unichar_to_id("\u062c"), 4); + // The first two bytes of this string is \u0627, which matches id 3; + EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3); + EXPECT_EQ(u.unichar_to_id("\u062f"), 5); + // Individual f and i are not present, but they are there as a pair. + EXPECT_EQ(u.unichar_to_id("f"), INVALID_UNICHAR_ID); + EXPECT_EQ(u.unichar_to_id("i"), INVALID_UNICHAR_ID); + EXPECT_EQ(u.unichar_to_id("fi"), 6); + // The fi ligature is findable. + EXPECT_EQ(u.unichar_to_id("\ufb01"), 6); + std::vector<int> labels; + EXPECT_TRUE(u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true, + &labels, nullptr, nullptr)); + std::vector<int> v(&labels[0], &labels[0] + labels.size()); + EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7})); + // With the fi ligature the fi is picked out. + std::vector<char> lengths; + int encoded_length; + std::string src_str = "\u0627\u062c\ufb01\u0635\u062b"; + // src_str has to be pre-cleaned for lengths to be correct. + std::string cleaned = u.CleanupString(src_str.c_str()); + EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, + &encoded_length)); + EXPECT_EQ(encoded_length, cleaned.size()); + std::string len_str(&lengths[0], lengths.size()); + EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002"); + v = std::vector<int>(&labels[0], &labels[0] + labels.size()); + EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7})); +} + +TEST(UnicharsetTest, MultibyteBigrams) { + // This test verifies basic insertion, unichar_to_id, and encode. + // The difference from Basic above is that now we are testing multi-byte + // unicodes instead of single byte. + UNICHARSET u; + // Insert some Arabic letters. + u.unichar_insert("\u0c9c"); + EXPECT_EQ(u.size(), 4); + u.unichar_insert("\u0cad"); + EXPECT_EQ(u.size(), 5); + u.unichar_insert("\u0ccd\u0c9c"); + EXPECT_EQ(u.size(), 6); + u.unichar_insert("\u0ccd"); + EXPECT_EQ(u.size(), 7); + // By default the encodable bigram is NOT added. + u.unichar_insert("\u0ccd\u0cad"); + EXPECT_EQ(u.size(), 7); + // It is added if we force it to be. + u.unichar_insert("\u0ccd\u0cad", OldUncleanUnichars::kTrue); + EXPECT_EQ(u.size(), 8); + std::vector<char> data; + tesseract::TFile fp; + fp.OpenWrite(&data); + u.save_to_file(&fp); + fp.Open(&data[0], data.size()); + UNICHARSET v; + v.load_from_file(&fp, false); + EXPECT_EQ(v.unichar_to_id("\u0c9c"), 3); + EXPECT_EQ(v.unichar_to_id("\u0cad"), 4); + EXPECT_EQ(v.unichar_to_id("\u0ccd\u0c9c"), 5); + EXPECT_EQ(v.unichar_to_id("\u0ccd"), 6); + EXPECT_EQ(v.unichar_to_id("\u0ccd\u0cad"), 7); +} + +TEST(UnicharsetTest, OldStyle) { + // This test verifies an old unicharset that contains fi/fl ligatures loads + // and keeps all the entries. + std::string filename = + file::JoinPath(TESTDATA_DIR, "eng.unicharset"); + UNICHARSET u; + LOG(INFO) << "Filename=" << filename; + EXPECT_TRUE(u.load_from_file(filename.c_str())); + EXPECT_EQ(u.size(), 111); +} + +} // namespace diff --git a/tesseract/unittest/util/utf8/unicodetext.cc b/tesseract/unittest/util/utf8/unicodetext.cc new file mode 100644 index 00000000..1a884dd1 --- /dev/null +++ b/tesseract/unittest/util/utf8/unicodetext.cc @@ -0,0 +1,507 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/utf8/unicodetext.h" + +#include <string.h> // for memcpy, NULL, memcmp, etc +#include <algorithm> // for max + +//#include "base/logging.h" // for operator<<, CHECK, etc +//#include "base/stringprintf.h" // for StringPrintf, StringAppendF +//#include "strings/stringpiece.h" // for StringPiece, etc + +#include "third_party/utf/utf.h" // for isvalidcharntorune, etc +#include "util/utf8/unilib.h" // for IsInterchangeValid, etc +#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen + +static int CodepointDistance(const char* start, const char* end) { + int n = 0; + // Increment n on every non-trail-byte. + for (const char* p = start; p < end; ++p) { + n += (*reinterpret_cast<const signed char*>(p) >= -0x40); + } + return n; +} + +static int CodepointCount(const char* utf8, int len) { + return CodepointDistance(utf8, utf8 + len); +} + +UnicodeText::const_iterator::difference_type +distance(const UnicodeText::const_iterator& first, + const UnicodeText::const_iterator& last) { + return CodepointDistance(first.it_, last.it_); +} + +// ---------- Utility ---------- + +static int ConvertToInterchangeValid(char* start, int len) { + // This routine is called only when we've discovered that a UTF-8 buffer + // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 + // was not interchange valid. This indicates a bug in the caller, and + // a LOG(WARNING) is done in that case. + // This is similar to CoerceToInterchangeValid, but it replaces each + // structurally valid byte with a space, and each non-interchange + // character with a space, even when that character requires more + // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is + // structurally valid UTF8, but U+FDD0 is not an interchange-valid + // code point. The result should contain one space, not three. + // + // Since the conversion never needs to write more data than it + // reads, it is safe to change the buffer in place. It returns the + // number of bytes written. + char* const in = start; + char* out = start; + char* const end = start + len; + while (start < end) { + int good = UniLib::SpanInterchangeValid(start, end - start); + if (good > 0) { + if (out != start) { + memmove(out, start, good); + } + out += good; + start += good; + if (start == end) { + break; + } + } + // Is the current string invalid UTF8 or just non-interchange UTF8? + char32 rune; + int n; + if (isvalidcharntorune(start, end - start, &rune, &n)) { + // structurally valid UTF8, but not interchange valid + start += n; // Skip over the whole character. + } else { // bad UTF8 + start += 1; // Skip over just one byte + } + *out++ = ' '; + } + return out - in; +} + + +// *************** Data representation ********** + +// Note: the copy constructor is undefined. + +// After reserve(), resize(), or clear(), we're an owner, not an alias. + +void UnicodeText::Repr::reserve(int new_capacity) { + // If there's already enough capacity, and we're an owner, do nothing. + if (capacity_ >= new_capacity && ours_) return; + + // Otherwise, allocate a new buffer. + capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20); + char* new_data = new char[capacity_]; + + // If there is an old buffer, copy it into the new buffer. + if (data_) { + memcpy(new_data, data_, size_); + if (ours_) delete[] data_; // If we owned the old buffer, free it. + } + data_ = new_data; + ours_ = true; // We own the new buffer. + // size_ is unchanged. +} + +void UnicodeText::Repr::resize(int new_size) { + if (new_size == 0) { + clear(); + } else { + if (!ours_ || new_size > capacity_) reserve(new_size); + // Clear the memory in the expanded part. + if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); + size_ = new_size; + ours_ = true; + } +} + +// This implementation of clear() deallocates the buffer if we're an owner. +// That's not strictly necessary; we could just set size_ to 0. +void UnicodeText::Repr::clear() { + if (ours_) delete[] data_; + data_ = nullptr; + size_ = capacity_ = 0; + ours_ = true; +} + +void UnicodeText::Repr::Copy(const char* data, int size) { + resize(size); + memcpy(data_, data, size); +} + +void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { + if (data == data_) return; // We already own this memory. (Weird case.) + if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. + data_ = data; + size_ = size; + capacity_ = capacity; + ours_ = true; +} + +void UnicodeText::Repr::PointTo(const char* data, int size) { + if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. + data_ = const_cast<char*>(data); + size_ = size; + capacity_ = size; + ours_ = false; +} + +void UnicodeText::Repr::append(const char* bytes, int byte_length) { + reserve(size_ + byte_length); + memcpy(data_ + size_, bytes, byte_length); + size_ += byte_length; +} + +string UnicodeText::Repr::DebugString() const { + return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", + this, + data_, size_, capacity_, + ours_ ? "Owned" : "Alias"); +} + + + +// *************** UnicodeText ****************** + +// ----- Constructors ----- + +// Default constructor +UnicodeText::UnicodeText() { +} + +// Copy constructor +UnicodeText::UnicodeText(const UnicodeText& src) { + Copy(src); +} + +// Substring constructor +UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, + const UnicodeText::const_iterator& last) { + CHECK(first <= last) << " Incompatible iterators"; + repr_.append(first.it_, last.it_ - first.it_); +} + +string UnicodeText::UTF8Substring(const const_iterator& first, + const const_iterator& last) { + CHECK(first <= last) << " Incompatible iterators"; + return string(first.it_, last.it_ - first.it_); +} + + +// ----- Copy ----- + +UnicodeText& UnicodeText::operator=(const UnicodeText& src) { + if (this != &src) { + Copy(src); + } + return *this; +} + +UnicodeText& UnicodeText::Copy(const UnicodeText& src) { + repr_.Copy(src.repr_.data_, src.repr_.size_); + return *this; +} + +UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { + repr_.Copy(buffer, byte_length); + if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; + repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); + } + return *this; +} + +UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, + int byte_length) { + repr_.Copy(buffer, byte_length); + return *this; +} + +// ----- TakeOwnershipOf ----- + +UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, + int byte_length, + int byte_capacity) { + repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); + if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; + repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); + } + return *this; +} + +UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, + int byte_length, + int byte_capacity) { + repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); + return *this; +} + +// ----- PointTo ----- + +UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { + if (UniLib:: IsInterchangeValid(buffer, byte_length)) { + repr_.PointTo(buffer, byte_length); + } else { + LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; + repr_.Copy(buffer, byte_length); + repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); + } + return *this; +} + +UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, + int byte_length) { + repr_.PointTo(buffer, byte_length); + return *this; +} + +UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { + repr_.PointTo(src.repr_.data_, src.repr_.size_); + return *this; +} + +UnicodeText& UnicodeText::PointTo(const const_iterator &first, + const const_iterator &last) { + CHECK(first <= last) << " Incompatible iterators"; + repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); + return *this; +} + +// ----- Append ----- + +UnicodeText& UnicodeText::append(const UnicodeText& u) { + repr_.append(u.repr_.data_, u.repr_.size_); + return *this; +} + +UnicodeText& UnicodeText::append(const const_iterator& first, + const const_iterator& last) { + CHECK(first <= last) << " Incompatible iterators"; + repr_.append(first.it_, last.it_ - first.it_); + return *this; +} + +UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { + repr_.append(utf8, len); + return *this; +} + +// ----- substring searching ----- + +UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, + const_iterator start_pos) const { + CHECK_GE(start_pos.utf8_data(), utf8_data()); + CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length()); + return UnsafeFind(look, start_pos); +} + +UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { + return UnsafeFind(look, begin()); +} + +UnicodeText::const_iterator UnicodeText::UnsafeFind( + const UnicodeText& look, const_iterator start_pos) const { + // Due to the magic of the UTF8 encoding, searching for a sequence of + // letters is equivalent to substring search. + StringPiece searching(utf8_data(), utf8_length()); + StringPiece look_piece(look.utf8_data(), look.utf8_length()); + LOG(FATAL) << "Not implemented"; + //StringPiece::size_type found = + // searching.find(look_piece, start_pos.utf8_data() - utf8_data()); + StringPiece::size_type found = StringPiece::npos; + if (found == StringPiece::npos) return end(); + return const_iterator(utf8_data() + found); +} + +bool UnicodeText::HasReplacementChar() const { + // Equivalent to: + // UnicodeText replacement_char; + // replacement_char.push_back(0xFFFD); + // return find(replacement_char) != end(); + StringPiece searching(utf8_data(), utf8_length()); + StringPiece looking_for("\xEF\xBF\xBD", 3); + LOG(FATAL) << "Not implemented"; + //return searching.find(looking_for) != StringPiece::npos; + return false; +} + +// ----- other methods ----- + +// Clear operator +void UnicodeText::clear() { + repr_.clear(); +} + +// Destructor +UnicodeText::~UnicodeText() {} + + +void UnicodeText::push_back(char32 c) { + if (UniLib::IsValidCodepoint(c)) { + char buf[UTFmax]; + int len = runetochar(buf, &c); + if (UniLib::IsInterchangeValid(buf, len)) { + repr_.append(buf, len); + } else { + LOG(WARNING) << "Unicode value 0x" << std::hex << c + << " is not valid for interchange"; + repr_.append(" ", 1); + } + } else { + LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c; + repr_.append(" ", 1); + } +} + +int UnicodeText::size() const { + return CodepointCount(repr_.data_, repr_.size_); +} + +bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { + if (&lhs == &rhs) return true; + if (lhs.repr_.size_ != rhs.repr_.size_) return false; + return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; +} + +string UnicodeText::DebugString() const { + return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", + this, + size(), + repr_.DebugString().c_str()); +} + + +// ******************* UnicodeText::const_iterator ********************* + +// The implementation of const_iterator would be nicer if it +// inherited from boost::iterator_facade +// (http://boost.org/libs/iterator/doc/iterator_facade.html). + +UnicodeText::const_iterator::const_iterator() : it_(nullptr) {} + +UnicodeText::const_iterator::const_iterator(const const_iterator& other) + : it_(other.it_) { +} + +UnicodeText::const_iterator& +UnicodeText::const_iterator::operator=(const const_iterator& other) { + if (&other != this) + it_ = other.it_; + return *this; +} + +UnicodeText::const_iterator UnicodeText::begin() const { + return const_iterator(repr_.data_); +} + +UnicodeText::const_iterator UnicodeText::end() const { + return const_iterator(repr_.data_ + repr_.size_); +} + +bool operator<(const UnicodeText::const_iterator& lhs, + const UnicodeText::const_iterator& rhs) { + return lhs.it_ < rhs.it_; +} + +char32 UnicodeText::const_iterator::operator*() const { + // (We could call chartorune here, but that does some + // error-checking, and we're guaranteed that our data is valid + // UTF-8. Also, we expect this routine to be called very often. So + // for speed, we do the calculation ourselves.) + + // Convert from UTF-8 + unsigned char byte1 = it_[0]; + if (byte1 < 0x80) + return byte1; + + unsigned char byte2 = it_[1]; + if (byte1 < 0xE0) + return ((byte1 & 0x1F) << 6) + | (byte2 & 0x3F); + + unsigned char byte3 = it_[2]; + if (byte1 < 0xF0) + return ((byte1 & 0x0F) << 12) + | ((byte2 & 0x3F) << 6) + | (byte3 & 0x3F); + + unsigned char byte4 = it_[3]; + return ((byte1 & 0x07) << 18) + | ((byte2 & 0x3F) << 12) + | ((byte3 & 0x3F) << 6) + | (byte4 & 0x3F); +} + +UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { + it_ += UniLib::OneCharLen(it_); + return *this; +} + +UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { + while (UniLib::IsTrailByte(*--it_)); + return *this; +} + +int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { + utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1; + utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2; + utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3; + utf8_output[3] = it_[3]; + return 4; +} + +string UnicodeText::const_iterator::get_utf8_string() const { + return string(utf8_data(), utf8_length()); +} + +int UnicodeText::const_iterator::utf8_length() const { + if ((it_[0] & 0xff) < 0x80) { + return 1; + } else if ((it_[0] & 0xff) < 0xE0) { + return 2; + } else if ((it_[0] & 0xff) < 0xF0) { + return 3; + } else { + return 4; + } +} + +UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { + CHECK(p != nullptr); + const char* start = utf8_data(); + int len = utf8_length(); + const char* end = start + len; + CHECK(p >= start); + CHECK(p <= end); + CHECK(p == end || !UniLib::IsTrailByte(*p)); + return const_iterator(p); +} + +string UnicodeText::const_iterator::DebugString() const { + return tensorflow::strings::Printf("{iter %p}", it_); +} + + +// *************************** Utilities ************************* + +string CodepointString(const UnicodeText& t) { + string s; + UnicodeText::const_iterator it = t.begin(), end = t.end(); + while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++); + return s; +} diff --git a/tesseract/unittest/util/utf8/unicodetext.h b/tesseract/unittest/util/utf8/unicodetext.h new file mode 100644 index 00000000..4e25d3ee --- /dev/null +++ b/tesseract/unittest/util/utf8/unicodetext.h @@ -0,0 +1,477 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_ +#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_ + +#include <stddef.h> // for NULL, ptrdiff_t +#include <iterator> // for bidirectional_iterator_tag, etc +#include <string> // for string +#include <utility> // for pair + +#include "syntaxnet/base.h" + +// ***************************** UnicodeText ************************** +// +// A UnicodeText object is a container for a sequence of Unicode +// codepoint values. It has default, copy, and assignment constructors. +// Data can be appended to it from another UnicodeText, from +// iterators, or from a single codepoint. +// +// The internal representation of the text is UTF-8. Since UTF-8 is a +// variable-width format, UnicodeText does not provide random access +// to the text, and changes to the text are permitted only at the end. +// +// The UnicodeText class defines a const_iterator. The dereferencing +// operator (*) returns a codepoint (char32). The iterator is a +// bidirectional, read-only iterator. It becomes invalid if the text +// is changed. +// +// There are methods for appending and retrieving UTF-8 data directly. +// The 'utf8_data' method returns a const char* that contains the +// UTF-8-encoded version of the text; 'utf8_length' returns the number +// of bytes in the UTF-8 data. An iterator's 'get' method stores up to +// 4 bytes of UTF-8 data in a char array and returns the number of +// bytes that it stored. +// +// Codepoints are integers in the range [0, 0xD7FF] or [0xE000, +// 0x10FFFF], but UnicodeText has the additional restriction that it +// can contain only those characters that are valid for interchange on +// the Web. This excludes all of the control codes except for carriage +// return, line feed, and horizontal tab. It also excludes +// non-characters, but codepoints that are in the Private Use regions +// are allowed, as are codepoints that are unassigned. (See the +// Unicode reference for details.) The function UniLib::IsInterchangeValid +// can be used as a test for this property. +// +// UnicodeTexts are safe. Every method that constructs or modifies a +// UnicodeText tests for interchange-validity, and will substitute a +// space for the invalid data. Such cases are reported via +// LOG(WARNING). +// +// MEMORY MANAGEMENT: copy, take ownership, or point to +// +// A UnicodeText is either an "owner", meaning that it owns the memory +// for the data buffer and will free it when the UnicodeText is +// destroyed, or it is an "alias", meaning that it does not. +// +// There are three methods for storing UTF-8 data in a UnicodeText: +// +// CopyUTF8(buffer, len) copies buffer. +// +// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. +// +// PointToUTF8(buffer, size) creates an alias pointing to buffer. +// +// All three methods perform a validity check on the buffer. There are +// private, "unsafe" versions of these functions that bypass the +// validity check. They are used internally and by friend-functions +// that are handling UTF-8 data that has already been validated. +// +// The purpose of an alias is to avoid making an unnecessary copy of a +// UTF-8 buffer while still providing access to the Unicode values +// within that text through iterators or the fast scanners that are +// based on UTF-8 state tables. The lifetime of an alias must not +// exceed the lifetime of the buffer from which it was constructed. +// +// The semantics of an alias might be described as "copy on write or +// repair." The source data is never modified. If push_back() or +// append() is called on an alias, a copy of the data will be created, +// and the UnicodeText will become an owner. If clear() is called on +// an alias, it becomes an (empty) owner. +// +// The copy constructor and the assignment operator produce an owner. +// That is, after direct initialization ("UnicodeText x(y);") or copy +// initialization ("UnicodeText x = y;") x will be an owner, even if y +// was an alias. The assignment operator ("x = y;") also produces an +// owner unless x and y are the same object and y is an alias. +// +// Aliases should be used with care. If the source from which an alias +// was created is freed, or if the contents are changed, while the +// alias is still in use, fatal errors could result. But it can be +// quite useful to have a UnicodeText "window" through which to see a +// UTF-8 buffer without having to pay the price of making a copy. +// +// UTILITIES +// +// The interfaces in util/utf8/public/textutils.h provide higher-level +// utilities for dealing with UnicodeTexts, including routines for +// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or +// strings, creating strings from UnicodeTexts, normalizing text for +// efficient matching or display, and others. + +class UnicodeText { + public: + class const_iterator; + + typedef char32 value_type; + + // Constructors. These always produce owners. + UnicodeText(); // Create an empty text. + UnicodeText(const UnicodeText& src); // copy constructor + // Construct a substring (copies the data). + UnicodeText(const const_iterator& first, const const_iterator& last); + + // Assignment operator. This copies the data and produces an owner + // unless this == &src, e.g., "x = x;", which is a no-op. + UnicodeText& operator=(const UnicodeText& src); + + // x.Copy(y) copies the data from y into x. + UnicodeText& Copy(const UnicodeText& src); + inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } + + // x.PointTo(y) changes x so that it points to y's data. + // It does not copy y or take ownership of y's data. + UnicodeText& PointTo(const UnicodeText& src); + UnicodeText& PointTo(const const_iterator& first, + const const_iterator& last); + + ~UnicodeText(); + + void clear(); // Clear text. + bool empty() const { return repr_.size_ == 0; } // Test if text is empty. + + // Add a codepoint to the end of the text. + // If the codepoint is not interchange-valid, add a space instead + // and log a warning. + void push_back(char32 codepoint); + + // Generic appending operation. + // iterator_traits<ForwardIterator>::value_type must be implicitly + // convertible to char32. Typical uses of this method might include: + // char32 chars[] = {0x1, 0x2, ...}; + // vector<char32> more_chars = ...; + // utext.append(chars, chars+arraysize(chars)); + // utext.append(more_chars.begin(), more_chars.end()); + template<typename ForwardIterator> + UnicodeText& append(ForwardIterator first, const ForwardIterator last) { + while (first != last) { push_back(*first++); } + return *this; + } + + // A specialization of the generic append() method. + UnicodeText& append(const const_iterator& first, const const_iterator& last); + + // An optimization of append(source.begin(), source.end()). + UnicodeText& append(const UnicodeText& source); + + int size() const; // the number of Unicode characters (codepoints) + + friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); + friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); + + class const_iterator { + typedef const_iterator CI; + public: + typedef std::bidirectional_iterator_tag iterator_category; + typedef char32 value_type; + typedef ptrdiff_t difference_type; + typedef void pointer; // (Not needed.) + typedef const char32 reference; // (Needed for const_reverse_iterator) + + // Iterators are default-constructible. + const_iterator(); + + // It's safe to make multiple passes over a UnicodeText. + const_iterator(const const_iterator& other); + const_iterator& operator=(const const_iterator& other); + + char32 operator*() const; // Dereference + + const_iterator& operator++(); // Advance (++iter) + const_iterator operator++(int) { // (iter++) + const_iterator result(*this); + ++*this; + return result; + } + + const_iterator& operator--(); // Retreat (--iter) + const_iterator operator--(int) { // (iter--) + const_iterator result(*this); + --*this; + return result; + } + + // We love relational operators. + friend bool operator==(const CI& lhs, const CI& rhs) { + return lhs.it_ == rhs.it_; } + friend bool operator!=(const CI& lhs, const CI& rhs) { + return !(lhs == rhs); } + friend bool operator<(const CI& lhs, const CI& rhs); + friend bool operator>(const CI& lhs, const CI& rhs) { + return rhs < lhs; } + friend bool operator<=(const CI& lhs, const CI& rhs) { + return !(rhs < lhs); } + friend bool operator>=(const CI& lhs, const CI& rhs) { + return !(lhs < rhs); } + + friend difference_type distance(const CI& first, const CI& last); + + // UTF-8-specific methods + // Store the UTF-8 encoding of the current codepoint into buf, + // which must be at least 4 bytes long. Return the number of + // bytes written. + int get_utf8(char* buf) const; + // Return the UTF-8 character that the iterator points to. + string get_utf8_string() const; + // Return the byte length of the UTF-8 character the iterator points to. + int utf8_length() const; + // Return the iterator's pointer into the UTF-8 data. + const char* utf8_data() const { return it_; } + + string DebugString() const; + + private: + friend class UnicodeText; + friend class UnicodeTextUtils; + friend class UTF8StateTableProperty; + explicit const_iterator(const char* it) : it_(it) {} + + const char* it_; + }; + + const_iterator begin() const; + const_iterator end() const; + + class const_reverse_iterator : public std::reverse_iterator<const_iterator> { + public: + explicit const_reverse_iterator(const_iterator it) : + std::reverse_iterator<const_iterator>(it) {} + const char* utf8_data() const { + const_iterator tmp_it = base(); + return (--tmp_it).utf8_data(); + } + int get_utf8(char* buf) const { + const_iterator tmp_it = base(); + return (--tmp_it).get_utf8(buf); + } + string get_utf8_string() const { + const_iterator tmp_it = base(); + return (--tmp_it).get_utf8_string(); + } + int utf8_length() const { + const_iterator tmp_it = base(); + return (--tmp_it).utf8_length(); + } + }; + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Substring searching. Returns the beginning of the first + // occurrence of "look", or end() if not found. + const_iterator find(const UnicodeText& look, const_iterator start_pos) const; + // Equivalent to find(look, begin()) + const_iterator find(const UnicodeText& look) const; + + // Returns whether this contains the character U+FFFD. This can + // occur, for example, if the input to Encodings::Decode() had byte + // sequences that were invalid in the source encoding. + bool HasReplacementChar() const; + + // UTF-8-specific methods + // + // Return the data, length, and capacity of UTF-8-encoded version of + // the text. Length and capacity are measured in bytes. + const char* utf8_data() const { return repr_.data_; } + int utf8_length() const { return repr_.size_; } + int utf8_capacity() const { return repr_.capacity_; } + + // Return the UTF-8 data as a string. + static string UTF8Substring(const const_iterator& first, + const const_iterator& last); + + // There are three methods for initializing a UnicodeText from UTF-8 + // data. They vary in details of memory management. In all cases, + // the data is tested for interchange-validity. If it is not + // interchange-valid, a LOG(WARNING) is issued, and each + // structurally invalid byte and each interchange-invalid codepoint + // is replaced with a space. + + // x.CopyUTF8(buf, len) copies buf into x. + UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); + + // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of + // buf. buf is not copied. + UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, + int byte_length, + int byte_capacity); + + // x.PointToUTF8(buf,len) changes x so that it points to buf + // ("becomes an alias"). It does not take ownership or copy buf. + // If the buffer is not valid, this has the same effect as + // CopyUTF8(utf8_buffer, byte_length). + UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); + + // Occasionally it is necessary to use functions that operate on the + // pointer returned by utf8_data(). MakeIterator(p) provides a way + // to get back to the UnicodeText level. It uses CHECK to ensure + // that p is a pointer within this object's UTF-8 data, and that it + // points to the beginning of a character. + const_iterator MakeIterator(const char* p) const; + + string DebugString() const; + + private: + friend class const_iterator; + friend class UnicodeTextUtils; + + class Repr { // A byte-string. + public: + char* data_; + int size_; + int capacity_; + bool ours_; // Do we own data_? + + Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {} + ~Repr() { if (ours_) delete[] data_; } + + void clear(); + void reserve(int capacity); + void resize(int size); + + void append(const char* bytes, int byte_length); + void Copy(const char* data, int size); + void TakeOwnershipOf(char* data, int size, int capacity); + void PointTo(const char* data, int size); + + string DebugString() const; + + private: + Repr& operator=(const Repr&); + Repr(const Repr& other); + }; + + Repr repr_; + + // UTF-8-specific private methods. + // These routines do not perform a validity check when compiled + // in opt mode. + // It is an error to call these methods with UTF-8 data that + // is not interchange-valid. + // + UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); + UnicodeText& UnsafeTakeOwnershipOfUTF8( + char* utf8_buffer, int byte_length, int byte_capacity); + UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); + UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); + const_iterator UnsafeFind(const UnicodeText& look, + const_iterator start_pos) const; +}; + +bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); + +inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { + return !(lhs == rhs); +} + +// UnicodeTextRange is a pair of iterators, useful for specifying text +// segments. If the iterators are ==, the segment is empty. +typedef pair<UnicodeText::const_iterator, + UnicodeText::const_iterator> UnicodeTextRange; + +inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { + return r.first == r.second; +} + + +// *************************** Utilities ************************* + +// A factory function for creating a UnicodeText from a buffer of +// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It +// is an "owner.") +// +// Each byte that is structurally invalid will be replaced with a +// space. Each codepoint that is interchange-invalid will also be +// replaced with a space, even if the codepoint was represented with a +// multibyte sequence in the UTF-8 data. +// +inline UnicodeText MakeUnicodeTextAcceptingOwnership( + char* utf8_buffer, int byte_length, int byte_capacity) { + return UnicodeText().TakeOwnershipOfUTF8( + utf8_buffer, byte_length, byte_capacity); +} + +// A factory function for creating a UnicodeText from a buffer of +// UTF-8 data. The new UnicodeText does not take ownership of the +// buffer. (It is an "alias.") +// +inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( + const char* utf8_buffer, int byte_length) { + return UnicodeText().PointToUTF8(utf8_buffer, byte_length); +} + +// Create a UnicodeText from a UTF-8 string or buffer. +// +// If do_copy is true, then a copy of the string is made. The copy is +// owned by the resulting UnicodeText object and will be freed when +// the object is destroyed. This UnicodeText object is referred to +// as an "owner." +// +// If do_copy is false, then no copy is made. The resulting +// UnicodeText object does NOT take ownership of the string; in this +// case, the lifetime of the UnicodeText object must not exceed the +// lifetime of the string. This Unicodetext object is referred to as +// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. +// +// If the input string does not contain valid UTF-8, then a copy is +// made (as if do_copy were true) and coerced to valid UTF-8 by +// replacing each invalid byte with a space. +// +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, + bool do_copy) { + UnicodeText t; + if (do_copy) { + t.CopyUTF8(utf8_buf, len); + } else { + t.PointToUTF8(utf8_buf, len); + } + return t; +} + +inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { + return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); +} + +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { + return UTF8ToUnicodeText(utf8_buf, len, true); +} +inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { + return UTF8ToUnicodeText(utf8_string, true); +} + +// Return a string containing the UTF-8 encoded version of all the +// Unicode characters in t. +inline string UnicodeTextToUTF8(const UnicodeText& t) { + return string(t.utf8_data(), t.utf8_length()); +} + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template <typename T, size_t N> +char (&ArraySizeHelper(T (&array)[N]))[N]; +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// For debugging. Return a string of integers, written in uppercase +// hex (%X), corresponding to the codepoints within the text. Each +// integer is followed by a space. E.g., "61 62 6A 3005 ". +string CodepointString(const UnicodeText& t); + +#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_ diff --git a/tesseract/unittest/util/utf8/unilib.cc b/tesseract/unittest/util/utf8/unilib.cc new file mode 100644 index 00000000..c00759ae --- /dev/null +++ b/tesseract/unittest/util/utf8/unilib.cc @@ -0,0 +1,58 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#include "util/utf8/unilib.h" + +#include "syntaxnet/base.h" +#include "third_party/utf/utf.h" + +namespace UniLib { + +// Codepoints not allowed for interchange are: +// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020), +// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A), +// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D) +// C1 controls: U+007F to U+009F +// Surrogates: U+D800 to U+DFFF +// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx +bool IsInterchangeValid(char32 c) { + return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || + (c >= 0x7F && c <= 0x9F) || + (c >= 0xD800 && c <= 0xDFFF) || + (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE); +} + +int SpanInterchangeValid(const char* begin, int byte_length) { + char32 rune; + const char* p = begin; + const char* end = begin + byte_length; + while (p < end) { + int bytes_consumed = charntorune(&rune, p, end - p); + // We want to accept Runeerror == U+FFFD as a valid char, but it is used + // by chartorune to indicate error. Luckily, the real codepoint is size 3 + // while errors return bytes_consumed <= 1. + if ((rune == Runeerror && bytes_consumed <= 1) || + !IsInterchangeValid(rune)) { + break; // Found + } + p += bytes_consumed; + } + return p - begin; +} + +} // namespace UniLib diff --git a/tesseract/unittest/util/utf8/unilib.h b/tesseract/unittest/util/utf8/unilib.h new file mode 100644 index 00000000..e99895a2 --- /dev/null +++ b/tesseract/unittest/util/utf8/unilib.h @@ -0,0 +1,63 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Routines to do manipulation of Unicode characters or text +// +// The StructurallyValid routines accept buffers of arbitrary bytes. +// For CoerceToStructurallyValid(), the input buffer and output buffers may +// point to exactly the same memory. +// +// In all other cases, the UTF-8 string must be structurally valid and +// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. +// Debug builds take a fatal error for invalid UTF-8 input. +// The input and output buffers may not overlap at all. +// +// The char32 routines are here only for convenience; they convert to UTF-8 +// internally and use the UTF-8 routines. + +#ifndef UTIL_UTF8_UNILIB_H__ +#define UTIL_UTF8_UNILIB_H__ + +#include <string> +#include "syntaxnet/base.h" + +// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here, +// but they are defined in unilib_utf8_utils.h. +//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export + +namespace UniLib { + +// Returns the length in bytes of the prefix of src that is all +// interchange valid UTF-8 +int SpanInterchangeValid(const char* src, int byte_length); +inline int SpanInterchangeValid(const std::string& src) { + return SpanInterchangeValid(src.data(), src.size()); +} + +// Returns true if the source is all interchange valid UTF-8 +// "Interchange valid" is a stronger than structurally valid -- +// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. +bool IsInterchangeValid(char32 codepoint); +inline bool IsInterchangeValid(const char* src, int byte_length) { + return (byte_length == SpanInterchangeValid(src, byte_length)); +} +inline bool IsInterchangeValid(const std::string& src) { + return IsInterchangeValid(src.data(), src.size()); +} + +} // namespace UniLib + +#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ diff --git a/tesseract/unittest/util/utf8/unilib_utf8_utils.h b/tesseract/unittest/util/utf8/unilib_utf8_utils.h new file mode 100644 index 00000000..a9c10166 --- /dev/null +++ b/tesseract/unittest/util/utf8/unilib_utf8_utils.h @@ -0,0 +1,66 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ +#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ + +// These definitions are self-contained and have no dependencies. +// They are also exported from unilib.h for legacy reasons. + +#include "syntaxnet/base.h" +#include "third_party/utf/utf.h" + +namespace UniLib { + +// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF] +// (i.e., is not a surrogate codepoint). See also +// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h. +inline bool IsValidCodepoint(char32 c) { + return (static_cast<uint32>(c) < 0xD800) + || (c >= 0xE000 && c <= 0x10FFFF); +} + +// Returns true if 'str' is the start of a structurally valid UTF-8 +// sequence and is not a surrogate codepoint. Returns false if str.empty() +// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function +// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]). +inline bool IsUTF8ValidCodepoint(StringPiece str) { + char32 c; + int consumed; + // It's OK if str.length() > consumed. + return !str.empty() + && isvalidcharntorune(str.data(), str.size(), &c, &consumed) + && IsValidCodepoint(c); +} + +// Returns the length (number of bytes) of the Unicode code point +// starting at src, based on inspecting just that one byte. This +// requires that src point to a well-formed UTF-8 string; the result +// is undefined otherwise. +inline int OneCharLen(const char* src) { + return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; +} + +// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx) +inline bool IsTrailByte(char x) { + // return (x & 0xC0) == 0x80; + // Since trail bytes are always in [0x80, 0xBF], we can optimize: + return static_cast<signed char>(x) < -0x40; +} + +} // namespace UniLib + +#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ diff --git a/tesseract/unittest/validate_grapheme_test.cc b/tesseract/unittest/validate_grapheme_test.cc new file mode 100644 index 00000000..54e2f490 --- /dev/null +++ b/tesseract/unittest/validate_grapheme_test.cc @@ -0,0 +1,179 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "normstrngs.h" +#include "normstrngs_test.h" + +namespace tesseract { + +TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) { + std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E. + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + // It made 3 graphemes. + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f")); + EXPECT_EQ(glyphs[1], std::string("\u0c15")); + EXPECT_EQ(glyphs[2], std::string("\u0c0e")); +} + +TEST(ValidateGraphemeTest, SingleConsonantOK) { + std::string str = "\u0cb9"; // HA + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); +} + +TEST(ValidateGraphemeTest, SimpleCV) { + std::string str = "\u0cb9\u0cbf"; // HA I + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); +} + +TEST(ValidateGraphemeTest, SubscriptConjunct) { + std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95")); +} + +TEST(ValidateGraphemeTest, HalfFormJoiner) { + std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d")); +} + +TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) { + std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d")); +} + +TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) { + std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d")); + // Malaylam only, so not allowed in Telugu. + str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta + EXPECT_FALSE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); +} + +TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) { + std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 2); + EXPECT_EQ(glyphs[1], std::string("\u0d24")); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c")); +} + +TEST(ValidateGraphemeTest, ThaiGraphemes) { + // This is a single grapheme unless in glyph split mode + std::string str = "\u0e14\u0e38\u0e4a"; + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 1); + EXPECT_EQ(glyphs[0], str); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[0], std::string("\u0e14")); +} + +TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) { + std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d"; + std::vector<std::string> glyphs; + // Returns true, but the joiner is gone. + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(glyphs.size(), 5); + EXPECT_EQ(glyphs[0], std::string("'")); + EXPECT_EQ(glyphs[1], std::string("\u0d24")); + EXPECT_EQ(glyphs[2], std::string("\u0d23")); + EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c")); + EXPECT_EQ(glyphs[4], std::string("'")); +} + +} // namespace tesseract diff --git a/tesseract/unittest/validate_indic_test.cc b/tesseract/unittest/validate_indic_test.cc new file mode 100644 index 00000000..d317198b --- /dev/null +++ b/tesseract/unittest/validate_indic_test.cc @@ -0,0 +1,231 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "normstrngs.h" +#include "normstrngs_test.h" + +namespace tesseract { + +// Though the unicode example for Telugu in section 12.7: +// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf +// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to +// suppress a conjugate that would otherwise occur. If a consonant is followed +// by a virama and then by a non-Indic character, OpenType will presume that +// the user simply meant to suppress the inherent vowel of the consonant +// and render it as the consonant with an explicit virama, the same as if +// a ZWNJ had followed. Since this is confusing to an OCR engine, the +// normalizer always puts a termninating ZWNJ on the end if not present, +// and accepts the string as valid. +TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) { + std::string str = "\u0c15\u0c4d"; // KA - virama + std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str); + // Same result if we started with the normalized string. + ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, + target_str); +} + +// Only one dependent vowel is allowed. +TEST(ValidateIndicTest, OnlyOneDependentVowel) { + std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU + std::string dest; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &dest)) + << PrintString32WithUnicodes(str); +} + +// [c26][c4d][c01] +// A consonant (DA) followed by the virama followed by a bindu +// Syllable modifiers [c01][c02][c03] all modify the pronunciation of +// the vowel in a syllable, as does the virama [c04]. You can only +// have one of these on a syllable. +// +// References: +// http://www.omniglot.com/writing/telugu.htm +TEST(ValidateIndicTest, OnlyOneVowelModifier) { + std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu + std::string result; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + // It made 1 grapheme of 4 chars, by terminating the explicit virama. + EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result); + + str = "\u0995\u0983\u0981"; // KA visarga candrabindu + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + + // Exception: Malayalam allows multiple anusvara. + str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + EXPECT_EQ(str, result); +} + +// [c28][c02][c3f] +// A consonant (NA) followed by the Anusvara/sunna and another matra (I). +// The anusvara [c02] is a pronunciation directive +// for a whole syllable and only appears at the end of the syllable +// References: +// + Unicode v9, 12.1 "Modifier Mark Rules R10," +// and the Microsoft page +// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx +TEST(ValidateIndicTest, VowelModifierMustBeLast) { + std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I + std::string dest; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &dest)) + << PrintString32WithUnicodes(str); + // Swap c02/c3f and all is ok. + str = "\u0c28\u0c3f\u0c02"; // NA I Sunna + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); +} + +// [c05][c47] +// A Vowel (A) followed by a combining vowel/matra (EE). +// In Telugu, matras are only put on consonants, not independent +// vowels. +// References: +// + Unicode v9, 12.1: +// Principles of the Devanagari Script: Dependent Vowel Signs (Matras). +// + http://varamozhi.sourceforge.net/iscii91.pdf +TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) { + std::string str = "\u0c05\u0c47"; // A EE + std::string dest; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &dest)) + << PrintString32WithUnicodes(str); + str = "\u0c1e\u0c3e"; // NYA AA + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); +} + +// Sub-graphemes are allowed if GraphemeNorm is turned off. +TEST(ValidateIndicTest, SubGraphemes) { + std::string str = "\u0d3e"; // AA + std::string dest; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &dest)) + << PrintString32WithUnicodes(str); + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNone, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); +} + +TEST(ValidateIndicTest, Nukta) { + std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9")); + // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it. + std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA + ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str); +} + +// Sinhala has some of its own specific rules. See www.macciato.com/sinhala +TEST(ValidateIndicTest, SinhalaRakaransaya) { + std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna + std::string dest; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 2); + EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb")); + // Can be followed by a dependent vowel. + str += "\u0dd9"; // E + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); +} + +TEST(ValidateIndicTest, SinhalaYansaya) { + std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna + std::string dest; + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); + // Can be followed by a dependent vowel. + str += "\u0ddd"; // OO + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), &dest)) + << PrintString32WithUnicodes(str); + EXPECT_EQ(dest, str); + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba")); +} + +TEST(ValidateIndicTest, SinhalaRepaya) { + std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 2); + EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8")); + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 3); + EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d")); +} + +TEST(ValidateIndicTest, SinhalaSpecials) { + // Sinhala has some exceptions from the usual rules. + std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d"; + std::vector<std::string> glyphs; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(glyphs[0], std::string("\u0dc0")); + EXPECT_EQ(glyphs[1], std::string("\u0d9c")); + EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb")); + EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d")); + EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d")); + str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf"; + EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs); + EXPECT_EQ(glyphs[0], std::string("\u0dc3")); + EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d")); + EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d")); + EXPECT_EQ(glyphs[3], std::string("\u0dcf")); +} + +} // namespace tesseract diff --git a/tesseract/unittest/validate_khmer_test.cc b/tesseract/unittest/validate_khmer_test.cc new file mode 100644 index 00000000..74b87e61 --- /dev/null +++ b/tesseract/unittest/validate_khmer_test.cc @@ -0,0 +1,50 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "normstrngs.h" +#include "normstrngs_test.h" + +namespace tesseract { + +// Test some random Khmer words. +TEST(ValidateKhmerTest, GoodKhmerWords) { + std::string str = "ព័ត៏មានប្លែកៗ"; + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 13, 12, 7, str); + str = "ទំនុកច្រៀង"; + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 10, 9, 5, str); + str = "កាលីហ្វូញ៉ា"; + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 4, str); + str = "ចាប់ពីផ្លូវ"; + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 5, str); +} + +// Test some random Khmer words with dotted circles. +TEST(ValidateKhmerTest, BadKhmerWords) { + std::string result; + // Multiple dependent vowels not allowed + std::string str = "\u1796\u17b6\u17b7"; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + // Multiple shifters not allowed + str = "\u1798\u17c9\u17ca"; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + // Multiple signs not allowed + str = "\u1780\u17b6\u17cb\u17cd"; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); +} + +} // namespace tesseract diff --git a/tesseract/unittest/validate_myanmar_test.cc b/tesseract/unittest/validate_myanmar_test.cc new file mode 100644 index 00000000..262e04b6 --- /dev/null +++ b/tesseract/unittest/validate_myanmar_test.cc @@ -0,0 +1,54 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "normstrngs.h" +#include "normstrngs_test.h" + +namespace tesseract { + +// Test some random Myanmar words. +TEST(ValidateMyanmarTest, GoodMyanmarWords) { + std::string str = "လျှာကသိသည် "; // No viramas in this one. + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 11, 5, str); + str = "တုန္လႈပ္မႈ "; + ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 9, 4, str); +} + +// Test some random Myanmar words with dotted circles. +TEST(ValidateMyanmarTest, BadMyanmarWords) { + std::string str = "က်န္းမာေရး"; + std::vector<std::string> glyphs; + EXPECT_FALSE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, + str.c_str(), &glyphs)); + std::string result; + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + // It works if the grapheme normalization is turned off. + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNone, str.c_str(), &result)); + EXPECT_EQ(str, result); + str = "ခုႏွစ္"; + EXPECT_FALSE(NormalizeCleanAndSegmentUTF8( + UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, + true, str.c_str(), &glyphs)); + EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNormalize, str.c_str(), + &result)); + // It works if the grapheme normalization is turned off. + EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, + GraphemeNorm::kNone, str.c_str(), &result)); + EXPECT_EQ(str, result); +} + +} // namespace tesseract diff --git a/tesseract/unittest/validator_test.cc b/tesseract/unittest/validator_test.cc new file mode 100644 index 00000000..84cb42af --- /dev/null +++ b/tesseract/unittest/validator_test.cc @@ -0,0 +1,76 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "validator.h" + +#include "gmock/gmock.h" // for testing::ElementsAreArray +#include "include_gunit.h" + +namespace tesseract { + +class TestableValidator : public Validator { + public: + static ViramaScript TestableMostFrequentViramaScript( + const std::vector<char32>& utf32) { + return MostFrequentViramaScript(utf32); + } +}; + +// The majority of Validator is tested by the script-specific tests of its +// subclasses, but the MostFrequentViramaScript function is worth a unittest. +TEST(ValidatorTest, MostFrequentViramaScript) { + // The most frequent virama script should come out correct, despite + // distractions from other scripts. + EXPECT_EQ(ViramaScript::kTelugu, + TestableValidator::TestableMostFrequentViramaScript({0xc05})); + // It is still Telugu surrounded by Latin. + EXPECT_EQ(ViramaScript::kTelugu, + TestableValidator::TestableMostFrequentViramaScript( + {'a', 0xc05, 'b', 'c'})); + // But not still Telugu surrounded by Devanagari. + EXPECT_EQ(ViramaScript::kDevanagari, + TestableValidator::TestableMostFrequentViramaScript( + {0x905, 0xc05, 0x906, 0x907})); + EXPECT_EQ(ViramaScript::kKannada, + TestableValidator::TestableMostFrequentViramaScript( + {0xc85, 0xc05, 0xc86, 0xc87})); + EXPECT_EQ(ViramaScript::kBengali, + TestableValidator::TestableMostFrequentViramaScript( + {0x985, 0xc05, 0x986, 0x987})); + // Danda and double Danda don't count as Devanagari, as they are common. + EXPECT_EQ(ViramaScript::kTelugu, + TestableValidator::TestableMostFrequentViramaScript( + {0x964, 0xc05, 0x965, 0x965})); +} + +// ValidateCleanAndSegment doesn't modify the input by much, but its +// transformation should be idempotent. (Doesn't change again if re-applied.) +TEST(ValidatorTest, Idempotency) { + std::vector<char32> str1( + {0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c}); + std::vector<char32> str2( + {0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''}); + std::vector<std::vector<char32>> result1, result2, result3, result4; + EXPECT_TRUE(Validator::ValidateCleanAndSegment( + GraphemeNormMode::kSingleString, true, str1, &result1)); + EXPECT_TRUE(Validator::ValidateCleanAndSegment( + GraphemeNormMode::kSingleString, true, result1[0], &result2)); + EXPECT_EQ(result1.size(), result2.size()); + EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0])); + EXPECT_TRUE(Validator::ValidateCleanAndSegment( + GraphemeNormMode::kSingleString, true, str2, &result3)); + EXPECT_TRUE(Validator::ValidateCleanAndSegment( + GraphemeNormMode::kSingleString, true, result3[0], &result4)); + EXPECT_EQ(result3.size(), result4.size()); + EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0])); +} + +} // namespace tesseract |