summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/unittest')
-rw-r--r--tesseract/unittest/README.md88
-rw-r--r--tesseract/unittest/apiexample_test.cc119
-rw-r--r--tesseract/unittest/applybox_test.cc128
-rw-r--r--tesseract/unittest/baseapi_test.cc402
-rw-r--r--tesseract/unittest/baseapi_thread_test.cc229
-rw-r--r--tesseract/unittest/bitvector_test.cc166
-rw-r--r--tesseract/unittest/capiexample_c_test.c21
-rw-r--r--tesseract/unittest/capiexample_test.cc19
-rw-r--r--tesseract/unittest/cleanapi_test.cc28
-rw-r--r--tesseract/unittest/colpartition_test.cc76
-rw-r--r--tesseract/unittest/commandlineflags_test.cc158
-rw-r--r--tesseract/unittest/cycletimer.h61
-rw-r--r--tesseract/unittest/dawg_test.cc115
-rw-r--r--tesseract/unittest/denorm_test.cc99
-rw-r--r--tesseract/unittest/doubleptr.h93
-rw-r--r--tesseract/unittest/equationdetect_test.cc549
-rw-r--r--tesseract/unittest/fileio_test.cc66
-rw-r--r--tesseract/unittest/fuzzers/fuzzer-api.cpp101
-rwxr-xr-xtesseract/unittest/fuzzers/oss-fuzz-build.sh59
-rw-r--r--tesseract/unittest/heap_test.cc202
-rw-r--r--tesseract/unittest/imagedata_test.cc131
-rw-r--r--tesseract/unittest/include_gunit.h76
-rw-r--r--tesseract/unittest/indexmapbidi_test.cc117
-rw-r--r--tesseract/unittest/intfeaturemap_test.cc129
-rw-r--r--tesseract/unittest/intsimdmatrix_test.cc135
-rw-r--r--tesseract/unittest/lang_model_test.cc217
-rw-r--r--tesseract/unittest/layout_test.cc234
-rw-r--r--tesseract/unittest/ligature_table_test.cc111
-rw-r--r--tesseract/unittest/linlsq_test.cc118
-rw-r--r--tesseract/unittest/list_test.cc68
-rw-r--r--tesseract/unittest/loadlang_test.cc251
-rw-r--r--tesseract/unittest/log.h67
-rw-r--r--tesseract/unittest/lstm_recode_test.cc45
-rw-r--r--tesseract/unittest/lstm_squashed_test.cc31
-rw-r--r--tesseract/unittest/lstm_test.cc221
-rw-r--r--tesseract/unittest/lstm_test.h189
-rw-r--r--tesseract/unittest/lstmtrainer_test.cc106
-rw-r--r--tesseract/unittest/mastertrainer_test.cc298
-rw-r--r--tesseract/unittest/matrix_test.cc137
-rw-r--r--tesseract/unittest/networkio_test.cc217
-rw-r--r--tesseract/unittest/normstrngs_test.cc422
-rw-r--r--tesseract/unittest/normstrngs_test.h84
-rw-r--r--tesseract/unittest/nthitem_test.cc120
-rw-r--r--tesseract/unittest/osd_test.cc133
-rw-r--r--tesseract/unittest/pagesegmode_test.cc114
-rw-r--r--tesseract/unittest/pango_font_info_test.cc334
-rw-r--r--tesseract/unittest/paragraphs_test.cc705
-rw-r--r--tesseract/unittest/params_model_test.cc75
-rw-r--r--tesseract/unittest/progress_test.cc165
-rw-r--r--tesseract/unittest/qrsequence_test.cc69
-rw-r--r--tesseract/unittest/recodebeam_test.cc483
-rw-r--r--tesseract/unittest/rect_test.cc176
-rw-r--r--tesseract/unittest/resultiterator_test.cc612
-rw-r--r--tesseract/unittest/scanutils_test.cc114
-rw-r--r--tesseract/unittest/shapetable_test.cc182
-rw-r--r--tesseract/unittest/stats_test.cc59
-rw-r--r--tesseract/unittest/stridemap_test.cc219
-rw-r--r--tesseract/unittest/stringrenderer_test.cc564
-rw-r--r--tesseract/unittest/syntaxnet/base.h61
-rw-r--r--tesseract/unittest/tablefind_test.cc261
-rw-r--r--tesseract/unittest/tablerecog_test.cc316
-rw-r--r--tesseract/unittest/tabvector_test.cc130
-rw-r--r--tesseract/unittest/tatweel_test.cc114
-rw-r--r--tesseract/unittest/tesseract_leaksanitizer.supp12
-rw-r--r--tesseract/unittest/textlineprojection_test.cc262
-rw-r--r--tesseract/unittest/tfile_test.cc179
-rw-r--r--tesseract/unittest/third_party/utf/rune.c357
-rw-r--r--tesseract/unittest/third_party/utf/utf.h246
-rw-r--r--tesseract/unittest/third_party/utf/utfdef.h14
-rw-r--r--tesseract/unittest/unichar_test.cc43
-rw-r--r--tesseract/unittest/unicharcompress_test.cc257
-rw-r--r--tesseract/unittest/unicharset_test.cc161
-rw-r--r--tesseract/unittest/util/utf8/unicodetext.cc507
-rw-r--r--tesseract/unittest/util/utf8/unicodetext.h477
-rw-r--r--tesseract/unittest/util/utf8/unilib.cc58
-rw-r--r--tesseract/unittest/util/utf8/unilib.h63
-rw-r--r--tesseract/unittest/util/utf8/unilib_utf8_utils.h66
-rw-r--r--tesseract/unittest/validate_grapheme_test.cc179
-rw-r--r--tesseract/unittest/validate_indic_test.cc231
-rw-r--r--tesseract/unittest/validate_khmer_test.cc50
-rw-r--r--tesseract/unittest/validate_myanmar_test.cc54
-rw-r--r--tesseract/unittest/validator_test.cc76
82 files changed, 14481 insertions, 0 deletions
diff --git a/tesseract/unittest/README.md b/tesseract/unittest/README.md
new file mode 100644
index 00000000..bf4f83fe
--- /dev/null
+++ b/tesseract/unittest/README.md
@@ -0,0 +1,88 @@
+# Unit Testing for Tesseract
+
+
+## Requirements
+
+### Files and structure
+```
+
+├── langdata_lstm
+│   ├── common.punc
+│   ├── common.unicharambigs
+│   ├── desired_bigrams.txt
+│   ├── eng
+│   │   ├── desired_characters
+│   │   ├── eng.config
+│   │   ├── eng.numbers
+│   │   ├── eng.punc
+│   │   ├── eng.singles_text
+│   │   ├── eng.training_text
+│   │   ├── eng.unicharambigs
+│   │   ├── eng.wordlist
+│   │   └── okfonts.txt
+│   ├── extended
+│   │   └── extended.config
+│   ├── extendedhin
+│   │   └── extendedhin.config
+│   ├── font_properties
+│   ├── forbidden_characters_default
+│   ├── hin
+│   │   ├── hin.config
+│   │   ├── hin.numbers
+│   │   ├── hin.punc
+│   │   └── hin.wordlist
+│   ├── kan
+│   │   └── kan.config
+│   ├── kor
+│   │   └── kor.config
+│   ├── osd
+│   │   └── osd.unicharset
+│   └── radical-stroke.txt
+├── tessdata
+│   ├── ara.traineddata
+│   ├── chi_tra.traineddata
+│   ├── eng.traineddata
+│   ├── heb.traineddata
+│   ├── hin.traineddata
+│   ├── jpn.traineddata
+│   ├── kmr.traineddata
+│   ├── osd.traineddata
+│   └── vie.traineddata
+├── tessdata_best
+│   ├── eng.traineddata
+│   ├── fra.traineddata
+│   ├── kmr.traineddata
+│   └── osd.traineddata
+├── tessdata_fast
+│   ├── eng.traineddata
+│   ├── kmr.traineddata
+│   ├── osd.traineddata
+│   └── script
+│   └── Latin.traineddata
+└── tesseract
+ ├── abseil
+ ...
+ ├── test
+ ├── unittest
+ └── VERSION
+```
+
+### Fonts
+
+* Microsoft fonts: arialbi.ttf, times.ttf, verdana.ttf - [instalation guide](https://www.makeuseof.com/tag/how-to-install-microsoft-core-fonts-in-ubuntu-linux/)
+* [ae_Arab.ttf](https://www.wfonts.com/download/data/2014/12/03/ae-arab/ae-arab.zip)
+* dejavu-fonts: [DejaVuSans-ExtraLight.ttf](https://dejavu-fonts.github.io/Download.html)
+* [Lohit-Hindi.ttf](https://raw.githubusercontent.com/pratul/packageofpractices/master/assets/fonts/Lohit-Hindi.ttf)
+* [UnBatang.ttf](https://raw.githubusercontent.com/byrongibson/fonts/master/backup/truetype.original/unfonts-core/UnBatang.ttf)
+
+
+## Run tests
+
+To run the tests, do the following in tesseract folder
+
+```
+autoreconf -fiv
+git submodule update --init
+export TESSDATA_PREFIX=/prefix/to/path/to/tessdata
+make check
+```
diff --git a/tesseract/unittest/apiexample_test.cc b/tesseract/unittest/apiexample_test.cc
new file mode 100644
index 00000000..5a721fa3
--- /dev/null
+++ b/tesseract/unittest/apiexample_test.cc
@@ -0,0 +1,119 @@
+///////////////////////////////////////////////////////////////////////
+// File: apiexample_test.cc
+// Description: Api Test for Tesseract using text fixtures and parameters.
+// Tests for Devanagari, Latin and Arabic scripts are disabled by default.
+// Disabled tests can be run when required by using the
+// --gtest_also_run_disabled_tests argument.
+// ./unittest/apiexample_test --gtest_also_run_disabled_tests
+//
+// Author: ShreeDevi Kumar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// expects clone of tessdata_fast repo in ../../tessdata_fast
+
+//#include "log.h"
+#include <time.h>
+#include <fstream>
+#include <iostream>
+#include <locale>
+#include <memory> // std::unique_ptr
+#include <string>
+#include <tesseract/baseapi.h>
+#include "include_gunit.h"
+#include "allheaders.h"
+
+namespace tesseract {
+
+class QuickTest : public testing::Test {
+ protected:
+ virtual void SetUp() { start_time_ = time(nullptr); }
+ virtual void TearDown() {
+#ifndef NDEBUG
+ // Debug builds can be very slow, so allow 4 min for OCR of a test image.
+ // apitest_example including disabled tests takes about 18 min on ARMv7.
+ const time_t MAX_SECONDS_FOR_TEST = 240;
+#else
+ // Release builds typically need less than 10 s for OCR of a test image,
+ // apitest_example including disabled tests takes about 90 s on ARMv7.
+ const time_t MAX_SECONDS_FOR_TEST = 55;
+#endif
+ const time_t end_time = time(nullptr);
+ EXPECT_TRUE(end_time - start_time_ <= MAX_SECONDS_FOR_TEST)
+ << "The test took too long - "
+ << ::testing::PrintToString(end_time - start_time_);
+ }
+ time_t start_time_;
+};
+
+void OCRTester(const char* imgname, const char* groundtruth,
+ const char* tessdatadir, const char* lang) {
+ // log.info() << tessdatadir << " for language: " << lang << std::endl;
+ char* outText;
+ std::locale loc("C"); // You can also use "" for the default system locale
+ std::ifstream file(groundtruth);
+ file.imbue(loc); // Use it for file input
+ std::string gtText((std::istreambuf_iterator<char>(file)),
+ std::istreambuf_iterator<char>());
+ std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+ ASSERT_FALSE(api->Init(tessdatadir, lang))
+ << "Could not initialize tesseract.";
+ Pix* image = pixRead(imgname);
+ ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+ api->SetImage(image);
+ outText = api->GetUTF8Text();
+ EXPECT_EQ(gtText, outText)
+ << "Phototest.tif OCR does not match ground truth for "
+ << ::testing::PrintToString(lang);
+ api->End();
+ delete[] outText;
+ pixDestroy(&image);
+}
+
+class MatchGroundTruth : public QuickTest,
+ public ::testing::WithParamInterface<const char*> {};
+
+TEST_P(MatchGroundTruth, FastPhototestOCR) {
+ OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
+ TESSDATA_DIR "_fast", GetParam());
+}
+
+TEST_P(MatchGroundTruth, BestPhototestOCR) {
+ OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
+ TESSDATA_DIR "_best", GetParam());
+}
+
+TEST_P(MatchGroundTruth, TessPhototestOCR) {
+ OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
+ TESSDATA_DIR, GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(Eng, MatchGroundTruth, ::testing::Values("eng"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth,
+ ::testing::Values("script/Latin"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth,
+ ::testing::Values("script/Devanagari"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth,
+ ::testing::Values("script/Arabic"));
+
+class EuroText : public QuickTest {};
+
+TEST_F(EuroText, FastLatinOCR) {
+ OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt",
+ TESSDATA_DIR "_fast", "script/Latin");
+}
+
+// script/Latin for eurotext.tif does not match groundtruth
+// for tessdata & tessdata_best.
+// so do not test these here.
+
+} // namespace
diff --git a/tesseract/unittest/applybox_test.cc b/tesseract/unittest/applybox_test.cc
new file mode 100644
index 00000000..055172d7
--- /dev/null
+++ b/tesseract/unittest/applybox_test.cc
@@ -0,0 +1,128 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "boxread.h"
+#include "rect.h"
+#include <tesseract/resultiterator.h>
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+const char* kTruthTextWords = "To simple burn running of goods lately.\n";
+const char* kTruthTextLine = "Tosimpleburnrunningofgoodslately.\n";
+
+// The fixture for testing Tesseract.
+class ApplyBoxTest : public testing::Test {
+ protected:
+ std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR, name);
+ }
+ std::string TessdataPath() { return TESSDATA_DIR; }
+
+ ApplyBoxTest() { src_pix_ = nullptr; }
+ ~ApplyBoxTest() { pixDestroy(&src_pix_); }
+
+ bool SetImage(const char* filename) {
+ bool found = false;
+ pixDestroy(&src_pix_);
+ src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
+ if (api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+ api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+ api_.SetImage(src_pix_);
+ api_.SetVariable("tessedit_make_boxes_from_boxes", "1");
+ api_.SetInputName(TestDataNameToPath(filename).c_str());
+ found = true;
+ }
+ return found;
+ }
+
+ // Runs ApplyBoxes (via setting the appropriate variables and Recognize)
+ // and checks that the output ocr text matches the truth_str, and that
+ // the boxes match the given box file well enough.
+ // If line_mode is true, ApplyBoxes is run in line segmentation mode,
+ // otherwise the input box file is assumed to have character-level boxes.
+ void VerifyBoxesAndText(const char* imagefile, const char* truth_str,
+ const char* target_box_file, bool line_mode) {
+ if (!SetImage(imagefile)) {
+ // eng.traineddata not found or other problem during Init.
+ GTEST_SKIP();
+ return;
+ }
+ if (line_mode)
+ api_.SetVariable("tessedit_resegment_from_line_boxes", "1");
+ else
+ api_.SetVariable("tessedit_resegment_from_boxes", "1");
+ api_.Recognize(nullptr);
+ char* ocr_text = api_.GetUTF8Text();
+ EXPECT_STREQ(truth_str, ocr_text);
+ delete[] ocr_text;
+ // Test the boxes by reading the target box file in parallel with the
+ // bounding boxes in the ocr output.
+ std::string box_filename = TestDataNameToPath(target_box_file);
+ FILE* box_file = OpenBoxFile(box_filename.c_str());
+ ASSERT_TRUE(box_file != nullptr);
+ int height = pixGetHeight(src_pix_);
+ ResultIterator* it = api_.GetIterator();
+ do {
+ int left, top, right, bottom;
+ EXPECT_TRUE(
+ it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom));
+ TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top));
+ int line_number = 0;
+ TBOX truth_box;
+ STRING box_text;
+ EXPECT_TRUE(
+ ReadNextBox(0, &line_number, box_file, &box_text, &truth_box));
+ // Testing for major overlap is a bit weak, but if they all
+ // major overlap successfully, then it has to be fairly close.
+ EXPECT_TRUE(ocr_box.major_overlap(truth_box));
+ // Also check that the symbol text matches the box text.
+ char* symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL);
+ EXPECT_STREQ(box_text.c_str(), symbol_text);
+ delete[] symbol_text;
+ } while (it->Next(tesseract::RIL_SYMBOL));
+ delete it;
+ }
+
+ Pix* src_pix_;
+ std::string ocr_text_;
+ tesseract::TessBaseAPI api_;
+};
+
+// Tests character-level applyboxes on normal Times New Roman.
+TEST_F(ApplyBoxTest, TimesCharLevel) {
+ VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box",
+ false);
+}
+
+// Tests character-level applyboxes on italic Times New Roman.
+TEST_F(ApplyBoxTest, ItalicCharLevel) {
+ VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box",
+ false);
+}
+
+// Tests line-level applyboxes on normal Times New Roman.
+TEST_F(ApplyBoxTest, TimesLineLevel) {
+ VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine,
+ "trainingtimes.box", true);
+}
+
+// Tests line-level applyboxes on italic Times New Roman.
+TEST_F(ApplyBoxTest, ItalLineLevel) {
+ VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box",
+ true);
+}
+
+} // namespace
diff --git a/tesseract/unittest/baseapi_test.cc b/tesseract/unittest/baseapi_test.cc
new file mode 100644
index 00000000..285172e3
--- /dev/null
+++ b/tesseract/unittest/baseapi_test.cc
@@ -0,0 +1,402 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "cycletimer.h" // for CycleTimer
+#include "log.h" // for LOG
+#include "ocrblock.h" // for class BLOCK
+#include "pageres.h"
+
+#include <tesseract/baseapi.h>
+
+#include "allheaders.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "gmock/gmock-matchers.h"
+
+#include <memory>
+#include <regex>
+#include <string>
+#include <vector>
+
+namespace tesseract {
+
+using ::testing::ContainsRegex;
+using ::testing::HasSubstr;
+
+static const char* langs[] = {"eng", "vie", "hin", "ara", nullptr};
+static const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif",
+ "arabic.tif", nullptr};
+static const char* gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
+ "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
+ "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a",
+ nullptr};
+
+class FriendlyTessBaseAPI : public tesseract::TessBaseAPI {
+ FRIEND_TEST(TesseractTest, LSTMGeometryTest);
+};
+
+std::string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) {
+ tess->SetImage(pix);
+ char* result = tess->GetUTF8Text();
+ std::string ocr_result = result;
+ delete[] result;
+ absl::StripAsciiWhitespace(&ocr_result);
+ return ocr_result;
+}
+
+// The fixture for testing Tesseract.
+class TesseractTest : public testing::Test {
+ protected:
+ static std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR, name);
+ }
+ static std::string TessdataPath() {
+ return TESSDATA_DIR;
+ }
+};
+
+// Tests that Tesseract gets exactly the right answer on phototest.
+TEST_F(TesseractTest, BasicTesseractTest) {
+ tesseract::TessBaseAPI api;
+ std::string truth_text;
+ std::string ocr_text;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+ Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
+ CHECK(src_pix);
+ ocr_text = GetCleanedTextResult(&api, src_pix);
+ CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
+ &truth_text, file::Defaults()));
+ absl::StripAsciiWhitespace(&truth_text);
+ EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
+ pixDestroy(&src_pix);
+ } else {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ }
+}
+
+// Test that api.GetComponentImages() will return a set of images for
+// paragraphs even if text recognition was not run.
+TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
+ tesseract::TessBaseAPI api;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+ api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+ api.SetVariable("paragraph_debug_level", "3");
+#if 0 // TODO: b622.png is missing
+ Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
+ CHECK(src_pix);
+ api.SetImage(src_pix);
+ Boxa* para_boxes =
+ api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);
+ EXPECT_TRUE(para_boxes != nullptr);
+ Boxa* block_boxes =
+ api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);
+ EXPECT_TRUE(block_boxes != nullptr);
+ // TODO(eger): Get paragraphs out of this page pre-text.
+ EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
+ boxaDestroy(&block_boxes);
+ boxaDestroy(&para_boxes);
+ pixDestroy(&src_pix);
+#endif
+ } else {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ }
+}
+
+// We should get hOCR output and not seg fault, even if the api caller doesn't
+// call SetInputName().
+TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {
+ tesseract::TessBaseAPI api;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ return;
+ }
+ Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
+ CHECK(src_pix);
+ api.SetImage(src_pix);
+ char* result = api.GetHOCRText(0);
+ EXPECT_TRUE(result != nullptr);
+ EXPECT_THAT(result, HasSubstr("Hello"));
+ EXPECT_THAT(result, HasSubstr("<div class='ocr_page'"));
+ delete[] result;
+ pixDestroy(&src_pix);
+}
+
+// hOCR output should contain baseline info for upright textlines.
+TEST_F(TesseractTest, HOCRContainsBaseline) {
+ tesseract::TessBaseAPI api;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ return;
+ }
+ Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
+ CHECK(src_pix);
+ api.SetInputName("HelloGoogle.tif");
+ api.SetImage(src_pix);
+ char* result = api.GetHOCRText(0);
+ EXPECT_TRUE(result != nullptr);
+ EXPECT_THAT(result, HasSubstr("Hello"));
+ EXPECT_TRUE(std::regex_search(result, std::regex{ "<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+" }));
+
+ delete[] result;
+ pixDestroy(&src_pix);
+}
+
+// Tests that Tesseract gets exactly the right answer on some page numbers.
+TEST_F(TesseractTest, AdaptToWordStrTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because TessBaseAPI::AdaptToWordStr is missing.
+ GTEST_SKIP();
+#else
+ static const char* kTrainingPages[] = {
+ "136.tif", "256.tif", "410.tif", "432.tif", "540.tif",
+ "692.tif", "779.tif", "793.tif", "808.tif", "815.tif",
+ "12.tif", "12.tif", nullptr};
+ static const char* kTrainingText[] = {
+ "1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0", "6 9 2", "7 7 9",
+ "7 9 3", "8 0 8", "8 1 5", "1 2", "1 2", nullptr};
+ static const char* kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr};
+ static const char* kTestText[] = {"324", "433", "12", nullptr};
+ tesseract::TessBaseAPI api;
+ std::string truth_text;
+ std::string ocr_text;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ return;
+ }
+ api.SetVariable("matcher_sufficient_examples_for_prototyping", "1");
+ api.SetVariable("classify_class_pruner_threshold", "220");
+ // Train on the training text.
+ for (int i = 0; kTrainingPages[i] != nullptr; ++i) {
+ std::string image_file = TestDataNameToPath(kTrainingPages[i]);
+ Pix* src_pix = pixRead(image_file.c_str());
+ CHECK(src_pix);
+ api.SetImage(src_pix);
+ EXPECT_TRUE(
+ api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))
+ << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image "
+ << image_file;
+ pixDestroy(&src_pix);
+ }
+ // Test the test text.
+ api.SetVariable("tess_bn_matching", "1");
+ api.SetPageSegMode(tesseract::PSM_SINGLE_WORD);
+ for (int i = 0; kTestPages[i] != nullptr; ++i) {
+ Pix* src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str());
+ CHECK(src_pix);
+ ocr_text = GetCleanedTextResult(&api, src_pix);
+ absl::StripAsciiWhitespace(&truth_text);
+ EXPECT_STREQ(kTestText[i], ocr_text.c_str());
+ pixDestroy(&src_pix);
+ }
+#endif
+}
+
+// Tests that LSTM gets exactly the right answer on phototest.
+TEST_F(TesseractTest, BasicLSTMTest) {
+ tesseract::TessBaseAPI api;
+ std::string truth_text;
+ std::string ocr_text;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ return;
+ }
+ Pix* src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
+ CHECK(src_pix);
+ ocr_text = GetCleanedTextResult(&api, src_pix);
+ CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
+ &truth_text, file::Defaults()));
+ absl::StripAsciiWhitespace(&truth_text);
+ EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
+ pixDestroy(&src_pix);
+}
+
+// Test that LSTM's character bounding boxes are properly converted to
+// Tesseract structures. Note that we can't guarantee that LSTM's
+// character boxes fall completely within Tesseract's word box because
+// the baseline denormalization/normalization transforms may introduce
+// errors due to float/int conversions (e.g., see OUTLINE::move() in
+// ccstruct/poutline.h) Instead, we do a loose check.
+TEST_F(TesseractTest, LSTMGeometryTest) {
+ Pix* src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str());
+ FriendlyTessBaseAPI api;
+ if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
+ // eng.traineddata not found.
+ GTEST_SKIP();
+ return;
+ }
+ api.SetImage(src_pix);
+ ASSERT_EQ(api.Recognize(nullptr), 0);
+
+ const PAGE_RES* page_res = api.GetPageRes();
+ PAGE_RES_IT page_res_it(const_cast<PAGE_RES*>(page_res));
+ page_res_it.restart_page();
+ BLOCK* block = page_res_it.block()->block;
+ CHECK(block);
+
+ // extract word and character boxes for each word
+ for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+ page_res_it.forward()) {
+ WERD_RES* word = page_res_it.word();
+ CHECK(word);
+ CHECK(word->best_choice);
+ CHECK_GT(word->best_choice->length(), 0);
+ CHECK(word->word);
+ CHECK(word->box_word);
+ // tesseract's word box
+ TBOX tess_blob_box;
+ tess_blob_box = word->word->bounding_box();
+ tess_blob_box.rotate(block->re_rotation());
+ // verify that each of LSTM's character boxes lies close to within
+ // tesseract's word box
+ for (int i = 0; i < word->box_word->length(); ++i) {
+ TBOX lstm_blob_box = word->box_word->BlobBox(i);
+ // LSTM character box should not spill out of tesseract word box
+ // by more than a few pixels in any direction
+ EXPECT_LT(tess_blob_box.left() - lstm_blob_box.left(), 5);
+ EXPECT_LT(lstm_blob_box.right() - tess_blob_box.right(), 5);
+ EXPECT_LT(tess_blob_box.bottom() - lstm_blob_box.bottom(), 5);
+ EXPECT_LT(lstm_blob_box.top() - tess_blob_box.top(), 5);
+ }
+ }
+ pixDestroy(&src_pix);
+}
+
+TEST_F(TesseractTest, InitConfigOnlyTest) {
+ // Languages for testing initialization.
+ const char* langs[] = {"eng", "chi_tra", "jpn", "vie"};
+ std::unique_ptr<tesseract::TessBaseAPI> api;
+ CycleTimer timer;
+ for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
+ api.reset(new tesseract::TessBaseAPI);
+ timer.Restart();
+ EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
+ tesseract::OEM_TESSERACT_ONLY));
+ timer.Stop();
+ LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs()
+ << "ms in regular init";
+ }
+ // Init variables to set for config-only initialization.
+ std::vector<std::string> vars_vec, vars_values;
+ vars_vec.push_back("tessedit_init_config_only");
+ vars_values.push_back("1");
+ LOG(INFO) << "Switching to config only initialization:";
+ for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
+ api.reset(new tesseract::TessBaseAPI);
+ timer.Restart();
+ EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
+ tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec,
+ &vars_values, false));
+ timer.Stop();
+ LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs()
+ << "ms in config-only init";
+ }
+}
+
+// Tests if two instances of Tesseract/LSTM can co-exist in the same thread.
+// NOTE: This is not an exhaustive test and current support for multiple
+// instances in Tesseract is fragile. This test is intended largely as a means
+// of detecting and guarding against the existing support being possibly broken
+// by future CLs. TessBaseAPI instances are initialized using the default
+// OEM_DEFAULT mode.
+TEST(TesseractInstanceTest, TestMultipleTessInstances) {
+ int num_langs = 0;
+ while (langs[num_langs] != nullptr) ++num_langs;
+
+ const std::string kTessdataPath = TESSDATA_DIR;
+
+ // Preload images and verify that OCR is correct on them individually.
+ std::vector<Pix*> pix(num_langs);
+ for (int i = 0; i < num_langs; ++i) {
+ SCOPED_TRACE(absl::StrCat("Single instance test with lang = ", langs[i]));
+ std::string path = file::JoinPath(TESTING_DIR, image_files[i]);
+ pix[i] = pixRead(path.c_str());
+ QCHECK(pix[i] != nullptr) << "Could not read " << path;
+
+ tesseract::TessBaseAPI tess;
+ EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i]));
+ std::string ocr_result = GetCleanedTextResult(&tess, pix[i]);
+ EXPECT_STREQ(gt_text[i], ocr_result.c_str());
+ }
+
+ // Process the images in all pairwise combinations of associated languages.
+ std::string ocr_result[2];
+ for (int i = 0; i < num_langs; ++i) {
+ for (int j = i + 1; j < num_langs; ++j) {
+ tesseract::TessBaseAPI tess1, tess2;
+ tess1.Init(kTessdataPath.c_str(), langs[i]);
+ tess2.Init(kTessdataPath.c_str(), langs[j]);
+
+ ocr_result[0] = GetCleanedTextResult(&tess1, pix[i]);
+ ocr_result[1] = GetCleanedTextResult(&tess2, pix[j]);
+
+ EXPECT_FALSE(strcmp(gt_text[i], ocr_result[0].c_str()) ||
+ strcmp(gt_text[j], ocr_result[1].c_str()))
+ << "OCR failed on language pair " << langs[i] << "-" << langs[j];
+ }
+ }
+
+ for (int i = 0; i < num_langs; ++i) pixDestroy(&pix[i]);
+}
+
+// Tests whether Tesseract parameters are correctly set for the two instances.
+TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
+ std::string illegal_name = "an_illegal_name";
+ std::string langs[2] = {"eng", "hin"};
+ std::string int_param_name = "tessedit_pageseg_mode";
+ int int_param[2] = {1, 2};
+ std::string int_param_str[2] = {"1", "2"};
+ std::string bool_param_name = "tessedit_ambigs_training";
+ bool bool_param[2] = {false, true};
+ std::string bool_param_str[2] = {"F", "T"};
+ std::string str_param_name = "tessedit_char_blacklist";
+ std::string str_param[2] = {"abc", "def"};
+ std::string double_param_name = "segment_penalty_dict_frequent_word";
+ std::string double_param_str[2] = {"0.01", "2"};
+ double double_param[2] = {0.01, 2};
+
+ const std::string kTessdataPath = TESSDATA_DIR;
+
+ tesseract::TessBaseAPI tess1, tess2;
+ for (int i = 0; i < 2; ++i) {
+ tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2;
+ api->Init(kTessdataPath.c_str(), langs[i].c_str());
+ api->SetVariable(illegal_name.c_str(), "none");
+ api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str());
+ api->SetVariable(bool_param_name.c_str(), bool_param_str[i].c_str());
+ api->SetVariable(str_param_name.c_str(), str_param[i].c_str());
+ api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str());
+ }
+ for (int i = 0; i < 2; ++i) {
+ tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2;
+ EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str()));
+ int intvar;
+ EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar));
+ EXPECT_EQ(int_param[i], intvar);
+ bool boolvar;
+ EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar));
+ EXPECT_EQ(bool_param[i], boolvar);
+ EXPECT_STREQ(str_param[i].c_str(),
+ api->GetStringVariable(str_param_name.c_str()));
+ double doublevar;
+ EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar));
+ EXPECT_EQ(double_param[i], doublevar);
+ }
+}
+
+} // namespace
diff --git a/tesseract/unittest/baseapi_thread_test.cc b/tesseract/unittest/baseapi_thread_test.cc
new file mode 100644
index 00000000..3608a748
--- /dev/null
+++ b/tesseract/unittest/baseapi_thread_test.cc
@@ -0,0 +1,229 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Unit test to run Tesseract instances in parallel threads and verify
+// the OCR result.
+
+// Note that success of running this test as-is does NOT verify
+// thread-safety. For that, you need to run this binary under TSAN using the
+// associated baseapi_thread_test_with_tsan.sh script.
+//
+// The tests are partitioned by instance to allow running Tesseract/Cube/both
+// and by stage to run initialization/recognition/both. See flag descriptions
+// for details.
+
+#include <functional>
+#include <memory>
+#include <string>
+#ifdef INCLUDE_TENSORFLOW
+#include <tensorflow/core/lib/core/threadpool.h>
+#endif
+#include "absl/strings/ascii.h" // for absl::StripAsciiWhitespace
+#include "allheaders.h"
+#include "include_gunit.h"
+#include <tesseract/baseapi.h>
+#include "commandlineflags.h"
+#include "log.h"
+
+// Run with Tesseract instances.
+BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances");
+// Run with Cube instances.
+// Note that with TSAN, Cube typically takes much longer to test. Ignoring
+// std::string operations using the associated tess_tsan.ignore file when
+// testing Cube significantly reduces testing time.
+BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances");
+
+// When used with TSAN, having more repetitions can help in finding hidden
+// thread-safety violations at the expense of increased testing time.
+INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.");
+
+INT_PARAM_FLAG(max_concurrent_instances, 0,
+ "Maximum number of instances to run in parallel at any given "
+ "instant. The number of concurrent instances cannot exceed "
+ "reps * number_of_langs_tested, which is also the default value.");
+
+namespace tesseract {
+
+static const char* kTessLangs[] = {"eng", "vie", nullptr};
+static const char* kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr};
+static const char* kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
+ nullptr};
+
+static const char* kCubeLangs[] = {"hin", "ara", nullptr};
+static const char* kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr};
+static const char* kCubeTruthText[] = {
+ "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
+ "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
+
+class BaseapiThreadTest : public ::testing::Test {
+ protected:
+ static void SetUpTestCase() {
+ CHECK(FLAGS_test_tesseract || FLAGS_test_cube)
+ << "Need to test at least one of Tesseract/Cube!";
+ // Form a list of langs/gt_text/image_files we will work with.
+ std::vector<std::string> image_files;
+ if (FLAGS_test_tesseract) {
+ int i = 0;
+ while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) {
+ langs_.push_back(kTessLangs[i]);
+ gt_text_.push_back(kTessTruthText[i]);
+ image_files.push_back(kTessImages[i]);
+ ++i;
+ }
+ LOG(INFO) << "Testing Tesseract on " << i << " languages.";
+ }
+ if (FLAGS_test_cube) {
+ int i = 0;
+ while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) {
+ langs_.push_back(kCubeLangs[i]);
+ gt_text_.push_back(kCubeTruthText[i]);
+ image_files.push_back(kCubeImages[i]);
+ ++i;
+ }
+ LOG(INFO) << "Testing Cube on " << i << " languages.";
+ }
+ num_langs_ = langs_.size();
+
+ // Pre-load the images into an array. We will be making multiple copies of
+ // an image here if FLAGS_reps > 1 and that is intentional. In this test, we
+ // wish to not make any assumptions about the thread-safety of Pix objects,
+ // and so entirely disallow concurrent access of a Pix instance.
+ const int n = num_langs_ * FLAGS_reps;
+ for (int i = 0; i < n; ++i) {
+ std::string path = TESTING_DIR "/" + image_files[i % num_langs_];
+ Pix* new_pix = pixRead(path.c_str());
+ QCHECK(new_pix != nullptr) << "Could not read " << path;
+ pix_.push_back(new_pix);
+ }
+
+#ifdef INCLUDE_TENSORFLOW
+ pool_size_ = (FLAGS_max_concurrent_instances < 1)
+ ? num_langs_ * FLAGS_reps
+ : FLAGS_max_concurrent_instances;
+#endif
+ }
+
+ static void TearDownTestCase() {
+ for (auto& pix : pix_) {
+ pixDestroy(&pix);
+ }
+ }
+
+#ifdef INCLUDE_TENSORFLOW
+ void ResetPool() {
+ pool_.reset(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
+ }
+
+ void WaitForPoolWorkers() { pool_.reset(nullptr); }
+
+ std::unique_ptr<tensorflow::thread::ThreadPool> pool_;
+ static int pool_size_;
+#endif
+ static std::vector<Pix*> pix_;
+ static std::vector<std::string> langs_;
+ static std::vector<std::string> gt_text_;
+ static int num_langs_;
+};
+
+// static member variable declarations.
+#ifdef INCLUDE_TENSORFLOW
+int BaseapiThreadTest::pool_size_;
+#endif
+std::vector<Pix*> BaseapiThreadTest::pix_;
+std::vector<std::string> BaseapiThreadTest::langs_;
+std::vector<std::string> BaseapiThreadTest::gt_text_;
+int BaseapiThreadTest::num_langs_;
+
+static void InitTessInstance(TessBaseAPI* tess, const std::string& lang) {
+ CHECK(tess != nullptr);
+ EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str()));
+}
+
+static void GetCleanedText(TessBaseAPI* tess, Pix* pix, std::string* ocr_text) {
+ tess->SetImage(pix);
+ char* result = tess->GetUTF8Text();
+ *ocr_text = result;
+ delete[] result;
+ absl::StripAsciiWhitespace(ocr_text);
+}
+
+static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lang,
+ const std::string& expected_text) {
+ TessBaseAPI* tess_local = nullptr;
+ if (tess) {
+ tess_local = tess;
+ } else {
+ tess_local = new TessBaseAPI;
+ InitTessInstance(tess_local, lang);
+ }
+ std::string ocr_text;
+ GetCleanedText(tess_local, pix, &ocr_text);
+ EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());
+ if (tess_local != tess) delete tess_local;
+}
+
+// Check that Tesseract/Cube produce the correct results in single-threaded
+// operation. If not, it is pointless to run the real multi-threaded tests.
+TEST_F(BaseapiThreadTest, TestBasicSanity) {
+ for (int i = 0; i < num_langs_; ++i) {
+ TessBaseAPI tess;
+ InitTessInstance(&tess, langs_[i]);
+ std::string ocr_text;
+ GetCleanedText(&tess, pix_[i], &ocr_text);
+ CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0)
+ << "Failed with lang = " << langs_[i];
+ }
+}
+
+// Test concurrent instance initialization.
+TEST_F(BaseapiThreadTest, TestInit) {
+#ifdef INCLUDE_TENSORFLOW
+ const int n = num_langs_ * FLAGS_reps;
+ ResetPool();
+ std::vector<TessBaseAPI> tess(n);
+ for (int i = 0; i < n; ++i) {
+ pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_]));
+ }
+ WaitForPoolWorkers();
+#endif
+}
+
+// Test concurrent recognition.
+TEST_F(BaseapiThreadTest, TestRecognition) {
+#ifdef INCLUDE_TENSORFLOW
+ const int n = num_langs_ * FLAGS_reps;
+ std::vector<TessBaseAPI> tess(n);
+ // Initialize api instances in a single thread.
+ for (int i = 0; i < n; ++i) {
+ InitTessInstance(&tess[i], langs_[i % num_langs_]);
+ }
+
+ ResetPool();
+ for (int i = 0; i < n; ++i) {
+ pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i],
+ langs_[i % num_langs_], gt_text_[i % num_langs_]));
+ }
+ WaitForPoolWorkers();
+#endif
+}
+
+TEST_F(BaseapiThreadTest, TestAll) {
+#ifdef INCLUDE_TENSORFLOW
+ const int n = num_langs_ * FLAGS_reps;
+ ResetPool();
+ for (int i = 0; i < n; ++i) {
+ pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i],
+ langs_[i % num_langs_], gt_text_[i % num_langs_]));
+ }
+ WaitForPoolWorkers();
+#endif
+}
+} // namespace
diff --git a/tesseract/unittest/bitvector_test.cc b/tesseract/unittest/bitvector_test.cc
new file mode 100644
index 00000000..9be718a0
--- /dev/null
+++ b/tesseract/unittest/bitvector_test.cc
@@ -0,0 +1,166 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+
+#include "bitvector.h"
+
+#include "include_gunit.h"
+
+const int kPrimeLimit = 1000;
+
+namespace tesseract {
+
+class BitVectorTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ public:
+ std::string OutputNameToPath(const std::string& name) {
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+ // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes.
+ void ComputePrimes(BitVector* map) {
+ map->Init(kPrimeLimit + 1);
+ TestAll(*map, false);
+ map->SetBit(2);
+ // Set all the odds to true.
+ for (int i = 3; i <= kPrimeLimit; i += 2) map->SetValue(i, true);
+ int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));
+ for (int f = 3; f <= factor_limit; f += 2) {
+ if (map->At(f)) {
+ for (int m = 2; m * f <= kPrimeLimit; ++m) map->ResetBit(f * m);
+ }
+ }
+ }
+
+ void TestPrimes(const BitVector& map) {
+ // Now all primes in the vector are true, and all others false.
+ // According to Wikipedia, there are 168 primes under 1000, the last
+ // of which is 997.
+ int total_primes = 0;
+ for (int i = 0; i <= kPrimeLimit; ++i) {
+ if (map[i]) ++total_primes;
+ }
+ EXPECT_EQ(168, total_primes);
+ EXPECT_TRUE(map[997]);
+ EXPECT_FALSE(map[998]);
+ EXPECT_FALSE(map[999]);
+ }
+ // Test that all bits in the vector have the given value.
+ void TestAll(const BitVector& map, bool value) {
+ for (int i = 0; i < map.size(); ++i) {
+ EXPECT_EQ(value, map[i]);
+ }
+ }
+
+ // Sets up a BitVector with bit patterns for byte values in
+ // [start_byte, end_byte) positioned every spacing bytes (for spacing >= 1)
+ // with spacing-1 zero bytes in between the pattern bytes.
+ void SetBitPattern(int start_byte, int end_byte, int spacing, BitVector* bv) {
+ bv->Init((end_byte - start_byte) * 8 * spacing);
+ for (int byte_value = start_byte; byte_value < end_byte; ++byte_value) {
+ for (int bit = 0; bit < 8; ++bit) {
+ if (byte_value & (1 << bit))
+ bv->SetBit((byte_value - start_byte) * 8 * spacing + bit);
+ }
+ }
+ }
+
+ // Expects that every return from NextSetBit is really set and that all others
+ // are really not set. Checks the return from NumSetBits also.
+ void ExpectCorrectBits(const BitVector& bv) {
+ int bit_index = -1;
+ int prev_bit_index = -1;
+ int num_bits_tested = 0;
+ while ((bit_index = bv.NextSetBit(bit_index)) >= 0) {
+ EXPECT_LT(bit_index, bv.size());
+ // All bits in between must be 0.
+ for (int i = prev_bit_index + 1; i < bit_index; ++i) {
+ EXPECT_EQ(0, bv[i]) << "i = " << i << " prev = " << prev_bit_index;
+ }
+ // This bit must be 1.
+ EXPECT_EQ(1, bv[bit_index]) << "Bit index = " << bit_index;
+ ++num_bits_tested;
+ prev_bit_index = bit_index;
+ }
+ // Check the bits between the last and the end.
+ for (int i = prev_bit_index + 1; i < bv.size(); ++i) {
+ EXPECT_EQ(0, bv[i]);
+ }
+ EXPECT_EQ(num_bits_tested, bv.NumSetBits());
+ }
+};
+
+// Tests the sieve of Eratosthenes as a way of testing set/reset and I/O.
+TEST_F(BitVectorTest, Primes) {
+ BitVector map;
+ ComputePrimes(&map);
+ TestPrimes(map);
+ // It still works if we use the copy constructor.
+ BitVector map2(map);
+ TestPrimes(map2);
+ // Or if we assign it.
+ BitVector map3;
+ map3 = map;
+ TestPrimes(map3);
+ // Test file i/o too.
+ std::string filename = OutputNameToPath("primesbitvector");
+ FILE* fp = fopen(filename.c_str(), "wb");
+ ASSERT_TRUE(fp != nullptr);
+ EXPECT_TRUE(map.Serialize(fp));
+ fclose(fp);
+ fp = fopen(filename.c_str(), "rb");
+ ASSERT_TRUE(fp != nullptr);
+ BitVector read_map;
+ EXPECT_TRUE(read_map.DeSerialize(false, fp));
+ fclose(fp);
+ TestPrimes(read_map);
+}
+
+// Tests the many-to-one setup feature.
+TEST_F(BitVectorTest, SetAll) {
+ // Test the default constructor and set/resetall.
+ BitVector map(42);
+ TestAll(map, false);
+ map.SetAllTrue();
+ TestAll(map, true);
+ map.SetAllFalse();
+ TestAll(map, false);
+}
+
+// Tests the values in the tables offset_table_, next_table_, hamming_table_
+// by setting all possible byte patterns and verifying that the NextSetBit and
+// NumSetBits functions return the correct values.
+TEST_F(BitVectorTest, TestNextSetBit) {
+ BitVector bv;
+ for (int spacing = 1; spacing <= 5; ++spacing) {
+ SetBitPattern(0, 256, spacing, &bv);
+ ExpectCorrectBits(bv);
+ }
+}
+
+// Tests the values in hamming_table_ more thoroughly by setting single byte
+// patterns for each byte individually.
+TEST_F(BitVectorTest, TestNumSetBits) {
+ BitVector bv;
+ for (int byte = 0; byte < 256; ++byte) {
+ SetBitPattern(byte, byte + 1, 1, &bv);
+ ExpectCorrectBits(bv);
+ }
+}
+
+} // namespace.
diff --git a/tesseract/unittest/capiexample_c_test.c b/tesseract/unittest/capiexample_c_test.c
new file mode 100644
index 00000000..5917f0c4
--- /dev/null
+++ b/tesseract/unittest/capiexample_c_test.c
@@ -0,0 +1,21 @@
+///////////////////////////////////////////////////////////////////////
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// Verifies that C is able to include capi header.
+#include <tesseract/capi.h>
+
+// Verifies that the libtesseract library has C API symbols.
+int main()
+{
+ printf("%s\n", TessVersion());
+ return 0;
+}
diff --git a/tesseract/unittest/capiexample_test.cc b/tesseract/unittest/capiexample_test.cc
new file mode 100644
index 00000000..3c843056
--- /dev/null
+++ b/tesseract/unittest/capiexample_test.cc
@@ -0,0 +1,19 @@
+///////////////////////////////////////////////////////////////////////
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// Verifies that C++ is able to include capi header.
+#include <tesseract/capi.h>
+
+#include <gtest/gtest.h>
+
+// Verifies that the libtesseract library has C API symbols.
+TEST(C, VersionTest) { TessVersion(); }
diff --git a/tesseract/unittest/cleanapi_test.cc b/tesseract/unittest/cleanapi_test.cc
new file mode 100644
index 00000000..4d284af0
--- /dev/null
+++ b/tesseract/unittest/cleanapi_test.cc
@@ -0,0 +1,28 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <tesseract/baseapi.h>
+
+// Dummy enum in the global namespace that checks for collision with awkward
+// names.
+// If this test fails to compile, clean up the includes in tesseract/baseapi.h!
+// They are not supposed to drag in definitions of any of the tesseract
+// types included in this enum!
+enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD };
+
+#include "gtest/gtest.h"
+
+namespace tesseract {
+
+// Verifies that the global namespace is clean.
+TEST(CleanNamespaceTess, DummyTest) { tesseract::TessBaseAPI api; }
+
+} // namespace.
diff --git a/tesseract/unittest/colpartition_test.cc b/tesseract/unittest/colpartition_test.cc
new file mode 100644
index 00000000..caebe605
--- /dev/null
+++ b/tesseract/unittest/colpartition_test.cc
@@ -0,0 +1,76 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "colpartition.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableColPartition : public ColPartition {
+ public:
+ void SetColumnRange(int first, int last) {
+ set_first_column(first);
+ set_last_column(last);
+ }
+};
+
+class ColPartitionTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ void TearDown() {}
+};
+
+TEST_F(ColPartitionTest, IsInSameColumnAsReflexive) {
+ TestableColPartition a, b;
+ a.SetColumnRange(1, 2);
+ b.SetColumnRange(3, 3);
+
+ EXPECT_TRUE(a.IsInSameColumnAs(a));
+ EXPECT_TRUE(b.IsInSameColumnAs(b));
+}
+
+TEST_F(ColPartitionTest, IsInSameColumnAsBorders) {
+ TestableColPartition a, b, c, d;
+ a.SetColumnRange(0, 1);
+ b.SetColumnRange(1, 2);
+ c.SetColumnRange(2, 3);
+ d.SetColumnRange(4, 5);
+
+ EXPECT_TRUE(a.IsInSameColumnAs(b));
+ EXPECT_TRUE(b.IsInSameColumnAs(a));
+ EXPECT_FALSE(c.IsInSameColumnAs(d));
+ EXPECT_FALSE(d.IsInSameColumnAs(c));
+ EXPECT_FALSE(a.IsInSameColumnAs(d));
+}
+
+TEST_F(ColPartitionTest, IsInSameColumnAsSuperset) {
+ TestableColPartition a, b;
+ a.SetColumnRange(4, 7);
+ b.SetColumnRange(2, 8);
+
+ EXPECT_TRUE(a.IsInSameColumnAs(b));
+ EXPECT_TRUE(b.IsInSameColumnAs(a));
+}
+
+TEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) {
+ TestableColPartition a, b;
+ a.SetColumnRange(3, 8);
+ b.SetColumnRange(6, 10);
+
+ EXPECT_TRUE(a.IsInSameColumnAs(b));
+ EXPECT_TRUE(b.IsInSameColumnAs(a));
+}
+
+} // namespace
diff --git a/tesseract/unittest/commandlineflags_test.cc b/tesseract/unittest/commandlineflags_test.cc
new file mode 100644
index 00000000..7b16fbdd
--- /dev/null
+++ b/tesseract/unittest/commandlineflags_test.cc
@@ -0,0 +1,158 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+
+#include "include_gunit.h"
+
+// Flags used for testing parser.
+INT_PARAM_FLAG(foo_int, 0, "Integer flag for testing");
+INT_PARAM_FLAG(bar_int, 0, "Integer flag for testing");
+DOUBLE_PARAM_FLAG(foo_double, 0.1, "Double flag for testing");
+DOUBLE_PARAM_FLAG(bar_double, 0.2, "Double flag for testing");
+STRING_PARAM_FLAG(foo_string, "foo", "String flag for testing");
+STRING_PARAM_FLAG(bar_string, "bar", "String flag for testing");
+BOOL_PARAM_FLAG(foo_bool, false, "Bool flag for testing");
+BOOL_PARAM_FLAG(bar_bool, false, "Bool flag for testing");
+// A flag whose name is a single character, tested for backward
+// compatibility. This should be selected to not conflict with existing flags
+// in commontraining.cpp.
+STRING_PARAM_FLAG(q, "", "Single character name");
+
+namespace tesseract {
+
+class CommandlineflagsTest : public ::testing::Test {
+ protected:
+ void TestParser(int argc, const char** const_argv) {
+ TestParser("", argc, const_argv);
+ }
+ void TestParser(const char* usage, int argc, const char** const_argv) {
+ // Make a copy of the pointer since it can be altered by the function.
+ char** argv = const_cast<char**>(const_argv);
+ tesseract::ParseCommandLineFlags(usage, &argc, &argv, true);
+ }
+};
+
+TEST_F(CommandlineflagsTest, RemoveFlags) {
+ const char* const_argv[] = {"Progname", "--foo_int", "3", "file1.h",
+ "file2.h"};
+ int argc = ARRAYSIZE(const_argv);
+ char** argv = const_cast<char**>(const_argv);
+ tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
+
+ // argv should be rearranged to look like { "Progname", "file1.h", "file2.h" }
+ EXPECT_EQ(3, argc);
+ EXPECT_STREQ("Progname", argv[0]);
+ EXPECT_STREQ("file1.h", argv[1]);
+ EXPECT_STREQ("file2.h", argv[2]);
+}
+
+#if 0 // TODO: this test needs an update (it currently fails).
+TEST_F(CommandlineflagsTest, PrintUsageAndExit) {
+ const char* argv[] = { "Progname", "--help" };
+ EXPECT_EXIT(TestParser("Progname [flags]", ARRAYSIZE(argv), argv),
+ ::testing::ExitedWithCode(0),
+ "USAGE: Progname \\[flags\\]");
+}
+#endif
+
+TEST_F(CommandlineflagsTest, ExitsWithErrorOnInvalidFlag) {
+ const char* argv[] = {"", "--test_nonexistent_flag"};
+ EXPECT_EXIT(TestParser(ARRAYSIZE(argv), argv), ::testing::ExitedWithCode(1),
+ "ERROR: Non-existent flag");
+}
+
+TEST_F(CommandlineflagsTest, ParseIntegerFlags) {
+ const char* argv[] = {"", "--foo_int=3", "--bar_int", "-4"};
+ TestParser(ARRAYSIZE(argv), argv);
+ EXPECT_EQ(3, FLAGS_foo_int);
+ EXPECT_EQ(-4, FLAGS_bar_int);
+
+ const char* arg_no_value[] = {"", "--bar_int"};
+ EXPECT_EXIT(TestParser(ARRAYSIZE(arg_no_value), arg_no_value),
+ ::testing::ExitedWithCode(1), "ERROR");
+
+ const char* arg_invalid_value[] = {"", "--bar_int", "--foo_int=3"};
+ EXPECT_EXIT(TestParser(ARRAYSIZE(arg_invalid_value), arg_invalid_value),
+ ::testing::ExitedWithCode(1), "ERROR");
+
+ const char* arg_bad_format[] = {"", "--bar_int="};
+ EXPECT_EXIT(TestParser(ARRAYSIZE(arg_bad_format), arg_bad_format),
+ ::testing::ExitedWithCode(1), "ERROR");
+}
+
+TEST_F(CommandlineflagsTest, ParseDoubleFlags) {
+ const char* argv[] = {"", "--foo_double=3.14", "--bar_double", "1.2"};
+ TestParser(ARRAYSIZE(argv), argv);
+
+ EXPECT_EQ(3.14, FLAGS_foo_double);
+ EXPECT_EQ(1.2, FLAGS_bar_double);
+
+ const char* arg_no_value[] = {"", "--bar_double"};
+ EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1),
+ "ERROR");
+
+ const char* arg_bad_format[] = {"", "--bar_double="};
+ EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1),
+ "ERROR");
+}
+
+TEST_F(CommandlineflagsTest, ParseStringFlags) {
+ const char* argv[] = {"", "--foo_string=abc", "--bar_string", "def"};
+ TestParser(ARRAYSIZE(argv), argv);
+
+ EXPECT_STREQ("abc", FLAGS_foo_string.c_str());
+ EXPECT_STREQ("def", FLAGS_bar_string.c_str());
+
+ const char* arg_no_value[] = {"", "--bar_string"};
+ EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1),
+ "ERROR");
+
+ FLAGS_bar_string.set_value("bar");
+ const char* arg_empty_string[] = {"", "--bar_string="};
+ TestParser(2, arg_empty_string);
+ EXPECT_STREQ("", FLAGS_bar_string.c_str());
+}
+
+TEST_F(CommandlineflagsTest, ParseBoolFlags) {
+ const char* argv[] = {"", "--foo_bool=true", "--bar_bool=1"};
+ FLAGS_foo_bool.set_value(false);
+ FLAGS_bar_bool.set_value(false);
+ TestParser(ARRAYSIZE(argv), argv);
+ // Verify changed value
+ EXPECT_TRUE(FLAGS_foo_bool);
+ EXPECT_TRUE(FLAGS_bar_bool);
+
+ const char* inv_argv[] = {"", "--foo_bool=false", "--bar_bool=0"};
+ FLAGS_foo_bool.set_value(true);
+ FLAGS_bar_bool.set_value(true);
+ TestParser(3, inv_argv);
+ // Verify changed value
+ EXPECT_FALSE(FLAGS_foo_bool);
+ EXPECT_FALSE(FLAGS_bar_bool);
+
+ const char* arg_implied_true[] = {"", "--bar_bool"};
+ FLAGS_bar_bool.set_value(false);
+ TestParser(2, arg_implied_true);
+ EXPECT_TRUE(FLAGS_bar_bool);
+
+ const char* arg_missing_val[] = {"", "--bar_bool="};
+ EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1),
+ "ERROR");
+}
+
+TEST_F(CommandlineflagsTest, ParseOldFlags) {
+ EXPECT_STREQ("", FLAGS_q.c_str());
+ const char* argv[] = {"", "-q", "text"};
+ TestParser(ARRAYSIZE(argv), argv);
+ EXPECT_STREQ("text", FLAGS_q.c_str());
+}
+} // namespace
diff --git a/tesseract/unittest/cycletimer.h b/tesseract/unittest/cycletimer.h
new file mode 100644
index 00000000..e1a13719
--- /dev/null
+++ b/tesseract/unittest/cycletimer.h
@@ -0,0 +1,61 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Portability include to match the Google test environment.
+
+#ifndef TESSERACT_UNITTEST_CYCLETIMER_H
+#define TESSERACT_UNITTEST_CYCLETIMER_H
+
+#include "absl/time/clock.h" // for GetCurrentTimeNanos
+
+// See https://github.com/google/or-tools/blob/master/ortools/base/timer.h
+class CycleTimer {
+public:
+ CycleTimer() {
+ Reset();
+ }
+
+ void Reset() {
+ running_ = false;
+ sum_ = 0;
+ start_ = 0;
+ }
+
+ // When Start() is called multiple times, only the most recent is used.
+ void Start() {
+ running_ = true;
+ start_ = absl::GetCurrentTimeNanos();
+ }
+
+ void Restart() {
+ sum_ = 0;
+ Start();
+ }
+
+ void Stop() {
+ if (running_) {
+ sum_ += absl::GetCurrentTimeNanos() - start_;
+ running_ = false;
+ }
+ }
+ int64_t GetInMs() const { return GetNanos() / 1000000; }
+
+ protected:
+ int64_t GetNanos() const {
+ return running_ ? absl::GetCurrentTimeNanos() - start_ + sum_ : sum_;
+ }
+
+ private:
+ bool running_;
+ int64_t start_;
+ int64_t sum_;
+};
+
+#endif // TESSERACT_UNITTEST_CYCLETIMER_H
diff --git a/tesseract/unittest/dawg_test.cc b/tesseract/unittest/dawg_test.cc
new file mode 100644
index 00000000..4a40b050
--- /dev/null
+++ b/tesseract/unittest/dawg_test.cc
@@ -0,0 +1,115 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "ratngs.h"
+#include "unicharset.h"
+#include "trie.h"
+
+#include <cstdlib> // for system
+#include <fstream> // for ifstream
+#include <set>
+#include <string>
+#include <vector>
+#include <sys/stat.h>
+
+#ifndef SW_TESTING
+#define wordlist2dawg_prog "wordlist2dawg"
+#define dawg2wordlist_prog "dawg2wordlist"
+#endif
+
+namespace tesseract {
+
+// Test some basic functionality dealing with Dawgs (compressed dictionaries,
+// aka Directed Acyclic Word Graphs).
+class DawgTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ void LoadWordlist(const std::string& filename, std::set<std::string>* words) const {
+ std::ifstream file(filename);
+ if (file.is_open()) {
+ std::string line;
+ while (getline(file, line)) {
+ // Remove trailing line terminators from line.
+ while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
+ line.resize(line.size() - 1);
+ }
+ // Add line to set.
+ words->insert(line.c_str());
+ }
+ file.close();
+ }
+ }
+ std::string TessBinaryPath(const std::string& name) const {
+ return file::JoinPath(TESSBIN_DIR, name);
+ }
+ std::string OutputNameToPath(const std::string& name) const {
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+ int RunCommand(const std::string& program, const std::string& arg1,
+ const std::string& arg2, const std::string& arg3) const {
+ std::string cmdline =
+ TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
+ return system(cmdline.c_str());
+ }
+ // Test that we are able to convert a wordlist file (one "word" per line) to
+ // a dawg (a compressed format) and then extract the original wordlist back
+ // out using the tools "wordlist2dawg" and "dawg2wordlist."
+ void TestDawgRoundTrip(const std::string& unicharset_filename,
+ const std::string& wordlist_filename) const {
+ std::set<std::string> orig_words, roundtrip_words;
+ std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
+ std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
+ std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
+ std::string output_wordlist = OutputNameToPath(wordlist_filename);
+ LoadWordlist(orig_wordlist, &orig_words);
+ EXPECT_EQ(
+ RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
+ EXPECT_EQ(
+ RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist),
+ 0);
+ LoadWordlist(output_wordlist, &roundtrip_words);
+ EXPECT_EQ(orig_words, roundtrip_words);
+ }
+};
+
+TEST_F(DawgTest, TestDawgConversion) {
+ TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
+}
+
+TEST_F(DawgTest, TestMatching) {
+ UNICHARSET unicharset;
+ unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
+ tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM,
+ unicharset.size(), 0);
+ WERD_CHOICE space_apos(" '", unicharset);
+ trie.add_word_to_dawg(space_apos);
+
+ WERD_CHOICE space(" ", unicharset);
+
+ // partial match ok - then good!
+ EXPECT_TRUE(trie.prefix_in_dawg(space, false));
+ // require complete match - not present.
+ EXPECT_FALSE(trie.word_in_dawg(space));
+ EXPECT_FALSE(trie.prefix_in_dawg(space, true));
+
+ // partial or complete match ok for full word:
+ EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
+ EXPECT_TRUE(trie.word_in_dawg(space_apos));
+ EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
+}
+
+} // namespace
diff --git a/tesseract/unittest/denorm_test.cc b/tesseract/unittest/denorm_test.cc
new file mode 100644
index 00000000..28328b15
--- /dev/null
+++ b/tesseract/unittest/denorm_test.cc
@@ -0,0 +1,99 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "blobs.h"
+#include "normalis.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class DENORMTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ public:
+ void TearDown() {}
+
+ void ExpectCorrectTransform(const DENORM& denorm, const TPOINT& src,
+ const TPOINT& result, bool local) {
+ TPOINT normed;
+ if (local)
+ denorm.LocalNormTransform(src, &normed);
+ else
+ denorm.NormTransform(nullptr, src, &normed);
+ EXPECT_EQ(result.x, normed.x);
+ EXPECT_EQ(result.y, normed.y);
+ // Now undo
+ TPOINT denormed;
+ if (local)
+ denorm.LocalDenormTransform(normed, &denormed);
+ else
+ denorm.DenormTransform(nullptr, normed, &denormed);
+ EXPECT_EQ(src.x, denormed.x);
+ EXPECT_EQ(src.y, denormed.y);
+ }
+};
+
+// Tests a simple baseline-style normalization.
+TEST_F(DENORMTest, NoRotations) {
+ DENORM denorm;
+ denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f,
+ 0.0f, static_cast<float>(kBlnBaselineOffset));
+ TPOINT pt1(1100, 2000);
+ TPOINT result1(200, kBlnBaselineOffset);
+ ExpectCorrectTransform(denorm, pt1, result1, true);
+ ExpectCorrectTransform(denorm, pt1, result1, false);
+ TPOINT pt2(900, 2100);
+ TPOINT result2(-200, 300 + kBlnBaselineOffset);
+ ExpectCorrectTransform(denorm, pt2, result2, true);
+ ExpectCorrectTransform(denorm, pt2, result2, false);
+}
+
+// Tests a simple baseline-style normalization with a rotation.
+TEST_F(DENORMTest, WithRotations) {
+ DENORM denorm;
+ FCOORD rotation90(0.0f, 1.0f);
+ denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f,
+ 3.0f, 0.0f, static_cast<float>(kBlnBaselineOffset));
+
+ TPOINT pt1(1100, 2000);
+ TPOINT result1(0, 200 + kBlnBaselineOffset);
+ ExpectCorrectTransform(denorm, pt1, result1, true);
+ ExpectCorrectTransform(denorm, pt1, result1, false);
+ TPOINT pt2(900, 2100);
+ TPOINT result2(-300, kBlnBaselineOffset - 200);
+ ExpectCorrectTransform(denorm, pt2, result2, true);
+ ExpectCorrectTransform(denorm, pt2, result2, false);
+}
+
+// Tests a simple baseline-style normalization with a second rotation & scale.
+TEST_F(DENORMTest, Multiple) {
+ DENORM denorm;
+ denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f,
+ 0.0f, static_cast<float>(kBlnBaselineOffset));
+
+ DENORM denorm2;
+ FCOORD rotation90(0.0f, 1.0f);
+ denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f,
+ 0.25f, 0.0f, 0.0f);
+ TPOINT pt1(1050, 2000);
+ TPOINT result1(100, kBlnBaselineOffset);
+ ExpectCorrectTransform(denorm, pt1, result1, true);
+ ExpectCorrectTransform(denorm, pt1, result1, false);
+ TPOINT result2(kBlnBaselineOffset / 4, -14);
+ ExpectCorrectTransform(denorm2, result1, result2, true);
+ ExpectCorrectTransform(denorm2, pt1, result2, false);
+}
+
+} // namespace.
diff --git a/tesseract/unittest/doubleptr.h b/tesseract/unittest/doubleptr.h
new file mode 100644
index 00000000..38628b5f
--- /dev/null
+++ b/tesseract/unittest/doubleptr.h
@@ -0,0 +1,93 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File: doubleptr.h
+// Description: Double-ended pointer that keeps pointing correctly even
+// when reallocated or copied.
+// Author: Ray Smith
+// Created: Wed Mar 14 12:22:57 PDT 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_
+#define TESSERACT_CCUTIL_DOUBLEPTR_H_
+
+#include "errcode.h"
+
+namespace tesseract {
+
+// A smart pointer class that implements a double-ended pointer. Each end
+// points to the other end. The copy constructor and operator= have MOVE
+// semantics, meaning that the relationship with the other end moves to the
+// destination of the copy, leaving the source unattached.
+// For this reason both the copy constructor and the operator= take a non-const
+// reference argument, and the const reference versions cannot be used.
+// DoublePtr is useful to incorporate into structures that are part of a
+// collection such as GenericVector or STL containers, where reallocs can
+// relocate the members. DoublePtr is also useful in a GenericHeap, where it
+// can correctly maintain the pointer to an element of the heap despite it
+// getting moved around on the heap.
+class DoublePtr {
+ public:
+ DoublePtr() : other_end_(nullptr) {}
+ // Copy constructor steals the partner off src and is therefore a non
+ // const reference arg.
+ // Copying a const DoublePtr generates a compiler error.
+ DoublePtr(const DoublePtr& src) {
+ other_end_ = src.other_end_;
+ if (other_end_ != nullptr) {
+ other_end_->other_end_ = this;
+ ((DoublePtr&)src).other_end_ = nullptr;
+ }
+ }
+ // Operator= steals the partner off src, and therefore needs src to be a non-
+ // const reference.
+ // Assigning from a const DoublePtr generates a compiler error.
+ void operator=(const DoublePtr& src) {
+ Disconnect();
+ other_end_ = src.other_end_;
+ if (other_end_ != nullptr) {
+ other_end_->other_end_ = this;
+ ((DoublePtr&)src).other_end_ = nullptr;
+ }
+ }
+
+ // Connects this and other, discarding any existing connections.
+ void Connect(DoublePtr* other) {
+ other->Disconnect();
+ Disconnect();
+ other->other_end_ = this;
+ other_end_ = other;
+ }
+ // Disconnects this and other, making OtherEnd() return nullptr for both.
+ void Disconnect() {
+ if (other_end_ != nullptr) {
+ other_end_->other_end_ = nullptr;
+ other_end_ = nullptr;
+ }
+ }
+ // Returns the pointer to the other end of the double pointer.
+ DoublePtr* OtherEnd() const {
+ return other_end_;
+ }
+
+ private:
+ // Pointer to the other end of the link. It is always true that either
+ // other_end_ == nullptr or other_end_->other_end_ == this.
+ DoublePtr* other_end_;
+};
+
+} // namespace tesseract.
+
+#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_
diff --git a/tesseract/unittest/equationdetect_test.cc b/tesseract/unittest/equationdetect_test.cc
new file mode 100644
index 00000000..eb52231e
--- /dev/null
+++ b/tesseract/unittest/equationdetect_test.cc
@@ -0,0 +1,549 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "colpartitiongrid.h"
+#include "equationdetect.h"
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#define ENABLE_IdentifySpecialText_TEST 0
+#if ENABLE_IdentifySpecialText_TEST
+#define EQU_TRAINEDDATA_NAME "equ"
+#else
+#define EQU_TRAINEDDATA_NAME "equINTENTIONALLY_MISSING_FILE"
+#endif
+
+namespace tesseract {
+
+class TestableEquationDetect : public EquationDetect {
+ public:
+ TestableEquationDetect(const char* tessdata, Tesseract* lang_tesseract)
+ : EquationDetect(tessdata, EQU_TRAINEDDATA_NAME) {
+ SetLangTesseract(lang_tesseract);
+ }
+
+ // Insert a certain math and digit blobs into part.
+ void AddMathDigitBlobs(const int math_blobs, const int digit_blobs,
+ const int total_blobs, ColPartition* part) {
+ CHECK(part != nullptr);
+ CHECK_LE(math_blobs + digit_blobs, total_blobs);
+ int count = 0;
+ for (int i = 0; i < math_blobs; i++, count++) {
+ BLOBNBOX* blob = new BLOBNBOX();
+ blob->set_special_text_type(BSTT_MATH);
+ part->AddBox(blob);
+ }
+ for (int i = 0; i < digit_blobs; i++, count++) {
+ BLOBNBOX* blob = new BLOBNBOX();
+ blob->set_special_text_type(BSTT_DIGIT);
+ part->AddBox(blob);
+ }
+ for (int i = count; i < total_blobs; i++) {
+ BLOBNBOX* blob = new BLOBNBOX();
+ blob->set_special_text_type(BSTT_NONE);
+ part->AddBox(blob);
+ }
+ }
+
+ // Set up pix_binary for lang_tesseract_.
+ void SetPixBinary(Pix* pix) {
+ CHECK_EQ(1, pixGetDepth(pix));
+ *(lang_tesseract_->mutable_pix_binary()) = pix;
+ }
+
+ void RunIdentifySpecialText(BLOBNBOX* blob, const int height_th) {
+ IdentifySpecialText(blob, height_th);
+ }
+
+ BlobSpecialTextType RunEstimateTypeForUnichar(const char* val) {
+ const UNICHARSET& unicharset = lang_tesseract_->unicharset;
+ return EstimateTypeForUnichar(unicharset, unicharset.unichar_to_id(val));
+ }
+
+ EquationDetect::IndentType RunIsIndented(ColPartitionGrid* part_grid,
+ ColPartition* part) {
+ this->part_grid_ = part_grid;
+ return IsIndented(part);
+ }
+
+ bool RunIsNearSmallNeighbor(const TBOX& seed_box, const TBOX& part_box) {
+ return IsNearSmallNeighbor(seed_box, part_box);
+ }
+
+ bool RunCheckSeedBlobsCount(ColPartition* part) {
+ return CheckSeedBlobsCount(part);
+ }
+
+ float RunComputeForegroundDensity(const TBOX& tbox) {
+ return ComputeForegroundDensity(tbox);
+ }
+
+ int RunCountAlignment(const GenericVector<int>& sorted_vec, const int val) {
+ return CountAlignment(sorted_vec, val);
+ }
+
+ void RunSplitCPHorLite(ColPartition* part,
+ GenericVector<TBOX>* splitted_boxes) {
+ SplitCPHorLite(part, splitted_boxes);
+ }
+
+ void RunSplitCPHor(ColPartition* part,
+ GenericVector<ColPartition*>* parts_splitted) {
+ SplitCPHor(part, parts_splitted);
+ }
+
+ void TestComputeCPsSuperBBox(const TBOX& box, ColPartitionGrid* part_grid) {
+ CHECK(part_grid != nullptr);
+ part_grid_ = part_grid;
+ ComputeCPsSuperBBox();
+ EXPECT_TRUE(*cps_super_bbox_ == box);
+ }
+};
+
+class EquationFinderTest : public testing::Test {
+ protected:
+ std::unique_ptr<TestableEquationDetect> equation_det_;
+ std::unique_ptr<Tesseract> tesseract_;
+
+ // The directory for testdata;
+ std::string testdata_dir_;
+
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ tesseract_.reset(new Tesseract());
+ tesseract_->init_tesseract(TESSDATA_DIR, "eng", OEM_TESSERACT_ONLY);
+ tesseract_->set_source_resolution(300);
+ equation_det_.reset(
+ new TestableEquationDetect(TESSDATA_DIR, tesseract_.get()));
+ equation_det_->SetResolution(300);
+
+ testdata_dir_ = TESTDATA_DIR;
+ }
+
+ void TearDown() {
+ tesseract_.reset(nullptr);
+ equation_det_.reset(nullptr);
+ }
+
+ // Add a BLOCK covering the whole page.
+ void AddPageBlock(Pix* pix, BLOCK_LIST* blocks) {
+ CHECK(pix != nullptr);
+ CHECK(blocks != nullptr);
+ BLOCK_IT block_it(blocks);
+ BLOCK* block =
+ new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix));
+ block_it.add_to_end(block);
+ }
+
+ // Create col partitions, add into part_grid, and put them into all_parts.
+ void CreateColParts(const int rows, const int cols,
+ ColPartitionGrid* part_grid,
+ std::vector<ColPartition*>* all_parts) {
+ const int kWidth = 10, kHeight = 10;
+ ClearParts(all_parts);
+ for (int y = 0; y < rows; ++y) {
+ for (int x = 0; x < cols; ++x) {
+ int left = x * kWidth * 2, bottom = y * kHeight * 2;
+ TBOX box(left, bottom, left + kWidth, bottom + kHeight);
+ ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT,
+ BRT_TEXT, BTFT_NONE);
+ part_grid->InsertBBox(true, true, part);
+ all_parts->push_back(part);
+ }
+ }
+ }
+
+ void ClearParts(std::vector<ColPartition*>* all_parts) {
+ for (size_t i = 0; i < all_parts->size(); ++i) {
+ (*all_parts)[i]->DeleteBoxes();
+ delete ((*all_parts)[i]);
+ }
+ }
+
+ // Create a BLOBNBOX object with bounding box tbox, and add it into part.
+ void AddBlobIntoPart(const TBOX& tbox, ColPartition* part) {
+ CHECK(part != nullptr);
+ BLOBNBOX* blob = new BLOBNBOX();
+ blob->set_bounding_box(tbox);
+ part->AddBox(blob);
+ }
+};
+
+TEST_F(EquationFinderTest, IdentifySpecialText) {
+#if !ENABLE_IdentifySpecialText_TEST
+ GTEST_SKIP();
+#else // TODO: missing equ_gt1.tif
+ // Load Image.
+ std::string imagefile = file::JoinPath(testdata_dir_, "equ_gt1.tif");
+ Pix* pix_binary = pixRead(imagefile.c_str());
+ CHECK(pix_binary != nullptr && pixGetDepth(pix_binary) == 1);
+
+ // Get components.
+ BLOCK_LIST blocks;
+ TO_BLOCK_LIST to_blocks;
+ AddPageBlock(pix_binary, &blocks);
+ Textord* textord = tesseract_->mutable_textord();
+ textord->find_components(pix_binary, &blocks, &to_blocks);
+
+ // Identify special texts from to_blocks.
+ TO_BLOCK_IT to_block_it(&to_blocks);
+ std::map<int, int> stt_count;
+ for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list();
+ to_block_it.forward()) {
+ TO_BLOCK* to_block = to_block_it.data();
+ BLOBNBOX_IT blob_it(&(to_block->blobs));
+ for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+ BLOBNBOX* blob = blob_it.data();
+ // blob->set_special_text_type(BSTT_NONE);
+ equation_det_->RunIdentifySpecialText(blob, 0);
+ tensorflow::gtl::InsertIfNotPresent(&stt_count, blob->special_text_type(), 0);
+ stt_count[blob->special_text_type()]++;
+ }
+ }
+
+ // Verify the number, but allow a range of +/- kCountRange before squealing.
+ const int kCountRange = 3;
+ EXPECT_GE(39 + kCountRange, stt_count[BSTT_NONE]);
+ EXPECT_LE(39 - kCountRange, stt_count[BSTT_NONE]);
+
+ // if you count all the subscripts etc, there are ~45 italic chars.
+ EXPECT_GE(45 + kCountRange, stt_count[BSTT_ITALIC]);
+ EXPECT_LE(45 - kCountRange, stt_count[BSTT_ITALIC]);
+ EXPECT_GE(41 + kCountRange, stt_count[BSTT_DIGIT]);
+ EXPECT_LE(41 - kCountRange, stt_count[BSTT_DIGIT]);
+ EXPECT_GE(50 + kCountRange, stt_count[BSTT_MATH]);
+ EXPECT_LE(50 - kCountRange, stt_count[BSTT_MATH]);
+ EXPECT_GE(10 + kCountRange, stt_count[BSTT_UNCLEAR]);
+ EXPECT_LE(10 - kCountRange, stt_count[BSTT_UNCLEAR]);
+
+ // Release memory.
+ pixDestroy(&pix_binary);
+#endif
+}
+
+TEST_F(EquationFinderTest, EstimateTypeForUnichar) {
+ // Test abc characters.
+ EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("a"));
+ EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("c"));
+
+ // Test punctuation characters.
+ EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("'"));
+ EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(","));
+
+ // Test digits.
+ EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("1"));
+ EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("4"));
+ EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("|"));
+
+ // Test math symbols.
+ EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar("("));
+ EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar("+"));
+}
+
+TEST_F(EquationFinderTest, IsIndented) {
+ ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));
+
+ // Create five ColPartitions:
+ // part 1: ************
+ // part 2: *********
+ // part 3: *******
+ // part 4: *****
+ //
+ // part 5: ********
+ TBOX box1(0, 950, 999, 999);
+ ColPartition* part1 =
+ ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part_grid.InsertBBox(true, true, part1);
+ TBOX box2(300, 920, 900, 940);
+ ColPartition* part2 =
+ ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part_grid.InsertBBox(true, true, part2);
+ TBOX box3(0, 900, 600, 910);
+ ColPartition* part3 =
+ ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part_grid.InsertBBox(true, true, part3);
+ TBOX box4(300, 890, 600, 899);
+ ColPartition* part4 =
+ ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part_grid.InsertBBox(true, true, part4);
+ TBOX box5(300, 500, 900, 510);
+ ColPartition* part5 =
+ ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part_grid.InsertBBox(true, true, part5);
+
+ // Test
+ // part1 should be no indent.
+ EXPECT_EQ(EquationDetect::NO_INDENT,
+ equation_det_->RunIsIndented(&part_grid, part1));
+ // part2 should be left indent in terms of part1.
+ EXPECT_EQ(EquationDetect::LEFT_INDENT,
+ equation_det_->RunIsIndented(&part_grid, part2));
+ // part3 should be right indent.
+ EXPECT_EQ(EquationDetect::RIGHT_INDENT,
+ equation_det_->RunIsIndented(&part_grid, part3));
+ // part4 should be both indented.
+ EXPECT_EQ(EquationDetect::BOTH_INDENT,
+ equation_det_->RunIsIndented(&part_grid, part4));
+ // part5 should be no indent because it is too far from part1.
+ EXPECT_EQ(EquationDetect::NO_INDENT,
+ equation_det_->RunIsIndented(&part_grid, part5));
+
+ // Release memory.
+ part1->DeleteBoxes();
+ delete (part1);
+ part2->DeleteBoxes();
+ delete (part2);
+ part3->DeleteBoxes();
+ delete (part3);
+ part4->DeleteBoxes();
+ delete (part4);
+ part5->DeleteBoxes();
+ delete (part5);
+}
+
+TEST_F(EquationFinderTest, IsNearSmallNeighbor) {
+ // Create four tboxes:
+ // part 1, part 2
+ // ***** *****
+ // part 3: *****
+ //
+ // part 4: *****************
+ TBOX box1(0, 950, 499, 999);
+ TBOX box2(500, 950, 999, 998);
+ TBOX box3(0, 900, 499, 949);
+ TBOX box4(0, 550, 499, 590);
+
+ // Test
+ // box2 should be box1's near neighbor but not vice versa.
+ EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box2));
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box1));
+ // box1 and box3 should be near neighbors of each other.
+ EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box3));
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3));
+ // box2 and box3 should not be near neighbors of each other.
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3));
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box2));
+
+ // box4 should not be the near neighbor of any one.
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box1, box4));
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box4));
+ EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box4));
+}
+
+TEST_F(EquationFinderTest, CheckSeedBlobsCount) {
+ TBOX box(0, 950, 999, 999);
+ ColPartition* part1 =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ ColPartition* part2 =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ ColPartition* part3 =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ ColPartition* part4 =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+
+ // Part 1: 8 math, 0 digit, 20 total.
+ equation_det_->AddMathDigitBlobs(8, 0, 20, part1);
+ EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part1));
+
+ // Part 2: 1 math, 8 digit, 20 total.
+ equation_det_->AddMathDigitBlobs(1, 8, 20, part2);
+ EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part2));
+
+ // Part 3: 3 math, 8 digit, 8 total.
+ equation_det_->AddMathDigitBlobs(3, 8, 20, part3);
+ EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part3));
+
+ // Part 4: 8 math, 0 digit, 8 total.
+ equation_det_->AddMathDigitBlobs(0, 0, 8, part4);
+ EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part4));
+
+ // Release memory.
+ part1->DeleteBoxes();
+ delete (part1);
+ part2->DeleteBoxes();
+ delete (part2);
+ part3->DeleteBoxes();
+ delete (part3);
+ part4->DeleteBoxes();
+ delete (part4);
+}
+
+TEST_F(EquationFinderTest, ComputeForegroundDensity) {
+ // Create the pix with top half foreground, bottom half background.
+ int width = 1024, height = 768;
+ Pix* pix = pixCreate(width, height, 1);
+ pixRasterop(pix, 0, 0, width, height / 2, PIX_SET, nullptr, 0, 0);
+ TBOX box1(100, 0, 140, 140), box2(100, height / 2 - 20, 140, height / 2 + 20),
+ box3(100, height - 40, 140, height);
+ equation_det_->SetPixBinary(pix);
+
+ // Verify
+ EXPECT_NEAR(0.0, equation_det_->RunComputeForegroundDensity(box1), 0.0001f);
+ EXPECT_NEAR(0.5, equation_det_->RunComputeForegroundDensity(box2), 0.0001f);
+ EXPECT_NEAR(1.0, equation_det_->RunComputeForegroundDensity(box3), 0.0001f);
+}
+
+TEST_F(EquationFinderTest, CountAlignment) {
+ GenericVector<int> vec;
+ vec.push_back(1);
+ vec.push_back(1);
+ vec.push_back(1);
+ vec.push_back(100);
+ vec.push_back(200);
+ vec.push_back(200);
+
+ // Test the right point.
+ EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 1));
+ EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 100));
+ EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 200));
+
+ // Test the near neighbors.
+ EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 3));
+ EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 99));
+ EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 202));
+
+ // Test the far neighbors.
+ EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 150));
+ EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 50));
+ EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 250));
+}
+
+TEST_F(EquationFinderTest, ComputeCPsSuperBBox) {
+ Pix* pix = pixCreate(1001, 1001, 1);
+ equation_det_->SetPixBinary(pix);
+ ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));
+
+ TBOX box1(0, 0, 999, 99);
+ ColPartition* part1 =
+ ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ TBOX box2(0, 100, 499, 199);
+ ColPartition* part2 =
+ ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ TBOX box3(500, 100, 999, 199);
+ ColPartition* part3 =
+ ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ TBOX box4(0, 200, 999, 299);
+ ColPartition* part4 =
+ ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ TBOX box5(0, 900, 999, 999);
+ ColPartition* part5 =
+ ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+
+ // Add part1->part3 into part_grid and test.
+ part_grid.InsertBBox(true, true, part1);
+ part_grid.InsertBBox(true, true, part2);
+ part_grid.InsertBBox(true, true, part3);
+ TBOX super_box(0, 0, 999, 199);
+ equation_det_->TestComputeCPsSuperBBox(super_box, &part_grid);
+
+ // Add part4 and test.
+ part_grid.InsertBBox(true, true, part4);
+ TBOX super_box2(0, 0, 999, 299);
+ equation_det_->TestComputeCPsSuperBBox(super_box2, &part_grid);
+
+ // Add part5 and test.
+ part_grid.InsertBBox(true, true, part5);
+ TBOX super_box3(0, 0, 999, 999);
+ equation_det_->TestComputeCPsSuperBBox(super_box3, &part_grid);
+
+ // Release memory.
+ part1->DeleteBoxes();
+ delete (part1);
+ part2->DeleteBoxes();
+ delete (part2);
+ part3->DeleteBoxes();
+ delete (part3);
+ part4->DeleteBoxes();
+ delete (part4);
+ part5->DeleteBoxes();
+ delete (part5);
+}
+
+TEST_F(EquationFinderTest, SplitCPHorLite) {
+ TBOX box(0, 0, 999, 99);
+ ColPartition* part =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part->DeleteBoxes();
+ part->set_median_width(10);
+ GenericVector<TBOX> splitted_boxes;
+
+ // Test an empty part.
+ equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
+ EXPECT_TRUE(splitted_boxes.empty());
+
+ // Test with one blob.
+ AddBlobIntoPart(TBOX(0, 0, 10, 50), part);
+ equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
+ EXPECT_EQ(1, splitted_boxes.size());
+ EXPECT_TRUE(TBOX(0, 0, 10, 50) == splitted_boxes[0]);
+
+ // Add more blob and test.
+ AddBlobIntoPart(TBOX(11, 0, 20, 60), part);
+ AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point.
+ AddBlobIntoPart(TBOX(100, 0, 110, 15), part);
+ AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point.
+ AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point.
+ equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
+ // Verify.
+ EXPECT_EQ(3, splitted_boxes.size());
+ EXPECT_TRUE(TBOX(0, 0, 30, 60) == splitted_boxes[0]);
+ EXPECT_TRUE(TBOX(100, 0, 140, 45) == splitted_boxes[1]);
+ EXPECT_TRUE(TBOX(500, 0, 540, 35) == splitted_boxes[2]);
+
+ part->DeleteBoxes();
+ delete (part);
+}
+
+TEST_F(EquationFinderTest, SplitCPHor) {
+ TBOX box(0, 0, 999, 99);
+ ColPartition* part =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part->DeleteBoxes();
+ part->set_median_width(10);
+ GenericVector<ColPartition*> parts_splitted;
+
+ // Test an empty part.
+ equation_det_->RunSplitCPHor(part, &parts_splitted);
+ EXPECT_TRUE(parts_splitted.empty());
+ // Test with one blob.
+ AddBlobIntoPart(TBOX(0, 0, 10, 50), part);
+
+ equation_det_->RunSplitCPHor(part, &parts_splitted);
+ EXPECT_EQ(1, parts_splitted.size());
+ EXPECT_TRUE(TBOX(0, 0, 10, 50) == parts_splitted[0]->bounding_box());
+
+ // Add more blob and test.
+ AddBlobIntoPart(TBOX(11, 0, 20, 60), part);
+ AddBlobIntoPart(TBOX(25, 0, 30, 55), part); // break point.
+ AddBlobIntoPart(TBOX(100, 0, 110, 15), part);
+ AddBlobIntoPart(TBOX(125, 0, 140, 45), part); // break point.
+ AddBlobIntoPart(TBOX(500, 0, 540, 35), part); // break point.
+ equation_det_->RunSplitCPHor(part, &parts_splitted);
+
+ // Verify.
+ EXPECT_EQ(3, parts_splitted.size());
+ EXPECT_TRUE(TBOX(0, 0, 30, 60) == parts_splitted[0]->bounding_box());
+ EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());
+ EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());
+
+ parts_splitted.delete_data_pointers();
+ part->DeleteBoxes();
+ delete (part);
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/fileio_test.cc b/tesseract/unittest/fileio_test.cc
new file mode 100644
index 00000000..00488918
--- /dev/null
+++ b/tesseract/unittest/fileio_test.cc
@@ -0,0 +1,66 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <stdio.h>
+#include <memory>
+
+#include "absl/strings/str_split.h"
+
+#include "fileio.h"
+#include "include_gunit.h"
+
+namespace tesseract {
+
+TEST(FileTest, JoinPath) {
+ EXPECT_EQ("/abc/def", File::JoinPath("/abc", "def"));
+ EXPECT_EQ("/abc/def", File::JoinPath("/abc/", "def"));
+ EXPECT_EQ("def", File::JoinPath("", "def"));
+}
+
+TEST(OutputBufferTest, WriteString) {
+ const int kMaxBufSize = 128;
+ char buffer[kMaxBufSize];
+ for (int i = 0; i < kMaxBufSize; ++i) buffer[i] = '\0';
+ FILE* fp = tmpfile();
+ CHECK(fp != nullptr);
+
+ std::unique_ptr<OutputBuffer> output(new OutputBuffer(fp));
+ output->WriteString("Hello ");
+ output->WriteString("world!");
+
+ rewind(fp);
+ auto s = "Hello world!";
+ fread(buffer, strlen(s), 1, fp);
+ EXPECT_STREQ(s, buffer);
+}
+
+TEST(InputBufferTest, Read) {
+ const int kMaxBufSize = 128;
+ char buffer[kMaxBufSize];
+ auto s = "Hello\n world!";
+ strncpy(buffer, s, kMaxBufSize);
+ EXPECT_STREQ(s, buffer);
+ FILE* fp = tmpfile();
+ CHECK(fp != nullptr);
+ fwrite(buffer, strlen(s), 1, fp);
+ rewind(fp);
+
+ std::string str;
+ std::unique_ptr<InputBuffer> input(new InputBuffer(fp));
+ EXPECT_TRUE(input->Read(&str));
+ std::vector<std::string> lines = absl::StrSplit(str, '\n', absl::SkipEmpty());
+ EXPECT_EQ(2, lines.size());
+ EXPECT_EQ("Hello", lines[0]);
+ EXPECT_EQ(" world!", lines[1]);
+}
+
+} // namespace
diff --git a/tesseract/unittest/fuzzers/fuzzer-api.cpp b/tesseract/unittest/fuzzers/fuzzer-api.cpp
new file mode 100644
index 00000000..a1e4e7c4
--- /dev/null
+++ b/tesseract/unittest/fuzzers/fuzzer-api.cpp
@@ -0,0 +1,101 @@
+#include <tesseract/baseapi.h>
+#include <allheaders.h>
+
+#include <libgen.h> // for dirname
+#include <cstdio> // for printf
+#include <cstdlib> // for std::getenv, std::setenv
+#include <string> // for std::string
+
+#ifndef TESSERACT_FUZZER_WIDTH
+#define TESSERACT_FUZZER_WIDTH 100
+#endif
+
+#ifndef TESSERACT_FUZZER_HEIGHT
+#define TESSERACT_FUZZER_HEIGHT 100
+#endif
+
+class BitReader {
+ private:
+ uint8_t const* data;
+ size_t size;
+ size_t shift;
+
+ public:
+ BitReader(const uint8_t* data, size_t size)
+ : data(data), size(size), shift(0) {}
+
+ int Read(void) {
+ if (size == 0) {
+ return 0;
+ }
+
+ const int ret = ((*data) >> shift) & 1;
+
+ shift++;
+ if (shift >= 8) {
+ shift = 0;
+ data++;
+ size--;
+ }
+
+ return ret;
+ }
+};
+
+static tesseract::TessBaseAPI* api = nullptr;
+
+extern "C" int LLVMFuzzerInitialize(int* /*pArgc*/, char*** pArgv) {
+ if (std::getenv("TESSDATA_PREFIX") == nullptr) {
+ std::string binary_path = *pArgv[0];
+ const std::string filepath = dirname(&binary_path[0]);
+
+ const std::string tessdata_path = filepath + "/" + "tessdata";
+ if (setenv("TESSDATA_PREFIX", tessdata_path.c_str(), 1) != 0) {
+ printf("Setenv failed\n");
+ std::abort();
+ }
+ }
+
+ api = new tesseract::TessBaseAPI();
+ if (api->Init(nullptr, "eng") != 0) {
+ printf("Cannot initialize API\n");
+ abort();
+ }
+
+ /* Silence output */
+ api->SetVariable("debug_file", "/dev/null");
+
+ return 0;
+}
+
+static PIX* createPix(BitReader& BR, const size_t width, const size_t height) {
+ Pix* pix = pixCreate(width, height, 1);
+
+ if (pix == nullptr) {
+ printf("pix creation failed\n");
+ abort();
+ }
+
+ for (size_t i = 0; i < width; i++) {
+ for (size_t j = 0; j < height; j++) {
+ pixSetPixel(pix, i, j, BR.Read());
+ }
+ }
+
+ return pix;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ BitReader BR(data, size);
+
+ auto pix = createPix(BR, TESSERACT_FUZZER_WIDTH, TESSERACT_FUZZER_HEIGHT);
+
+ api->SetImage(pix);
+
+ char* outText = api->GetUTF8Text();
+
+ pixDestroy(&pix);
+ delete[] outText;
+
+ return 0;
+}
diff --git a/tesseract/unittest/fuzzers/oss-fuzz-build.sh b/tesseract/unittest/fuzzers/oss-fuzz-build.sh
new file mode 100755
index 00000000..d10f2d80
--- /dev/null
+++ b/tesseract/unittest/fuzzers/oss-fuzz-build.sh
@@ -0,0 +1,59 @@
+#!/bin/bash -eu
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+cd $SRC/leptonica
+./autogen.sh
+./configure --disable-shared
+make SUBDIRS=src install -j$(nproc)
+ldconfig
+
+cd $SRC/tesseract
+./autogen.sh
+CXXFLAGS="$CXXFLAGS -D_GLIBCXX_DEBUG" ./configure --disable-graphics --disable-shared
+make -j$(nproc)
+
+cp -R $SRC/tessdata $OUT
+
+$CXX $CXXFLAGS \
+ -I $SRC/tesseract/include \
+ -I/usr/local/include/leptonica \
+ $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api \
+ $SRC/tesseract/.libs/libtesseract.a \
+ /usr/local/lib/liblept.a \
+ /usr/lib/x86_64-linux-gnu/libtiff.a \
+ /usr/lib/x86_64-linux-gnu/libpng.a \
+ /usr/lib/x86_64-linux-gnu/libjpeg.a \
+ /usr/lib/x86_64-linux-gnu/libjbig.a \
+ /usr/lib/x86_64-linux-gnu/liblzma.a \
+ -lz \
+ $LIB_FUZZING_ENGINE
+
+$CXX $CXXFLAGS \
+ -DTESSERACT_FUZZER_WIDTH=512 \
+ -DTESSERACT_FUZZER_HEIGHT=256 \
+ -I $SRC/tesseract/include \
+ -I/usr/local/include/leptonica \
+ $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api-512x256 \
+ $SRC/tesseract/.libs/libtesseract.a \
+ /usr/local/lib/liblept.a \
+ /usr/lib/x86_64-linux-gnu/libtiff.a \
+ /usr/lib/x86_64-linux-gnu/libpng.a \
+ /usr/lib/x86_64-linux-gnu/libjpeg.a \
+ /usr/lib/x86_64-linux-gnu/libjbig.a \
+ /usr/lib/x86_64-linux-gnu/liblzma.a \
+ -lz \
+ $LIB_FUZZING_ENGINE
diff --git a/tesseract/unittest/heap_test.cc b/tesseract/unittest/heap_test.cc
new file mode 100644
index 00000000..c2754181
--- /dev/null
+++ b/tesseract/unittest/heap_test.cc
@@ -0,0 +1,202 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include_gunit.h"
+
+#include "doubleptr.h"
+#include "genericheap.h"
+#include "genericvector.h"
+#include "kdpair.h"
+
+#include <string>
+#include <utility>
+
+namespace tesseract {
+
+int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0};
+
+// The fixture for testing GenericHeap and DoublePtr.
+class HeapTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ public:
+ virtual ~HeapTest();
+ // Pushes the test data onto both the heap and the KDVector.
+ void PushTestData(GenericHeap<IntKDPair>* heap, KDVector* v) {
+ for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+ IntKDPair pair(test_data[i], i);
+ heap->Push(&pair);
+ v->push_back(pair);
+ }
+ }
+ // Verifies that the data in the heap matches the vector (after sorting) by
+ // popping everything off the heap.
+ void VerifyHeapVectorMatch(GenericHeap<IntKDPair>* heap, KDVector* v) {
+ EXPECT_FALSE(heap->empty());
+ EXPECT_EQ(heap->size(), v->size());
+ // Sort the vector and check that the keys come out of the heap in the same
+ // order as v.
+ // Also check that the indices match, except for 9, which is duplicated.
+ v->sort();
+ // Check that we have increasing order.
+ EXPECT_LT((*v)[0].key(), v->back().key());
+ for (int i = 0; i < v->size(); ++i) {
+ EXPECT_EQ((*v)[i].key(), heap->PeekTop().key());
+ // Indices don't necessarily match for equal keys, so don't test them.
+ if (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) {
+ while (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) {
+ heap->Pop(nullptr);
+ ++i;
+ EXPECT_FALSE(heap->empty());
+ EXPECT_EQ((*v)[i].key(), heap->PeekTop().key());
+ }
+ } else {
+ // The indices must also match if the key is unique.
+ EXPECT_EQ((*v)[i].data(), heap->PeekTop().data());
+ }
+ EXPECT_FALSE(heap->empty());
+ EXPECT_TRUE(heap->Pop(nullptr));
+ }
+ EXPECT_TRUE(heap->empty());
+ }
+};
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of a weak vtable (fixes compiler warning).
+HeapTest::~HeapTest() = default;
+
+// Tests that a sort using a GenericHeap matches the result of a sort using
+// a KDVector.
+TEST_F(HeapTest, SortTest) {
+ GenericHeap<IntKDPair> heap;
+ EXPECT_TRUE(heap.empty());
+ KDVector v;
+ EXPECT_EQ(heap.size(), v.size());
+ // Push the test data onto both the heap and the KDVector.
+ PushTestData(&heap, &v);
+ VerifyHeapVectorMatch(&heap, &v);
+}
+
+// Tests that pushing some stuff, popping some stuff, and then pushing more
+// stuff results in output that matches the sort using a KDVector.
+// a KDVector.
+TEST_F(HeapTest, MixedTest) {
+ GenericHeap<IntKDPair> heap;
+ KDVector v;
+ // Push the test data onto both the heap and the KDVector.
+ PushTestData(&heap, &v);
+ // Sort the vector and remove the first 5 values from both heap and v.
+ v.sort();
+ for (int i = 0; i < 5; ++i) {
+ heap.Pop(nullptr);
+ v.remove(0);
+ }
+ // Push the test data onto both the heap and the KDVector.
+ PushTestData(&heap, &v);
+ // Heap and vector should still match!
+ VerifyHeapVectorMatch(&heap, &v);
+}
+
+// Tests that PopWorst still leaves the heap in a state such that it still
+// matches a sorted KDVector.
+TEST_F(HeapTest, PopWorstTest) {
+ GenericHeap<IntKDPair> heap;
+ KDVector v;
+ // Push the test data onto both the heap and the KDVector.
+ PushTestData(&heap, &v);
+ // Get the worst element off the heap.
+ IntKDPair pair;
+ heap.PopWorst(&pair);
+ EXPECT_EQ(pair.key(), 65536);
+ EXPECT_EQ(pair.data(), 6);
+ // Sort and remove the worst element from the vector.
+ v.sort();
+ v.truncate(v.size() - 1);
+ // After that they should still match!
+ VerifyHeapVectorMatch(&heap, &v);
+}
+
+// Tests that Reshuffle works and the heap still matches a KDVector with the
+// same value changed. Doubles up as a test of DoublePtr.
+TEST_F(HeapTest, RevalueTest) {
+ // Here the data element of the pair is a DoublePtr, which links the entries
+ // in the vector and heap, and we test a MAX heap.
+ typedef KDPairDec<int, DoublePtr> PtrPair;
+ GenericHeap<PtrPair> heap;
+ GenericVector<PtrPair> v;
+ // Push the test data onto both the heap and the vector.
+ for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+ PtrPair h_pair;
+ h_pair.key() = test_data[i];
+ PtrPair v_pair;
+ v_pair.key() = test_data[i];
+ h_pair.data().Connect(&v_pair.data());
+ heap.Push(&h_pair);
+ v.push_back(v_pair);
+ }
+ // Test changes both ways. Index 0 is 8, so change it to -1.
+ v[0].key() = -1;
+ // v[0].data.OtherEnd() is a pointer to the data element in the appropriate
+ // heap entry, wherever it may be. We can change its value via that pointer.
+ // Without Reshuffle, that would be a terribly bad thing to do, as it violates
+ // the heap invariant, making the heap corrupt.
+ PtrPair* pair_ptr = reinterpret_cast<PtrPair*>(v[0].data().OtherEnd());
+ pair_ptr->key() = v[0].key();
+ heap.Reshuffle(pair_ptr);
+ // Index 1 is 1. Change to 32767.
+ v[1].key() = 32767;
+ pair_ptr = reinterpret_cast<PtrPair*>(v[1].data().OtherEnd());
+ pair_ptr->key() = v[1].key();
+ heap.Reshuffle(pair_ptr);
+ // After the changes, popping the heap should still match the sorted order
+ // of the vector.
+ v.sort();
+ EXPECT_GT(v[0].key(), v.back().key());
+ for (int i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i].key(), heap.PeekTop().key());
+ EXPECT_FALSE(heap.empty());
+ heap.Pop(nullptr);
+ }
+ EXPECT_TRUE(heap.empty());
+}
+
+#if 0
+// Helper checks that the compiler rejects use of a copy constructor with
+// a const argument and the default copy constructor is properly hidden by
+// the non-const version.
+static void ConstRefTest(const DoublePtr& ptr1) {
+ DoublePtr ptr2(ptr1); // Compiler error here.
+ EXPECT_EQ(&ptr2, ptr2.OtherEnd()->OtherEnd());
+ EXPECT_TRUE(ptr1.OtherEnd() == nullptr);
+}
+#endif
+
+// Tests that DoublePtr works as expected.
+TEST_F(HeapTest, DoublePtrTest) {
+ DoublePtr ptr1;
+ DoublePtr ptr2;
+ ptr1.Connect(&ptr2);
+ // Check that the correct copy constructor is used.
+ DoublePtr ptr3(ptr1);
+ EXPECT_EQ(&ptr3, ptr3.OtherEnd()->OtherEnd());
+ EXPECT_TRUE(ptr1.OtherEnd() == nullptr);
+ // Check that the correct operator= is used.
+ ptr1 = ptr3;
+ EXPECT_EQ(&ptr1, ptr1.OtherEnd()->OtherEnd());
+ EXPECT_TRUE(ptr3.OtherEnd() == nullptr);
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/imagedata_test.cc b/tesseract/unittest/imagedata_test.cc
new file mode 100644
index 00000000..31bd2f24
--- /dev/null
+++ b/tesseract/unittest/imagedata_test.cc
@@ -0,0 +1,131 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+
+#include "imagedata.h"
+#include "include_gunit.h"
+#include "log.h"
+
+namespace tesseract {
+
+// Tests the caching mechanism of DocumentData/ImageData.
+
+class ImagedataTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ ImagedataTest() {}
+
+ // Creates a fake DocumentData, writes it to a file, and returns the filename.
+ std::string MakeFakeDoc(int num_pages, unsigned doc_id,
+ std::vector<std::string>* page_texts) {
+ // The size of the fake images that we will use.
+ const int kImageSize = 1048576;
+ // Not using a real image here - just an array of zeros! We are just testing
+ // that the truth text matches.
+ std::vector<char> fake_image(kImageSize, 0);
+ DocumentData write_doc("My document");
+ for (int p = 0; p < num_pages; ++p) {
+ // Make some fake text that is different for each page and save it.
+ page_texts->push_back(
+ absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id));
+ // Make an imagedata and put it in the document.
+ ImageData* imagedata =
+ ImageData::Build("noname", p, "eng", fake_image.data(),
+ fake_image.size(), (*page_texts)[p].c_str(), nullptr);
+ EXPECT_EQ(kImageSize, imagedata->MemoryUsed());
+ write_doc.AddPageToDocument(imagedata);
+ }
+ // Write it to a file.
+ std::string filename = file::JoinPath(
+ FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf"));
+ EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr));
+ return filename;
+ }
+};
+
+TEST_F(ImagedataTest, CachesProperly) {
+ // This test verifies that Imagedata can be stored in a DocumentData and a
+ // collection of them is cached correctly given limited memory.
+ // Number of pages to put in the fake document.
+ const int kNumPages = 12;
+ // Allowances to read the document. Big enough for 1, 3, 0, all pages.
+ const int kMemoryAllowances[] = {2000000, 4000000, 1000000, 100000000, 0};
+ // Order in which to read the pages, with some sequential and some seeks.
+ const int kPageReadOrder[] = {0, 1, 2, 3, 8, 4, 5, 6, 7, 11, 10, 9, -1};
+
+ std::vector<std::string> page_texts;
+ std::string filename = MakeFakeDoc(kNumPages, 0, &page_texts);
+ // Now try getting it back with different memory allowances and check that
+ // the pages can still be read.
+ for (int m = 0; kMemoryAllowances[m] > 0; ++m) {
+ DocumentData read_doc("My document");
+ EXPECT_TRUE(
+ read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr));
+ LOG(ERROR) << "Allowance = " << kMemoryAllowances[m];
+ // Read the pages in a specific order.
+ for (int p = 0; kPageReadOrder[p] >= 0; ++p) {
+ int page = kPageReadOrder[p];
+ const ImageData* imagedata = read_doc.GetPage(page);
+ EXPECT_NE(nullptr, imagedata);
+ //EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata);
+ // Check that this is the right page.
+ EXPECT_STREQ(page_texts[page].c_str(),
+ imagedata->transcription().c_str());
+ }
+ }
+}
+
+TEST_F(ImagedataTest, CachesMultiDocs) {
+ // This test verifies that DocumentCache works to store multiple DocumentData
+ // and the two caching strategies read images in the right order.
+ // Number of pages in each document.
+ const std::vector<int> kNumPages = {6, 5, 7};
+ std::vector<std::vector<std::string>> page_texts;
+ std::vector<STRING> filenames;
+ for (size_t d = 0; d < kNumPages.size(); ++d) {
+ page_texts.emplace_back(std::vector<std::string>());
+ std::string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());
+ filenames.push_back(STRING(filename.c_str()));
+ }
+ // Now try getting them back with different cache strategies and check that
+ // the pages come out in the right order.
+ DocumentCache robin_cache(8000000);
+ robin_cache.LoadDocuments(filenames, tesseract::CS_ROUND_ROBIN, nullptr);
+ DocumentCache serial_cache(8000000);
+ serial_cache.LoadDocuments(filenames, tesseract::CS_SEQUENTIAL, nullptr);
+ for (int p = 0; p <= 21; ++p) {
+ LOG(INFO) << "Page " << p;
+ const ImageData* robin_data = robin_cache.GetPageBySerial(p);
+ const ImageData* serial_data = serial_cache.GetPageBySerial(p);
+ CHECK(robin_data != nullptr);
+ CHECK(serial_data != nullptr);
+ int robin_doc = p % kNumPages.size();
+ int robin_page = p / kNumPages.size() % kNumPages[robin_doc];
+ // Check that this is the right page.
+ EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(),
+ robin_data->transcription().c_str());
+ int serial_doc = p / kNumPages[0] % kNumPages.size();
+ int serial_page = p % kNumPages[0] % kNumPages[serial_doc];
+ EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(),
+ serial_data->transcription().c_str());
+ }
+}
+
+} // namespace.
diff --git a/tesseract/unittest/include_gunit.h b/tesseract/unittest/include_gunit.h
new file mode 100644
index 00000000..568326cb
--- /dev/null
+++ b/tesseract/unittest/include_gunit.h
@@ -0,0 +1,76 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Portability include to match the Google test environment.
+
+#ifndef TESSERACT_UNITTEST_INCLUDE_GUNIT_H_
+#define TESSERACT_UNITTEST_INCLUDE_GUNIT_H_
+
+#include "errcode.h" // for ASSERT_HOST
+#include "fileio.h" // for tesseract::File
+#include "log.h" // for LOG
+#include "gtest/gtest.h"
+
+const char* FLAGS_test_tmpdir = "./tmp";
+
+class file : public tesseract::File {
+public:
+
+ static void MakeTmpdir() {
+#if defined(_WIN32)
+ _mkdir(FLAGS_test_tmpdir);
+#else
+ mkdir(FLAGS_test_tmpdir, S_IRWXU | S_IRWXG);
+#endif
+ }
+
+// Create a file and write a string to it.
+ static bool WriteStringToFile(const std::string& contents, const std::string& filename) {
+ File::WriteStringToFileOrDie(contents, filename);
+ return true;
+ }
+
+ static bool GetContents(const std::string& filename, std::string* out, int) {
+ return File::ReadFileToString(filename, out);
+ }
+
+ static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) {
+ return WriteStringToFile(contents, name);
+ }
+
+ static int Defaults() {
+ return 0;
+ }
+
+ static std::string JoinPath(const std::string& s1, const std::string& s2) {
+ return tesseract::File::JoinPath(s1, s2);
+ }
+
+ static std::string JoinPath(const std::string& s1, const std::string& s2,
+ const std::string& s3) {
+ return JoinPath(JoinPath(s1, s2), s3);
+ }
+};
+
+#define ARRAYSIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+// /usr/include/tensorflow/core/platform/default/logging.h defines the CHECK* macros.
+#if !defined(CHECK)
+#define CHECK(condition) \
+ if (!(condition)) \
+ LOG(FATAL) << "Check failed: " #condition " "
+#define CHECK_EQ(test, value) CHECK((test) == (value))
+#define CHECK_GT(test, value) CHECK((test) > (value))
+#define CHECK_LT(test, value) CHECK((test) < (value))
+#define CHECK_LE(test, value) CHECK((test) <= (value))
+#define CHECK_OK(test) CHECK(test)
+#endif
+
+#endif // TESSERACT_UNITTEST_INCLUDE_GUNIT_H_
diff --git a/tesseract/unittest/indexmapbidi_test.cc b/tesseract/unittest/indexmapbidi_test.cc
new file mode 100644
index 00000000..bdd3c895
--- /dev/null
+++ b/tesseract/unittest/indexmapbidi_test.cc
@@ -0,0 +1,117 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+
+#include "indexmapbidi.h"
+
+#include "include_gunit.h"
+
+const int kPrimeLimit = 1000;
+
+namespace tesseract {
+
+class IndexMapBiDiTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ public:
+ std::string OutputNameToPath(const std::string& name) {
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+ // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes.
+ void ComputePrimes(IndexMapBiDi* map) {
+ map->Init(kPrimeLimit + 1, false);
+ map->SetMap(2, true);
+ // Set all the odds to true.
+ for (int i = 3; i <= kPrimeLimit; i += 2) map->SetMap(i, true);
+ int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));
+ for (int f = 3; f <= factor_limit; f += 2) {
+ if (map->SparseToCompact(f) >= 0) {
+ for (int m = 2; m * f <= kPrimeLimit; ++m) map->SetMap(f * m, false);
+ }
+ }
+ map->Setup();
+ }
+
+ void TestPrimes(const IndexMap& map) {
+ // Now all primes are mapped in the sparse map to their index.
+ // According to Wikipedia, the 168th prime is 997, and it has compact
+ // index 167 because we are indexing from 0.
+ EXPECT_EQ(167, map.SparseToCompact(997));
+ EXPECT_EQ(997, map.CompactToSparse(167));
+ // 995, 996, 998, 999 are not prime.
+ EXPECT_EQ(-1, map.SparseToCompact(995));
+ EXPECT_EQ(-1, map.SparseToCompact(996));
+ EXPECT_EQ(-1, map.SparseToCompact(998));
+ EXPECT_EQ(-1, map.SparseToCompact(999));
+ // The 167th prime is 991.
+ EXPECT_EQ(991, map.CompactToSparse(166));
+ // There are 168 primes in 0..1000.
+ EXPECT_EQ(168, map.CompactSize());
+ EXPECT_EQ(kPrimeLimit + 1, map.SparseSize());
+ }
+};
+
+// Tests the sieve of Eratosthenes as a way of testing setup.
+TEST_F(IndexMapBiDiTest, Primes) {
+ IndexMapBiDi map;
+ ComputePrimes(&map);
+ TestPrimes(map);
+ // It still works if we assign it to another.
+ IndexMapBiDi map2;
+ map2.CopyFrom(map);
+ TestPrimes(map2);
+ // Or if we assign it to a base class.
+ IndexMap base_map;
+ base_map.CopyFrom(map);
+ TestPrimes(base_map);
+ // Test file i/o too.
+ std::string filename = OutputNameToPath("primesmap");
+ FILE* fp = fopen(filename.c_str(), "wb");
+ CHECK(fp != nullptr);
+ EXPECT_TRUE(map.Serialize(fp));
+ fclose(fp);
+ fp = fopen(filename.c_str(), "rb");
+ CHECK(fp != nullptr);
+ IndexMapBiDi read_map;
+ EXPECT_TRUE(read_map.DeSerialize(false, fp));
+ fclose(fp);
+ TestPrimes(read_map);
+}
+
+// Tests the many-to-one setup feature.
+TEST_F(IndexMapBiDiTest, ManyToOne) {
+ // Test the example in the comment on CompleteMerges.
+ IndexMapBiDi map;
+ map.Init(13, false);
+ map.SetMap(2, true);
+ map.SetMap(4, true);
+ map.SetMap(7, true);
+ map.SetMap(9, true);
+ map.SetMap(11, true);
+ map.Setup();
+ map.Merge(map.SparseToCompact(2), map.SparseToCompact(9));
+ map.Merge(map.SparseToCompact(4), map.SparseToCompact(11));
+ map.CompleteMerges();
+ EXPECT_EQ(3, map.CompactSize());
+ EXPECT_EQ(13, map.SparseSize());
+ EXPECT_EQ(1, map.SparseToCompact(4));
+ EXPECT_EQ(4, map.CompactToSparse(1));
+ EXPECT_EQ(1, map.SparseToCompact(11));
+}
+
+} // namespace.
diff --git a/tesseract/unittest/intfeaturemap_test.cc b/tesseract/unittest/intfeaturemap_test.cc
new file mode 100644
index 00000000..e95aa0c3
--- /dev/null
+++ b/tesseract/unittest/intfeaturemap_test.cc
@@ -0,0 +1,129 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "intfeaturemap.h"
+#include "intfeaturespace.h"
+
+#include "include_gunit.h"
+
+// Random re-quantization to test that they don't have to be easy.
+// WARNING! Change these and change the expected_misses calculation below.
+const int kXBuckets = 16;
+const int kYBuckets = 24;
+const int kThetaBuckets = 13;
+
+namespace tesseract {
+
+class IntFeatureMapTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ public:
+ // Expects that the given vector has contiguous integer values in the
+ // range [start, end).
+ void ExpectContiguous(const GenericVector<int>& v, int start, int end) {
+ for (int i = start; i < end; ++i) {
+ EXPECT_EQ(i, v[i - start]);
+ }
+ }
+};
+
+// Tests the IntFeatureMap and implicitly the IntFeatureSpace underneath.
+TEST_F(IntFeatureMapTest, Exhaustive) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because IntFeatureSpace is missing.
+ GTEST_SKIP();
+#else
+ IntFeatureSpace space;
+ space.Init(kXBuckets, kYBuckets, kThetaBuckets);
+ IntFeatureMap map;
+ map.Init(space);
+ int total_size = kIntFeatureExtent * kIntFeatureExtent * kIntFeatureExtent;
+ std::unique_ptr<INT_FEATURE_STRUCT[]> features(
+ new INT_FEATURE_STRUCT[total_size]);
+ // Fill the features with every value.
+ for (int y = 0; y < kIntFeatureExtent; ++y) {
+ for (int x = 0; x < kIntFeatureExtent; ++x) {
+ for (int theta = 0; theta < kIntFeatureExtent; ++theta) {
+ int f_index = (y * kIntFeatureExtent + x) * kIntFeatureExtent + theta;
+ features[f_index].X = x;
+ features[f_index].Y = y;
+ features[f_index].Theta = theta;
+ }
+ }
+ }
+ GenericVector<int> index_features;
+ map.IndexAndSortFeatures(features.get(), total_size, &index_features);
+ EXPECT_EQ(total_size, index_features.size());
+ int total_buckets = kXBuckets * kYBuckets * kThetaBuckets;
+ GenericVector<int> map_features;
+ int misses = map.MapIndexedFeatures(index_features, &map_features);
+ EXPECT_EQ(0, misses);
+ EXPECT_EQ(total_buckets, map_features.size());
+ ExpectContiguous(map_features, 0, total_buckets);
+ EXPECT_EQ(total_buckets, map.compact_size());
+ EXPECT_EQ(total_buckets, map.sparse_size());
+
+ // Every offset should be within dx, dy, dtheta of the start point.
+ int dx = kIntFeatureExtent / kXBuckets + 1;
+ int dy = kIntFeatureExtent / kYBuckets + 1;
+ int dtheta = kIntFeatureExtent / kThetaBuckets + 1;
+ int bad_offsets = 0;
+ for (int index = 0; index < total_buckets; ++index) {
+ for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps;
+ ++dir) {
+ int offset_index = map.OffsetFeature(index, dir);
+ if (dir == 0) {
+ EXPECT_EQ(index, offset_index);
+ } else if (offset_index >= 0) {
+ INT_FEATURE_STRUCT f = map.InverseIndexFeature(index);
+ INT_FEATURE_STRUCT f2 = map.InverseIndexFeature(offset_index);
+ EXPECT_TRUE(f.X != f2.X || f.Y != f2.Y || f.Theta != f2.Theta);
+ EXPECT_LE(abs(f.X - f2.X), dx);
+ EXPECT_LE(abs(f.Y - f2.Y), dy);
+ int theta_delta = abs(f.Theta - f2.Theta);
+ if (theta_delta > kIntFeatureExtent / 2)
+ theta_delta = kIntFeatureExtent - theta_delta;
+ EXPECT_LE(theta_delta, dtheta);
+ } else {
+ ++bad_offsets;
+ INT_FEATURE_STRUCT f = map.InverseIndexFeature(index);
+ }
+ }
+ }
+ EXPECT_LE(bad_offsets, (kXBuckets + kYBuckets) * kThetaBuckets);
+
+ // To test the mapping further, delete the 1st and last map feature, and
+ // test again.
+ map.DeleteMapFeature(0);
+ map.DeleteMapFeature(total_buckets - 1);
+ map.FinalizeMapping(nullptr);
+ map.IndexAndSortFeatures(features.get(), total_size, &index_features);
+ // Has no effect on index features.
+ EXPECT_EQ(total_size, index_features.size());
+ misses = map.MapIndexedFeatures(index_features, &map_features);
+ int expected_misses = (kIntFeatureExtent / kXBuckets) *
+ (kIntFeatureExtent / kYBuckets) *
+ (kIntFeatureExtent / kThetaBuckets + 1);
+ expected_misses += (kIntFeatureExtent / kXBuckets) *
+ (kIntFeatureExtent / kYBuckets + 1) *
+ (kIntFeatureExtent / kThetaBuckets);
+ EXPECT_EQ(expected_misses, misses);
+ EXPECT_EQ(total_buckets - 2, map_features.size());
+ ExpectContiguous(map_features, 0, total_buckets - 2);
+ EXPECT_EQ(total_buckets - 2, map.compact_size());
+ EXPECT_EQ(total_buckets, map.sparse_size());
+#endif
+}
+
+} // namespace.
diff --git a/tesseract/unittest/intsimdmatrix_test.cc b/tesseract/unittest/intsimdmatrix_test.cc
new file mode 100644
index 00000000..cdfbaa2c
--- /dev/null
+++ b/tesseract/unittest/intsimdmatrix_test.cc
@@ -0,0 +1,135 @@
+///////////////////////////////////////////////////////////////////////
+// File: intsimdmatrix_test.cc
+// Author: rays@google.com (Ray Smith)
+//
+// Copyright 2017 Google Inc. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "intsimdmatrix.h"
+#include <memory>
+#include <vector>
+#include <gtest/gtest.h>
+#include <gtest/internal/gtest-port.h>
+#include "include_gunit.h"
+#include "matrix.h"
+#include "simddetect.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+class IntSimdMatrixTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ // Makes a random weights matrix of the given size.
+ GENERIC_2D_ARRAY<int8_t> InitRandom(int no, int ni) {
+ GENERIC_2D_ARRAY<int8_t> a(no, ni, 0);
+ for (int i = 0; i < no; ++i) {
+ for (int j = 0; j < ni; ++j) {
+ a(i, j) = static_cast<int8_t>(random_.SignedRand(INT8_MAX));
+ }
+ }
+ return a;
+ }
+ // Makes a random input vector of the given size, with rounding up.
+ std::vector<int8_t> RandomVector(int size, const IntSimdMatrix& matrix) {
+ int rounded_size = matrix.RoundInputs(size);
+ std::vector<int8_t> v(rounded_size, 0);
+ for (int i = 0; i < size; ++i) {
+ v[i] = static_cast<int8_t>(random_.SignedRand(INT8_MAX));
+ }
+ return v;
+ }
+ // Makes a random scales vector of the given size.
+ std::vector<double> RandomScales(int size) {
+ std::vector<double> v(size);
+ for (int i = 0; i < size; ++i) {
+ v[i] = (1.0 + random_.SignedRand(1.0)) / INT8_MAX;
+ }
+ return v;
+ }
+ // Tests a range of sizes and compares the results against the generic version.
+ void ExpectEqualResults(const IntSimdMatrix& matrix) {
+ double total = 0.0;
+ for (int num_out = 1; num_out < 130; ++num_out) {
+ for (int num_in = 1; num_in < 130; ++num_in) {
+ GENERIC_2D_ARRAY<int8_t> w = InitRandom(num_out, num_in + 1);
+ std::vector<int8_t> u = RandomVector(num_in, matrix);
+ std::vector<double> scales = RandomScales(num_out);
+ int ro = num_out;
+ if (IntSimdMatrix::intSimdMatrix)
+ ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
+ std::vector<double> base_result(ro);
+ base_result.resize(num_out);
+ IntSimdMatrix::MatrixDotVector(w, scales, u.data(), base_result.data());
+ std::vector<double> test_result(ro);
+ test_result.resize(num_out);
+ std::vector<int8_t> shaped_wi;
+ int32_t rounded_num_out;
+ matrix.Init(w, shaped_wi, rounded_num_out);
+ scales.reserve(rounded_num_out);
+ if (matrix.matrixDotVectorFunction) {
+ matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0],
+ &scales[0], &u[0], &test_result[0]);
+ } else {
+ IntSimdMatrix::MatrixDotVector(w, scales, u.data(), test_result.data());
+ }
+ for (int i = 0; i < num_out; ++i) {
+ EXPECT_FLOAT_EQ(base_result[i], test_result[i]) << "i=" << i;
+ total += base_result[i];
+ }
+ }
+ }
+ // Compare sum of all results with expected value.
+ EXPECT_FLOAT_EQ(total, 337849.39354684710);
+ }
+
+ TRand random_;
+};
+
+// Test the C++ implementation without SIMD.
+TEST_F(IntSimdMatrixTest, C) {
+ static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1};
+ ExpectEqualResults(matrix);
+}
+
+// Tests that the SSE implementation gets the same result as the vanilla.
+TEST_F(IntSimdMatrixTest, SSE) {
+#if defined(HAVE_SSE4_1)
+ if (!SIMDDetect::IsSSEAvailable()) {
+ GTEST_LOG_(INFO) << "No SSE found! Not tested!";
+ GTEST_SKIP();
+ }
+ ExpectEqualResults(IntSimdMatrix::intSimdMatrixSSE);
+#else
+ GTEST_LOG_(INFO) << "SSE unsupported! Not tested!";
+ GTEST_SKIP();
+#endif
+}
+
+// Tests that the AVX2 implementation gets the same result as the vanilla.
+TEST_F(IntSimdMatrixTest, AVX2) {
+#if defined(HAVE_AVX2)
+ if (!SIMDDetect::IsAVX2Available()) {
+ GTEST_LOG_(INFO) << "No AVX2 found! Not tested!";
+ GTEST_SKIP();
+ }
+ ExpectEqualResults(IntSimdMatrix::intSimdMatrixAVX2);
+#else
+ GTEST_LOG_(INFO) << "AVX2 unsupported! Not tested!";
+ GTEST_SKIP();
+#endif
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/lang_model_test.cc b/tesseract/unittest/lang_model_test.cc
new file mode 100644
index 00000000..b059c18c
--- /dev/null
+++ b/tesseract/unittest/lang_model_test.cc
@@ -0,0 +1,217 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string> // for std::string
+
+#include "absl/strings/str_cat.h"
+
+#include "gmock/gmock.h" // for testing::ElementsAreArray
+
+#include "include_gunit.h"
+#include "lang_model_helpers.h"
+#include "log.h" // for LOG
+#include "lstmtrainer.h"
+#include "unicharset_training_utils.h"
+
+namespace tesseract {
+
+std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR, name);
+}
+
+// This is an integration test that verifies that CombineLangModel works to
+// the extent that an LSTMTrainer can be initialized with the result, and it
+// can encode strings. More importantly, the test verifies that adding an extra
+// character to the unicharset does not change the encoding of strings.
+TEST(LangModelTest, AddACharacter) {
+ constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
+ constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
+ // Setup the arguments.
+ std::string script_dir = LANGDATA_DIR;
+ std::string eng_dir = file::JoinPath(script_dir, "eng");
+ std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
+ UNICHARSET unicharset;
+ EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
+ std::string version_str = "TestVersion";
+ file::MakeTmpdir();
+ std::string output_dir = FLAGS_test_tmpdir;
+ LOG(INFO) << "Output dir=" << output_dir << "\n";
+ std::string lang1 = "eng";
+ bool pass_through_recoder = false;
+ std::vector<STRING> words, puncs, numbers;
+ // If these reads fail, we get a warning message and an empty list of words.
+ ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr)
+ .split('\n', &words);
+ EXPECT_GT(words.size(), 0);
+ ReadFile(file::JoinPath(eng_dir, "eng.punc"), nullptr).split('\n', &puncs);
+ EXPECT_GT(puncs.size(), 0);
+ ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr)
+ .split('\n', &numbers);
+ EXPECT_GT(numbers.size(), 0);
+ bool lang_is_rtl = false;
+ // Generate the traineddata file.
+ EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir,
+ lang1, pass_through_recoder, words, puncs,
+ numbers, lang_is_rtl, nullptr, nullptr));
+ // Init a trainer with it, and encode kTestString.
+ std::string traineddata1 =
+ file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
+ LSTMTrainer trainer1;
+ trainer1.InitCharSet(traineddata1);
+ std::vector<int> labels1;
+ EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
+ STRING test1_decoded = trainer1.DecodeLabels(labels1);
+ std::string test1_str(&test1_decoded[0], test1_decoded.length());
+ LOG(INFO) << "Labels1=" << test1_str << "\n";
+
+ // Add a new character to the unicharset and try again.
+ int size_before = unicharset.size();
+ unicharset.unichar_insert("₹");
+ SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false,
+ &unicharset);
+ EXPECT_EQ(size_before + 1, unicharset.size());
+ // Generate the traineddata file.
+ std::string lang2 = "extended";
+ EXPECT_EQ(EXIT_SUCCESS,
+ CombineLangModel(unicharset, script_dir, version_str, output_dir,
+ lang2, pass_through_recoder, words, puncs, numbers,
+ lang_is_rtl, nullptr, nullptr));
+ // Init a trainer with it, and encode kTestString.
+ std::string traineddata2 =
+ file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
+ LSTMTrainer trainer2;
+ trainer2.InitCharSet(traineddata2);
+ std::vector<int> labels2;
+ EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
+ STRING test2_decoded = trainer2.DecodeLabels(labels2);
+ std::string test2_str(&test2_decoded[0], test2_decoded.length());
+ LOG(INFO) << "Labels2=" << test2_str << "\n";
+ // encode kTestStringRupees.
+ std::vector<int> labels3;
+ EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
+ STRING test3_decoded = trainer2.DecodeLabels(labels3);
+ std::string test3_str(&test3_decoded[0], test3_decoded.length());
+ LOG(INFO) << "labels3=" << test3_str << "\n";
+ // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
+ // Since Tensor Flow's CTC implementation insists on having the null be the
+ // last label, and we want to be compatible, null has to be renumbered when
+ // we add a class.
+ int null1 = trainer1.null_char();
+ int null2 = trainer2.null_char();
+ EXPECT_EQ(null1 + 1, null2);
+ std::vector<int> labels1_v(labels1.size());
+ for (int i = 0; i < labels1.size(); ++i) {
+ if (labels1[i] == null1)
+ labels1_v[i] = null2;
+ else
+ labels1_v[i] = labels1[i];
+ }
+ EXPECT_THAT(labels1_v,
+ testing::ElementsAreArray(&labels2[0], labels2.size()));
+ // To make sure we we are not cheating somehow, we can now encode the Rupee
+ // symbol, which we could not do before.
+ EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
+ EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
+}
+
+// Same as above test, for hin instead of eng
+TEST(LangModelTest, AddACharacterHindi) {
+ constexpr char kTestString[] = "हिन्दी में एक लाइन लिखें";
+ constexpr char kTestStringRupees[] = "हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००";
+ // Setup the arguments.
+ std::string script_dir = LANGDATA_DIR;
+ std::string hin_dir = file::JoinPath(script_dir, "hin");
+ std::string unicharset_path = TestDataNameToPath("hin_beam.unicharset");
+ UNICHARSET unicharset;
+ EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
+ std::string version_str = "TestVersion";
+ file::MakeTmpdir();
+ std::string output_dir = FLAGS_test_tmpdir;
+ LOG(INFO) << "Output dir=" << output_dir << "\n";
+ std::string lang1 = "hin";
+ bool pass_through_recoder = false;
+ std::vector<STRING> words, puncs, numbers;
+ // If these reads fail, we get a warning message and an empty list of words.
+ ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr)
+ .split('\n', &words);
+ EXPECT_GT(words.size(), 0);
+ ReadFile(file::JoinPath(hin_dir, "hin.punc"), nullptr).split('\n', &puncs);
+ EXPECT_GT(puncs.size(), 0);
+ ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr)
+ .split('\n', &numbers);
+ EXPECT_GT(numbers.size(), 0);
+ bool lang_is_rtl = false;
+ // Generate the traineddata file.
+ EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir,
+ lang1, pass_through_recoder, words, puncs,
+ numbers, lang_is_rtl, nullptr, nullptr));
+ // Init a trainer with it, and encode kTestString.
+ std::string traineddata1 =
+ file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
+ LSTMTrainer trainer1;
+ trainer1.InitCharSet(traineddata1);
+ std::vector<int> labels1;
+ EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
+ STRING test1_decoded = trainer1.DecodeLabels(labels1);
+ std::string test1_str(&test1_decoded[0], test1_decoded.length());
+ LOG(INFO) << "Labels1=" << test1_str << "\n";
+
+ // Add a new character to the unicharset and try again.
+ int size_before = unicharset.size();
+ unicharset.unichar_insert("₹");
+ SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false,
+ &unicharset);
+ EXPECT_EQ(size_before + 1, unicharset.size());
+ // Generate the traineddata file.
+ std::string lang2 = "extendedhin";
+ EXPECT_EQ(EXIT_SUCCESS,
+ CombineLangModel(unicharset, script_dir, version_str, output_dir,
+ lang2, pass_through_recoder, words, puncs, numbers,
+ lang_is_rtl, nullptr, nullptr));
+ // Init a trainer with it, and encode kTestString.
+ std::string traineddata2 =
+ file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
+ LSTMTrainer trainer2;
+ trainer2.InitCharSet(traineddata2);
+ std::vector<int> labels2;
+ EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
+ STRING test2_decoded = trainer2.DecodeLabels(labels2);
+ std::string test2_str(&test2_decoded[0], test2_decoded.length());
+ LOG(INFO) << "Labels2=" << test2_str << "\n";
+ // encode kTestStringRupees.
+ std::vector<int> labels3;
+ EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
+ STRING test3_decoded = trainer2.DecodeLabels(labels3);
+ std::string test3_str(&test3_decoded[0], test3_decoded.length());
+ LOG(INFO) << "labels3=" << test3_str << "\n";
+ // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
+ // Since Tensor Flow's CTC implementation insists on having the null be the
+ // last label, and we want to be compatible, null has to be renumbered when
+ // we add a class.
+ int null1 = trainer1.null_char();
+ int null2 = trainer2.null_char();
+ EXPECT_EQ(null1 + 1, null2);
+ std::vector<int> labels1_v(labels1.size());
+ for (int i = 0; i < labels1.size(); ++i) {
+ if (labels1[i] == null1)
+ labels1_v[i] = null2;
+ else
+ labels1_v[i] = labels1[i];
+ }
+ EXPECT_THAT(labels1_v,
+ testing::ElementsAreArray(&labels2[0], labels2.size()));
+ // To make sure we we are not cheating somehow, we can now encode the Rupee
+ // symbol, which we could not do before.
+ EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
+ EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/layout_test.cc b/tesseract/unittest/layout_test.cc
new file mode 100644
index 00000000..8a20c908
--- /dev/null
+++ b/tesseract/unittest/layout_test.cc
@@ -0,0 +1,234 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+
+#include "include_gunit.h"
+
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "coutln.h"
+#include "log.h" // for LOG
+#include "mutableiterator.h"
+#include "ocrblock.h" // for class BLOCK
+#include "pageres.h"
+#include "polyblk.h"
+#include <tesseract/resultiterator.h>
+#include "stepblob.h"
+
+namespace tesseract {
+
+/** String name for each block type. Keep in sync with PolyBlockType. */
+static const char* kPolyBlockNames[] = {
+ "Unknown",
+ "Flowing Text",
+ "Heading Text",
+ "Pullout Text",
+ "Equation",
+ "Inline Equation",
+ "Table",
+ "Vertical Text",
+ "Caption Text",
+ "Flowing Image",
+ "Heading Image",
+ "Pullout Image",
+ "Horizontal Line",
+ "Vertical Line",
+ "Noise",
+ "" // End marker for testing that sizes match.
+};
+
+const char* kStrings8087_054[] = {
+ "dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr};
+const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT,
+ PT_PULLOUT_IMAGE, PT_CAPTION_TEXT,
+ PT_FLOWING_TEXT};
+
+// The fixture for testing Tesseract.
+class LayoutTest : public testing::Test {
+ protected:
+ std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR, "/" + name);
+ }
+ std::string TessdataPath() {
+ return file::JoinPath(TESSDATA_DIR, "");
+ }
+
+ LayoutTest() { src_pix_ = nullptr; }
+ ~LayoutTest() { pixDestroy(&src_pix_); }
+
+ void SetImage(const char* filename, const char* lang) {
+ pixDestroy(&src_pix_);
+ src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
+ api_.Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY);
+ api_.SetPageSegMode(tesseract::PSM_AUTO);
+ api_.SetImage(src_pix_);
+ }
+
+ // Tests reading order and block finding (very roughly) by iterating
+ // over the blocks, expecting that they contain the strings in order,
+ // allowing for other blocks in between.
+ // An empty string should match an image block, and a nullptr string
+ // indicates the end of the array.
+ void VerifyBlockTextOrder(const char* strings[], const PolyBlockType* blocks,
+ ResultIterator* it) {
+ it->Begin();
+ int string_index = 0;
+ int block_index = 0;
+ do {
+ char* block_text = it->GetUTF8Text(tesseract::RIL_BLOCK);
+ if (block_text != nullptr && it->BlockType() == blocks[string_index] &&
+ strstr(block_text, strings[string_index]) != nullptr) {
+ LOG(INFO) << "Found string " << strings[string_index]
+ << " in block " << block_index
+ << " of type " << kPolyBlockNames[blocks[string_index]] << "\n";
+ // Found this one.
+ ++string_index;
+ } else if (it->BlockType() == blocks[string_index] &&
+ block_text == nullptr && strings[string_index][0] == '\0') {
+ LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]]
+ << " at block " << block_index << "\n";
+ // Found this one.
+ ++string_index;
+ } else {
+ LOG(INFO) << "No match found in block with text:\n" << block_text;
+ }
+ delete[] block_text;
+ ++block_index;
+ if (strings[string_index] == nullptr) break;
+ } while (it->Next(tesseract::RIL_BLOCK));
+ EXPECT_TRUE(strings[string_index] == nullptr);
+ }
+
+ // Tests that approximate order of the biggest text blocks is correct.
+ // Correctness is tested by the following simple rules:
+ // If a block overlaps its predecessor in x, then it must be below it.
+ // otherwise, if the block is not below its predecessor, then it must
+ // be to the left of it if right_to_left is true, or to the right otherwise.
+ void VerifyRoughBlockOrder(bool right_to_left, ResultIterator* it) {
+ int prev_left = 0;
+ int prev_right = 0;
+ int prev_bottom = 0;
+ it->Begin();
+ do {
+ int left, top, right, bottom;
+ if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&
+ PTIsTextType(it->BlockType()) && right - left > 800 &&
+ bottom - top > 200) {
+ if (prev_right > prev_left) {
+ if (std::min(right, prev_right) > std::max(left, prev_left)) {
+ EXPECT_GE(top, prev_bottom) << "Overlapping block should be below";
+ } else if (top < prev_bottom) {
+ if (right_to_left) {
+ EXPECT_GE(prev_left, right) << "Block should be to the left";
+ } else {
+ EXPECT_GE(left, prev_right) << "Block should be to the right";
+ }
+ }
+ }
+ prev_left = left;
+ prev_right = right;
+ prev_bottom = bottom;
+ }
+ } while (it->Next(tesseract::RIL_BLOCK));
+ }
+
+ // Tests that every blob assigned to the biggest text blocks is contained
+ // fully within its block by testing that the block polygon winds around
+ // the center of the bounding boxes of the outlines in the blob.
+ void VerifyTotalContainment(int winding_target, MutableIterator* it) {
+ it->Begin();
+ do {
+ int left, top, right, bottom;
+ if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&
+ PTIsTextType(it->BlockType()) && right - left > 800 &&
+ bottom - top > 200) {
+ const PAGE_RES_IT* pr_it = it->PageResIt();
+ POLY_BLOCK* pb = pr_it->block()->block->pdblk.poly_block();
+ CHECK(pb != nullptr);
+ FCOORD skew = pr_it->block()->block->skew();
+ EXPECT_GT(skew.x(), 0.0f);
+ EXPECT_GT(skew.y(), 0.0f);
+ // Iterate the words in the block.
+ MutableIterator word_it = *it;
+ do {
+ const PAGE_RES_IT* w_it = word_it.PageResIt();
+ // Iterate the blobs in the word.
+ C_BLOB_IT b_it(w_it->word()->word->cblob_list());
+ for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+ C_BLOB* blob = b_it.data();
+ // Iterate the outlines in the blob.
+ C_OUTLINE_IT ol_it(blob->out_list());
+ for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
+ C_OUTLINE* ol = ol_it.data();
+ TBOX box = ol->bounding_box();
+ ICOORD middle((box.left() + box.right()) / 2,
+ (box.top() + box.bottom()) / 2);
+ EXPECT_EQ(winding_target, pb->winding_number(middle));
+ }
+ }
+ } while (word_it.Next(tesseract::RIL_WORD) &&
+ !word_it.IsAtBeginningOf(tesseract::RIL_BLOCK));
+ }
+ } while (it->Next(tesseract::RIL_BLOCK));
+ }
+
+ Pix* src_pix_;
+ std::string ocr_text_;
+ tesseract::TessBaseAPI api_;
+};
+
+// Tests that array sizes match their intended size.
+TEST_F(LayoutTest, ArraySizeTest) {
+ int size = 0;
+ for (size = 0; kPolyBlockNames[size][0] != '\0'; ++size)
+ ;
+ EXPECT_EQ(size, PT_COUNT);
+}
+
+// Tests that Tesseract gets the important blocks and in the right order
+// on a UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
+TEST_F(LayoutTest, UNLV8087_054) {
+ SetImage("8087_054.3B.tif", "eng");
+ // Just run recognition.
+ EXPECT_EQ(api_.Recognize(nullptr), 0);
+ // Check iterator position.
+ tesseract::ResultIterator* it = api_.GetIterator();
+ VerifyBlockTextOrder(kStrings8087_054, kBlocks8087_054, it);
+ delete it;
+}
+
+// Tests that Tesseract gets the important blocks and in the right order
+// on GOOGLE:13510798882202548:74:84.sj-79.tif (Hebrew image)
+// TODO: replace hebrew.png by Google image referred above
+TEST_F(LayoutTest, HebrewOrderingAndSkew) {
+ SetImage("hebrew.png", "eng");
+ // Just run recognition.
+ EXPECT_EQ(api_.Recognize(nullptr), 0);
+ tesseract::MutableIterator* it = api_.GetMutableIterator();
+ // In eng mode, block order should not be RTL.
+ VerifyRoughBlockOrder(false, it);
+ VerifyTotalContainment(1, it);
+ delete it;
+ // Now try again using Hebrew.
+ SetImage("hebrew.png", "heb");
+ // Just run recognition.
+ EXPECT_EQ(api_.Recognize(nullptr), 0);
+ it = api_.GetMutableIterator();
+ // In heb mode, block order should be RTL.
+ VerifyRoughBlockOrder(true, it);
+ // And blobs should still be fully contained.
+ VerifyTotalContainment(-1, it);
+ delete it;
+}
+
+} // namespace
diff --git a/tesseract/unittest/ligature_table_test.cc b/tesseract/unittest/ligature_table_test.cc
new file mode 100644
index 00000000..0047f857
--- /dev/null
+++ b/tesseract/unittest/ligature_table_test.cc
@@ -0,0 +1,111 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+#include "fileio.h"
+#include "include_gunit.h"
+#include "ligature_table.h"
+#include "pango_font_info.h"
+
+namespace tesseract {
+
+const char kEngNonLigatureText[] = "fidelity effigy ſteep";
+// Same as above text, but with "fi" in the first word and "ffi" in the second
+// word replaced with their respective ligatures.
+const char kEngLigatureText[] = "fidelity effigy ſteep";
+// Same as kEngLigatureText but with "fi" in both words replaced with their
+// ligature. The test Verdana font does not support the "ffi" or "ſt" ligature.
+const char kRenderableEngLigatureText[] = "fidelity effigy ſteep";
+
+static PangoFontMap* font_map;
+
+class LigatureTableTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ lig_table_ = LigatureTable::Get();
+ if (!font_map) {
+ font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+ }
+ pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+ }
+
+ static void SetUpTestCase() {
+ static std::locale system_locale("");
+ std::locale::global(system_locale);
+
+ FLAGS_fonts_dir = TESTING_DIR;
+ FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+ file::MakeTmpdir();
+ PangoFontInfo::SoftInitFontConfig(); // init early
+ }
+ LigatureTable* lig_table_;
+};
+
+TEST_F(LigatureTableTest, DoesFillLigatureTables) {
+ EXPECT_GT(lig_table_->norm_to_lig_table().size(), 0);
+ EXPECT_GT(lig_table_->lig_to_norm_table().size(), 0);
+}
+
+TEST_F(LigatureTableTest, DoesAddLigatures) {
+ EXPECT_STREQ(kEngLigatureText,
+ lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str());
+}
+
+TEST_F(LigatureTableTest, DoesAddLigaturesWithSupportedFont) {
+ PangoFontInfo font;
+ EXPECT_TRUE(font.ParseFontDescriptionName("Verdana"));
+printf("1:%s\n", kRenderableEngLigatureText);
+printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
+ EXPECT_STREQ(kRenderableEngLigatureText,
+ lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
+}
+
+TEST_F(LigatureTableTest, DoesNotAddLigaturesWithUnsupportedFont) {
+ PangoFontInfo font;
+ EXPECT_TRUE(font.ParseFontDescriptionName("Lohit Hindi"));
+ EXPECT_STREQ(kEngNonLigatureText,
+ lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
+}
+
+TEST_F(LigatureTableTest, DoesRemoveLigatures) {
+ EXPECT_STREQ(kEngNonLigatureText,
+ lig_table_->RemoveLigatures(kEngLigatureText).c_str());
+}
+
+TEST_F(LigatureTableTest, TestCustomLigatures) {
+ const char* kTestCases[] = {
+ "act", "a\uE003", "publiſh", "publi\uE006", "ſince",
+ "\uE007nce", "aſleep", "a\uE008eep", "neceſſary", "nece\uE009ary",
+ };
+ for (size_t i = 0; i < ARRAYSIZE(kTestCases); i += 2) {
+ EXPECT_STREQ(kTestCases[i + 1],
+ lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
+ EXPECT_STREQ(kTestCases[i],
+ lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str());
+ EXPECT_STREQ(kTestCases[i],
+ lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
+ }
+}
+
+TEST_F(LigatureTableTest, TestRemovesCustomLigatures) {
+ const char* kTestCases[] = {
+ "fiction",
+ "fi\uE003ion",
+ "fiction",
+ };
+ for (size_t i = 0; i < ARRAYSIZE(kTestCases); i += 3) {
+ EXPECT_STREQ(kTestCases[i + 1],
+ lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
+ EXPECT_STREQ(kTestCases[i + 2],
+ lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
+ }
+}
+} // namespace
diff --git a/tesseract/unittest/linlsq_test.cc b/tesseract/unittest/linlsq_test.cc
new file mode 100644
index 00000000..2ca0ea9e
--- /dev/null
+++ b/tesseract/unittest/linlsq_test.cc
@@ -0,0 +1,118 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "linlsq.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class LLSQTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ public:
+ void TearDown() {}
+
+ void ExpectCorrectLine(const LLSQ& llsq, double m, double c, double rms,
+ double pearson, double tolerance) {
+ EXPECT_NEAR(m, llsq.m(), tolerance);
+ EXPECT_NEAR(c, llsq.c(llsq.m()), tolerance);
+ EXPECT_NEAR(rms, llsq.rms(llsq.m(), llsq.c(llsq.m())), tolerance);
+ EXPECT_NEAR(pearson, llsq.pearson(), tolerance);
+ }
+ FCOORD PtsMean(const std::vector<FCOORD>& pts) {
+ FCOORD total(0, 0);
+ for (const auto& p : pts) {
+ total += p;
+ }
+ return (pts.size() > 0) ? total / pts.size() : total;
+ }
+ void VerifyRmsOrth(const std::vector<FCOORD>& pts, const FCOORD& orth) {
+ LLSQ llsq;
+ FCOORD xavg = PtsMean(pts);
+ FCOORD nvec = !orth;
+ nvec.normalise();
+ double expected_answer = 0;
+ for (const auto& p : pts) {
+ llsq.add(p.x(), p.y());
+ double dot = nvec % (p - xavg);
+ expected_answer += dot * dot;
+ }
+ expected_answer /= pts.size();
+ expected_answer = sqrt(expected_answer);
+ EXPECT_NEAR(expected_answer, llsq.rms_orth(orth), 0.0001);
+ }
+ void ExpectCorrectVector(const LLSQ& llsq, FCOORD correct_mean_pt,
+ FCOORD correct_vector, float tolerance) {
+ FCOORD mean_pt = llsq.mean_point();
+ FCOORD vector = llsq.vector_fit();
+ EXPECT_NEAR(correct_mean_pt.x(), mean_pt.x(), tolerance);
+ EXPECT_NEAR(correct_mean_pt.y(), mean_pt.y(), tolerance);
+ EXPECT_NEAR(correct_vector.x(), vector.x(), tolerance);
+ EXPECT_NEAR(correct_vector.y(), vector.y(), tolerance);
+ }
+};
+
+// Tests a simple baseline-style normalization.
+TEST_F(LLSQTest, BasicLines) {
+ LLSQ llsq;
+ llsq.add(1.0, 1.0);
+ llsq.add(2.0, 2.0);
+ ExpectCorrectLine(llsq, 1.0, 0.0, 0.0, 1.0, 1e-6);
+ float half_root_2 = sqrt(2.0) / 2.0f;
+ ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f),
+ FCOORD(half_root_2, half_root_2), 1e-6);
+ llsq.remove(2.0, 2.0);
+ llsq.add(1.0, 2.0);
+ llsq.add(10.0, 1.0);
+ llsq.add(-8.0, 1.0);
+ // The point at 1,2 pulls the result away from what would otherwise be a
+ // perfect fit to a horizontal line by 0.25 unit, with rms error of 0.433.
+ ExpectCorrectLine(llsq, 0.0, 1.25, 0.433, 0.0, 1e-2);
+ ExpectCorrectVector(llsq, FCOORD(1.0f, 1.25f), FCOORD(1.0f, 0.0f), 1e-3);
+ llsq.add(1.0, 2.0, 10.0);
+ // With a heavy weight, the point at 1,2 pulls the line nearer.
+ ExpectCorrectLine(llsq, 0.0, 1.786, 0.41, 0.0, 1e-2);
+ ExpectCorrectVector(llsq, FCOORD(1.0f, 1.786f), FCOORD(1.0f, 0.0f), 1e-3);
+}
+
+// Tests a simple baseline-style normalization with a rotation.
+TEST_F(LLSQTest, Vectors) {
+ LLSQ llsq;
+ llsq.add(1.0, 1.0);
+ llsq.add(1.0, -1.0);
+ ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-6);
+ llsq.add(0.9, -2.0);
+ llsq.add(1.1, -3.0);
+ llsq.add(0.9, 2.0);
+ llsq.add(1.10001, 3.0);
+ ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-3);
+}
+
+// Verify that rms_orth() actually calculates:
+// sqrt( sum (!nvec * (x_i - x_avg))^2 / n)
+TEST_F(LLSQTest, RmsOrthWorksAsIntended) {
+ std::vector<FCOORD> pts;
+ pts.push_back(FCOORD(0.56, 0.95));
+ pts.push_back(FCOORD(0.09, 0.09));
+ pts.push_back(FCOORD(0.13, 0.77));
+ pts.push_back(FCOORD(0.16, 0.83));
+ pts.push_back(FCOORD(0.45, 0.79));
+ VerifyRmsOrth(pts, FCOORD(1, 0));
+ VerifyRmsOrth(pts, FCOORD(1, 1));
+ VerifyRmsOrth(pts, FCOORD(1, 2));
+ VerifyRmsOrth(pts, FCOORD(2, 1));
+}
+
+} // namespace.
diff --git a/tesseract/unittest/list_test.cc b/tesseract/unittest/list_test.cc
new file mode 100644
index 00000000..e6a2bf1d
--- /dev/null
+++ b/tesseract/unittest/list_test.cc
@@ -0,0 +1,68 @@
+// (C) Copyright 2020, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#if 0 // TODO: add tests for CLIST
+#include "clst.h"
+#endif
+#include "elst.h"
+#if 0 // TODO: add tests for ELIST2
+#include "elst2.h"
+#endif
+
+namespace tesseract {
+
+class ListTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ static std::locale system_locale("");
+ std::locale::global(system_locale);
+ }
+};
+
+class Elst : public ELIST_LINK {
+ public:
+ Elst(unsigned n) : value(n) {
+ }
+ unsigned value;
+};
+
+ELISTIZEH(Elst)
+ELISTIZE(Elst)
+
+TEST_F(ListTest, TestELIST) {
+ Elst_LIST list;
+ auto it = ELIST_ITERATOR(&list);
+ for (unsigned i = 0; i < 10; i++) {
+ auto* elst = new Elst(i);
+ //EXPECT_TRUE(elst->empty());
+ //EXPECT_EQ(elst->length(), 0);
+ it.add_to_end(elst);
+ }
+ it.move_to_first();
+ unsigned n = 0;
+ for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+ auto* elst = reinterpret_cast<Elst*>(it.data());
+ EXPECT_EQ(elst->value, n);
+ n++;
+ }
+ it.forward();
+ n++;
+ for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+ auto* elst = reinterpret_cast<Elst*>(it.extract());
+ EXPECT_EQ(elst->value, n % 10);
+ n++;
+ delete elst;
+ }
+ // TODO: add more tests for ELIST
+}
+
+} // namespace tesseract.
diff --git a/tesseract/unittest/loadlang_test.cc b/tesseract/unittest/loadlang_test.cc
new file mode 100644
index 00000000..ba7a9f6d
--- /dev/null
+++ b/tesseract/unittest/loadlang_test.cc
@@ -0,0 +1,251 @@
+///////////////////////////////////////////////////////////////////////
+// File: loadlang_test.cc
+// Description: Test loading of All languages and Scripts for Tesseract.
+// Tests for All languages and scripts are Disabled by default.
+// Force the disabled test to run if required by using the
+// --gtest_also_run_disabled_tests argument. Author: Shree Devi Kumar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include <memory> // std::unique_ptr
+#include <time.h>
+#include <tesseract/baseapi.h>
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class QuickTest : public testing::Test {
+ protected:
+ virtual void SetUp() { start_time_ = time(nullptr); }
+ virtual void TearDown() {
+ const time_t end_time = time(nullptr);
+ EXPECT_TRUE(end_time - start_time_ <= 25)
+ << "The test took too long - "
+ << ::testing::PrintToString(end_time - start_time_);
+ }
+ time_t start_time_;
+};
+
+void LangLoader(const char* lang, const char* tessdatadir) {
+ std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+ ASSERT_FALSE(api->Init(tessdatadir, lang))
+ << "Could not initialize tesseract for $lang.";
+ api->End();
+}
+
+// For all languages
+
+class LoadLanguage : public QuickTest,
+ public ::testing::WithParamInterface<const char*> {};
+
+TEST_P(LoadLanguage, afr) { LangLoader("afr", GetParam()); }
+TEST_P(LoadLanguage, amh) { LangLoader("amh", GetParam()); }
+TEST_P(LoadLanguage, ara) { LangLoader("ara", GetParam()); }
+TEST_P(LoadLanguage, asm) { LangLoader("asm", GetParam()); }
+TEST_P(LoadLanguage, aze) { LangLoader("aze", GetParam()); }
+TEST_P(LoadLanguage, aze_cyrl) { LangLoader("aze_cyrl", GetParam()); }
+TEST_P(LoadLanguage, bel) { LangLoader("bel", GetParam()); }
+TEST_P(LoadLanguage, ben) { LangLoader("ben", GetParam()); }
+TEST_P(LoadLanguage, bod) { LangLoader("bod", GetParam()); }
+TEST_P(LoadLanguage, bos) { LangLoader("bos", GetParam()); }
+TEST_P(LoadLanguage, bre) { LangLoader("bre", GetParam()); }
+TEST_P(LoadLanguage, bul) { LangLoader("bul", GetParam()); }
+TEST_P(LoadLanguage, cat) { LangLoader("cat", GetParam()); }
+TEST_P(LoadLanguage, ceb) { LangLoader("ceb", GetParam()); }
+TEST_P(LoadLanguage, ces) { LangLoader("ces", GetParam()); }
+TEST_P(LoadLanguage, chi_sim) { LangLoader("chi_sim", GetParam()); }
+TEST_P(LoadLanguage, chi_sim_vert) { LangLoader("chi_sim_vert", GetParam()); }
+TEST_P(LoadLanguage, chi_tra) { LangLoader("chi_tra", GetParam()); }
+TEST_P(LoadLanguage, chi_tra_vert) { LangLoader("chi_tra_vert", GetParam()); }
+TEST_P(LoadLanguage, chr) { LangLoader("chr", GetParam()); }
+TEST_P(LoadLanguage, cos) { LangLoader("cos", GetParam()); }
+TEST_P(LoadLanguage, cym) { LangLoader("cym", GetParam()); }
+TEST_P(LoadLanguage, dan) { LangLoader("dan", GetParam()); }
+TEST_P(LoadLanguage, deu) { LangLoader("deu", GetParam()); }
+TEST_P(LoadLanguage, div) { LangLoader("div", GetParam()); }
+TEST_P(LoadLanguage, dzo) { LangLoader("dzo", GetParam()); }
+TEST_P(LoadLanguage, ell) { LangLoader("ell", GetParam()); }
+TEST_P(LoadLanguage, eng) { LangLoader("eng", GetParam()); }
+TEST_P(LoadLanguage, enm) { LangLoader("enm", GetParam()); }
+TEST_P(LoadLanguage, epo) { LangLoader("epo", GetParam()); }
+TEST_P(LoadLanguage, est) { LangLoader("est", GetParam()); }
+TEST_P(LoadLanguage, eus) { LangLoader("eus", GetParam()); }
+TEST_P(LoadLanguage, fao) { LangLoader("fao", GetParam()); }
+TEST_P(LoadLanguage, fas) { LangLoader("fas", GetParam()); }
+TEST_P(LoadLanguage, fil) { LangLoader("fil", GetParam()); }
+TEST_P(LoadLanguage, fin) { LangLoader("fin", GetParam()); }
+TEST_P(LoadLanguage, fra) { LangLoader("fra", GetParam()); }
+TEST_P(LoadLanguage, frk) { LangLoader("frk", GetParam()); }
+TEST_P(LoadLanguage, frm) { LangLoader("frm", GetParam()); }
+TEST_P(LoadLanguage, fry) { LangLoader("fry", GetParam()); }
+TEST_P(LoadLanguage, gla) { LangLoader("gla", GetParam()); }
+TEST_P(LoadLanguage, gle) { LangLoader("gle", GetParam()); }
+TEST_P(LoadLanguage, glg) { LangLoader("glg", GetParam()); }
+TEST_P(LoadLanguage, grc) { LangLoader("grc", GetParam()); }
+TEST_P(LoadLanguage, guj) { LangLoader("guj", GetParam()); }
+TEST_P(LoadLanguage, hat) { LangLoader("hat", GetParam()); }
+TEST_P(LoadLanguage, heb) { LangLoader("heb", GetParam()); }
+TEST_P(LoadLanguage, hin) { LangLoader("hin", GetParam()); }
+TEST_P(LoadLanguage, hrv) { LangLoader("hrv", GetParam()); }
+TEST_P(LoadLanguage, hun) { LangLoader("hun", GetParam()); }
+TEST_P(LoadLanguage, hye) { LangLoader("hye", GetParam()); }
+TEST_P(LoadLanguage, iku) { LangLoader("iku", GetParam()); }
+TEST_P(LoadLanguage, ind) { LangLoader("ind", GetParam()); }
+TEST_P(LoadLanguage, isl) { LangLoader("isl", GetParam()); }
+TEST_P(LoadLanguage, ita) { LangLoader("ita", GetParam()); }
+TEST_P(LoadLanguage, ita_old) { LangLoader("ita_old", GetParam()); }
+TEST_P(LoadLanguage, jav) { LangLoader("jav", GetParam()); }
+TEST_P(LoadLanguage, jpn) { LangLoader("jpn", GetParam()); }
+TEST_P(LoadLanguage, jpn_vert) { LangLoader("jpn_vert", GetParam()); }
+TEST_P(LoadLanguage, kan) { LangLoader("kan", GetParam()); }
+TEST_P(LoadLanguage, kat) { LangLoader("kat", GetParam()); }
+TEST_P(LoadLanguage, kat_old) { LangLoader("kat_old", GetParam()); }
+TEST_P(LoadLanguage, kaz) { LangLoader("kaz", GetParam()); }
+TEST_P(LoadLanguage, khm) { LangLoader("khm", GetParam()); }
+TEST_P(LoadLanguage, kir) { LangLoader("kir", GetParam()); }
+// TEST_P(LoadLanguage, kmr) {LangLoader("kmr" , GetParam());}
+TEST_P(LoadLanguage, kor) { LangLoader("kor", GetParam()); }
+TEST_P(LoadLanguage, kor_vert) { LangLoader("kor_vert", GetParam()); }
+TEST_P(LoadLanguage, lao) { LangLoader("lao", GetParam()); }
+TEST_P(LoadLanguage, lat) { LangLoader("lat", GetParam()); }
+TEST_P(LoadLanguage, lav) { LangLoader("lav", GetParam()); }
+TEST_P(LoadLanguage, lit) { LangLoader("lit", GetParam()); }
+TEST_P(LoadLanguage, ltz) { LangLoader("ltz", GetParam()); }
+TEST_P(LoadLanguage, mal) { LangLoader("mal", GetParam()); }
+TEST_P(LoadLanguage, mar) { LangLoader("mar", GetParam()); }
+TEST_P(LoadLanguage, mkd) { LangLoader("mkd", GetParam()); }
+TEST_P(LoadLanguage, mlt) { LangLoader("mlt", GetParam()); }
+TEST_P(LoadLanguage, mon) { LangLoader("mon", GetParam()); }
+TEST_P(LoadLanguage, mri) { LangLoader("mri", GetParam()); }
+TEST_P(LoadLanguage, msa) { LangLoader("msa", GetParam()); }
+TEST_P(LoadLanguage, mya) { LangLoader("mya", GetParam()); }
+TEST_P(LoadLanguage, nep) { LangLoader("nep", GetParam()); }
+TEST_P(LoadLanguage, nld) { LangLoader("nld", GetParam()); }
+TEST_P(LoadLanguage, nor) { LangLoader("nor", GetParam()); }
+TEST_P(LoadLanguage, oci) { LangLoader("oci", GetParam()); }
+TEST_P(LoadLanguage, ori) { LangLoader("ori", GetParam()); }
+TEST_P(LoadLanguage, osd) { LangLoader("osd", GetParam()); }
+TEST_P(LoadLanguage, pan) { LangLoader("pan", GetParam()); }
+TEST_P(LoadLanguage, pol) { LangLoader("pol", GetParam()); }
+TEST_P(LoadLanguage, por) { LangLoader("por", GetParam()); }
+TEST_P(LoadLanguage, pus) { LangLoader("pus", GetParam()); }
+TEST_P(LoadLanguage, que) { LangLoader("que", GetParam()); }
+TEST_P(LoadLanguage, ron) { LangLoader("ron", GetParam()); }
+TEST_P(LoadLanguage, rus) { LangLoader("rus", GetParam()); }
+TEST_P(LoadLanguage, san) { LangLoader("san", GetParam()); }
+TEST_P(LoadLanguage, sin) { LangLoader("sin", GetParam()); }
+TEST_P(LoadLanguage, slk) { LangLoader("slk", GetParam()); }
+TEST_P(LoadLanguage, slv) { LangLoader("slv", GetParam()); }
+TEST_P(LoadLanguage, snd) { LangLoader("snd", GetParam()); }
+TEST_P(LoadLanguage, spa) { LangLoader("spa", GetParam()); }
+TEST_P(LoadLanguage, spa_old) { LangLoader("spa_old", GetParam()); }
+TEST_P(LoadLanguage, sqi) { LangLoader("sqi", GetParam()); }
+TEST_P(LoadLanguage, srp) { LangLoader("srp", GetParam()); }
+TEST_P(LoadLanguage, srp_latn) { LangLoader("srp_latn", GetParam()); }
+TEST_P(LoadLanguage, sun) { LangLoader("sun", GetParam()); }
+TEST_P(LoadLanguage, swa) { LangLoader("swa", GetParam()); }
+TEST_P(LoadLanguage, swe) { LangLoader("swe", GetParam()); }
+TEST_P(LoadLanguage, syr) { LangLoader("syr", GetParam()); }
+TEST_P(LoadLanguage, tam) { LangLoader("tam", GetParam()); }
+TEST_P(LoadLanguage, tat) { LangLoader("tat", GetParam()); }
+TEST_P(LoadLanguage, tel) { LangLoader("tel", GetParam()); }
+TEST_P(LoadLanguage, tgk) { LangLoader("tgk", GetParam()); }
+TEST_P(LoadLanguage, tha) { LangLoader("tha", GetParam()); }
+TEST_P(LoadLanguage, tir) { LangLoader("tir", GetParam()); }
+TEST_P(LoadLanguage, ton) { LangLoader("ton", GetParam()); }
+TEST_P(LoadLanguage, tur) { LangLoader("tur", GetParam()); }
+TEST_P(LoadLanguage, uig) { LangLoader("uig", GetParam()); }
+TEST_P(LoadLanguage, ukr) { LangLoader("ukr", GetParam()); }
+TEST_P(LoadLanguage, urd) { LangLoader("urd", GetParam()); }
+TEST_P(LoadLanguage, uzb) { LangLoader("uzb", GetParam()); }
+TEST_P(LoadLanguage, uzb_cyrl) { LangLoader("uzb_cyrl", GetParam()); }
+TEST_P(LoadLanguage, vie) { LangLoader("vie", GetParam()); }
+TEST_P(LoadLanguage, yid) { LangLoader("yid", GetParam()); }
+TEST_P(LoadLanguage, yor) { LangLoader("yor", GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadLanguage,
+ ::testing::Values(TESSDATA_DIR "_fast"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadLanguage,
+ ::testing::Values(TESSDATA_DIR "_best"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage,
+ ::testing::Values(TESSDATA_DIR));
+
+// For all scripts
+
+class LoadScript : public QuickTest,
+ public ::testing::WithParamInterface<const char*> {};
+
+TEST_P(LoadScript, Arabic) { LangLoader("script/Arabic", GetParam()); }
+TEST_P(LoadScript, Armenian) { LangLoader("script/Armenian", GetParam()); }
+TEST_P(LoadScript, Bengali) { LangLoader("script/Bengali", GetParam()); }
+TEST_P(LoadScript, Canadian_Aboriginal) {
+ LangLoader("script/Canadian_Aboriginal", GetParam());
+}
+TEST_P(LoadScript, Cherokee) { LangLoader("script/Cherokee", GetParam()); }
+TEST_P(LoadScript, Cyrillic) { LangLoader("script/Cyrillic", GetParam()); }
+TEST_P(LoadScript, Devanagari) { LangLoader("script/Devanagari", GetParam()); }
+TEST_P(LoadScript, Ethiopic) { LangLoader("script/Ethiopic", GetParam()); }
+TEST_P(LoadScript, Fraktur) { LangLoader("script/Fraktur", GetParam()); }
+TEST_P(LoadScript, Georgian) { LangLoader("script/Georgian", GetParam()); }
+TEST_P(LoadScript, Greek) { LangLoader("script/Greek", GetParam()); }
+TEST_P(LoadScript, Gujarati) { LangLoader("script/Gujarati", GetParam()); }
+TEST_P(LoadScript, Gurmukhi) { LangLoader("script/Gurmukhi", GetParam()); }
+TEST_P(LoadScript, HanS) { LangLoader("script/HanS", GetParam()); }
+TEST_P(LoadScript, HanS_vert) { LangLoader("script/HanS_vert", GetParam()); }
+TEST_P(LoadScript, HanT) { LangLoader("script/HanT", GetParam()); }
+TEST_P(LoadScript, HanT_vert) { LangLoader("script/HanT_vert", GetParam()); }
+TEST_P(LoadScript, Hangul) { LangLoader("script/Hangul", GetParam()); }
+TEST_P(LoadScript, Hangul_vert) {
+ LangLoader("script/Hangul_vert", GetParam());
+}
+TEST_P(LoadScript, Hebrew) { LangLoader("script/Hebrew", GetParam()); }
+TEST_P(LoadScript, Japanese) { LangLoader("script/Japanese", GetParam()); }
+TEST_P(LoadScript, Japanese_vert) {
+ LangLoader("script/Japanese_vert", GetParam());
+}
+TEST_P(LoadScript, Kannada) { LangLoader("script/Kannada", GetParam()); }
+TEST_P(LoadScript, Khmer) { LangLoader("script/Khmer", GetParam()); }
+TEST_P(LoadScript, Lao) { LangLoader("script/Lao", GetParam()); }
+TEST_P(LoadScript, Latin) { LangLoader("script/Latin", GetParam()); }
+TEST_P(LoadScript, Malayalam) { LangLoader("script/Malayalam", GetParam()); }
+TEST_P(LoadScript, Myanmar) { LangLoader("script/Myanmar", GetParam()); }
+TEST_P(LoadScript, Oriya) { LangLoader("script/Oriya", GetParam()); }
+TEST_P(LoadScript, Sinhala) { LangLoader("script/Sinhala", GetParam()); }
+TEST_P(LoadScript, Syriac) { LangLoader("script/Syriac", GetParam()); }
+TEST_P(LoadScript, Tamil) { LangLoader("script/Tamil", GetParam()); }
+TEST_P(LoadScript, Telugu) { LangLoader("script/Telugu", GetParam()); }
+TEST_P(LoadScript, Thaana) { LangLoader("script/Thaana", GetParam()); }
+TEST_P(LoadScript, Thai) { LangLoader("script/Thai", GetParam()); }
+TEST_P(LoadScript, Tibetan) { LangLoader("script/Tibetan", GetParam()); }
+TEST_P(LoadScript, Vietnamese) { LangLoader("script/Vietnamese", GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadScript,
+ ::testing::Values(TESSDATA_DIR "_fast"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadScript,
+ ::testing::Values(TESSDATA_DIR "_best"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript,
+ ::testing::Values(TESSDATA_DIR));
+
+class LoadLang : public QuickTest {};
+
+// Test Load of English here, as the parameterized tests are disabled by
+// default.
+TEST_F(LoadLang, engFast) { LangLoader("eng", TESSDATA_DIR "_fast"); }
+TEST_F(LoadLang, engBest) { LangLoader("eng", TESSDATA_DIR "_best"); }
+TEST_F(LoadLang, engBestInt) { LangLoader("eng", TESSDATA_DIR); }
+
+// Use class LoadLang for languages which are NOT there in all three repos
+TEST_F(LoadLang, kmrFast) { LangLoader("kmr", TESSDATA_DIR "_fast"); }
+TEST_F(LoadLang, kmrBest) { LangLoader("kmr", TESSDATA_DIR "_best"); }
+// TEST_F(LoadLang, kmrBestInt) {LangLoader("kmr" , TESSDATA_DIR);}
+
+} // namespace
diff --git a/tesseract/unittest/log.h b/tesseract/unittest/log.h
new file mode 100644
index 00000000..0b21f3ee
--- /dev/null
+++ b/tesseract/unittest/log.h
@@ -0,0 +1,67 @@
+///////////////////////////////////////////////////////////////////////
+// File: log.h
+// Description: Include for custom log message for unittest for tesseract.
+// based on
+// https://stackoverflow.com/questions/16491675/how-to-send-custom-message-in-google-c-testing-framework
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_UNITTEST_LOG_H_
+#define TESSERACT_UNITTEST_LOG_H_
+
+// This is a minimal implementation of the TensorFlow logging API
+// which is sufficient for the Tesseract unit tests.
+
+// See tensorflow/core/platform/default/logging.h for the original code.
+
+#include <iostream>
+
+enum LogLevel {
+ INFO, WARNING, ERROR, FATAL
+};
+
+// Avoid conflict with logging.h from TensorFlow.
+#undef LOG
+
+static inline std::ostream& LOG(enum LogLevel level)
+{
+ switch (level) {
+ case INFO:
+ std::cout << "[INFO] ";
+ break;
+ case WARNING:
+ std::cout << "[WARN] ";
+ break;
+ case ERROR:
+ std::cout << "[ERROR] ";
+ break;
+ case FATAL:
+ std::cout << "[FATAL] ";
+ break;
+ }
+ return std::cout;
+}
+
+// Avoid conflict with logging.h from TensorFlow.
+#undef QCHECK
+
+// https://github.com/google/ion/blob/master/ion/base/logging.h
+static inline std::ostream& QCHECK(bool condition)
+{
+ if (condition) {
+ static std::ostream null_stream(nullptr);
+ return null_stream;
+ }
+ return std::cout;
+}
+
+#endif // TESSERACT_UNITTEST_LOG_H_
diff --git a/tesseract/unittest/lstm_recode_test.cc b/tesseract/unittest/lstm_recode_test.cc
new file mode 100644
index 00000000..5365bf4b
--- /dev/null
+++ b/tesseract/unittest/lstm_recode_test.cc
@@ -0,0 +1,45 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lstm_test.h"
+
+namespace tesseract {
+
+// Tests that training with unicharset recoding learns faster than without,
+// for Korean. This test is split in two, so it can be run sharded.
+
+TEST_F(LSTMTrainerTest, RecodeTestKorBase) {
+ // A basic single-layer, bi-di 1d LSTM on Korean.
+ SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-full", "kor/kor.unicharset",
+ "kor.Arial_Unicode_MS.exp0.lstmf", false, true, 5e-4, false, "kor");
+ double kor_full_err = TrainIterations(kTrainerIterations * 2);
+ EXPECT_LT(kor_full_err, 88);
+// EXPECT_GT(kor_full_err, 85);
+ LOG(INFO) << "********** Expected < 88 ************\n" ;
+}
+
+TEST_F(LSTMTrainerTest, RecodeTestKor) {
+ // A basic single-layer, bi-di 1d LSTM on Korean.
+ SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-recode", "kor/kor.unicharset",
+ "kor.Arial_Unicode_MS.exp0.lstmf", true, true, 5e-4, false, "kor");
+ double kor_recode_err = TrainIterations(kTrainerIterations);
+ EXPECT_LT(kor_recode_err, 60);
+ LOG(INFO) << "********** Expected < 60 ************\n" ;
+}
+
+// Tests that the given string encodes and decodes back to the same
+// with both recode on and off for Korean.
+
+TEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) {
+ TestEncodeDecodeBoth("kor", "한국어 위키백과에 오신 것을 환영합니다!");
+}
+
+} // namespace tesseract.
diff --git a/tesseract/unittest/lstm_squashed_test.cc b/tesseract/unittest/lstm_squashed_test.cc
new file mode 100644
index 00000000..1dd08746
--- /dev/null
+++ b/tesseract/unittest/lstm_squashed_test.cc
@@ -0,0 +1,31 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lstm_test.h"
+
+namespace tesseract {
+
+// Tests that a Squashed network learns correctly.
+// Almost as fast as the 2d-lstm.
+TEST_F(LSTMTrainerTest, TestSquashed) {
+ // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom, and
+ // a small convolution/maxpool below that.
+ // Match training conditions to those typically used with this spec:
+ // recoding on, adam on.
+ SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]",
+ "SQU-2-layer-lstm", /*recode*/ true, /*adam*/ true);
+ double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);
+ EXPECT_LT(lstm_2d_err, 80);
+ LOG(INFO) << "********** < 80 ************\n" ;
+ TestIntMode(kTrainerIterations);
+}
+
+} // namespace tesseract.
diff --git a/tesseract/unittest/lstm_test.cc b/tesseract/unittest/lstm_test.cc
new file mode 100644
index 00000000..930384a6
--- /dev/null
+++ b/tesseract/unittest/lstm_test.cc
@@ -0,0 +1,221 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Generating the training data:
+// If the format of the lstmf (ImageData) file changes, the training data will
+// have to be regenerated as follows:
+//
+// Use --xsize 800 for text2image to be similar to original training data.
+//
+// src/training/tesstrain.sh --fonts_dir /usr/share/fonts --lang eng \
+// --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \
+// --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \
+// --fontlist "Arial" --maxpages 10
+//
+
+#include "lstm_test.h"
+
+namespace tesseract {
+
+// Tests that some simple networks can learn Arial and meet accuracy targets.
+TEST_F(LSTMTrainerTest, BasicTest) {
+ // A Convolver sliding window classifier without LSTM.
+ SetupTrainer(
+ "[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 "
+ "Ct1,1,64O1c1]",
+ "no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false,
+ 2e-4, false, "eng");
+ double non_lstm_err = TrainIterations(kTrainerIterations * 4);
+ EXPECT_LT(non_lstm_err, 98);
+ LOG(INFO) << "********** Expected < 98 ************\n" ;
+
+ // A basic single-layer, single direction LSTM.
+ SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false);
+ double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
+ EXPECT_LT(lstm_uni_err, 86);
+ LOG(INFO) << "********** Expected < 86 ************\n" ;
+ // Beats the convolver. (Although it does have a lot more weights, it still
+ // iterates faster.)
+ EXPECT_LT(lstm_uni_err, non_lstm_err);
+}
+
+// Color learns almost as fast as normalized grey/2D.
+TEST_F(LSTMTrainerTest, ColorTest) {
+ // A basic single-layer, single direction LSTM.
+ SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+ "2D-color-lstm", true, true);
+ double lstm_uni_err = TrainIterations(kTrainerIterations);
+ EXPECT_LT(lstm_uni_err, 85);
+// EXPECT_GT(lstm_uni_err, 66);
+ LOG(INFO) << "********** Expected < 85 ************\n" ;
+}
+
+TEST_F(LSTMTrainerTest, BidiTest) {
+ // A basic single-layer, bi-di 1d LSTM.
+ SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false);
+ double lstm_bi_err = TrainIterations(kTrainerIterations);
+ EXPECT_LT(lstm_bi_err, 75);
+ LOG(INFO) << "********** Expected < 75 ************\n" ;
+ // Int mode training is dead, so convert the trained network to int and check
+ // that its error rate is close to the float version.
+ TestIntMode(kTrainerIterations);
+}
+
+// Tests that a 2d-2-layer network learns correctly.
+// It takes a lot of iterations to get there.
+TEST_F(LSTMTrainerTest, Test2D) {
+ // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
+ SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+ "2-D-2-layer-lstm", false, false);
+ double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2 );
+ EXPECT_LT(lstm_2d_err, 98);
+// EXPECT_GT(lstm_2d_err, 90);
+ LOG(INFO) << "********** Expected < 98 ************\n" ;
+ // Int mode training is dead, so convert the trained network to int and check
+ // that its error rate is close to the float version.
+ TestIntMode(kTrainerIterations);
+}
+
+// Tests that a 2d-2-layer network with Adam does *a lot* better than
+// without it.
+TEST_F(LSTMTrainerTest, TestAdam) {
+ // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
+ SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+ "2-D-2-layer-lstm", false, true);
+ double lstm_2d_err = TrainIterations(kTrainerIterations);
+ EXPECT_LT(lstm_2d_err, 70);
+ LOG(INFO) << "********** Expected < 70 ************\n" ;
+ TestIntMode(kTrainerIterations);
+}
+
+// Trivial test of training speed on a fairly complex network.
+TEST_F(LSTMTrainerTest, SpeedTest) {
+ SetupTrainerEng(
+ "[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 "
+ "O1c1]",
+ "2-D-2-layer-lstm", false, true);
+ TrainIterations(kTrainerIterations);
+ LOG(INFO) << "********** *** ************\n" ;
+}
+
+// Tests that two identical networks trained the same get the same results.
+// Also tests that the same happens with a serialize/deserialize in the middle.
+TEST_F(LSTMTrainerTest, DeterminismTest) {
+ SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+ "2-D-2-layer-lstm", false, false);
+ double lstm_2d_err_a = TrainIterations(kTrainerIterations);
+ double act_error_a = trainer_->ActivationError();
+ double char_error_a = trainer_->CharError();
+ std::vector<char> trainer_a_data;
+ EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
+ &trainer_a_data));
+ SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+ "2-D-2-layer-lstm", false, false);
+ double lstm_2d_err_b = TrainIterations(kTrainerIterations);
+ double act_error_b = trainer_->ActivationError();
+ double char_error_b = trainer_->CharError();
+ EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
+ EXPECT_FLOAT_EQ(act_error_a, act_error_b);
+ EXPECT_FLOAT_EQ(char_error_a, char_error_b);
+ // Now train some more iterations.
+ lstm_2d_err_b = TrainIterations(kTrainerIterations / 3);
+ act_error_b = trainer_->ActivationError();
+ char_error_b = trainer_->CharError();
+ // Unpack into a new trainer and train that some more too.
+ SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+ "2-D-2-layer-lstm", false, false);
+ EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, trainer_.get()));
+ lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);
+ act_error_a = trainer_->ActivationError();
+ char_error_a = trainer_->CharError();
+ EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
+ EXPECT_FLOAT_EQ(act_error_a, act_error_b);
+ EXPECT_FLOAT_EQ(char_error_a, char_error_b);
+ LOG(INFO) << "********** *** ************\n" ;
+}
+
+// The baseline network against which to test the built-in softmax.
+TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) {
+ // A basic single-layer, single direction LSTM.
+ SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true);
+ double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
+ EXPECT_LT(lstm_uni_err, 60);
+// EXPECT_GT(lstm_uni_err, 48);
+ LOG(INFO) << "********** Expected < 60 ************\n" ;
+ // Check that it works in int mode too.
+ TestIntMode(kTrainerIterations);
+ // If we run TestIntMode again, it tests that int_mode networks can
+ // serialize and deserialize correctly.
+ double delta = TestIntMode(kTrainerIterations);
+ // The two tests (both of int mode this time) should be almost identical.
+ LOG(INFO) << "Delta in Int mode error rates = " << delta << "\n";
+ EXPECT_LT(delta, 0.01);
+}
+
+// Tests that the built-in softmax does better than the external one,
+// which has an error rate slightly less than 55%, as tested by
+// SoftmaxBaselineTest.
+TEST_F(LSTMTrainerTest, SoftmaxTest) {
+ // LSTM with a built-in softmax can beat the external softmax.
+ SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true);
+ double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
+ EXPECT_LT(lstm_sm_err, 49.0);
+ LOG(INFO) << "********** Expected < 49 ************\n" ;
+ // Check that it works in int mode too.
+ TestIntMode(kTrainerIterations);
+}
+
+// Tests that the built-in encoded softmax does better than the external one.
+// It takes a lot of iterations to get there.
+TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) {
+ // LSTM with a built-in encoded softmax can beat the external softmax.
+ SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true);
+ double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
+ EXPECT_LT(lstm_sm_err, 62.0);
+ LOG(INFO) << "********** Expected < 62 ************\n" ;
+ // Check that it works in int mode too.
+ TestIntMode(kTrainerIterations);
+}
+
+// Tests that layer access methods work correctly.
+TEST_F(LSTMTrainerTest, TestLayerAccess) {
+ // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom.
+ SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm",
+ false, false);
+ // Number of layers.
+ const int kNumLayers = 8;
+ // Expected layer names.
+ const char* kLayerIds[kNumLayers] = {":0", ":1:0", ":1:1", ":2",
+ ":3:0", ":4:0", ":4:1:0", ":5"};
+ const char* kLayerNames[kNumLayers] = {"Input", "Convolve", "ConvNL",
+ "Maxpool", "Lfys32", "Lbx128LTR",
+ "Lbx128", "Output"};
+ // Expected number of weights.
+ const int kNumWeights[kNumLayers] = {0,
+ 0,
+ 16 * (25 + 1),
+ 0,
+ 32 * (4 * (32 + 16 + 1)),
+ 128 * (4 * (128 + 32 + 1)),
+ 128 * (4 * (128 + 32 + 1)),
+ 112 * (2 * 128 + 1)};
+
+ auto layers = trainer_->EnumerateLayers();
+ EXPECT_EQ(kNumLayers, layers.size());
+ for (int i = 0; i < kNumLayers && i < layers.size(); ++i) {
+ EXPECT_STREQ(kLayerIds[i], layers[i].c_str());
+ EXPECT_STREQ(kLayerNames[i],
+ trainer_->GetLayer(layers[i])->name().c_str());
+ EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights());
+ }
+}
+
+} // namespace tesseract.
diff --git a/tesseract/unittest/lstm_test.h b/tesseract/unittest/lstm_test.h
new file mode 100644
index 00000000..4f3d9572
--- /dev/null
+++ b/tesseract/unittest/lstm_test.h
@@ -0,0 +1,189 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TESSERACT_UNITTEST_LSTM_TEST_H_
+#define TESSERACT_UNITTEST_LSTM_TEST_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "include_gunit.h"
+
+#include "absl/strings/str_cat.h"
+#include "tprintf.h"
+#include "helpers.h"
+
+#include "functions.h"
+#include "lang_model_helpers.h"
+#include "log.h" // for LOG
+#include "lstmtrainer.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+#if DEBUG_DETAIL == 0
+// Number of iterations to run all the trainers.
+const int kTrainerIterations = 600;
+// Number of iterations between accuracy checks.
+const int kBatchIterations = 100;
+#else
+// Number of iterations to run all the trainers.
+const int kTrainerIterations = 2;
+// Number of iterations between accuracy checks.
+const int kBatchIterations = 1;
+#endif
+
+// The fixture for testing LSTMTrainer.
+class LSTMTrainerTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ LSTMTrainerTest() {}
+ std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTDATA_DIR,
+ "" + name);
+ }
+ std::string TessDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESSDATA_DIR,
+ "" + name);
+ }
+ std::string TestingNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR,
+ "" + name);
+ }
+
+ void SetupTrainerEng(const std::string& network_spec, const std::string& model_name,
+ bool recode, bool adam) {
+ SetupTrainer(network_spec, model_name, "eng/eng.unicharset",
+ "eng.Arial.exp0.lstmf", recode, adam, 5e-4, false, "eng");
+ }
+ void SetupTrainer(const std::string& network_spec, const std::string& model_name,
+ const std::string& unicharset_file, const std::string& lstmf_file,
+ bool recode, bool adam, double learning_rate,
+ bool layer_specific, const std::string& kLang) {
+// constexpr char kLang[] = "eng"; // Exact value doesn't matter.
+ std::string unicharset_name = TestDataNameToPath(unicharset_file);
+ UNICHARSET unicharset;
+ ASSERT_TRUE(unicharset.load_from_file(unicharset_name.c_str(), false));
+ std::string script_dir = file::JoinPath(
+ LANGDATA_DIR, "");
+ std::vector<STRING> words;
+ EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir,
+ kLang, !recode, words, words, words, false,
+ nullptr, nullptr));
+ std::string model_path = file::JoinPath(FLAGS_test_tmpdir, model_name);
+ std::string checkpoint_path = model_path + "_checkpoint";
+ trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(),
+ 0, 0));
+ trainer_->InitCharSet(file::JoinPath(FLAGS_test_tmpdir, kLang,
+ absl::StrCat(kLang, ".traineddata")));
+ int net_mode = adam ? NF_ADAM : 0;
+ // Adam needs a higher learning rate, due to not multiplying the effective
+ // rate by 1/(1-momentum).
+ if (adam) learning_rate *= 20.0;
+ if (layer_specific) net_mode |= NF_LAYER_SPECIFIC_LR;
+ EXPECT_TRUE(trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1,
+ learning_rate, 0.9, 0.999));
+ std::vector<STRING> filenames;
+ filenames.push_back(STRING(TestDataNameToPath(lstmf_file).c_str()));
+ EXPECT_TRUE(trainer_->LoadAllTrainingData(filenames, CS_SEQUENTIAL, false));
+ LOG(INFO) << "Setup network:" << model_name << "\n" ;
+ }
+ // Trains for a given number of iterations and returns the char error rate.
+ double TrainIterations(int max_iterations) {
+ int iteration = trainer_->training_iteration();
+ int iteration_limit = iteration + max_iterations;
+ double best_error = 100.0;
+ do {
+ STRING log_str;
+ int target_iteration = iteration + kBatchIterations;
+ // Train a few.
+ double mean_error = 0.0;
+ while (iteration < target_iteration && iteration < iteration_limit) {
+ trainer_->TrainOnLine(trainer_.get(), false);
+ iteration = trainer_->training_iteration();
+ mean_error += trainer_->LastSingleError(ET_CHAR_ERROR);
+ }
+ trainer_->MaintainCheckpoints(nullptr, &log_str);
+ iteration = trainer_->training_iteration();
+ mean_error *= 100.0 / kBatchIterations;
+ if (mean_error < best_error) best_error = mean_error;
+ } while (iteration < iteration_limit);
+ LOG(INFO) << "Trainer error rate = " << best_error << "\n";
+ return best_error;
+ }
+ // Tests for a given number of iterations and returns the char error rate.
+ double TestIterations(int max_iterations) {
+ CHECK_GT(max_iterations, 0);
+ int iteration = trainer_->sample_iteration();
+ double mean_error = 0.0;
+ int error_count = 0;
+ while (error_count < max_iterations) {
+ const ImageData& trainingdata =
+ *trainer_->mutable_training_data()->GetPageBySerial(iteration);
+ NetworkIO fwd_outputs, targets;
+ if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) !=
+ UNENCODABLE) {
+ mean_error += trainer_->NewSingleError(ET_CHAR_ERROR);
+ ++error_count;
+ }
+ trainer_->SetIteration(++iteration);
+ }
+ mean_error *= 100.0 / max_iterations;
+ LOG(INFO) << "Tester error rate = " << mean_error << "\n" ;
+ return mean_error;
+ }
+ // Tests that the current trainer_ can be converted to int mode and still gets
+ // within 1% of the error rate. Returns the increase in error from float to
+ // int.
+ double TestIntMode(int test_iterations) {
+ std::vector<char> trainer_data;
+ EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
+ &trainer_data));
+ // Get the error on the next few iterations in float mode.
+ double float_err = TestIterations(test_iterations);
+ // Restore the dump, convert to int and test error on that.
+ EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_data, trainer_.get()));
+ trainer_->ConvertToInt();
+ double int_err = TestIterations(test_iterations);
+ EXPECT_LT(int_err, float_err + 1.0);
+ return int_err - float_err;
+ }
+ // Sets up a trainer with the given language and given recode+ctc condition.
+ // It then verifies that the given str encodes and decodes back to the same
+ // string.
+ void TestEncodeDecode(const std::string& lang, const std::string& str, bool recode) {
+ std::string unicharset_name = lang + "/" + lang + ".unicharset";
+ std::string lstmf_name = lang + ".Arial_Unicode_MS.exp0.lstmf";
+ SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name,
+ lstmf_name, recode, true, 5e-4, true, lang);
+ std::vector<int> labels;
+ EXPECT_TRUE(trainer_->EncodeString(str.c_str(), &labels));
+ STRING decoded = trainer_->DecodeLabels(labels);
+ std::string decoded_str(&decoded[0], decoded.length());
+ EXPECT_EQ(str, decoded_str);
+ }
+ // Calls TestEncodeDeode with both recode on and off.
+ void TestEncodeDecodeBoth(const std::string& lang, const std::string& str) {
+ TestEncodeDecode(lang, str, false);
+ TestEncodeDecode(lang, str, true);
+ }
+
+ std::unique_ptr<LSTMTrainer> trainer_;
+};
+
+} // namespace tesseract.
+
+#endif // THIRD_PARTY_TESSERACT_UNITTEST_LSTM_TEST_H_
diff --git a/tesseract/unittest/lstmtrainer_test.cc b/tesseract/unittest/lstmtrainer_test.cc
new file mode 100644
index 00000000..967d1fe5
--- /dev/null
+++ b/tesseract/unittest/lstmtrainer_test.cc
@@ -0,0 +1,106 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "lstm_test.h"
+
+namespace tesseract {
+
+TEST_F(LSTMTrainerTest, EncodesEng) {
+ TestEncodeDecodeBoth("eng",
+ "The quick brown 'fox' jumps over: the lazy dog!");
+}
+
+TEST_F(LSTMTrainerTest, EncodesKan) {
+ TestEncodeDecodeBoth("kan", "ಫ್ರಬ್ರವರಿ ತತ್ವಾಂಶಗಳೆಂದರೆ ಮತ್ತು ಜೊತೆಗೆ ಕ್ರಮವನ್ನು");
+}
+
+TEST_F(LSTMTrainerTest, EncodesKor) {
+ TestEncodeDecodeBoth("kor",
+ "이는 것으로 다시 넣을 수는 있지만 선택의 의미는");
+}
+
+TEST_F(LSTMTrainerTest, MapCoder) {
+ LSTMTrainer fra_trainer;
+ fra_trainer.InitCharSet(TestDataNameToPath("fra/fra.traineddata"));
+ LSTMTrainer deu_trainer;
+ deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata"));
+ // A string that uses characters common to French and German.
+ std::string kTestStr = "The quick brown 'fox' jumps over: the lazy dog!";
+ std::vector<int> deu_labels;
+ EXPECT_TRUE(deu_trainer.EncodeString(kTestStr.c_str(), &deu_labels));
+ // The french trainer cannot decode them correctly.
+ STRING badly_decoded = fra_trainer.DecodeLabels(deu_labels);
+ std::string bad_str(&badly_decoded[0], badly_decoded.length());
+ LOG(INFO) << "bad_str fra=" << bad_str << "\n";
+ EXPECT_NE(kTestStr, bad_str);
+ // Encode the string as fra.
+ std::vector<int> fra_labels;
+ EXPECT_TRUE(fra_trainer.EncodeString(kTestStr.c_str(), &fra_labels));
+ // Use the mapper to compute what the labels are as deu.
+ std::vector<int> mapping = fra_trainer.MapRecoder(deu_trainer.GetUnicharset(),
+ deu_trainer.GetRecoder());
+ std::vector<int> mapped_fra_labels(fra_labels.size(), -1);
+ for (int i = 0; i < fra_labels.size(); ++i) {
+ mapped_fra_labels[i] = mapping[fra_labels[i]];
+ EXPECT_NE(-1, mapped_fra_labels[i]) << "i=" << i << ", ch=" << kTestStr[i];
+ EXPECT_EQ(mapped_fra_labels[i], deu_labels[i])
+ << "i=" << i << ", ch=" << kTestStr[i]
+ << " has deu label=" << deu_labels[i] << ", but mapped to "
+ << mapped_fra_labels[i];
+ }
+ // The german trainer can now decode them correctly.
+ STRING decoded = deu_trainer.DecodeLabels(mapped_fra_labels);
+ std::string ok_str(&decoded[0], decoded.length());
+ LOG(INFO) << "ok_str deu=" << ok_str << "\n";
+ EXPECT_EQ(kTestStr, ok_str);
+}
+
+// Tests that the actual fra model can be converted to the deu character set
+// and still read an eng image with 100% accuracy.
+TEST_F(LSTMTrainerTest, ConvertModel) {
+ // Setup a trainer with a deu charset.
+ LSTMTrainer deu_trainer;
+ deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata"));
+ // Load the fra traineddata, strip out the model, and save to a tmp file.
+ TessdataManager mgr;
+ std::string fra_data =
+ file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata");
+ CHECK(mgr.Init(fra_data.c_str()));
+ LOG(INFO) << "Load " << fra_data << "\n";
+ file::MakeTmpdir();
+ std::string model_path = file::JoinPath(FLAGS_test_tmpdir, "fra.lstm");
+ CHECK(mgr.ExtractToFile(model_path.c_str()));
+ LOG(INFO) << "Extract " << model_path << "\n";
+ // Load the fra model into the deu_trainer, and save the converted model.
+ CHECK(deu_trainer.TryLoadingCheckpoint(model_path.c_str(), fra_data.c_str()));
+ LOG(INFO) << "Checkpoint load for " << model_path << " and " << fra_data << "\n";
+ std::string deu_data = file::JoinPath(FLAGS_test_tmpdir, "deu.traineddata");
+ CHECK(deu_trainer.SaveTraineddata(deu_data.c_str()));
+ LOG(INFO) << "Save " << deu_data << "\n";
+ // Now run the saved model on phototest. (See BasicTesseractTest in
+ // baseapi_test.cc).
+ TessBaseAPI api;
+ api.Init(FLAGS_test_tmpdir, "deu", tesseract::OEM_LSTM_ONLY);
+ Pix* src_pix = pixRead(TestingNameToPath("phototest.tif").c_str());
+ CHECK(src_pix);
+ api.SetImage(src_pix);
+ std::unique_ptr<char[]> result(api.GetUTF8Text());
+ std::string truth_text;
+ CHECK_OK(file::GetContents(TestingNameToPath("phototest.gold.txt"),
+ &truth_text, file::Defaults()));
+
+ EXPECT_STREQ(truth_text.c_str(), result.get());
+ pixDestroy(&src_pix);
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/mastertrainer_test.cc b/tesseract/unittest/mastertrainer_test.cc
new file mode 100644
index 00000000..0f93e221
--- /dev/null
+++ b/tesseract/unittest/mastertrainer_test.cc
@@ -0,0 +1,298 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Although this is a trivial-looking test, it exercises a lot of code:
+// SampleIterator has to correctly iterate over the correct characters, or
+// it will fail.
+// The canonical and cloud features computed by TrainingSampleSet need to
+// be correct, along with the distance caches, organizing samples by font
+// and class, indexing of features, distance calculations.
+// IntFeatureDist has to work, or the canonical samples won't work.
+// Mastertrainer has ability to read tr files and set itself up tested.
+// Finally the serialize/deserialize test ensures that MasterTrainer,
+// TrainingSampleSet, TrainingSample can all serialize/deserialize correctly
+// enough to reproduce the same results.
+
+#include "include_gunit.h"
+
+#include "log.h" // for LOG
+#include "unicharset.h"
+#include "errorcounter.h"
+#include "mastertrainer.h"
+#include "shapeclassifier.h"
+#include "shapetable.h"
+#include "trainingsample.h"
+#include "commontraining.h"
+
+#include "absl/strings/numbers.h" // for safe_strto32
+#include "absl/strings/str_split.h" // for absl::StrSplit
+
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace tesseract;
+
+// Specs of the MockClassifier.
+static const int kNumTopNErrs = 10;
+static const int kNumTop2Errs = kNumTopNErrs + 20;
+static const int kNumTop1Errs = kNumTop2Errs + 30;
+static const int kNumTopTopErrs = kNumTop1Errs + 25;
+static const int kNumNonReject = 1000;
+static const int kNumCorrect = kNumNonReject - kNumTop1Errs;
+// The total number of answers is given by the number of non-rejects plus
+// all the multiple answers.
+static const int kNumAnswers = kNumNonReject + 2 * (kNumTop2Errs - kNumTopNErrs) +
+ (kNumTop1Errs - kNumTop2Errs) +
+ (kNumTopTopErrs - kNumTop1Errs);
+
+#ifndef DISABLED_LEGACY_ENGINE
+static bool safe_strto32(const std::string& str, int* pResult)
+{
+ long n = strtol(str.c_str(), nullptr, 0);
+ *pResult = n;
+ return true;
+}
+#endif
+
+// Mock ShapeClassifier that cheats by looking at the correct answer, and
+// creates a specific pattern of errors that can be tested.
+class MockClassifier : public ShapeClassifier {
+ public:
+ explicit MockClassifier(ShapeTable* shape_table)
+ : shape_table_(shape_table), num_done_(0), done_bad_font_(false) {
+ // Add a false font answer to the shape table. We pick a random unichar_id,
+ // add a new shape for it with a false font. Font must actually exist in
+ // the font table, but not match anything in the first 1000 samples.
+ false_unichar_id_ = 67;
+ false_shape_ = shape_table_->AddShape(false_unichar_id_, 25);
+ }
+ virtual ~MockClassifier() {}
+
+ // Classifies the given [training] sample, writing to results.
+ // If debug is non-zero, then various degrees of classifier dependent debug
+ // information is provided.
+ // If keep_this (a shape index) is >= 0, then the results should always
+ // contain keep_this, and (if possible) anything of intermediate confidence.
+ // The return value is the number of classes saved in results.
+ int ClassifySample(const TrainingSample& sample, Pix* page_pix,
+ int debug, UNICHAR_ID keep_this,
+ std::vector<ShapeRating>* results) override {
+ results->clear();
+ // Everything except the first kNumNonReject is a reject.
+ if (++num_done_ > kNumNonReject) return 0;
+
+ int class_id = sample.class_id();
+ int font_id = sample.font_id();
+ int shape_id = shape_table_->FindShape(class_id, font_id);
+ // Get ids of some wrong answers.
+ int wrong_id1 = shape_id > 10 ? shape_id - 1 : shape_id + 1;
+ int wrong_id2 = shape_id > 10 ? shape_id - 2 : shape_id + 2;
+ if (num_done_ <= kNumTopNErrs) {
+ // The first kNumTopNErrs are top-n errors.
+ results->push_back(ShapeRating(wrong_id1, 1.0f));
+ } else if (num_done_ <= kNumTop2Errs) {
+ // The next kNumTop2Errs - kNumTopNErrs are top-2 errors.
+ results->push_back(ShapeRating(wrong_id1, 1.0f));
+ results->push_back(ShapeRating(wrong_id2, 0.875f));
+ results->push_back(ShapeRating(shape_id, 0.75f));
+ } else if (num_done_ <= kNumTop1Errs) {
+ // The next kNumTop1Errs - kNumTop2Errs are top-1 errors.
+ results->push_back(ShapeRating(wrong_id1, 1.0f));
+ results->push_back(ShapeRating(shape_id, 0.8f));
+ } else if (num_done_ <= kNumTopTopErrs) {
+ // The next kNumTopTopErrs - kNumTop1Errs are cases where the actual top
+ // is not correct, but do not count as a top-1 error because the rating
+ // is close enough to the top answer.
+ results->push_back(ShapeRating(wrong_id1, 1.0f));
+ results->push_back(ShapeRating(shape_id, 0.99f));
+ } else if (!done_bad_font_ && class_id == false_unichar_id_) {
+ // There is a single character with a bad font.
+ results->push_back(ShapeRating(false_shape_, 1.0f));
+ done_bad_font_ = true;
+ } else {
+ // Everything else is correct.
+ results->push_back(ShapeRating(shape_id, 1.0f));
+ }
+ return results->size();
+ }
+ // Provides access to the ShapeTable that this classifier works with.
+ const ShapeTable* GetShapeTable() const override { return shape_table_; }
+
+ private:
+ // Borrowed pointer to the ShapeTable.
+ ShapeTable* shape_table_;
+ // Unichar_id of a random character that occurs after the first 60 samples.
+ int false_unichar_id_;
+ // Shape index of prepared false answer for false_unichar_id.
+ int false_shape_;
+ // The number of classifications we have processed.
+ int num_done_;
+ // True after the false font has been emitted.
+ bool done_bad_font_;
+};
+
+const double kMin1lDistance = 0.25;
+
+// The fixture for testing Tesseract.
+class MasterTrainerTest : public testing::Test {
+#ifndef DISABLED_LEGACY_ENGINE
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR, name);
+ }
+ std::string TmpNameToPath(const std::string& name) {
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+
+ MasterTrainerTest() {
+ shape_table_ = nullptr;
+ master_trainer_ = nullptr;
+ }
+ ~MasterTrainerTest() {
+ delete shape_table_;
+ }
+
+ // Initializes the master_trainer_ and shape_table_.
+ // if load_from_tmp, then reloads a master trainer that was saved by a
+ // previous call in which it was false.
+ void LoadMasterTrainer() {
+ FLAGS_output_trainer = TmpNameToPath("tmp_trainer").c_str();
+ FLAGS_F = file::JoinPath(LANGDATA_DIR, "font_properties").c_str();
+ FLAGS_X = TestDataNameToPath("eng.xheights").c_str();
+ FLAGS_U = TestDataNameToPath("eng.unicharset").c_str();
+ std::string tr_file_name(TestDataNameToPath("eng.Arial.exp0.tr"));
+ const char* argv[] = {tr_file_name.c_str()};
+ int argc = 1;
+ STRING file_prefix;
+ delete shape_table_;
+ shape_table_ = nullptr;
+ master_trainer_ =
+ LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix);
+ EXPECT_TRUE(master_trainer_ != nullptr);
+ EXPECT_TRUE(shape_table_ != nullptr);
+ }
+
+ // EXPECTs that the distance between I and l in Arial is 0 and that the
+ // distance to 1 is significantly not 0.
+ void VerifyIl1() {
+ // Find the font id for Arial.
+ int font_id = master_trainer_->GetFontInfoId("Arial");
+ EXPECT_GE(font_id, 0);
+ // Track down the characters we are interested in.
+ int unichar_I = master_trainer_->unicharset().unichar_to_id("I");
+ EXPECT_GT(unichar_I, 0);
+ int unichar_l = master_trainer_->unicharset().unichar_to_id("l");
+ EXPECT_GT(unichar_l, 0);
+ int unichar_1 = master_trainer_->unicharset().unichar_to_id("1");
+ EXPECT_GT(unichar_1, 0);
+ // Now get the shape ids.
+ int shape_I = shape_table_->FindShape(unichar_I, font_id);
+ EXPECT_GE(shape_I, 0);
+ int shape_l = shape_table_->FindShape(unichar_l, font_id);
+ EXPECT_GE(shape_l, 0);
+ int shape_1 = shape_table_->FindShape(unichar_1, font_id);
+ EXPECT_GE(shape_1, 0);
+
+ float dist_I_l =
+ master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l);
+ // No tolerance here. We expect that I and l should match exactly.
+ EXPECT_EQ(0.0f, dist_I_l);
+ float dist_l_I =
+ master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I);
+ // BOTH ways.
+ EXPECT_EQ(0.0f, dist_l_I);
+
+ // l/1 on the other hand should be distinct.
+ float dist_l_1 =
+ master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1);
+ EXPECT_GT(dist_l_1, kMin1lDistance);
+ float dist_1_l =
+ master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l);
+ EXPECT_GT(dist_1_l, kMin1lDistance);
+
+ // So should I/1.
+ float dist_I_1 =
+ master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1);
+ EXPECT_GT(dist_I_1, kMin1lDistance);
+ float dist_1_I =
+ master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I);
+ EXPECT_GT(dist_1_I, kMin1lDistance);
+ }
+
+ // Objects declared here can be used by all tests in the test case for Foo.
+ ShapeTable* shape_table_;
+ std::unique_ptr<MasterTrainer> master_trainer_;
+#endif
+};
+
+// Tests that the MasterTrainer correctly loads its data and reaches the correct
+// conclusion over the distance between Arial I l and 1.
+TEST_F(MasterTrainerTest, Il1Test) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because LoadTrainingData is missing.
+ GTEST_SKIP();
+#else
+ // Initialize the master_trainer_ and load the Arial tr file.
+ LoadMasterTrainer();
+ VerifyIl1();
+#endif
+}
+
+// Tests the ErrorCounter using a MockClassifier to check that it counts
+// error categories correctly.
+TEST_F(MasterTrainerTest, ErrorCounterTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because LoadTrainingData is missing.
+ GTEST_SKIP();
+#else
+ // Initialize the master_trainer_ from the saved tmp file.
+ LoadMasterTrainer();
+ // Add the space character to the shape_table_ if not already present to
+ // count junk.
+ if (shape_table_->FindShape(0, -1) < 0) shape_table_->AddShape(0, 0);
+ // Make a mock classifier.
+ auto shape_classifier = std::make_unique<MockClassifier>(shape_table_);
+ // Get the accuracy report.
+ STRING accuracy_report;
+ master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0,
+ false, shape_classifier.get(),
+ &accuracy_report);
+ LOG(INFO) << accuracy_report.c_str();
+ std::string result_string = accuracy_report.c_str();
+ std::vector<std::string> results =
+ absl::StrSplit(result_string, '\t', absl::SkipEmpty());
+ EXPECT_EQ(tesseract::CT_SIZE + 1, results.size());
+ int result_values[tesseract::CT_SIZE];
+ for (int i = 0; i < tesseract::CT_SIZE; ++i) {
+ EXPECT_TRUE(safe_strto32(results[i + 1], &result_values[i]));
+ }
+ // These tests are more-or-less immune to additions to the number of
+ // categories or changes in the training data.
+ int num_samples = master_trainer_->GetSamples()->num_raw_samples();
+ EXPECT_EQ(kNumCorrect, result_values[tesseract::CT_UNICHAR_TOP_OK]);
+ EXPECT_EQ(1, result_values[tesseract::CT_FONT_ATTR_ERR]);
+ EXPECT_EQ(kNumTopTopErrs, result_values[tesseract::CT_UNICHAR_TOPTOP_ERR]);
+ EXPECT_EQ(kNumTop1Errs, result_values[tesseract::CT_UNICHAR_TOP1_ERR]);
+ EXPECT_EQ(kNumTop2Errs, result_values[tesseract::CT_UNICHAR_TOP2_ERR]);
+ EXPECT_EQ(kNumTopNErrs, result_values[tesseract::CT_UNICHAR_TOPN_ERR]);
+ // Each of the TOPTOP errs also counts as a multi-unichar.
+ EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs,
+ result_values[tesseract::CT_OK_MULTI_UNICHAR]);
+ EXPECT_EQ(num_samples - kNumNonReject, result_values[tesseract::CT_REJECT]);
+ EXPECT_EQ(kNumAnswers, result_values[tesseract::CT_NUM_RESULTS]);
+#endif
+}
diff --git a/tesseract/unittest/matrix_test.cc b/tesseract/unittest/matrix_test.cc
new file mode 100644
index 00000000..c900308d
--- /dev/null
+++ b/tesseract/unittest/matrix_test.cc
@@ -0,0 +1,137 @@
+///////////////////////////////////////////////////////////////////////
+// File: matrix_test.cc
+// Author: rays@google.com (Ray Smith)
+//
+// Copyright 2016 Google Inc. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "matrix.h"
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class MatrixTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ std::locale::global(std::locale(""));
+ }
+
+ // Fills src_ with data so it can pretend to be a tensor thus:
+ // dims_=[5, 4, 3, 2]
+ // array_=[0, 1, 2, ....119]
+ // tensor=[[[[0, 1][2, 3][4, 5]]
+ // [[6, 7][8, 9][10, 11]]
+ // [[12, 13][14, 15][16, 17]]
+ // [[18, 19][20, 21][22, 23]]]
+ // [[[24, 25]...
+ MatrixTest() {
+ src_.Resize(1, kInputSize_, 0);
+ for (int i = 0; i < kInputSize_; ++i) {
+ src_.put(0, i, i);
+ }
+ for (int i = 0; i < kNumDims_; ++i) dims_[i] = 5 - i;
+ }
+ // Number of dimensions in src_.
+ static const int kNumDims_ = 4;
+ // Number of elements in src_.
+ static const int kInputSize_ = 120;
+ // Size of each dimension in src_;
+ int dims_[kNumDims_];
+ // Input array filled with [0,kInputSize).
+ GENERIC_2D_ARRAY<int> src_;
+};
+
+// Tests that the RotatingTranspose function does the right thing for various
+// transformations.
+// dims=[5, 4, 3, 2]->[5, 2, 4, 3]
+TEST_F(MatrixTest, RotatingTranspose_3_1) {
+ GENERIC_2D_ARRAY<int> m;
+ src_.RotatingTranspose(dims_, kNumDims_, 3, 1, &m);
+ m.ResizeNoInit(kInputSize_ / 3, 3);
+ // Verify that the result is:
+ // output tensor=[[[[0, 2, 4][6, 8, 10][12, 14, 16][18, 20, 22]]
+ // [[1, 3, 5][7, 9, 11][13, 15, 17][19, 21, 23]]]
+ // [[[24, 26, 28]...
+ EXPECT_EQ(0, m(0, 0));
+ EXPECT_EQ(2, m(0, 1));
+ EXPECT_EQ(4, m(0, 2));
+ EXPECT_EQ(6, m(1, 0));
+ EXPECT_EQ(1, m(4, 0));
+ EXPECT_EQ(24, m(8, 0));
+ EXPECT_EQ(26, m(8, 1));
+ EXPECT_EQ(25, m(12, 0));
+}
+
+// dims=[5, 4, 3, 2]->[3, 5, 4, 2]
+TEST_F(MatrixTest, RotatingTranspose_2_0) {
+ GENERIC_2D_ARRAY<int> m;
+ src_.RotatingTranspose(dims_, kNumDims_, 2, 0, &m);
+ m.ResizeNoInit(kInputSize_ / 2, 2);
+ // Verify that the result is:
+ // output tensor=[[[[0, 1][6, 7][12, 13][18, 19]]
+ // [[24, 25][30, 31][36, 37][42, 43]]
+ // [[48, 49][54, 55][60, 61][66, 67]]
+ // [[72, 73][78, 79][84, 85][90, 91]]
+ // [[96, 97][102, 103][108, 109][114, 115]]]
+ // [[[2,3]...
+ EXPECT_EQ(0, m(0, 0));
+ EXPECT_EQ(1, m(0, 1));
+ EXPECT_EQ(6, m(1, 0));
+ EXPECT_EQ(7, m(1, 1));
+ EXPECT_EQ(24, m(4, 0));
+ EXPECT_EQ(25, m(4, 1));
+ EXPECT_EQ(30, m(5, 0));
+ EXPECT_EQ(2, m(20, 0));
+}
+
+// dims=[5, 4, 3, 2]->[5, 3, 2, 4]
+TEST_F(MatrixTest, RotatingTranspose_1_3) {
+ GENERIC_2D_ARRAY<int> m;
+ src_.RotatingTranspose(dims_, kNumDims_, 1, 3, &m);
+ m.ResizeNoInit(kInputSize_ / 4, 4);
+ // Verify that the result is:
+ // output tensor=[[[[0, 6, 12, 18][1, 7, 13, 19]]
+ // [[2, 8, 14, 20][3, 9, 15, 21]]
+ // [[4, 10, 16, 22][5, 11, 17, 23]]]
+ // [[[24, 30, 36, 42]...
+ EXPECT_EQ(0, m(0, 0));
+ EXPECT_EQ(6, m(0, 1));
+ EXPECT_EQ(1, m(1, 0));
+ EXPECT_EQ(2, m(2, 0));
+ EXPECT_EQ(3, m(3, 0));
+ EXPECT_EQ(4, m(4, 0));
+ EXPECT_EQ(5, m(5, 0));
+ EXPECT_EQ(24, m(6, 0));
+ EXPECT_EQ(30, m(6, 1));
+}
+
+// dims=[5, 4, 3, 2]->[4, 3, 5, 2]
+TEST_F(MatrixTest, RotatingTranspose_0_2) {
+ GENERIC_2D_ARRAY<int> m;
+ src_.RotatingTranspose(dims_, kNumDims_, 0, 2, &m);
+ m.ResizeNoInit(kInputSize_ / 2, 2);
+ // Verify that the result is:
+ // output tensor=[[[[0, 1][24, 25][48, 49][72, 73][96, 97]]
+ // [[2, 3][26, 27][50, 51][74, 75][98, 99]]
+ // [[4, 5][28, 29][52, 53][76, 77][100, 101]]]
+ // [[[6, 7]...
+ EXPECT_EQ(0, m(0, 0));
+ EXPECT_EQ(1, m(0, 1));
+ EXPECT_EQ(24, m(1, 0));
+ EXPECT_EQ(25, m(1, 1));
+ EXPECT_EQ(96, m(4, 0));
+ EXPECT_EQ(97, m(4, 1));
+ EXPECT_EQ(2, m(5, 0));
+ EXPECT_EQ(6, m(15, 0));
+}
+
+} // namespace
diff --git a/tesseract/unittest/networkio_test.cc b/tesseract/unittest/networkio_test.cc
new file mode 100644
index 00000000..3c25f14f
--- /dev/null
+++ b/tesseract/unittest/networkio_test.cc
@@ -0,0 +1,217 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "networkio.h"
+#include "stridemap.h"
+#ifdef INCLUDE_TENSORFLOW
+#include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D
+#endif
+
+namespace tesseract {
+
+class NetworkioTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ std::locale::global(std::locale(""));
+ }
+
+#ifdef INCLUDE_TENSORFLOW
+ // Sets up an Array2d object of the given size, initialized to increasing
+ // values starting with start.
+ std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) {
+ std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize));
+ int value = start;
+ for (int y = 0; y < ysize; ++y) {
+ for (int x = 0; x < xsize; ++x) {
+ (*a)(y, x) = value++;
+ }
+ }
+ return a;
+ }
+ // Sets up a NetworkIO with a batch of 2 "images" of known values.
+ void SetupNetworkIO(NetworkIO* nio) {
+ std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;
+ arrays.push_back(SetupArray(3, 4, 0));
+ arrays.push_back(SetupArray(4, 5, 12));
+ std::vector<std::pair<int, int>> h_w_sizes;
+ for (size_t i = 0; i < arrays.size(); ++i) {
+ h_w_sizes.emplace_back(arrays[i].get()->height(),
+ arrays[i].get()->width());
+ }
+ StrideMap stride_map;
+ stride_map.SetStride(h_w_sizes);
+ nio->ResizeToMap(true, stride_map, 2);
+ // Iterate over the map, setting nio's contents from the arrays.
+ StrideMap::Index index(stride_map);
+ do {
+ int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH));
+ nio->SetPixel(index.t(), 0, 128 + value, 0.0f, 128.0f);
+ nio->SetPixel(index.t(), 1, 128 - value, 0.0f, 128.0f);
+ } while (index.Increment());
+ }
+#endif
+};
+
+// Tests that the initialization via SetPixel works and the resize correctly
+// fills with zero where image sizes don't match.
+TEST_F(NetworkioTest, InitWithZeroFill) {
+#ifdef INCLUDE_TENSORFLOW
+ NetworkIO nio;
+ nio.Resize2d(true, 32, 2);
+ int width = nio.Width();
+ for (int t = 0; t < width; ++t) {
+ nio.SetPixel(t, 0, 0, 0.0f, 128.0f);
+ nio.SetPixel(t, 1, 0, 0.0f, 128.0f);
+ }
+ // The initialization will wipe out all previously set values.
+ SetupNetworkIO(&nio);
+ nio.ZeroInvalidElements();
+ StrideMap::Index index(nio.stride_map());
+ int next_t = 0;
+ int pos = 0;
+ do {
+ int t = index.t();
+ // The indexed values just increase monotonically.
+ int value = nio.i(t)[0];
+ EXPECT_EQ(value, pos);
+ value = nio.i(t)[1];
+ EXPECT_EQ(value, -pos);
+ // When we skip t values, the data is always 0.
+ while (next_t < t) {
+ EXPECT_EQ(nio.i(next_t)[0], 0);
+ EXPECT_EQ(nio.i(next_t)[1], 0);
+ ++next_t;
+ }
+ ++pos;
+ ++next_t;
+ } while (index.Increment());
+ EXPECT_EQ(pos, 32);
+ EXPECT_EQ(next_t, 40);
+#else
+ LOG(INFO) << "Skip test because of missing xla::Array2D";
+ GTEST_SKIP();
+#endif
+}
+
+// Tests that CopyWithYReversal works.
+TEST_F(NetworkioTest, CopyWithYReversal) {
+#ifdef INCLUDE_TENSORFLOW
+ NetworkIO nio;
+ SetupNetworkIO(&nio);
+ NetworkIO copy;
+ copy.CopyWithYReversal(nio);
+ StrideMap::Index index(copy.stride_map());
+ int next_t = 0;
+ int pos = 0;
+ std::vector<int> expected_values = {
+ 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 27, 28, 29, 30,
+ 31, 22, 23, 24, 25, 26, 17, 18, 19, 20, 21, 12, 13, 14, 15, 16};
+ do {
+ int t = index.t();
+ // The indexed values match the expected values.
+ int value = copy.i(t)[0];
+ EXPECT_EQ(value, expected_values[pos]);
+ value = copy.i(t)[1];
+ EXPECT_EQ(value, -expected_values[pos]);
+ // When we skip t values, the data is always 0.
+ while (next_t < t) {
+ EXPECT_EQ(copy.i(next_t)[0], 0) << "Failure t = " << next_t;
+ EXPECT_EQ(copy.i(next_t)[1], 0) << "Failure t = " << next_t;
+ ++next_t;
+ }
+ ++pos;
+ ++next_t;
+ } while (index.Increment());
+ EXPECT_EQ(pos, 32);
+ EXPECT_EQ(next_t, 40);
+#else
+ LOG(INFO) << "Skip test because of missing xla::Array2D";
+ GTEST_SKIP();
+#endif
+}
+
+// Tests that CopyWithXReversal works.
+TEST_F(NetworkioTest, CopyWithXReversal) {
+#ifdef INCLUDE_TENSORFLOW
+ NetworkIO nio;
+ SetupNetworkIO(&nio);
+ NetworkIO copy;
+ copy.CopyWithXReversal(nio);
+ StrideMap::Index index(copy.stride_map());
+ int next_t = 0;
+ int pos = 0;
+ std::vector<int> expected_values = {
+ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 16, 15, 14, 13,
+ 12, 21, 20, 19, 18, 17, 26, 25, 24, 23, 22, 31, 30, 29, 28, 27};
+ do {
+ int t = index.t();
+ // The indexed values match the expected values.
+ int value = copy.i(t)[0];
+ EXPECT_EQ(value, expected_values[pos]);
+ value = copy.i(t)[1];
+ EXPECT_EQ(value, -expected_values[pos]);
+ // When we skip t values, the data is always 0.
+ while (next_t < t) {
+ EXPECT_EQ(copy.i(next_t)[0], 0) << "Failure t = " << next_t;
+ EXPECT_EQ(copy.i(next_t)[1], 0) << "Failure t = " << next_t;
+ ++next_t;
+ }
+ ++pos;
+ ++next_t;
+ } while (index.Increment());
+ EXPECT_EQ(pos, 32);
+ EXPECT_EQ(next_t, 40);
+#else
+ LOG(INFO) << "Skip test because of missing xla::Array2D";
+ GTEST_SKIP();
+#endif
+}
+
+// Tests that CopyWithXYTranspose works.
+TEST_F(NetworkioTest, CopyWithXYTranspose) {
+#ifdef INCLUDE_TENSORFLOW
+ NetworkIO nio;
+ SetupNetworkIO(&nio);
+ NetworkIO copy;
+ copy.CopyWithXYTranspose(nio);
+ StrideMap::Index index(copy.stride_map());
+ int next_t = 0;
+ int pos = 0;
+ std::vector<int> expected_values = {
+ 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 17, 22, 27,
+ 13, 18, 23, 28, 14, 19, 24, 29, 15, 20, 25, 30, 16, 21, 26, 31};
+ do {
+ int t = index.t();
+ // The indexed values match the expected values.
+ int value = copy.i(t)[0];
+ EXPECT_EQ(value, expected_values[pos]);
+ value = copy.i(t)[1];
+ EXPECT_EQ(value, -expected_values[pos]);
+ // When we skip t values, the data is always 0.
+ while (next_t < t) {
+ EXPECT_EQ(copy.i(next_t)[0], 0);
+ EXPECT_EQ(copy.i(next_t)[1], 0);
+ ++next_t;
+ }
+ ++pos;
+ ++next_t;
+ } while (index.Increment());
+ EXPECT_EQ(pos, 32);
+ EXPECT_EQ(next_t, 40);
+#else
+ LOG(INFO) << "Skip test because of missing xla::Array2D";
+ GTEST_SKIP();
+#endif
+}
+
+} // namespace
diff --git a/tesseract/unittest/normstrngs_test.cc b/tesseract/unittest/normstrngs_test.cc
new file mode 100644
index 00000000..301bbd68
--- /dev/null
+++ b/tesseract/unittest/normstrngs_test.cc
@@ -0,0 +1,422 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/str_format.h" // for absl::StrFormat
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+#include <tesseract/unichar.h>
+#ifdef INCLUDE_TENSORFLOW
+#include "util/utf8/unilib.h" // for UniLib
+#endif
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+#if defined(MISSING_CODE)
+static std::string EncodeAsUTF8(const char32 ch32) {
+ UNICHAR uni_ch(ch32);
+ return std::string(uni_ch.utf8(), uni_ch.utf8_len());
+}
+#endif
+
+TEST(NormstrngsTest, BasicText) {
+ const char* kBasicText = "AbCd Ef";
+ std::string result;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+ GraphemeNorm::kNormalize, kBasicText,
+ &result));
+ EXPECT_STREQ(kBasicText, result.c_str());
+}
+
+TEST(NormstrngsTest, LigatureText) {
+ const char* kTwoByteLigText = "ij"; // U+0133 (ij) -> ij
+ std::string result;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+ GraphemeNorm::kNormalize, kTwoByteLigText,
+ &result));
+ EXPECT_STREQ("ij", result.c_str());
+
+ const char* kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+ GraphemeNorm::kNormalize, kThreeByteLigText,
+ &result));
+ EXPECT_STREQ("finds", result.c_str());
+}
+
+TEST(NormstrngsTest, OcrSpecificNormalization) {
+ const char* kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (')
+ std::string result;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+ GraphemeNorm::kNormalize, kSingleQuoteText,
+ &result));
+ EXPECT_STREQ("'Hi", result.c_str());
+
+ const char* kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (")
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+ GraphemeNorm::kNormalize, kDoubleQuoteText,
+ &result));
+ EXPECT_STREQ("\"Hi", result.c_str());
+
+ const char* kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-)
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+ GraphemeNorm::kNormalize, kEmDash, &result));
+ EXPECT_STREQ("Hi-", result.c_str());
+ // Without the ocr normalization, these changes are not made.
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, kSingleQuoteText,
+ &result));
+ EXPECT_STREQ(kSingleQuoteText, result.c_str());
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, kDoubleQuoteText,
+ &result));
+ EXPECT_STREQ(kDoubleQuoteText, result.c_str());
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, kEmDash, &result));
+ EXPECT_STREQ(kEmDash, result.c_str());
+}
+
+// Sample text used in tests.
+const char kEngText[] = "the quick brown fox jumps over the lazy dog";
+const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
+const char kKorText[] = "이는 것으로";
+// Hindi words containing illegal vowel sequences.
+const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात",
+ "कहीअे", "पत्रिाका", "छह्णाीस"};
+// Thai illegal sequences.
+const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};
+
+TEST(NormstrngsTest, DetectsCorrectText) {
+ std::string chars;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, kEngText, &chars));
+ EXPECT_STREQ(kEngText, chars.c_str());
+
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, kHinText, &chars))
+ << "Incorrect text: '" << kHinText << "'";
+ EXPECT_STREQ(kHinText, chars.c_str());
+
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, kKorText, &chars));
+ EXPECT_STREQ(kKorText, chars.c_str());
+}
+
+TEST(NormstrngsTest, DetectsIncorrectText) {
+ for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize,
+ kBadlyFormedHinWords[i], nullptr))
+ << kBadlyFormedHinWords[i];
+ }
+ for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize,
+ kBadlyFormedThaiWords[i], nullptr))
+ << kBadlyFormedThaiWords[i];
+ }
+}
+
+TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
+ std::string nonindic = "Here's some latin text.";
+ std::string dest;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, nonindic.c_str(),
+ &dest))
+ << PrintString32WithUnicodes(nonindic);
+ EXPECT_EQ(dest, nonindic);
+}
+
+TEST(NormstrngsTest, NoLonelyJoiners) {
+ std::string str = "x\u200d\u0d06\u0d34\u0d02";
+ std::vector<std::string> glyphs;
+ // Returns true, but the joiner is gone.
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[0], std::string("x"));
+ EXPECT_EQ(glyphs[1], std::string("\u0d06"));
+ EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02"));
+}
+
+TEST(NormstrngsTest, NoLonelyJoinersPlus) {
+ std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
+ std::vector<std::string> glyphs;
+ // Returns true, but the joiner is gone.
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
+ EXPECT_EQ(glyphs[1], std::string("+"));
+ EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b"));
+}
+
+TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
+ std::string str = "\u200d+\u200c\u200d";
+ // Returns true, but the joiners are gone.
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string("+"));
+ str = "\u200d\u200c\u200d";
+ // Without the plus, the string is invalid.
+ std::string result;
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result))
+ << PrintString32WithUnicodes(result);
+}
+
+TEST(NormstrngsTest, JoinersStayInArabic) {
+ std::string str = "\u0628\u200c\u0628\u200d\u0628";
+ // Returns true, string untouched.
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 5, 5, 2, str);
+}
+
+TEST(NormstrngsTest, DigitOK) {
+ std::string str = "\u0cea"; // Digit 4.
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
+}
+
+TEST(NormstrngsTest, DandaOK) {
+ std::string str = "\u0964"; // Single danda.
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
+ str = "\u0965"; // Double danda.
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
+}
+
+TEST(NormstrngsTest, AllScriptsRegtest) {
+ // Tests some valid text in a large number of scripts, some of which were
+ // found to be rejected by an earlier version.
+ const std::vector<std::pair<std::string, std::string>> kScriptText(
+ {{"Arabic",
+ " فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
+ "توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
+ "مجموعه هیچ اثری در فنون هنر و ادب و ترجمه، تقدیم پیشگاه ارجمند "
+ "سازنده تاریخ نگاه میکرد و به اصطلاح انسان و فطرت انسانی را زیربنای"},
+ {"Armenian",
+ "անտիկ աշխարհի փիլիսոփաների կենսագրությունը, թե′ նրանց ուս-"
+ "պատրաստւում է դալ (բուլամա): Կովկասում կաթից նաև պատ-"
+ "Հոգաբարձութեան յղել այդ անձին յիմարութիւնը հաստա-"
+ "գծերը եւ միջագծերը կը համրուին վարէն վեր:"},
+ {"Bengali",
+ "এসে দাঁড়ায় দাও খানি উঁচিয়ে নিয়ে । ঝরনার স্বচ্ছ জলে প্রতিবিম্বিত "
+ "পাঠিয়ে, গোবিন্দ স্মরণ করে, নির্ভয়ে রওনা হয়েছিল। তাতে সে "
+ "সুলতার। মনে পড়ে বিয়ের সময় বাবা এদের বাড়ি থেকে ঘুরে "
+ "কিন্তু তারপর মাতৃহৃদয় কেমন করে আছে? কী"},
+ {"Cyrillic",
+ "достей, є ще нагороди й почесті, є хай і сумнівна, але слава, "
+ "вып., 96б). Параўн. найсвятший у 1 знач., насвятейший у 1 знач., "
+ "»Правді«, — гітлерівські окупанти винищували нижчі раси, після дру- "
+ "І знов майдан зачорнів од народу. Всередині чоло-"},
+ {"Devanagari",
+ "डा॰ नै हात्तीमाथि चढेर त्यो भएनेर आइपुगे। राजालाई देखी "
+ "बाबतीत लिहिणे ही एक मोठीच जबाबदारी आहे. काकासाहेबांच्या कार्याचा "
+ "प्रबंध, आधोगिक प्रबंध तथा बैंकिंग एवम वाणिज्य आदि विषयों में "
+ "चित्रकृती दिल्या. शंभराहून अधिक देश आज आपापले चित्रपट निर्माण करीत"},
+ {"Greek",
+ "Μέσα ένα τετράδιο είχα στριμώξει το πρώτο "
+ "νον αξίως τού ευαγγελίου τού χριστού πολιτεύεσθε, ίνα "
+ "οὐδεμία ὑπ' αὐτοῦ μνεία γίνεται τῶν οἰκείων χωρίων. "
+ "είτα την φάσιν αυτήν ην ούτος εποιήσατο κατά του Μίκω-"},
+ {"Gujarati",
+ "ઉપહારગૃહે ને નાટ્યસ્થળે આ એ જ તેલ કડકડતું "
+ "શકી. ભાવવધારો અટકાવી નથી શકી અને બેકારીને "
+ "ત્યાં વાંકુથી પાછે આવ્યો, ચોરીનો માલ સોંપવા ! "
+ "કહી. એણે રેશમના કપડામાં વીંટી રાખેલ કુંવરીની છબી"},
+ {"Gurmukhi",
+ "ਯਾਦ ਰਹੇ ਕਿ ‘ਨਫਰਤ ’ ਦਾ ਵਿਸ਼ਾ ਕ੍ਰਾਤੀ ਨਹੀ ਹੈ ਅਤੇ ਕਵੀ ਦੀ ਇਹ "
+ "ਮਹਾਂ ਨੰਦਾ ਕੋਲ ਇਕ ਚੀਜ਼ ਸੀ ਉਹ ਸੀ ਸਚ, ਕੋਰਾ ਸਚ, ਬੇਧਤ੍ਰਕ ਕਹਿੳ "
+ "ਭੂਰਾ ਸਾਨੂੰ ਥੜਾ ਚੰਗਾ ਲਗਦਾ ਸੀ । ਉਸ ਦਾ ਇਕ ਪੈਰ ਜਨਮ ਤੋ "
+ "ਨੂੰ ਇਹ ਅਧਿਕਾਰ ਦਿੱਤਾ ਕਿ ਉਹ ਸਿੱਖ ਵਿਰੋਧ ਦਾ ਸੰਗਠਨ ਕਰੇ ਅਤੇ 3 ਸਤੰਬਰ,"},
+ {"Hangul",
+ "로 들어갔다. 이대통령은 아이젠하워 대통령의 뒷모습을 보면서 "
+ "그것뿐인 줄 아요? 노름도 했다 캅니다. 빌어묵을 놈이 그러 "
+ "의 가장 과학적 태도이며, 우리 역사를 가장 정확하게 학습할 수 있는 "
+ "마르크스 레"
+ "각하는 그는 그들의 식사보장을 위해 때때로 집에"},
+ {"HanS",
+ "大凡世界上的先生可 分 三 种: 第一种只会教书, 只会拿一 "
+ "书像是探宝一样,在茶叶店里我买过西湖龙井﹑黄山毛峰﹑福建的铁观音﹑大红"
+ " "
+ "持 “左” 倾冒险主义的干部,便扣上 “富农 "
+ "笑说:“我听说了,王总工程师也跟我说过了,只是工作忙,谁"},
+ {"HanT",
+ "叁、 銀行資產管理的群組分析模式 "
+ "民國六十三年,申請就讀台灣大學歷史研究所,並從事著述,"
+ "質言之﹐在社會結構中﹐性質﹑特徵﹑地位相類似的一羣人﹐由於 "
+ "董橋,一九四二年生,福建晉江人,國立成功大學外"},
+ {"Hebrew",
+ " אֵ-לִי, אֵ-לִי, כֵּיַצד מְטַפְּסִים בְּקִירוֹת שֶׁל זְכוּכִי"
+ " הראשון חוצה אותי שוב. אני בסיבוב הרביעי, הוא בטח מתחיל את"
+ " ווערטער געהאט, אבער דער עיקר איז ניט דאָס וואָרט, נאָר"
+ " על גחלת היהדות המקורית בעירך, נתת צביון ואופי מיוחד"},
+ {"Japanese",
+ "は異民族とみなされていた。楚の荘王(前613〜前 "
+ "を詳細に吟味する。実際の治療活動の領域は便宜上、(1) 障害者 "
+ "困難性は多角企業の場合原則として部門別に判断されている.). "
+ "☆ご希望の団体には見本をお送りします"},
+ {"Kannada",
+ "ಕೂಡ ಯುದ್ಧ ಮಾಡಿ ಜಯಪಡೆ. ನಂತರ ನಗರದೊಳಕ್ಕೆ ನಡೆ ಇದನ್ನು "
+ "ಅಸಹ್ಯದೃಶ್ಯ ಯಾರಿಗಾದರೂ ನಾಚಿಕೆತರುವಂತಹದಾಗಿದೆ. ಆರೋಗ್ಯ ದೃಷ್ಟಿ "
+ "ಯಾಗಲಿ, ಮೋಹನನಾಗಲಿ ಇಂಥ ಬಿಸಿಲಿನಲ್ಲಿ ಎಂದೂ ಬಹಳ ಹೊತ್ತು "
+ "\"ಇದೆ...ಖಂಡಿತಾ ಇದೆ\" ಅಂದ ಮನಸ್ಸಿನಲ್ಲಿಯೇ ವಂದಿಸುತ್ತಾ,"},
+ {"Khmer",
+ "សិតសក់និងផ្លាស់សម្លៀកបំពាក់ពេលយប់ចេញ។ "
+ "និយាយអំពីនគរនេះ ប្រាប់ដល់លោកទាំងមូលឲ្យដឹងច្បាស់លាស់អំពី "
+ "កន្លះកាថាសម្រាប់ទន្ទេញឲ្យងាយចាំ បោះពុម្ពនៅក្នុងទ្រង់ទ្រាយបច្ចុប្បន្ន "
+ "ឯកសារនេះបានផ្សព្វផ្សាយនៅក្នុងសន្និសីទ"},
+ {"Lao",
+ "ເອີຍ ! ຟັງສຽງຟ້າມັນຮ້ອງຮ່ວນ ມັນດັງໄກໆ ເອີຍ "
+ "ໄດລຽງດູລາວມາດວບຄວາມລາບາກຫລາຍ; "
+ "ບາງໄດ້ ເຈົ້າລອງສູ້ບໍ່ໄດ້ຈຶ່ງຫນີລົງມາວຽງຈັນ. "
+ "ລົບອອກຈາກ 3 ເຫລືອ 1, ຂ້ອຍຂຽນ 1 (1)"},
+ {"Latin",
+ "režisoru, palīdzēja to manu domīgo, kluso Dzejas metru ielikt "
+ "Ešte nedávno sa chcel mladý Novomeský „liečiť” "
+ "tiivisia kysymyksiä, mistä seuraa, että spekula- | don luonteesta "
+ "Grabiel Sanchez, yang bertani selama 120 tahun meninggal"},
+ {"Malayalam",
+ "അമൂർത്തചിത്രമായിരിക്കും. ഛേ! ആ വീട്ടിലേക്ക് അവളൊന്നിച്ച് പോകേണ്ടതാ "
+ "മൃഗങ്ങൾക്ക് എന്തെക്കിലും പറ്റിയാൽ മാത്രം ഞാനതു "
+ "വെലക്ക് വേണമെങ്കിൽ തരാം. എന്തോ തരും? പറ. "
+ "എല്ലാം കഴിഞ്ഞ് സീനിയറിന്റെ അടുത്തു ചെന്ന് കാൽതൊട്ട"},
+ {"Tamil",
+ "பொருத்தமாகப் பாடினாள் நம் ஔவைப் பாட்டி. காவிரி "
+ "உள்ளடக்கி நிற்பது விநோத வார்த்தையின் அஃறிணை "
+ "சூரிய கிரஹண சமயத்தில் குருக்ஷேத்திரம் செல்வது "
+ "காலங்களில் வெளியே போகும்பொழுது, 'ஸார்', 'ஸார்',"},
+ {"Telugu",
+ "1892లో ఆమె 10వ సంవత్సరంలో గుంటూరు తాలూకా వేములాపాడు "
+ "ఫండ్స్ చట్టము'నందు చేయబడెను. తరువాత క్రీ. శ. "
+ "సంచారము చేయును. మీరు ఇప్పుడే కాళకాలయమునకు "
+ "ఎంతటి సరళమైన భాషలో వ్రాశాడో విశదమవుతుంది. పైగా ఆనాటి భాష"},
+ {"Thai",
+ "อ้อ! กับนัง....แม่ยอดพระกลิ่น นั่นเอง ! หรับก็ย่อมจะรู้โดยชัดเจนว่า "
+ "ถ้าตราบใดยังมีเรือปืนอยู่ใกล้ ๆ แล้ว ตราบนั้น "
+ "พระดำรินี้ ที่มีคตีทำกรวยหมากและธูปเทียน "
+ "อันยานมีเรือเปนต้นฃ้ามยาก ฯ เพราะว่าแม่น้ำนั่นมีน้ำใสยิ่ง แม้เพียง"},
+ {"Vietnamese",
+ "vợ đến tai mụ hung thần Xăng-tô- mê-a. Mụ vô cùng "
+ "chiếc xe con gấu chạy qua nhà. Nhưng thỉnh thoảng "
+ "hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng "
+ "Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}});
+
+ for (const auto& p : kScriptText) {
+ std::string normalized;
+ EXPECT_TRUE(tesseract::NormalizeUTF8String(
+ tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
+ tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))
+ << "Script=" << p.first << " text=" << p.second;
+ }
+}
+
+TEST(NormstrngsTest, IsWhitespace) {
+ // U+0020 is whitespace
+ EXPECT_TRUE(IsWhitespace(' '));
+ EXPECT_TRUE(IsWhitespace('\t'));
+ EXPECT_TRUE(IsWhitespace('\r'));
+ EXPECT_TRUE(IsWhitespace('\n'));
+ // U+2000 through U+200A
+ for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {
+ SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+ EXPECT_TRUE(IsWhitespace(ch));
+ }
+ // U+3000 is whitespace
+ EXPECT_TRUE(IsWhitespace(0x3000));
+ // ZWNBSP is not considered a space.
+ EXPECT_FALSE(IsWhitespace(0xFEFF));
+}
+
+TEST(NormstrngsTest, SpanUTF8Whitespace) {
+ EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\n"));
+ EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\nabc"));
+ EXPECT_EQ(0, SpanUTF8Whitespace("abc \t\r\nabc"));
+ EXPECT_EQ(0, SpanUTF8Whitespace(""));
+}
+
+TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
+ const char kHinText[] = "पिताने विवाह";
+ const char kKorText[] = "이는 것으로 다시 넣을";
+ const char kMixedText[] = "والفكر 123 والصراع abc";
+
+ EXPECT_EQ(0, SpanUTF8NotWhitespace(""));
+ EXPECT_EQ(0, SpanUTF8NotWhitespace(" abc"));
+ EXPECT_EQ(0, SpanUTF8NotWhitespace("\rabc"));
+ EXPECT_EQ(0, SpanUTF8NotWhitespace("\tabc"));
+ EXPECT_EQ(0, SpanUTF8NotWhitespace("\nabc"));
+ EXPECT_EQ(3, SpanUTF8NotWhitespace("abc def"));
+ EXPECT_EQ(18, SpanUTF8NotWhitespace(kHinText));
+ EXPECT_EQ(6, SpanUTF8NotWhitespace(kKorText));
+ EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
+}
+
+// Test that the method clones the util/utf8/unilib definition of
+// interchange validity.
+TEST(NormstrngsTest, IsInterchangeValid) {
+#ifdef INCLUDE_TENSORFLOW
+ const int32_t kMinUnicodeValue = 33;
+ const int32_t kMaxUnicodeValue = 0x10FFFF;
+ for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
+ SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+ EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
+ }
+#else
+ GTEST_SKIP();
+#endif
+}
+
+// Test that the method clones the util/utf8/unilib definition of
+// 7-bit ASCII interchange validity.
+TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
+#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
+ const int32_t kMinUnicodeValue = 33;
+ const int32_t kMaxUnicodeValue = 0x10FFFF;
+ for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
+ SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+ std::string str = EncodeAsUTF8(ch);
+ EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
+ IsInterchangeValid7BitAscii(ch));
+ }
+#else
+ // Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
+ GTEST_SKIP();
+#endif
+}
+
+// Test that the method clones the util/utf8/unilib definition of
+// fullwidth-halfwidth .
+TEST(NormstrngsTest, FullwidthToHalfwidth) {
+ // U+FF21 -> U+0041 (Latin capital letter A)
+ EXPECT_EQ('A', FullwidthToHalfwidth(0xFF21));
+ // U+FF05 -> U+0025 (percent sign)
+ EXPECT_EQ('%', FullwidthToHalfwidth(0xFF05));
+ // U+FFE6 -> U+20A9 (won sign)
+ EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
+
+#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
+ // Skipped because of missing UniLib::FullwidthToHalfwidth.
+ const int32_t kMinUnicodeValue = 33;
+ const int32_t kMaxUnicodeValue = 0x10FFFF;
+ for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
+ if (!IsValidCodepoint(ch)) continue;
+ SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+ std::string str = EncodeAsUTF8(ch);
+ const std::string expected_half_str =
+ UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);
+ EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));
+ }
+#endif
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/normstrngs_test.h b/tesseract/unittest/normstrngs_test.h
new file mode 100644
index 00000000..3b459348
--- /dev/null
+++ b/tesseract/unittest/normstrngs_test.h
@@ -0,0 +1,84 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
+#define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
+
+#include <sstream> // for std::stringstream
+#include <string>
+#include <vector>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+inline std::string CodepointList(const std::vector<char32>& str32) {
+ std::stringstream result;
+ int total_chars = str32.size();
+ result << std::hex;
+ for (int i = 0; i < total_chars; ++i) {
+ result << "[" << str32[i] << "]";
+ }
+ return result.str();
+}
+
+inline std::string PrintString32WithUnicodes(const std::string& str) {
+ std::vector<char32> str32 = UNICHAR::UTF8ToUTF32(str.c_str());
+ return absl::StrCat("\"", str, "\" ", CodepointList(str32));
+}
+
+inline std::string PrintStringVectorWithUnicodes(const std::vector<std::string>& glyphs) {
+ std::string result;
+ for (const auto& s : glyphs) {
+ result += "Glyph:";
+ result += PrintString32WithUnicodes(s) + "\n";
+ }
+ return result;
+}
+
+inline void ExpectGraphemeModeResults(const std::string& str, UnicodeNormMode u_mode,
+ int unicode_count, int glyph_count,
+ int grapheme_count,
+ const std::string& target_str) {
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true,
+ str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), unicode_count)
+ << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
+ GraphemeNormMode::kGlyphSplit, true,
+ str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), glyph_count)
+ << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
+ GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), grapheme_count)
+ << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
+ GraphemeNormMode::kSingleString,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(target_str, glyphs[0]);
+ std::string result;
+ EXPECT_TRUE(NormalizeUTF8String(
+ u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result));
+ EXPECT_EQ(target_str, result);
+}
+
+} // namespace tesseract
+
+#endif // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
diff --git a/tesseract/unittest/nthitem_test.cc b/tesseract/unittest/nthitem_test.cc
new file mode 100644
index 00000000..4d08ffae
--- /dev/null
+++ b/tesseract/unittest/nthitem_test.cc
@@ -0,0 +1,120 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kdpair.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0, -32767, 6, 7};
+
+// The fixture for testing GenericHeap and DoublePtr.
+class NthItemTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ std::locale::global(std::locale(""));
+ }
+
+ public:
+ virtual ~NthItemTest();
+ // Pushes the test data onto the KDVector.
+ void PushTestData(KDVector* v) {
+ for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+ IntKDPair pair(test_data[i], i);
+ v->push_back(pair);
+ }
+ }
+};
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of a weak vtable (fixes compiler warning).
+NthItemTest::~NthItemTest() = default;
+
+// Tests basic results.
+TEST_F(NthItemTest, GeneralTest) {
+ KDVector v;
+ // Push the test data onto the KDVector.
+ PushTestData(&v);
+ // Get the min item.
+ int index = v.choose_nth_item(0);
+ // The result is -32767.
+ EXPECT_EQ(-32767, v[index].key());
+ // Get the max item.
+ index = v.choose_nth_item(v.size() - 1);
+ // The result is 65536.
+ EXPECT_EQ(65536, v[index].key());
+ // Invalid items are silently truncated to valid.
+ // Get the min item.
+ index = v.choose_nth_item(-1);
+ // The result is -32767.
+ EXPECT_EQ(-32767, v[index].key());
+ // Get the max item.
+ index = v.choose_nth_item(v.size());
+ // The result is 65536.
+ EXPECT_EQ(65536, v[index].key());
+}
+
+// Tests results on boring data with lots of duplication.
+TEST_F(NthItemTest, BoringTest) {
+ KDVector v;
+ // Push the test data onto the KDVector.
+ int test_data[] = {8, 8, 8, 8, 8, 7, 7, 7, 7};
+ for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+ IntKDPair pair(test_data[i], i);
+ v.push_back(pair);
+ }
+ // The 3rd item is 7 but the 4th is 8..
+ int index = v.choose_nth_item(3);
+ // The result is 7.
+ EXPECT_EQ(7, v[index].key());
+ index = v.choose_nth_item(4);
+ // The result is 8.
+ EXPECT_EQ(8, v[index].key());
+ // Get the min item.
+ index = v.choose_nth_item(0);
+ // The result is 7.
+ EXPECT_EQ(7, v[index].key());
+ // Get the max item.
+ index = v.choose_nth_item(v.size() - 1);
+ // The result is 8.
+ EXPECT_EQ(8, v[index].key());
+}
+
+// Tests that a unique median in an odd-size array is found correctly.
+TEST_F(NthItemTest, UniqueTest) {
+ KDVector v;
+ // Push the test data onto the KDVector.
+ PushTestData(&v);
+ // Get the median item.
+ int index = v.choose_nth_item(v.size() / 2);
+ // The result is 6, it started out at index 11.
+ EXPECT_EQ(6, v[index].key());
+ EXPECT_EQ(11, v[index].data());
+}
+
+// Tests that an equal median is found correctly.
+TEST_F(NthItemTest, EqualTest) {
+ KDVector v;
+ // Push the test data onto the KDVector.
+ PushTestData(&v);
+ // Add an extra 8. This makes the median 7.
+ IntKDPair pair(8, 13);
+ v.push_back(pair);
+ // Get the median item.
+ int index = v.choose_nth_item(v.size() / 2);
+ // The result is 7, it started out at index 4 or 12.
+ EXPECT_EQ(7, v[index].key());
+ EXPECT_TRUE(v[index].data() == 4 || v[index].data() == 12);
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/osd_test.cc b/tesseract/unittest/osd_test.cc
new file mode 100644
index 00000000..5100a6f9
--- /dev/null
+++ b/tesseract/unittest/osd_test.cc
@@ -0,0 +1,133 @@
+///////////////////////////////////////////////////////////////////////
+// File: osd_test.cc
+// Description: OSD Tests for Tesseract.
+// Author: ShreeDevi Kumar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// based on https://gist.github.com/amitdo/7c7a522004dd79b398340c9595b377e1
+
+// expects clones of tessdata, tessdata_fast and tessdata_best repos
+
+//#include "log.h"
+#include <iostream>
+#include <memory> // std::unique_ptr
+#include <string>
+#include <tesseract/baseapi.h>
+#include "include_gunit.h"
+#include "allheaders.h"
+
+namespace tesseract {
+
+class TestClass : public testing::Test {
+ protected:
+};
+
+#ifndef DISABLED_LEGACY_ENGINE
+static void OSDTester(int expected_deg, const char* imgname, const char* tessdatadir) {
+ // log.info() << tessdatadir << " for image: " << imgname << std::endl;
+ std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+ ASSERT_FALSE(api->Init(tessdatadir, "osd"))
+ << "Could not initialize tesseract.";
+ Pix* image = pixRead(imgname);
+ ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+ api->SetImage(image);
+ int orient_deg;
+ float orient_conf;
+ const char* script_name;
+ float script_conf;
+ bool detected = api->DetectOrientationScript(&orient_deg, &orient_conf,
+ &script_name, &script_conf);
+ ASSERT_FALSE(!detected) << "Failed to detect OSD.";
+ printf(
+ "************ Orientation in degrees: %d, Orientation confidence: %.2f\n"
+ " Script: %s, Script confidence: %.2f\n",
+ orient_deg, orient_conf, script_name, script_conf);
+ EXPECT_EQ(expected_deg, orient_deg);
+ api->End();
+ pixDestroy(&image);
+}
+#endif
+
+class OSDTest : public TestClass,
+ public ::testing::WithParamInterface<
+ std::tuple<int, const char*, const char*>> {};
+
+TEST_P(OSDTest, MatchOrientationDegrees) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because TessBaseAPI::DetectOrientationScript is missing.
+ GTEST_SKIP();
+#else
+ OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()),
+ std::get<2>(GetParam()));
+#endif
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataEngEuroHebrew, OSDTest,
+ ::testing::Combine(::testing::Values(0),
+ ::testing::Values(TESTING_DIR "/phototest.tif",
+ TESTING_DIR "/eurotext.tif",
+ TESTING_DIR "/hebrew.png"),
+ ::testing::Values(TESSDATA_DIR)));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataBestEngEuroHebrew, OSDTest,
+ ::testing::Combine(::testing::Values(0),
+ ::testing::Values(TESTING_DIR "/phototest.tif",
+ TESTING_DIR "/eurotext.tif",
+ TESTING_DIR "/hebrew.png"),
+ ::testing::Values(TESSDATA_DIR "_best")));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataFastEngEuroHebrew, OSDTest,
+ ::testing::Combine(::testing::Values(0),
+ ::testing::Values(TESTING_DIR "/phototest.tif",
+ TESTING_DIR "/eurotext.tif",
+ TESTING_DIR "/hebrew.png"),
+ ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataFastRotated90, OSDTest,
+ ::testing::Combine(::testing::Values(90),
+ ::testing::Values(TESTING_DIR
+ "/phototest-rotated-R.png"),
+ ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataFastRotated180, OSDTest,
+ ::testing::Combine(::testing::Values(180),
+ ::testing::Values(TESTING_DIR
+ "/phototest-rotated-180.png"),
+ ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataFastRotated270, OSDTest,
+ ::testing::Combine(::testing::Values(270),
+ ::testing::Values(TESTING_DIR
+ "/phototest-rotated-L.png"),
+ ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataFastDevaRotated270, OSDTest,
+ ::testing::Combine(::testing::Values(270),
+ ::testing::Values(TESTING_DIR
+ "/devatest-rotated-270.png"),
+ ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+ TessdataFastDeva, OSDTest,
+ ::testing::Combine(::testing::Values(0),
+ ::testing::Values(TESTING_DIR "/devatest.png"),
+ ::testing::Values(TESSDATA_DIR "_fast")));
+
+} // namespace
diff --git a/tesseract/unittest/pagesegmode_test.cc b/tesseract/unittest/pagesegmode_test.cc
new file mode 100644
index 00000000..60dcf8da
--- /dev/null
+++ b/tesseract/unittest/pagesegmode_test.cc
@@ -0,0 +1,114 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(_WIN32)
+#include <io.h> // for _access
+#else
+#include <unistd.h> // for access
+#endif
+#include <string>
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "helpers.h"
+#include "log.h"
+#include "include_gunit.h"
+
+namespace tesseract {
+
+// Replacement for std::filesystem::exists (C++-17)
+static bool file_exists(const char* filename) {
+#if defined(_WIN32)
+ return _access(filename, 0) == 0;
+#else
+ return access(filename, 0) == 0;
+#endif
+}
+
+// The fixture for testing Tesseract.
+class PageSegModeTest : public testing::Test {
+ protected:
+ PageSegModeTest() = default;
+ ~PageSegModeTest() {
+ pixDestroy(&src_pix_);
+ }
+
+ void SetUp() override {
+ static std::locale system_locale("");
+ std::locale::global(system_locale);
+ }
+
+ void SetImage(const char* filename) {
+ pixDestroy(&src_pix_);
+ src_pix_ = pixRead(filename);
+ api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY);
+ api_.SetImage(src_pix_);
+ }
+
+ // Tests that the given rectangle produces exactly the given text in the
+ // given segmentation mode (after chopping off the last 2 newlines.)
+ void VerifyRectText(tesseract::PageSegMode mode, const char* str,
+ int left, int top, int width, int height) {
+ api_.SetPageSegMode(mode);
+ api_.SetRectangle(left, top, width, height);
+ char* result = api_.GetUTF8Text();
+ chomp_string(result);
+ chomp_string(result);
+ EXPECT_STREQ(str, result);
+ delete[] result;
+ }
+
+ // Tests that the given rectangle does NOT produce the given text in the
+ // given segmentation mode.
+ void NotRectText(tesseract::PageSegMode mode, const char* str,
+ int left, int top, int width, int height) {
+ api_.SetPageSegMode(mode);
+ api_.SetRectangle(left, top, width, height);
+ char* result = api_.GetUTF8Text();
+ EXPECT_STRNE(str, result);
+ delete[] result;
+ }
+
+ Pix* src_pix_ = nullptr;
+ std::string ocr_text_;
+ tesseract::TessBaseAPI api_;
+};
+
+// Tests the single-word segmentation mode, and that it performs correctly
+// and differently to line and block mode.
+TEST_F(PageSegModeTest, WordTest) {
+ std::string filename = file::JoinPath(TESTING_DIR, "segmodeimg.tif");
+ if (!file_exists(filename.c_str())) {
+ LOG(INFO) << "Skip test because of missing " << filename << '\n';
+ GTEST_SKIP();
+ } else {
+ SetImage(filename.c_str());
+ // Test various rectangles around the inverse page number.
+ VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1419, 264, 69, 34);
+ VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1411, 252, 78, 62);
+ VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1396, 218, 114, 102);
+ // Test a random pair of words as a line
+ VerifyRectText(tesseract::PSM_SINGLE_LINE,
+ "What should", 237, 393, 256, 36);
+ // Test a random pair of words as a word
+ VerifyRectText(tesseract::PSM_SINGLE_WORD,
+ "Whatshould", 237, 393, 256, 36);
+ // Test single block mode.
+ VerifyRectText(tesseract::PSM_SINGLE_BLOCK,
+ "both the\nfrom the", 237, 450, 172, 94);
+ // But doesn't work in line or word mode.
+ NotRectText(tesseract::PSM_SINGLE_LINE,
+ "both the\nfrom the", 237, 450, 172, 94);
+ NotRectText(tesseract::PSM_SINGLE_WORD,
+ "both the\nfrom the", 237, 450, 172, 94);
+ }
+}
+
+} // namespace
diff --git a/tesseract/unittest/pango_font_info_test.cc b/tesseract/unittest/pango_font_info_test.cc
new file mode 100644
index 00000000..5d1c7af7
--- /dev/null
+++ b/tesseract/unittest/pango_font_info_test.cc
@@ -0,0 +1,334 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdio>
+#include <string>
+#include <pango/pango.h>
+#include "include_gunit.h"
+#include "commandlineflags.h"
+#include "fileio.h"
+#include "pango_font_info.h"
+#include "absl/strings/str_cat.h" // for absl::StrCat
+#include "gmock/gmock-matchers.h" // for EXPECT_THAT
+#ifdef INCLUDE_TENSORFLOW
+#include "util/utf8/unicodetext.h" // for UnicodeText
+#endif
+
+namespace tesseract {
+
+// Fonts in testdata directory
+const char* kExpectedFontNames[] = {
+ "Arab",
+ "Arial Bold Italic",
+ "DejaVu Sans Ultra-Light",
+ "Lohit Hindi",
+#if PANGO_VERSION <= 12005
+ "Times New Roman",
+#else
+ "Times New Roman,", // Pango v1.36.2 requires a trailing ','
+#endif
+ "UnBatang",
+ "Verdana"
+};
+
+// Sample text used in tests.
+const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
+const char kEngText[] = "the quick brown fox jumps over the lazy dog";
+const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
+const char kKorText[] = "이는 것으로";
+// Hindi words containing illegal vowel sequences.
+const char* kBadlyFormedHinWords[] = {
+#if PANGO_VERSION <= 12005
+ "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
+#endif
+ // Pango v1.36.2 will render the above words even though they are invalid.
+ "प्रंात", nullptr
+};
+
+static PangoFontMap* font_map;
+
+class PangoFontInfoTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!font_map) {
+ font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+ }
+ pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+ }
+
+ // Creates a fake fonts.conf file that points to the testdata fonts for
+ // fontconfig to initialize with.
+ static void SetUpTestCase() {
+ static std::locale system_locale("");
+ std::locale::global(system_locale);
+
+ FLAGS_fonts_dir = TESTING_DIR;
+ FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+ file::MakeTmpdir();
+ PangoFontInfo::SoftInitFontConfig(); // init early
+ }
+
+ PangoFontInfo font_info_;
+};
+
+TEST_F(PangoFontInfoTest, TestNonDefaultConstructor) {
+ PangoFontInfo font("Arial Bold Italic 12");
+ EXPECT_EQ(12, font.font_size());
+ EXPECT_EQ("Arial", font.family_name());
+}
+
+TEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) {
+ EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Bold Italic 12"));
+ EXPECT_EQ(12, font_info_.font_size());
+ EXPECT_EQ("Arial", font_info_.family_name());
+
+ EXPECT_TRUE(font_info_.ParseFontDescriptionName("Verdana 10"));
+ EXPECT_EQ(10, font_info_.font_size());
+ EXPECT_EQ("Verdana", font_info_.family_name());
+
+ EXPECT_TRUE(font_info_.ParseFontDescriptionName("DejaVu Sans Ultra-Light"));
+ EXPECT_EQ("DejaVu Sans", font_info_.family_name());
+}
+
+TEST_F(PangoFontInfoTest, DoesParseMissingFonts) {
+ // Font family one of whose faces exists but this one doesn't.
+ EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
+ EXPECT_EQ(12, font_info_.font_size());
+ EXPECT_EQ("Arial", font_info_.family_name());
+
+ // Font family that doesn't exist in testdata. It will still parse the
+ // description name. But without the file, it will not be able to populate
+ // some font family details, like is_monospace().
+ EXPECT_TRUE(font_info_.ParseFontDescriptionName("Georgia 10"));
+ EXPECT_EQ(10, font_info_.font_size());
+ EXPECT_EQ("Georgia", font_info_.family_name());
+}
+
+TEST_F(PangoFontInfoTest, DoesGetSpacingProperties) {
+ EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
+ int x_bearing, x_advance;
+ EXPECT_TRUE(font_info_.GetSpacingProperties("A", &x_bearing, &x_advance));
+ EXPECT_GT(x_advance, 0);
+ EXPECT_TRUE(font_info_.GetSpacingProperties("a", &x_bearing, &x_advance));
+ EXPECT_GT(x_advance, 0);
+}
+
+TEST_F(PangoFontInfoTest, CanRenderString) {
+ font_info_.ParseFontDescriptionName("Verdana 12");
+ EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText)));
+
+ font_info_.ParseFontDescriptionName("UnBatang 12");
+ EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
+
+ font_info_.ParseFontDescriptionName("Lohit Hindi 12");
+ EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText)));
+}
+
+TEST_F(PangoFontInfoTest, CanRenderLigature) {
+ font_info_.ParseFontDescriptionName("Arab 12");
+ const char kArabicLigature[] = "لا";
+ EXPECT_TRUE(
+ font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
+
+ printf("Next word\n");
+ EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
+}
+
+TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {
+ font_info_.ParseFontDescriptionName("Verdana 12");
+ EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
+}
+
+TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
+ font_info_.ParseFontDescriptionName("Lohit Hindi 12");
+ for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {
+ EXPECT_FALSE(font_info_.CanRenderString(kBadlyFormedHinWords[i],
+ strlen(kBadlyFormedHinWords[i])))
+ << "Can render " << kBadlyFormedHinWords[i];
+ }
+}
+
+TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
+ font_info_.ParseFontDescriptionName("Verdana 12");
+ // Verdana cannot render the "ff" ligature
+ std::string word = "office";
+ EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
+ EXPECT_EQ("oice", word);
+
+ // Don't drop non-letter characters like word joiners.
+ const char* kJoiners[] = {
+ "\u2060", // U+2060 (WJ)
+ "\u200C", // U+200C (ZWJ)
+ "\u200D" // U+200D (ZWNJ)
+ };
+ for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
+ word = kJoiners[i];
+ EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
+ EXPECT_STREQ(kJoiners[i], word.c_str());
+ }
+}
+
+// ------------------------ FontUtils ------------------------------------
+
+class FontUtilsTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ file::MakeTmpdir();
+ }
+ // Creates a fake fonts.conf file that points to the testdata fonts for
+ // fontconfig to initialize with.
+ static void SetUpTestCase() {
+ FLAGS_fonts_dir = TESTING_DIR;
+ FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+ if (!font_map) {
+ font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+ }
+ pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+ }
+
+#ifdef INCLUDE_TENSORFLOW
+ void CountUnicodeChars(const char* utf8_text,
+ std::unordered_map<char32, int64_t>* ch_map) {
+ ch_map->clear();
+ UnicodeText ut;
+ ut.PointToUTF8(utf8_text, strlen(utf8_text));
+ for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
+#if 0
+ if (UnicodeProps::IsWhitespace(*it)) continue;
+#else
+ if (std::isspace(*it)) continue;
+#endif
+ ++(*ch_map)[*it];
+ }
+ }
+#endif
+};
+
+TEST_F(FontUtilsTest, DoesFindAvailableFonts) {
+ EXPECT_TRUE(FontUtils::IsAvailableFont("Arial Bold Italic"));
+ EXPECT_TRUE(FontUtils::IsAvailableFont("Verdana"));
+ EXPECT_TRUE(FontUtils::IsAvailableFont("DejaVu Sans Ultra-Light"));
+
+ // Test that we can support font name convention for Pango v1.30.2 even when
+ // we are running an older version.
+ EXPECT_TRUE(FontUtils::IsAvailableFont("Times New Roman,"));
+}
+
+TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
+ // Only bold italic face is available.
+ EXPECT_FALSE(FontUtils::IsAvailableFont("Arial"));
+ // Don't have a ttf for the Courier family.
+ EXPECT_FALSE(FontUtils::IsAvailableFont("Courier"));
+ // Pango "synthesizes" the italic font from the available Verdana Regular and
+ // includes it in its list, but it is not really loadable.
+ EXPECT_FALSE(FontUtils::IsAvailableFont("Verdana Italic"));
+ // We have "Dejavu Sans Ultra-Light" but not its medium weight counterpart.
+ EXPECT_FALSE(FontUtils::IsAvailableFont("DejaVu Sans"));
+}
+
+TEST_F(FontUtilsTest, DoesListAvailableFonts) {
+ const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
+ EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
+ for (auto& font : fonts) {
+ PangoFontInfo font_info;
+ EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
+ }
+}
+
+#ifdef INCLUDE_TENSORFLOW
+TEST_F(FontUtilsTest, DoesFindBestFonts) {
+ std::string fonts_list;
+ std::unordered_map<char32, int64_t> ch_map;
+ CountUnicodeChars(kEngText, &ch_map);
+ EXPECT_EQ(26, ch_map.size()); // 26 letters
+ std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
+ std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
+ EXPECT_TRUE(best_list.size());
+ // All fonts except Lohit Hindi should render English text.
+ EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
+
+ CountUnicodeChars(kKorText, &ch_map);
+ best_list = FontUtils::BestFonts(ch_map, &font_flags);
+ EXPECT_TRUE(best_list.size());
+ // Only UnBatang font family is able to render korean.
+ EXPECT_EQ(1, font_flags.size());
+ EXPECT_STREQ("UnBatang", font_flags[0].first);
+}
+#endif
+
+TEST_F(FontUtilsTest, DoesSelectFont) {
+ const char* kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr};
+ const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
+ for (int i = 0; kLangText[i] != nullptr; ++i) {
+ SCOPED_TRACE(kLangNames[i]);
+ std::vector<std::string> graphemes;
+ std::string selected_font;
+ EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
+ &selected_font, &graphemes));
+ EXPECT_TRUE(selected_font.size());
+ EXPECT_TRUE(graphemes.size());
+ }
+}
+
+TEST_F(FontUtilsTest, DoesFailToSelectFont) {
+ const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
+ std::vector<std::string> graphemes;
+ std::string selected_font;
+ EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
+ &selected_font, &graphemes));
+}
+
+#if 0
+// Needs fix. FontUtils::GetAllRenderableCharacters was removed
+// because of deprecated pango_coverage_max.
+TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
+ const int32_t kHindiChar = 0x0905;
+ const int32_t kArabicChar = 0x0623;
+ const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
+ const int32_t kOghamChar = 0x1680; // Ogham space mark
+ std::vector<bool> unicode_mask;
+ FontUtils::GetAllRenderableCharacters(&unicode_mask);
+ EXPECT_TRUE(unicode_mask['A']);
+ EXPECT_TRUE(unicode_mask['1']);
+ EXPECT_TRUE(unicode_mask[kHindiChar]);
+ EXPECT_TRUE(unicode_mask[kArabicChar]);
+ EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
+#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
+ EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
+#endif
+ unicode_mask.clear();
+
+ std::vector<std::string> selected_fonts;
+ selected_fonts.push_back("Lohit Hindi");
+ FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
+ EXPECT_TRUE(unicode_mask['1']);
+ EXPECT_TRUE(unicode_mask[kHindiChar]);
+ EXPECT_FALSE(unicode_mask['A']); // Lohit doesn't render English,
+ EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
+ EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
+ EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
+ unicode_mask.clear();
+
+ // Check that none of the included fonts cover the Mongolian or Ogham space
+ // characters.
+ for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
+ SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
+ FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
+#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
+ EXPECT_FALSE(unicode_mask[kOghamChar]);
+#endif
+ EXPECT_FALSE(unicode_mask[kMongolianChar]);
+ unicode_mask.clear();
+ }
+}
+#endif
+
+} // namespace
diff --git a/tesseract/unittest/paragraphs_test.cc b/tesseract/unittest/paragraphs_test.cc
new file mode 100644
index 00000000..16134cac
--- /dev/null
+++ b/tesseract/unittest/paragraphs_test.cc
@@ -0,0 +1,705 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string> // for std::string
+
+#include "absl/strings/str_cat.h" // for absl::StrCat
+#include "absl/strings/str_join.h" // for absl::StrJoin
+#include "absl/strings/str_split.h" // for absl::StrSplit
+
+#include "include_gunit.h" // for TEST
+#include "log.h" // for LOG
+
+#include "genericvector.h"
+// ccmain
+#include "paragraphs.h"
+#include "paragraphs_internal.h"
+// ccstruct
+#include "ocrpara.h"
+
+namespace tesseract {
+
+// Functions for making monospace ASCII trial text for the paragraph detector.
+const ParagraphJustification kLeft = JUSTIFICATION_LEFT;
+const ParagraphJustification kCenter = JUSTIFICATION_CENTER;
+const ParagraphJustification kRight = JUSTIFICATION_RIGHT;
+const ParagraphJustification kUnknown = JUSTIFICATION_UNKNOWN;
+
+enum TextModelInputType {
+ PCONT = 0, // Continuation line of a paragraph (default).
+ PSTART = 1, // First line of a paragraph.
+ PNONE = 2, // Not a paragraph line.
+};
+
+struct TextAndModel {
+ const char* ascii;
+ TextModelInputType model_type;
+
+ // fields corresponding to PARA (see ccstruct/ocrpara.h)
+ ParagraphModel model;
+ bool is_very_first_or_continuation;
+ bool is_list_item;
+};
+
+// Imagine that the given text is typewriter ASCII with each character ten
+// pixels wide and twenty pixels high and return an appropriate row_info.
+void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) {
+ const int kCharWidth = 10;
+ const int kLineSpace = 30;
+ info->text = text;
+ info->has_leaders =
+ strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
+ info->has_drop_cap = false;
+ info->pix_ldistance = info->pix_rdistance = 0;
+ info->average_interword_space = kCharWidth;
+ info->pix_xheight = kCharWidth;
+ info->lword_text = info->rword_text = "";
+ info->ltr = true;
+
+ std::vector<std::string> words = absl::StrSplit(text, ' ', absl::SkipEmpty());
+ info->num_words = words.size();
+ if (info->num_words < 1) return;
+
+ info->lword_text = words[0].c_str();
+ info->rword_text = words[words.size() - 1].c_str();
+ int lspace = 0;
+ while (lspace < info->text.size() && text[lspace] == ' ') {
+ lspace++;
+ }
+ int rspace = 0;
+ while (rspace < info->text.size() &&
+ text[info->text.size() - rspace - 1] == ' ') {
+ rspace++;
+ }
+
+ int top = -kLineSpace * row_number;
+ int bottom = top - kLineSpace;
+ int row_right = kCharWidth * info->text.size();
+ int lword_width = kCharWidth * info->lword_text.size();
+ int rword_width = kCharWidth * info->rword_text.size();
+ info->pix_ldistance = lspace * kCharWidth;
+ info->pix_rdistance = rspace * kCharWidth;
+ info->lword_box =
+ TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
+ info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,
+ row_right - info->pix_rdistance, top);
+ LeftWordAttributes(
+ nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
+ &info->lword_likely_starts_idea, &info->lword_likely_ends_idea);
+ RightWordAttributes(
+ nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
+ &info->rword_likely_starts_idea, &info->rword_likely_ends_idea);
+}
+
+void MakeAsciiRowInfos(const TextAndModel* row_infos, int n,
+ std::vector<RowInfo>* output) {
+ output->clear();
+ RowInfo info;
+ for (int i = 0; i < n; i++) {
+ AsciiToRowInfo(row_infos[i].ascii, i, &info);
+ output->push_back(info);
+ }
+}
+
+// Given n rows of reference ground truth, evaluate whether the n rows
+// of PARA * pointers yield the same paragraph breakpoints.
+void EvaluateParagraphDetection(const TextAndModel* correct, int n,
+ const GenericVector<PARA*>& detector_output) {
+ int incorrect_breaks = 0;
+ int missed_breaks = 0;
+ int poorly_matched_models = 0;
+ int bad_crowns = 0;
+ int bad_list_items = 0;
+ ASSERT_EQ(detector_output.size(), n);
+ for (int i = 1; i < n; i++) {
+ bool has_break = correct[i].model_type != PCONT;
+ bool detected_break = (detector_output[i - 1] != detector_output[i]);
+ if (has_break && !detected_break) missed_breaks++;
+ if (detected_break && !has_break) incorrect_breaks++;
+ if (has_break) {
+ if (correct[i].model_type == PNONE) {
+ if (detector_output[i]->model != nullptr) {
+ poorly_matched_models++;
+ }
+ } else {
+ if (correct[i].model.justification() != kUnknown &&
+ (detector_output[i]->model == nullptr ||
+ !correct[i].model.Comparable(*detector_output[i]->model))) {
+ poorly_matched_models++;
+ }
+ }
+ if (correct[i].is_very_first_or_continuation ^
+ detector_output[i]->is_very_first_or_continuation) {
+ bad_crowns++;
+ }
+ if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {
+ bad_list_items++;
+ }
+ }
+ }
+ EXPECT_EQ(incorrect_breaks, 0);
+ EXPECT_EQ(missed_breaks, 0);
+ EXPECT_EQ(poorly_matched_models, 0);
+ EXPECT_EQ(bad_list_items, 0);
+ EXPECT_EQ(bad_crowns, 0);
+ if (incorrect_breaks || missed_breaks || poorly_matched_models ||
+ bad_list_items || bad_crowns) {
+ std::vector<std::string> dbg_lines;
+ dbg_lines.push_back("# ==========================");
+ dbg_lines.push_back("# Correct paragraph breaks:");
+ dbg_lines.push_back("# ==========================");
+ for (int i = 0; i < n; i++) {
+ if (correct[i].model_type != PCONT) {
+ dbg_lines.push_back(absl::StrCat(
+ correct[i].ascii, " # ", correct[i].model.ToString().c_str(),
+ correct[i].is_very_first_or_continuation ? " crown" : "",
+ correct[i].is_list_item ? " li" : ""));
+ } else {
+ dbg_lines.push_back(correct[i].ascii);
+ }
+ }
+ dbg_lines.push_back("");
+ dbg_lines.push_back("# ==========================");
+ dbg_lines.push_back("# Paragraph detector output:");
+ dbg_lines.push_back("# ==========================");
+ for (int i = 0; i < n; i++) {
+ std::string annotation;
+ if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
+ if (detector_output[i] && detector_output[i]->model) {
+ annotation += absl::StrCat(
+ " # ", detector_output[i]->model->ToString().c_str(),
+ detector_output[i]->is_very_first_or_continuation ? " crown" : "",
+ detector_output[i]->is_list_item ? " li" : "");
+ } else {
+ annotation = " # Unmodeled paragraph.";
+ }
+ }
+ dbg_lines.push_back(absl::StrCat(correct[i].ascii, annotation));
+ }
+ LOG(INFO) << "Discrepency!\n" << absl::StrJoin(dbg_lines, "\n");
+ }
+}
+
+void TestParagraphDetection(const TextAndModel* correct, int num_rows) {
+ std::vector<RowInfo> row_infos;
+ GenericVector<PARA*> row_owners;
+ PARA_LIST paragraphs;
+ std::vector<ParagraphModel*> models;
+
+ MakeAsciiRowInfos(correct, num_rows, &row_infos);
+ int debug_level(3);
+ tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, &paragraphs,
+ &models);
+ EvaluateParagraphDetection(correct, num_rows, row_owners);
+ for (auto* model : models) {
+ delete model;
+ }
+}
+
+TEST(ParagraphsTest, ListItemsIdentified) {
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("iii"));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("A."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("B."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("C."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("1."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("2."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("3."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("1"));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("2"));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("3"));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("[[1]]"));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-1."));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-2"));
+ EXPECT_TRUE(tesseract::AsciiLikelyListItem("(A)(i)"));
+
+ EXPECT_FALSE(tesseract::AsciiLikelyListItem("The"));
+ EXPECT_FALSE(tesseract::AsciiLikelyListItem("first"));
+ EXPECT_FALSE(tesseract::AsciiLikelyListItem("house"));
+ EXPECT_FALSE(tesseract::AsciiLikelyListItem("Oregonian."));
+ EXPECT_FALSE(tesseract::AsciiLikelyListItem("on."));
+}
+
+typedef ParagraphModel PModel;
+
+const TextAndModel kTwoSimpleParagraphs[] = {
+ {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"This paragraph starts at the top", PCONT, PModel(), false, false},
+ {"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
+ {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"which indicates that the first ", PCONT, PModel(), false, false},
+ {"paragraph is not a continuation ", PCONT, PModel(), false, false},
+ {"from a previous page, as it is ", PCONT, PModel(), false, false},
+ {"indented just like this second ", PCONT, PModel(), false, false},
+ {"paragraph. ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestSimpleParagraphDetection) {
+ TestParagraphDetection(kTwoSimpleParagraphs,
+ ABSL_ARRAYSIZE(kTwoSimpleParagraphs));
+}
+
+const TextAndModel kFewCluesWithCrown[] = {
+ {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
+ true, false},
+ {"of the page and takes two lines.", PCONT, PModel(), false, false},
+ {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"which indicates that the first ", PCONT, PModel(), false, false},
+ {"paragraph is a continuation from", PCONT, PModel(), false, false},
+ {"a previous page, as it is ", PCONT, PModel(), false, false},
+ {"indented just like this second ", PCONT, PModel(), false, false},
+ {"paragraph. ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestFewCluesWithCrown) {
+ TestParagraphDetection(kFewCluesWithCrown,
+ ABSL_ARRAYSIZE(kFewCluesWithCrown));
+}
+
+const TextAndModel kCrownedParagraph[] = {
+ {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
+ true, false},
+ {"often not indented as the rest ", PCONT, PModel(), false, false},
+ {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
+ {"less it should be counted as the", PCONT, PModel(), false, false},
+ {"same type of paragraph. ", PCONT, PModel(), false, false},
+ {" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"graphs are both indented two ", PCONT, PModel(), false, false},
+ {"spaces. ", PCONT, PModel(), false, false},
+ {" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestCrownParagraphDetection) {
+ TestParagraphDetection(kCrownedParagraph, ABSL_ARRAYSIZE(kCrownedParagraph));
+}
+
+const TextAndModel kFlushLeftParagraphs[] = {
+ {"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
+ {"flush left paragraphs (those", PCONT, PModel(), false, false},
+ {"with no body indent) are not", PCONT, PModel(), false, false},
+ {"actually crowns. ", PCONT, PModel(), false, false},
+ {"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
+ {"also flush left aligned. Usual-", PCONT, PModel(), false, false},
+ {"ly, these paragraphs are set", PCONT, PModel(), false, false},
+ {"apart vertically by some white-", PCONT, PModel(), false, false},
+ {"space, but you can also detect", PCONT, PModel(), false, false},
+ {"them by observing the big empty", PCONT, PModel(), false, false},
+ {"space at the ends of the para-", PCONT, PModel(), false, false},
+ {"graphs. ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
+ TestParagraphDetection(kFlushLeftParagraphs,
+ ABSL_ARRAYSIZE(kFlushLeftParagraphs));
+}
+
+const TextAndModel kSingleFullPageContinuation[] = {
+ {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
+ {"continuation. It flows from", PCONT, PModel(), false, false},
+ {"line to line, using the full", PCONT, PModel(), false, false},
+ {"column width with no clear", PCONT, PModel(), false, false},
+ {"paragraph break, because it", PCONT, PModel(), false, false},
+ {"actually doesn't have one. It", PCONT, PModel(), false, false},
+ {"is the middle of one monster", PCONT, PModel(), false, false},
+ {"paragraph continued from the", PCONT, PModel(), false, false},
+ {"previous page and continuing", PCONT, PModel(), false, false},
+ {"onto the next page. There-", PCONT, PModel(), false, false},
+ {"fore, it ends up getting", PCONT, PModel(), false, false},
+ {"marked as a crown and then", PCONT, PModel(), false, false},
+ {"getting re-marked as any ex-", PCONT, PModel(), false, false},
+ {"isting model. Not great, but", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestSingleFullPageContinuation) {
+ const TextAndModel* correct = kSingleFullPageContinuation;
+ int num_rows = ABSL_ARRAYSIZE(kSingleFullPageContinuation);
+ std::vector<RowInfo> row_infos;
+ GenericVector<PARA*> row_owners;
+ PARA_LIST paragraphs;
+ std::vector<ParagraphModel*> models;
+ models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
+ MakeAsciiRowInfos(correct, num_rows, &row_infos);
+ tesseract::DetectParagraphs(3, &row_infos, &row_owners, &paragraphs, &models);
+ EvaluateParagraphDetection(correct, num_rows, row_owners);
+ for (auto* model : models) {
+ delete model;
+ }
+}
+
+const TextAndModel kRightAligned[] = {
+ {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
+ {" uncommon in Left-to-Right", PCONT, PModel(), false, false},
+ {" languages, but they do", PCONT, PModel(), false, false},
+ {" exist.", PCONT, PModel(), false, false},
+ {" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
+ {" horribly tiny paragraphs in", PCONT, PModel(), false, false},
+ {" tables on which we have no", PCONT, PModel(), false, false},
+ {" chance anyways.", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestRightAlignedParagraph) {
+ TestParagraphDetection(kRightAligned, ABSL_ARRAYSIZE(kRightAligned));
+}
+
+const TextAndModel kTinyParagraphs[] = {
+ {" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"obvious paragraph text, you might", PCONT, PModel(), false, false},
+ {"find short exchanges of dialogue ", PCONT, PModel(), false, false},
+ {"between characters. ", PCONT, PModel(), false, false},
+ {" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"mark a new paragraph whenever one", PCONT, PModel(), false, false},
+ {"of the statistics (left, right or", PCONT, PModel(), false, false},
+ {"center) changes from one text-", PCONT, PModel(), false, false},
+ {"line to the next. Such an", PCONT, PModel(), false, false},
+ {"approach would misclassify the", PCONT, PModel(), false, false},
+ {"tiny paragraphs above as a single", PCONT, PModel(), false, false},
+ {"paragraph. ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestTinyParagraphs) {
+ TestParagraphDetection(kTinyParagraphs, ABSL_ARRAYSIZE(kTinyParagraphs));
+}
+
+const TextAndModel kComplexPage1[] = {
+ {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
+ {" Centered Title ", PCONT, PModel(), false, false},
+ {" Paragraph Detection ", PCONT, PModel(), false, false},
+ {" OCR TEAM ", PCONT, PModel(), false, false},
+ {" 10 November 2010 ", PCONT, PModel(), false, false},
+ {" ", PNONE, PModel(), false, false},
+ {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"This paragraph starts at the top", PCONT, PModel(), false, false},
+ {"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
+ {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"which indicates that the first ", PCONT, PModel(), false, false},
+ {"paragraph is not a continuation ", PCONT, PModel(), false, false},
+ {"from a previous page, as it is ", PCONT, PModel(), false, false},
+ {"indented just like this second ", PCONT, PModel(), false, false},
+ {"paragraph. ", PCONT, PModel(), false, false},
+ {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
+ true, false},
+ {" looks like the prior text ", PCONT, PModel(), false, false},
+ {" but it is indented more ", PCONT, PModel(), false, false},
+ {" and is fully justified. ", PCONT, PModel(), false, false},
+ {" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"centered text, block quotes, ", PCONT, PModel(), false, false},
+ {"normal paragraphs, and lists ", PCONT, PModel(), false, false},
+ {"like what follows? ", PCONT, PModel(), false, false},
+ {"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" looking for lines where the ", PCONT, PModel(), false, false},
+ {" first word of the next line ", PCONT, PModel(), false, false},
+ {" would fit on the previous ", PCONT, PModel(), false, false},
+ {" line. ", PCONT, PModel(), false, false},
+ {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" Python and try it out. ", PCONT, PModel(), false, false},
+ {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" mistakes. ", PCONT, PModel(), false, false},
+ {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"you can try to identify source ", PCONT, PModel(), false, false},
+ {"code. Ouch! ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestComplexPage1) {
+ TestParagraphDetection(kComplexPage1, ABSL_ARRAYSIZE(kComplexPage1));
+}
+
+// The same as above, but wider.
+const TextAndModel kComplexPage2[] = {
+ {" Awesome ", PSTART,
+ PModel(kCenter, 0, 0, 0, 0), false, false},
+ {" Centered Title ", PCONT, PModel(), false, false},
+ {" Paragraph Detection ", PCONT, PModel(), false, false},
+ {" OCR TEAM ", PCONT, PModel(), false, false},
+ {" 10 November 2010 ", PCONT, PModel(), false, false},
+ {" ", PNONE, PModel(), false, false},
+ {" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"This paragraph starts at the top of", PCONT, PModel(), false, false},
+ {"the page and takes 3 lines. ", PCONT, PModel(), false, false},
+ {" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"which indicates that the first ", PCONT, PModel(), false, false},
+ {"paragraph is not a continuation ", PCONT, PModel(), false, false},
+ {"from a previous page, as it is in- ", PCONT, PModel(), false, false},
+ {"dented just like this second para- ", PCONT, PModel(), false, false},
+ {"graph. ", PCONT, PModel(), false, false},
+ {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
+ true, false},
+ {" looks like the prior text ", PCONT, PModel(), false, false},
+ {" but it is indented more ", PCONT, PModel(), false, false},
+ {" and is fully justified. ", PCONT, PModel(), false, false},
+ {" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
+ {"graphs, and lists like what follow?", PCONT, PModel(), false, false},
+ {"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!!
+ {"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" looking for lines where the ", PCONT, PModel(), false, false},
+ {" first word of the next line ", PCONT, PModel(), false, false},
+ {" would fit on the previous line. ", PCONT, PModel(), false, false},
+ {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" Python and try it out. ", PCONT, PModel(), false, false},
+ {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" mistakes. ", PCONT, PModel(), false, false},
+ {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+ false, true},
+ {" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"you can try to identify source ", PCONT, PModel(), false, false},
+ {"code. Ouch! ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestComplexPage2) {
+ TestParagraphDetection(kComplexPage2, ABSL_ARRAYSIZE(kComplexPage2));
+}
+
+const TextAndModel kSubtleCrown[] = {
+ {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
+ true, false},
+ {"often not indented as the rest ", PCONT, PModel(), false, false},
+ {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
+ {"less it should be counted as the", PCONT, PModel(), false, false},
+ {"same type of paragraph. ", PCONT, PModel(), false, false},
+ {" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"should suffice. ", PCONT, PModel(), false, false},
+ {" 1235 ", PNONE, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestSubtleCrown) {
+ TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown) - 1);
+}
+
+TEST(ParagraphsTest, TestStrayLineInBlock) {
+ TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown));
+}
+
+const TextAndModel kUnlvRep3AO[] = {
+ {" Defined contribution plans cover employees in Australia, New", PSTART,
+ PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, false},
+ {"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), false, false},
+ {"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false},
+ {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false},
+ {"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, false},
+ {"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, false},
+ {"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, false},
+ {" In addition to providing pension benefits, the Company pro- ", PSTART,
+ PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, false},
+ {"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, false},
+ {"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, false},
+ {"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, false},
+ {"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, false},
+ {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false},
+ {"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, false},
+ {" The U.S. plan covering the parent company is the largest plan.",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, false},
+ {"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, false},
+ {"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, false},
+ {"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), false, false},
+ {"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, false},
+ {"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, false},
+ {"credited service. The Company has the ability to change these ", PCONT, PModel(), false, false},
+ {"benefits at any time. ", PCONT, PModel(), false, false},
+ {" Effective October 1993, the Company amended its health ", PSTART,
+ PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false},
+ {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false},
+ {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false},
+ {"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, false},
+ {"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false},
+ {"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, false},
+ {"for 1994 by approximately $83. ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestUnlvInsurance) {
+ TestParagraphDetection(kUnlvRep3AO, ABSL_ARRAYSIZE(kUnlvRep3AO));
+}
+
+// The basic outcome we want for something with a bunch of leader dots is that
+// we group each logical entry as a separate item. Without knowledge of
+// leaders, we would most likely mark the text below as a simple right aligned
+// paragraph or two.
+// This example comes from Volume 9886293, Page 5
+const TextAndModel kTableOfContents[] = {
+ {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+ {" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+};
+
+TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
+ TestParagraphDetection(kTableOfContents, ABSL_ARRAYSIZE(kTableOfContents));
+}
+
+const TextAndModel kTextWithSourceCode[] = {
+ {" A typical page of a programming book may contain", PSTART,
+ PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
+ {"being described in prose. Such examples should be", PCONT, PModel(), false, false},
+ {"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false},
+ {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false},
+ {"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
+ {"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
+ {"when the text is re-flowed. ", PCONT, PModel(), false, false},
+ {" Let's show this by describing the function fact-", PSTART,
+ PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false},
+ {"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false},
+ {"that the typical C implementation will only work ", PCONT, PModel(), false, false},
+ {"for values less than about 12: ", PCONT, PModel(), false, false},
+ {" ", PNONE, PModel(), false, false},
+ {" # Naive implementation in C ", PCONT, PModel(), false, false},
+ {" int factorial(int n) { ", PCONT, PModel(), false, false},
+ {" if (n < 2) ", PCONT, PModel(), false, false},
+ {" return 1; ", PCONT, PModel(), false, false},
+ {" return n * factorial(n - 1); ", PCONT, PModel(), false, false},
+ {" } ", PCONT, PModel(), false, false},
+ {" ", PCONT, PModel(), false, false},
+ {" The C programming language does not have built- ", PSTART,
+ PModel(kLeft, 0, 20, 0, 0), false, false},
+ {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
+ {"naive implementation simply returns random values ", PCONT, PModel(), false, false},
+ {"if even a moderate sized n is provided. ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, NotDistractedBySourceCode) {
+ TestParagraphDetection(kTextWithSourceCode,
+ ABSL_ARRAYSIZE(kTextWithSourceCode));
+}
+
+const TextAndModel kOldManAndSea[] = {
+ {"royal palm which are called guano and in it there was a bed, a",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false},
+ {"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), false, false},
+ {"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), false, false},
+ {"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), false, false},
+ {"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), false, false},
+ {"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), false, false},
+ {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false},
+ {"shirt. ", PCONT, PModel(), false, false},
+ {" \"What do you have to eat?\" the boy asked. ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"A pot of yellow rice with fish. Do you want some?\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"No. I will eat at home. Do you want me to make the fire?\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"No. I will make it later on. Or I may eat the rice cold.\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"May I take the cast net?\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"Of course.\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" There was no cast net and the boy remembered when they had",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), false, false},
+ {"pot of yellow rice and fish and the boy knew this too. "
+ " ", PCONT, PModel(), false, false},
+ {" \"Eighty-five is a lucky number,\" the old man said. \"How",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"would you like to see me bring one in that dressed out over a "
+ "thou-", PCONT, PModel(), false, false},
+ {"sand pounds? "
+ " ", PCONT, PModel(), false, false},
+ {" \"I'll get the cast net and go for sardines. Will you sit in the "
+ "sun",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"in the doorway?\" "
+ " ", PCONT, PModel(), false, false},
+ {" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" The boy did not know whether yesterday's paper was a fiction",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"too. But the old man brought it out from under the bed. ", PCONT, PModel(), false, false},
+ {" \"Pedrico gave it to me at the bodega,\" he explained. "
+ " ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"I'll be back when I have the sardines. I'll keep yours and mine",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"together on ice and we can share them in the morning. When I", PCONT, PModel(), false, false},
+ {"come back you can tell me about the baseball.\" ", PCONT, PModel(), false, false},
+ {" \"The Yankees cannot lose.\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"But I fear the Indians of Cleveland.\" ",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {" \"Have faith in the Yankees my son. Think of the great Di-",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"Maggio.\" ", PCONT, PModel(), false, false},
+ {" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
+ PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+ {"land.\" ", PCONT, PModel(), false, false}
+};
+
+TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
+ TestParagraphDetection(kOldManAndSea, ABSL_ARRAYSIZE(kOldManAndSea));
+}
+
+const TextAndModel kNewZealandIndex[] = {
+ {"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {" 138 ", PCONT, PModel(), false, false},
+ {"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {" 145 ", PCONT, PModel(), false, false},
+ {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {" 85 ", PCONT, PModel(), false, false},
+ {"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+ {"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}
+};
+
+TEST(ParagraphsTest, IndexPageTest) {
+ TestParagraphDetection(kNewZealandIndex, ABSL_ARRAYSIZE(kNewZealandIndex));
+}
+
+// TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.
+
+} // namespace
diff --git a/tesseract/unittest/params_model_test.cc b/tesseract/unittest/params_model_test.cc
new file mode 100644
index 00000000..8627ab8e
--- /dev/null
+++ b/tesseract/unittest/params_model_test.cc
@@ -0,0 +1,75 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string> // std::string
+#include <vector>
+
+#include "include_gunit.h"
+#include "params_model.h"
+#include "serialis.h" // TFile
+#include "tprintf.h" // tprintf
+
+namespace tesseract {
+
+// Test some basic I/O of params model files (automated learning of language
+// model weights).
+#ifndef DISABLED_LEGACY_ENGINE
+static bool LoadFromFile(tesseract::ParamsModel& model, const char* lang, const char* full_path) {
+ tesseract::TFile fp;
+ if (!fp.Open(full_path, nullptr)) {
+ tprintf("Error opening file %s\n", full_path);
+ return false;
+ }
+ return model.LoadFromFp(lang, &fp);
+}
+#endif
+
+class ParamsModelTest : public testing::Test {
+#ifndef DISABLED_LEGACY_ENGINE
+ protected:
+ void SetUp() override {
+ std::locale::global(std::locale(""));
+ }
+
+ std::string TestDataNameToPath(const std::string& name) const {
+ return file::JoinPath(TESTDATA_DIR, name);
+ }
+ std::string OutputNameToPath(const std::string& name) const {
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+ // Test that we are able to load a params model, save it, reload it,
+ // and verify that the re-serialized version is the same as the original.
+ void TestParamsModelRoundTrip(const std::string& params_model_filename) const {
+ tesseract::ParamsModel orig_model;
+ tesseract::ParamsModel duplicate_model;
+ file::MakeTmpdir();
+ std::string orig_file = TestDataNameToPath(params_model_filename);
+ std::string out_file = OutputNameToPath(params_model_filename);
+
+ EXPECT_TRUE(LoadFromFile(orig_model, "eng", orig_file.c_str()));
+ EXPECT_TRUE(orig_model.SaveToFile(out_file.c_str()));
+
+ EXPECT_TRUE(LoadFromFile(duplicate_model, "eng", out_file.c_str()));
+ EXPECT_TRUE(orig_model.Equivalent(duplicate_model));
+ }
+#endif
+};
+
+TEST_F(ParamsModelTest, TestEngParamsModelIO) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because ParamsModel::LoadFromFp is missing.
+ GTEST_SKIP();
+#else
+ TestParamsModelRoundTrip("eng.params_model");
+#endif
+}
+
+} // namespace
diff --git a/tesseract/unittest/progress_test.cc b/tesseract/unittest/progress_test.cc
new file mode 100644
index 00000000..dbe30269
--- /dev/null
+++ b/tesseract/unittest/progress_test.cc
@@ -0,0 +1,165 @@
+///////////////////////////////////////////////////////////////////////
+// File: progress_test.cc
+// Description: Progress reporting API Test for Tesseract.
+// Author: Jaroslaw Kubik
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// expects clone of tessdata_fast repo in ../../tessdata_fast
+
+#include "include_gunit.h"
+
+#include <tesseract/baseapi.h>
+#include <tesseract/ocrclass.h>
+
+#include "allheaders.h"
+#include "gmock/gmock.h"
+
+#include <fstream>
+#include <iostream>
+#include <locale>
+#include <memory> // std::unique_ptr
+#include <string>
+
+#include <time.h>
+
+namespace tesseract {
+
+class QuickTest : public testing::Test {
+ protected:
+ virtual void SetUp() { start_time_ = time(nullptr); }
+ virtual void TearDown() {
+ const time_t end_time = time(nullptr);
+ EXPECT_TRUE(end_time - start_time_ <= 25)
+ << "The test took too long - "
+ << ::testing::PrintToString(end_time - start_time_);
+ }
+ time_t start_time_;
+};
+
+class ClassicMockProgressSink {
+ public:
+ MOCK_METHOD1(classicProgress, bool(int));
+ MOCK_METHOD1(cancel, bool(int));
+
+ ETEXT_DESC monitor;
+
+ ClassicMockProgressSink() {
+ monitor.progress_callback = [](int progress, int, int, int, int) -> bool {
+ return instance->classicProgress(progress);
+ };
+ monitor.cancel = [](void* ths, int words) -> bool {
+ return ((ClassicMockProgressSink*)ths)->cancel(words);
+ };
+ monitor.cancel_this = this;
+ instance = this;
+ }
+
+ static ClassicMockProgressSink* instance;
+};
+
+ClassicMockProgressSink* ClassicMockProgressSink::instance = nullptr;
+
+class NewMockProgressSink : public ClassicMockProgressSink {
+ public:
+ MOCK_METHOD1(progress, bool(int));
+
+ NewMockProgressSink() {
+ monitor.progress_callback2 = [](ETEXT_DESC* ths, int, int, int,
+ int) -> bool {
+ return ((NewMockProgressSink*)ths->cancel_this)->progress(ths->progress);
+ };
+ }
+};
+
+void ClassicProgressTester(const char* imgname, const char* tessdatadir,
+ const char* lang) {
+ using ::testing::_;
+ using ::testing::AllOf;
+ using ::testing::AtLeast;
+ using ::testing::DoAll;
+ using ::testing::Gt;
+ using ::testing::Le;
+ using ::testing::Return;
+ using ::testing::SaveArg;
+
+ std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+ ASSERT_FALSE(api->Init(tessdatadir, lang))
+ << "Could not initialize tesseract.";
+ Pix* image = pixRead(imgname);
+ ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+ api->SetImage(image);
+
+ ClassicMockProgressSink progressSink;
+
+ int currentProgress = -1;
+ EXPECT_CALL(progressSink,
+ classicProgress(AllOf(Gt<int&>(currentProgress), Le(100))))
+ .Times(AtLeast(5))
+ .WillRepeatedly(DoAll(SaveArg<0>(&currentProgress), Return(false)));
+ EXPECT_CALL(progressSink, cancel(_))
+ .Times(AtLeast(5))
+ .WillRepeatedly(Return(false));
+
+ EXPECT_EQ(api->Recognize(&progressSink.monitor), false);
+ EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%";
+
+ api->End();
+ pixDestroy(&image);
+}
+
+void NewProgressTester(const char* imgname, const char* tessdatadir,
+ const char* lang) {
+ using ::testing::_;
+ using ::testing::AllOf;
+ using ::testing::AtLeast;
+ using ::testing::DoAll;
+ using ::testing::Gt;
+ using ::testing::Le;
+ using ::testing::Return;
+ using ::testing::SaveArg;
+
+ std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+ ASSERT_FALSE(api->Init(tessdatadir, lang))
+ << "Could not initialize tesseract.";
+ Pix* image = pixRead(imgname);
+ ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+ api->SetImage(image);
+
+ NewMockProgressSink progressSink;
+
+ int currentProgress = -1;
+ EXPECT_CALL(progressSink, classicProgress(_)).Times(0);
+ EXPECT_CALL(progressSink, progress(AllOf(Gt<int&>(currentProgress), Le(100))))
+ .Times(AtLeast(5))
+ .WillRepeatedly(DoAll(SaveArg<0>(&currentProgress), Return(false)));
+ EXPECT_CALL(progressSink, cancel(_))
+ .Times(AtLeast(5))
+ .WillRepeatedly(Return(false));
+
+ EXPECT_EQ(api->Recognize(&progressSink.monitor), false);
+ EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%";
+
+ api->End();
+ pixDestroy(&image);
+}
+
+TEST(QuickTest, ClassicProgressReporting) {
+ ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast",
+ "eng");
+}
+
+TEST(QuickTest, NewProgressReporting) {
+ NewProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng");
+}
+
+} // namespace
diff --git a/tesseract/unittest/qrsequence_test.cc b/tesseract/unittest/qrsequence_test.cc
new file mode 100644
index 00000000..783228d8
--- /dev/null
+++ b/tesseract/unittest/qrsequence_test.cc
@@ -0,0 +1,69 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <algorithm>
+#include <vector>
+
+#include "cycletimer.h"
+#include "include_gunit.h"
+#include "log.h"
+#include "qrsequence.h"
+
+namespace tesseract {
+
+class TestableQRSequenceGenerator : public QRSequenceGenerator {
+ public:
+ explicit TestableQRSequenceGenerator(const int& N) : QRSequenceGenerator(N) {}
+ // Overriding scope for testing
+ using QRSequenceGenerator::GetBinaryReversedInteger;
+};
+
+// Verifies binary inversion for a small range.
+TEST(QRSequenceGenerator, GetBinaryReversedInteger) {
+ const int kRangeSize = 8;
+ TestableQRSequenceGenerator generator(kRangeSize);
+ int reversed_vals[kRangeSize] = {0, 4, 2, 6, 1, 5, 3, 7};
+ for (int i = 0; i < kRangeSize; ++i)
+ EXPECT_EQ(reversed_vals[i], generator.GetBinaryReversedInteger(i));
+}
+
+// Trivial test fixture for a parameterized test.
+class QRSequenceGeneratorTest : public ::testing::TestWithParam<int> {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+};
+
+TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) {
+ const int kRangeSize = GetParam();
+ TestableQRSequenceGenerator generator(kRangeSize);
+ std::vector<int> vals(kRangeSize);
+ CycleTimer timer;
+ timer.Restart();
+ for (int i = 0; i < kRangeSize; ++i) vals[i] = generator.GetVal();
+ LOG(INFO) << kRangeSize << "-length sequence took " << timer.GetInMs() << "ms";
+ // Sort the numbers to verify that we've covered the range without repetition.
+ std::sort(vals.begin(), vals.end());
+ for (int i = 0; i < kRangeSize; ++i) {
+ EXPECT_EQ(i, vals[i]);
+ if (i != vals[i]) {
+ LOG(INFO) << "Aborting remaining comparisons";
+ break;
+ }
+ }
+}
+
+// Run a parameterized test using the following range sizes.
+INSTANTIATE_TEST_SUITE_P(RangeTest, QRSequenceGeneratorTest,
+ ::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6));
+} // namespace
diff --git a/tesseract/unittest/recodebeam_test.cc b/tesseract/unittest/recodebeam_test.cc
new file mode 100644
index 00000000..6e9bc4e3
--- /dev/null
+++ b/tesseract/unittest/recodebeam_test.cc
@@ -0,0 +1,483 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include_gunit.h"
+#include "log.h" // for LOG
+
+#include "genericvector.h"
+#include "recodebeam.h"
+#include "matrix.h"
+#include "pageres.h"
+#include "ratngs.h"
+#include "unicharcompress.h"
+#include "normstrngs.h"
+#include "unicharset_training_utils.h"
+
+#include "helpers.h"
+
+#include "absl/strings/str_format.h" // for absl::StrFormat
+
+namespace tesseract {
+
+// Number of characters to test beam search with.
+const int kNumChars = 100;
+// Amount of extra random data to pad with after.
+const int kPadding = 64;
+// Dictionary test data.
+// The top choice is: "Gef s wordsright.".
+// The desired phrase is "Gets words right.".
+// There is a competing dictionary phrase: "Get swords right.".
+// ... due to the following errors from the network:
+// f stronger than t in "Get".
+// weak space between Gef and s and between s and words.
+// weak space between words and right.
+const char* kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
+ "s", "", "r", "i", "g", "h", "t", ".", nullptr};
+const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65,
+ 0.89, 0.99, 0.99, 0.99, 0.99, 0.95,
+ 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
+const char* kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
+ "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
+const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25,
+ 0.10, 0.01, 0.01, 0.01, 0.01, 0.05,
+ 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
+
+const char* kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
+const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
+const char* kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
+const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
+
+const char* kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
+const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
+const char* kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
+const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
+
+class RecodeBeamTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ RecodeBeamTest() : lstm_dict_(&ccutil_) {}
+ ~RecodeBeamTest() { lstm_dict_.End(); }
+
+ // Loads and compresses the given unicharset.
+ void LoadUnicharset(const std::string& unicharset_name) {
+ std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR,
+ "radical-stroke.txt");
+ std::string unicharset_file =
+ file::JoinPath(TESTDATA_DIR, unicharset_name);
+ std::string radical_data;
+ CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
+ file::Defaults()));
+ CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
+ unichar_null_char_ = ccutil_.unicharset.has_special_codes()
+ ? UNICHAR_BROKEN
+ : ccutil_.unicharset.size();
+ STRING radical_str(radical_data.c_str());
+ EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_,
+ &radical_str));
+ RecodedCharID code;
+ recoder_.EncodeUnichar(unichar_null_char_, &code);
+ encoded_null_char_ = code(0);
+ // Space should encode as itself.
+ recoder_.EncodeUnichar(UNICHAR_SPACE, &code);
+ EXPECT_EQ(UNICHAR_SPACE, code(0));
+ std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
+ STRING encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
+ std::string encoding_str(&encoding[0], encoding.size());
+ CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
+ LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
+ }
+ // Loads the dictionary.
+ void LoadDict(const std::string& lang) {
+ std::string traineddata_name = lang + ".traineddata";
+ std::string traineddata_file =
+ file::JoinPath(TESTDATA_DIR, traineddata_name);
+ lstm_dict_.SetupForLoad(nullptr);
+ tesseract::TessdataManager mgr;
+ mgr.Init(traineddata_file.c_str());
+ lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
+ lstm_dict_.FinishLoad();
+ }
+
+ // Expects the appropriate results from the compressed_ ccutil_.unicharset.
+ void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
+ const GenericVector<int>& transcription) {
+ // Get the utf8 string of the transcription.
+ std::string truth_utf8;
+ for (int i = 0; i < transcription.size(); ++i) {
+ truth_utf8 += ccutil_.unicharset.id_to_unichar(transcription[i]);
+ }
+ PointerVector<WERD_RES> words;
+ ExpectCorrect(output, truth_utf8, nullptr, &words);
+ }
+ void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
+ const std::string& truth_utf8, Dict* dict,
+ PointerVector<WERD_RES>* words) {
+ RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
+ beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
+ // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
+ // beam_search.DebugBeams(ccutil_.unicharset);
+ std::vector<int> labels, xcoords;
+ beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
+ LOG(INFO) << "Labels size = " << labels.size() << " coords "
+ << xcoords.size() << "\n";
+ // Now decode using recoder_.
+ std::string decoded;
+ int end = 1;
+ for (int start = 0; start < labels.size(); start = end) {
+ RecodedCharID code;
+ int index = start;
+ int uni_id = INVALID_UNICHAR_ID;
+ do {
+ code.Set(code.length(), labels[index++]);
+ uni_id = recoder_.DecodeUnichar(code);
+ } while (index < labels.size() &&
+ code.length() < RecodedCharID::kMaxCodeLen &&
+ (uni_id == INVALID_UNICHAR_ID ||
+ !recoder_.IsValidFirstCode(labels[index])));
+ EXPECT_NE(INVALID_UNICHAR_ID, uni_id)
+ << "index=" << index << "/" << labels.size();
+ // To the extent of truth_utf8, we expect decoded to match, but if
+ // transcription is shorter, that is OK too, as we may just be testing
+ // that we get a valid sequence when padded with random data.
+ if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size())
+ decoded += ccutil_.unicharset.id_to_unichar(uni_id);
+ end = index;
+ }
+ EXPECT_EQ(truth_utf8, decoded);
+
+ // Check that ExtractBestPathAsUnicharIds does the same thing.
+ std::vector<int> unichar_ids;
+ std::vector<float> certainties, ratings;
+ beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset,
+ &unichar_ids, &certainties,
+ &ratings, &xcoords);
+ std::string u_decoded;
+ float total_rating = 0.0f;
+ for (int u = 0; u < unichar_ids.size(); ++u) {
+ // To the extent of truth_utf8, we expect decoded to match, but if
+ // transcription is shorter, that is OK too, as we may just be testing
+ // that we get a valid sequence when padded with random data.
+ if (u_decoded.size() < truth_utf8.size()) {
+ const char* str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
+ total_rating += ratings[u];
+ LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u,
+ unichar_ids[u], str, certainties[u],
+ ratings[u], total_rating, xcoords[u]) << "\n";
+ if (str[0] == ' ') total_rating = 0.0f;
+ u_decoded += str;
+ }
+ }
+ EXPECT_EQ(truth_utf8, u_decoded);
+
+ // Check that ExtractBestPathAsWords does the same thing.
+ TBOX line_box(0, 0, 100, 10);
+ for (int i = 0; i < 2; ++i) {
+ beam_search.ExtractBestPathAsWords(line_box, 1.0f, false,
+ &ccutil_.unicharset, words);
+ std::string w_decoded;
+ for (int w = 0; w < words->size(); ++w) {
+ const WERD_RES* word = (*words)[w];
+ if (w_decoded.size() < truth_utf8.size()) {
+ if (!w_decoded.empty() && word->word->space()) w_decoded += " ";
+ w_decoded += word->best_choice->unichar_string().c_str();
+ }
+ LOG(INFO) << absl::StrFormat("Word:%d = %s, c=%g, r=%g, perm=%d", w,
+ word->best_choice->unichar_string().c_str(),
+ word->best_choice->certainty(),
+ word->best_choice->rating(),
+ word->best_choice->permuter()) << "\n";
+ }
+ std::string w_trunc(w_decoded.data(), truth_utf8.size());
+ if (truth_utf8 != w_trunc) {
+ tesseract::NormalizeUTF8String(
+ tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize,
+ tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
+ w_trunc.assign(w_decoded.data(), truth_utf8.size());
+ }
+ EXPECT_EQ(truth_utf8, w_trunc);
+ }
+ }
+ // Generates easy encoding of the given unichar_ids, and pads with at least
+ // padding of random data.
+ GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(
+ const GenericVector<int>& unichar_ids, int padding) {
+ int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
+ int num_codes = recoder_.code_range();
+ GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
+ // Fill with random data.
+ TRand random;
+ for (int t = 0; t < width; ++t) {
+ for (int i = 0; i < num_codes; ++i)
+ outputs(t, i) = random.UnsignedRand(0.25);
+ }
+ int t = 0;
+ for (int i = 0; i < unichar_ids.size(); ++i) {
+ RecodedCharID code;
+ int len = recoder_.EncodeUnichar(unichar_ids[i], &code);
+ EXPECT_NE(0, len);
+ for (int j = 0; j < len; ++j) {
+ // Make the desired answer a clear winner.
+ if (j > 0 && code(j) == code(j - 1)) {
+ // We will collapse adjacent equal codes so put a null in between.
+ outputs(t++, encoded_null_char_) = 1.0f;
+ }
+ outputs(t++, code(j)) = 1.0f;
+ }
+ // Put a 0 as a null char in between.
+ outputs(t++, encoded_null_char_) = 1.0f;
+ }
+ // Normalize the probs.
+ for (int t = 0; t < width; ++t) {
+ double sum = 0.0;
+ for (int i = 0; i < num_codes; ++i) sum += outputs(t, i);
+ for (int i = 0; i < num_codes; ++i) outputs(t, i) /= sum;
+ }
+
+ return outputs;
+ }
+ // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
+ // the score for the appropriate sequence of codes, returning the ending t.
+ int EncodeUTF8(const char* utf8_str, float score, int start_t, TRand* random,
+ GENERIC_2D_ARRAY<float>* outputs) {
+ int t = start_t;
+ std::vector<int> unichar_ids;
+ EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids,
+ nullptr, nullptr));
+ if (unichar_ids.empty() || utf8_str[0] == '\0') {
+ unichar_ids.clear();
+ unichar_ids.push_back(unichar_null_char_);
+ }
+ int num_ids = unichar_ids.size();
+ for (int u = 0; u < num_ids; ++u) {
+ RecodedCharID code;
+ int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
+ EXPECT_NE(0, len);
+ for (int i = 0; i < len; ++i) {
+ // Apply the desired score.
+ (*outputs)(t++, code(i)) = score;
+ if (random != nullptr &&
+ t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
+ int dups = static_cast<int>(random->UnsignedRand(3.0));
+ for (int d = 0; d < dups; ++d) {
+ // Duplicate the desired score.
+ (*outputs)(t++, code(i)) = score;
+ }
+ }
+ }
+ if (random != nullptr &&
+ t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
+ int dups = static_cast<int>(random->UnsignedRand(3.0));
+ for (int d = 0; d < dups; ++d) {
+ // Add a random number of nulls as well.
+ (*outputs)(t++, encoded_null_char_) = score;
+ }
+ }
+ }
+ return t;
+ }
+ // Generates an encoding of the given 4 arrays as synthetic network scores.
+ // uses scores1 for chars1 and scores2 for chars2, and everything else gets
+ // the leftovers shared out equally. Note that empty string encodes as the
+ // null_char_.
+ GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char* chars1[],
+ const float scores1[],
+ const char* chars2[],
+ const float scores2[],
+ TRand* random) {
+ int width = 0;
+ while (chars1[width] != nullptr) ++width;
+ int padding = width * RecodedCharID::kMaxCodeLen;
+ int num_codes = recoder_.code_range();
+ GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
+ int t = 0;
+ for (int i = 0; i < width; ++i) {
+ // In case there is overlap in the codes between 1st and 2nd choice, it
+ // is better to encode the 2nd choice first.
+ int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
+ int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
+ // Advance t to the max end, setting everything else to the leftovers.
+ int max_t = std::max(end_t1, end_t2);
+ while (t < max_t) {
+ double total_score = 0.0;
+ for (int j = 0; j < num_codes; ++j) total_score += outputs(t, j);
+ double null_remainder = (1.0 - total_score) / 2.0;
+ double remainder = null_remainder / (num_codes - 2);
+ if (outputs(t, encoded_null_char_) < null_remainder) {
+ outputs(t, encoded_null_char_) += null_remainder;
+ } else {
+ remainder += remainder;
+ }
+ for (int j = 0; j < num_codes; ++j) {
+ if (outputs(t, j) == 0.0f) outputs(t, j) = remainder;
+ }
+ ++t;
+ }
+ }
+ // Fill the rest with null chars.
+ while (t < width + padding) {
+ outputs(t++, encoded_null_char_) = 1.0f;
+ }
+ return outputs;
+ }
+ UnicharCompress recoder_;
+ int unichar_null_char_ = 0;
+ int encoded_null_char_ = 0;
+ CCUtil ccutil_;
+ Dict lstm_dict_;
+};
+
+TEST_F(RecodeBeamTest, DoesChinese) {
+ LOG(INFO) << "Testing chi_tra" << "\n";
+ LoadUnicharset("chi_tra.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ GenericVector<int> transcription;
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ GENERIC_2D_ARRAY<float> outputs =
+ GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+ LOG(INFO) << "Testing chi_sim" << "\n";
+ LoadUnicharset("chi_sim.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ transcription.clear();
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesJapanese) {
+ LOG(INFO) << "Testing jpn" << "\n";
+ LoadUnicharset("jpn.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ GenericVector<int> transcription;
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ GENERIC_2D_ARRAY<float> outputs =
+ GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesKorean) {
+ LOG(INFO) << "Testing kor" << "\n";
+ LoadUnicharset("kor.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ GenericVector<int> transcription;
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ GENERIC_2D_ARRAY<float> outputs =
+ GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesKannada) {
+ LOG(INFO) << "Testing kan" << "\n";
+ LoadUnicharset("kan.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ GenericVector<int> transcription;
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ GENERIC_2D_ARRAY<float> outputs =
+ GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesMarathi) {
+ LOG(INFO) << "Testing mar" << "\n";
+ LoadUnicharset("mar.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ GenericVector<int> transcription;
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ GENERIC_2D_ARRAY<float> outputs =
+ GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesEnglish) {
+ LOG(INFO) << "Testing eng" << "\n";
+ LoadUnicharset("eng.unicharset");
+ // Correctly reproduce the first kNumchars characters from easy output.
+ GenericVector<int> transcription;
+ for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+ transcription.push_back(i);
+ GENERIC_2D_ARRAY<float> outputs =
+ GenerateRandomPaddedOutputs(transcription, kPadding);
+ ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
+ LOG(INFO) << "Testing eng dictionary" << "\n";
+ LoadUnicharset("eng_beam.unicharset");
+ GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
+ kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
+ std::string default_str;
+ for (int i = 0; kGWRTops[i] != nullptr; ++i) default_str += kGWRTops[i];
+ PointerVector<WERD_RES> words;
+ ExpectCorrect(outputs, default_str, nullptr, &words);
+ // Now try again with the dictionary.
+ LoadDict("eng_beam");
+ ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
+}
+
+TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
+ LOG(INFO) << "Testing zh_hans dictionary" << "\n";
+ LoadUnicharset("zh_hans.unicharset");
+ GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
+ kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
+ PointerVector<WERD_RES> words;
+ ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
+ // Each is an individual word, with permuter = top choice.
+ EXPECT_EQ(7, words.size());
+ for (int w = 0; w < words.size(); ++w) {
+ EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
+ }
+ // Now try again with the dictionary.
+ LoadDict("zh_hans");
+ ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
+ // Number of words expected.
+ const int kNumWords = 5;
+ // Content of the words.
+ const char* kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
+ // Permuters of the words.
+ const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM,
+ TOP_CHOICE_PERM, TOP_CHOICE_PERM,
+ SYSTEM_DAWG_PERM};
+ EXPECT_EQ(kNumWords, words.size());
+ for (int w = 0; w < kNumWords && w < words.size(); ++w) {
+ EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
+ EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
+ }
+}
+
+// Tests that a recoder built with decomposed unicode allows true ctc
+// arbitrary duplicates and inserted nulls inside the multicode sequence.
+TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
+ LOG(INFO) << "Testing duplicates in multi-code sequences" << "\n";
+ LoadUnicharset("vie.d.unicharset");
+ tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
+ TRand random;
+ GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
+ kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
+ PointerVector<WERD_RES> words;
+ std::string truth_str;
+ tesseract::NormalizeUTF8String(
+ tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
+ tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
+ ExpectCorrect(outputs, truth_str, nullptr, &words);
+}
+
+} // namespace
diff --git a/tesseract/unittest/rect_test.cc b/tesseract/unittest/rect_test.cc
new file mode 100644
index 00000000..5d9d439f
--- /dev/null
+++ b/tesseract/unittest/rect_test.cc
@@ -0,0 +1,176 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "rect.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TBOXTest : public testing::Test {
+ public:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ void TearDown() {}
+};
+
+TEST_F(TBOXTest, OverlapInside) {
+ TBOX a(10, 10, 20, 20);
+ TBOX b(11, 11, 12, 12);
+
+ EXPECT_TRUE(a.overlap(b));
+ EXPECT_TRUE(b.overlap(a));
+ EXPECT_DOUBLE_EQ(0.01, a.overlap_fraction(b));
+ EXPECT_DOUBLE_EQ(1.0, b.overlap_fraction(a));
+}
+
+TEST_F(TBOXTest, OverlapBoolCorners) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX bottom_left(5, 5, 15, 15);
+ TBOX top_left(5, 25, 15, 35);
+ // other corners covered by symmetry
+
+ EXPECT_TRUE(mid.overlap(bottom_left));
+ EXPECT_TRUE(bottom_left.overlap(mid));
+ EXPECT_TRUE(mid.overlap(top_left));
+ EXPECT_TRUE(top_left.overlap(mid));
+}
+
+TEST_F(TBOXTest, OverlapFractionCorners) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX bottom_left(5, 5, 15, 15);
+ TBOX top_left(5, 25, 15, 35);
+ // other corners covered by symmetry
+
+ EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0),
+ mid.overlap_fraction(bottom_left));
+ EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0),
+ bottom_left.overlap_fraction(mid));
+ EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(top_left));
+ EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), top_left.overlap_fraction(mid));
+}
+
+TEST_F(TBOXTest, OverlapBoolSides) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX left(5, 15, 15, 25);
+ TBOX bottom(15, 5, 25, 15);
+ // other sides covered by symmetry
+
+ EXPECT_TRUE(mid.overlap(left));
+ EXPECT_TRUE(left.overlap(mid));
+ EXPECT_TRUE(mid.overlap(bottom));
+ EXPECT_TRUE(bottom.overlap(mid));
+}
+
+TEST_F(TBOXTest, OverlapFractionSides) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX left(5, 15, 15, 25);
+ TBOX bottom(15, 5, 25, 15);
+ // other sides covered by symmetry
+
+ EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(left));
+ EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), left.overlap_fraction(mid));
+ EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(bottom));
+ EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), bottom.overlap_fraction(mid));
+}
+
+TEST_F(TBOXTest, OverlapBoolSpan) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX vertical(15, 5, 25, 35);
+ TBOX horizontal(5, 15, 35, 25);
+ // other sides covered by symmetry in other test cases
+
+ EXPECT_TRUE(mid.overlap(vertical));
+ EXPECT_TRUE(vertical.overlap(mid));
+ EXPECT_TRUE(mid.overlap(horizontal));
+ EXPECT_TRUE(horizontal.overlap(mid));
+}
+
+TEST_F(TBOXTest, OverlapFractionSpan) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX vertical(15, 5, 25, 35);
+ TBOX horizontal(5, 15, 35, 25);
+ // other sides covered by symmetry in other test cases
+
+ EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0),
+ mid.overlap_fraction(vertical));
+ EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0),
+ vertical.overlap_fraction(mid));
+ EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0),
+ mid.overlap_fraction(horizontal));
+ EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0),
+ horizontal.overlap_fraction(mid));
+}
+
+// TODO(nbeato): pretty much all cases
+TEST_F(TBOXTest, OverlapOutsideTests) {
+ TBOX mid(10, 10, 30, 30);
+ TBOX left(0, 15, 5, 25);
+
+ EXPECT_FALSE(mid.overlap(left));
+ EXPECT_FALSE(left.overlap(mid));
+ EXPECT_DOUBLE_EQ(0.0, mid.overlap_fraction(left));
+ EXPECT_DOUBLE_EQ(0.0, left.overlap_fraction(mid));
+}
+
+TEST_F(TBOXTest, OverlapXFraction) {
+ TBOX a(10, 10, 20, 20);
+ TBOX b(12, 100, 26, 200);
+ TBOX c(0, 0, 100, 100);
+ TBOX d(0, 0, 1, 1);
+
+ EXPECT_DOUBLE_EQ(8.0 / 10.0, a.x_overlap_fraction(b));
+ EXPECT_DOUBLE_EQ(8.0 / 14.0, b.x_overlap_fraction(a));
+ EXPECT_DOUBLE_EQ(1.0, a.x_overlap_fraction(c));
+ EXPECT_DOUBLE_EQ(10.0 / 100.0, c.x_overlap_fraction(a));
+ EXPECT_DOUBLE_EQ(0.0, a.x_overlap_fraction(d));
+ EXPECT_DOUBLE_EQ(0.0, d.x_overlap_fraction(a));
+}
+
+TEST_F(TBOXTest, OverlapYFraction) {
+ TBOX a(10, 10, 20, 20);
+ TBOX b(100, 12, 200, 26);
+ TBOX c(0, 0, 100, 100);
+ TBOX d(0, 0, 1, 1);
+
+ EXPECT_DOUBLE_EQ(8.0 / 10.0, a.y_overlap_fraction(b));
+ EXPECT_DOUBLE_EQ(8.0 / 14.0, b.y_overlap_fraction(a));
+ EXPECT_DOUBLE_EQ(1.0, a.y_overlap_fraction(c));
+ EXPECT_DOUBLE_EQ(10.0 / 100.0, c.y_overlap_fraction(a));
+ EXPECT_DOUBLE_EQ(0.0, a.y_overlap_fraction(d));
+ EXPECT_DOUBLE_EQ(0.0, d.y_overlap_fraction(a));
+}
+
+TEST_F(TBOXTest, OverlapXFractionZeroSize) {
+ TBOX zero(10, 10, 10, 10);
+ TBOX big(0, 0, 100, 100);
+ TBOX small(0, 0, 1, 1);
+
+ EXPECT_DOUBLE_EQ(1.0, zero.x_overlap_fraction(big));
+ EXPECT_DOUBLE_EQ(0.0, big.x_overlap_fraction(zero));
+ EXPECT_DOUBLE_EQ(0.0, zero.x_overlap_fraction(small));
+ EXPECT_DOUBLE_EQ(0.0, small.x_overlap_fraction(zero));
+}
+
+TEST_F(TBOXTest, OverlapYFractionZeroSize) {
+ TBOX zero(10, 10, 10, 10);
+ TBOX big(0, 0, 100, 100);
+ TBOX small(0, 0, 1, 1);
+
+ EXPECT_DOUBLE_EQ(1.0, zero.y_overlap_fraction(big));
+ EXPECT_DOUBLE_EQ(0.0, big.y_overlap_fraction(zero));
+ EXPECT_DOUBLE_EQ(0.0, zero.y_overlap_fraction(small));
+ EXPECT_DOUBLE_EQ(0.0, small.y_overlap_fraction(zero));
+}
+
+} // namespace
diff --git a/tesseract/unittest/resultiterator_test.cc b/tesseract/unittest/resultiterator_test.cc
new file mode 100644
index 00000000..50e18949
--- /dev/null
+++ b/tesseract/unittest/resultiterator_test.cc
@@ -0,0 +1,612 @@
+
+#include <tesseract/resultiterator.h>
+#include <string>
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "genericvector.h"
+#include "scrollview.h"
+
+#include "include_gunit.h"
+#include "log.h" // for LOG
+#include "absl/strings/str_format.h" // for absl::StrFormat
+
+namespace tesseract {
+
+// DEFINE_string(tess_config, "", "config file for tesseract");
+// DEFINE_bool(visual_test, false, "Runs a visual test using scrollview");
+
+// Helper functions for converting to STL vectors
+template <typename T>
+void ToVector(const GenericVector<T>& from, std::vector<T>* to) {
+ to->clear();
+ for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
+}
+
+template <typename T>
+void ToVector(const std::vector<T>& from, std::vector<T>* to) {
+ to->clear();
+ for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
+}
+
+// The fixture for testing Tesseract.
+class ResultIteratorTest : public testing::Test {
+ protected:
+ std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTING_DIR , name);
+ }
+ std::string TessdataPath() {
+ return file::JoinPath(TESSDATA_DIR, "");
+ }
+ std::string OutputNameToPath(const std::string& name) {
+ file::MakeTmpdir();
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+
+ ResultIteratorTest() { src_pix_ = nullptr; }
+ ~ResultIteratorTest() {}
+
+ void SetImage(const char* filename) {
+ src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
+ api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
+// if (!FLAGS_tess_config.empty())
+// api_.ReadConfigFile(FLAGS_tess_config.c_str());
+ api_.SetPageSegMode(tesseract::PSM_AUTO);
+ api_.SetImage(src_pix_);
+ pixDestroy(&src_pix_);
+ src_pix_ = api_.GetInputImage();
+ }
+
+ // Rebuilds the image using the binary images at the given level, and
+ // EXPECTs that the number of pixels in the xor of the rebuilt image with
+ // the original is at most max_diff.
+ void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator* it) {
+ it->Begin();
+ int width = pixGetWidth(src_pix_);
+ int height = pixGetHeight(src_pix_);
+ int depth = pixGetDepth(src_pix_);
+ Pix* pix = pixCreate(width, height, depth);
+ EXPECT_TRUE(depth == 1 || depth == 8);
+ if (depth == 8) pixSetAll(pix);
+ do {
+ int left, top, right, bottom;
+ PageIteratorLevel im_level = level;
+ // If the return is false, it is a non-text block so get the block image.
+ if (!it->BoundingBox(level, &left, &top, &right, &bottom)) {
+ im_level = tesseract::RIL_BLOCK;
+ EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom));
+ }
+ LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right
+ << ", B:" << bottom << "]" << "\n";
+ Pix* block_pix;
+ if (depth == 1) {
+ block_pix = it->GetBinaryImage(im_level);
+ pixRasterop(pix, left, top, right - left, bottom - top,
+ PIX_SRC ^ PIX_DST, block_pix, 0, 0);
+ } else {
+ block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top);
+ pixRasterop(pix, left, top, pixGetWidth(block_pix),
+ pixGetHeight(block_pix), PIX_SRC & PIX_DST, block_pix, 0,
+ 0);
+ }
+ CHECK(block_pix != nullptr);
+ pixDestroy(&block_pix);
+ } while (it->Next(level));
+// if (base::GetFlag(FLAGS_v) >= 1)
+// pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG);
+ pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0);
+ if (depth == 8) {
+ Pix* binary_pix = pixThresholdToBinary(pix, 128);
+ pixDestroy(&pix);
+ pixInvert(binary_pix, binary_pix);
+ pix = binary_pix;
+ }
+// if (base::GetFlag(FLAGS_v) >= 1)
+// pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG);
+ l_int32 pixcount;
+ pixCountPixels(pix, &pixcount, nullptr);
+ if (pixcount > max_diff) {
+ std::string outfile = OutputNameToPath("failedxor.png");
+ LOG(INFO) << "outfile = " << outfile << "\n";
+ pixWrite(outfile.c_str(), pix, IFF_PNG);
+ }
+ pixDestroy(&pix);
+ LOG(INFO) << absl::StrFormat("At level %d: pix diff = %d\n", level, pixcount);
+ EXPECT_LE(pixcount, max_diff);
+// if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff);
+ }
+
+ // Rebuilds the text from the iterator strings at the given level, and
+ // EXPECTs that the rebuild string exactly matches the truth string.
+ void VerifyIteratorText(const std::string& truth, PageIteratorLevel level,
+ ResultIterator* it) {
+ LOG(INFO) << "Text Test Level " << level << "\n";
+ it->Begin();
+ std::string result;
+ do {
+ char* text = it->GetUTF8Text(level);
+ result += text;
+ delete[] text;
+ if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) &&
+ it->IsAtFinalElement(tesseract::RIL_WORD, level)) {
+ if (it->IsAtFinalElement(tesseract::RIL_TEXTLINE, level)) {
+ result += '\n';
+ } else {
+ result += ' ';
+ }
+ if (it->IsAtFinalElement(tesseract::RIL_PARA, level) &&
+ !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level)))
+ result += '\n';
+ }
+ } while (it->Next(level));
+ EXPECT_STREQ(truth.c_str(), result.c_str())
+ << "Rebuild failed at Text Level " << level;
+ }
+
+ void VerifyRebuilds(int block_limit, int para_limit, int line_limit,
+ int word_limit, int symbol_limit, PageIterator* it) {
+ VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);
+ VerifyRebuild(para_limit, tesseract::RIL_PARA, it);
+ VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);
+ VerifyRebuild(word_limit, tesseract::RIL_WORD, it);
+ VerifyRebuild(symbol_limit, tesseract::RIL_SYMBOL, it);
+ }
+
+ void VerifyAllText(const std::string& truth, ResultIterator* it) {
+ VerifyIteratorText(truth, tesseract::RIL_BLOCK, it);
+ VerifyIteratorText(truth, tesseract::RIL_PARA, it);
+ VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it);
+ VerifyIteratorText(truth, tesseract::RIL_WORD, it);
+ VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it);
+ }
+
+ // Verifies that ResultIterator::CalculateTextlineOrder() produces the right
+ // results given an array of word directions (word_dirs[num_words]), an
+ // expected output reading order
+ // (expected_reading_order[num_reading_order_entries]) and a given reading
+ // context (ltr or rtl).
+ void ExpectTextlineReadingOrder(bool in_ltr_context,
+ const StrongScriptDirection* word_dirs,
+ int num_words, int* expected_reading_order,
+ int num_reading_order_entries) const {
+ std::vector<StrongScriptDirection> gv_word_dirs;
+ for (int i = 0; i < num_words; i++) {
+ gv_word_dirs.push_back(word_dirs[i]);
+ }
+
+ std::vector<int> output;
+ ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
+ &output);
+ // STL vector can be used with EXPECT_EQ, so convert...
+ std::vector<int> correct_order(
+ expected_reading_order,
+ expected_reading_order + num_reading_order_entries);
+ std::vector<int> calculated_order;
+ ToVector(output, &calculated_order);
+ EXPECT_EQ(correct_order, calculated_order);
+ }
+
+ // Verify that ResultIterator::CalculateTextlineOrder() produces sane output
+ // for a given array of word_dirs[num_words] in ltr or rtl context.
+ // Sane means that the output contains some permutation of the indices
+ // 0..[num_words - 1] interspersed optionally with negative (marker) values.
+ void VerifySaneTextlineOrder(bool in_ltr_context,
+ const StrongScriptDirection* word_dirs,
+ int num_words) const {
+ std::vector<StrongScriptDirection> gv_word_dirs;
+ for (int i = 0; i < num_words; i++) {
+ gv_word_dirs.push_back(word_dirs[i]);
+ }
+
+ std::vector<int> output;
+ ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
+ &output);
+ ASSERT_GE(output.size(), num_words);
+ std::vector<int> output_copy(output);
+ std::sort(output_copy.begin(), output_copy.end());
+ bool sane = true;
+ int j = 0;
+ while (j < output_copy.size() && output_copy[j] < 0) j++;
+ for (int i = 0; i < num_words; i++, j++) {
+ if (output_copy[j] != i) {
+ sane = false;
+ break;
+ }
+ }
+ if (j != output_copy.size()) {
+ sane = false;
+ }
+ if (!sane) {
+ std::vector<int> output_copy2, empty;
+ ToVector(output, &output_copy2);
+ EXPECT_EQ(output_copy2, empty)
+ << " permutation of 0.." << num_words - 1 << " not found in "
+ << (in_ltr_context ? "ltr" : "rtl") << " context.";
+ }
+ }
+
+ // Objects declared here can be used by all tests in the test case for Foo.
+ Pix* src_pix_; // Borrowed from api_. Do not destroy.
+ std::string ocr_text_;
+ tesseract::TessBaseAPI api_;
+};
+
+// Tests layout analysis output (and scrollview) on the UNLV page numbered
+// 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true.
+//
+//TEST_F(ResultIteratorTest, VisualTest) {
+// if (!FLAGS_visual_test) return;
+// const char* kIms[] = {"8087_054.3G.tif", "8071_093.3B.tif", nullptr};
+// for (int i = 0; kIms[i] != nullptr; ++i) {
+// SetImage(kIms[i]);
+// // Just run layout analysis.
+// PageIterator* it = api_.AnalyseLayout();
+// EXPECT_FALSE(it == nullptr);
+// // Make a scrollview window for the display.
+// int width = pixGetWidth(src_pix_);
+// int height = pixGetHeight(src_pix_);
+// ScrollView* win =
+// new ScrollView(kIms[i], 100, 100, width / 2, height / 2, width, height);
+// win->Image(src_pix_, 0, 0);
+// it->Begin();
+// ScrollView::Color color = ScrollView::RED;
+// win->Brush(ScrollView::NONE);
+// do {
+// Pta* pts = it->BlockPolygon();
+// if (pts != nullptr) {
+// win->Pen(color);
+// int num_pts = ptaGetCount(pts);
+// l_float32 x, y;
+// ptaGetPt(pts, num_pts - 1, &x, &y);
+// win->SetCursor(static_cast<int>(x), static_cast<int>(y));
+// for (int p = 0; p < num_pts; ++p) {
+// ptaGetPt(pts, p, &x, &y);
+// win->DrawTo(static_cast<int>(x), static_cast<int>(y));
+// }
+// }
+// ptaDestroy(&pts);
+// } while (it->Next(tesseract::RIL_BLOCK));
+// win->Update();
+// delete win->AwaitEvent(SVET_DESTROY);
+// delete win;
+// delete it;
+// }
+//}
+
+// Tests that Tesseract gets exactly the right answer on phototest.
+TEST_F(ResultIteratorTest, EasyTest) {
+ SetImage("phototest.tif");
+ // Just run layout analysis.
+ PageIterator* p_it = api_.AnalyseLayout();
+ EXPECT_FALSE(p_it == nullptr);
+ // Check iterator position.
+ EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
+ // This should be a single block.
+ EXPECT_FALSE(p_it->Next(tesseract::RIL_BLOCK));
+ EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
+
+ // The images should rebuild almost perfectly.
+ LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" << "\n";
+ VerifyRebuilds(10, 10, 0, 0, 0, p_it);
+ delete p_it;
+
+ char* result = api_.GetUTF8Text();
+ ocr_text_ = result;
+ delete[] result;
+ ResultIterator* r_it = api_.GetIterator();
+ // The images should rebuild almost perfectly.
+ LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" << "\n";
+ VerifyRebuilds(8, 8, 0, 0, 40, r_it);
+ // Test the text.
+ LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" << "\n";
+ VerifyAllText(ocr_text_, r_it);
+
+ // The images should rebuild almost perfectly.
+ LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" << "\n";
+ VerifyRebuilds(8, 8, 0, 0, 40, r_it);
+
+ r_it->Begin();
+ // Test baseline of the first line.
+ int x1, y1, x2, y2;
+ r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
+ LOG(INFO) << absl::StrFormat("Baseline (%d,%d)->(%d,%d)", x1, y1, x2, y2) << "\n";
+ // Make sure we have a decent vector.
+ EXPECT_GE(x2, x1 + 400);
+ // The point 200,116 should be very close to the baseline.
+ // (x3,y3) is the vector from (x1,y1) to (200,116)
+ int x3 = 200 - x1;
+ int y3 = 116 - y1;
+ x2 -= x1;
+ y2 -= y1;
+ // The cross product (x2,y1)x(x3,y3) should be small.
+ int product = x2 * y3 - x3 * y2;
+ EXPECT_LE(abs(product), x2);
+
+ // Test font attributes for each word.
+ do {
+ bool bold, italic, underlined, monospace, serif, smallcaps;
+ int pointsize, font_id;
+ const char* font =
+ r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
+ &serif, &smallcaps, &pointsize, &font_id);
+ float confidence = r_it->Confidence(tesseract::RIL_WORD);
+ EXPECT_GE(confidence, 80.0f);
+ char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+ LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g",
+ word_str, font, font_id, pointsize, confidence) << "\n";
+ delete[] word_str;
+ EXPECT_FALSE(bold);
+ EXPECT_FALSE(italic);
+ EXPECT_FALSE(underlined);
+ EXPECT_FALSE(monospace);
+ EXPECT_FALSE(serif);
+ // The text is about 31 pixels high. Above we say the source is 200 ppi,
+ // which translates to:
+ // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts
+ EXPECT_GE(pointsize, 11.16 - 1.50);
+ EXPECT_LE(pointsize, 11.16 + 1.50);
+ } while (r_it->Next(tesseract::RIL_WORD));
+ delete r_it;
+}
+
+// Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
+TEST_F(ResultIteratorTest, ComplexTest) {
+ SetImage("8087_054.3B.tif");
+ // Just run layout analysis.
+ PageIterator* it = api_.AnalyseLayout();
+ EXPECT_FALSE(it == nullptr);
+ // The images should rebuild almost perfectly.
+ VerifyRebuilds(2073, 2073, 2080, 2081, 2090, it);
+ delete it;
+}
+
+// Tests image rebuild on the UNLV page numbered 8087_054.3G.tif. (Dubrovnik)
+TEST_F(ResultIteratorTest, GreyTest) {
+ SetImage("8087_054.3G.tif");
+ // Just run layout analysis.
+ PageIterator* it = api_.AnalyseLayout();
+ EXPECT_FALSE(it == nullptr);
+ // The images should rebuild almost perfectly.
+ VerifyRebuilds(600, 600, 600, 600, 600, it);
+ delete it;
+}
+
+// Tests that Tesseract gets smallcaps and dropcaps.
+TEST_F(ResultIteratorTest, SmallCapDropCapTest) {
+ SetImage("8071_093.3B.tif");
+ char* result = api_.GetUTF8Text();
+ delete[] result;
+ ResultIterator* r_it = api_.GetIterator();
+ // Iterate over the words.
+ int found_dropcaps = 0;
+ int found_smallcaps = 0;
+ int false_positives = 0;
+ do {
+ bool bold, italic, underlined, monospace, serif, smallcaps;
+ int pointsize, font_id;
+ r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
+ &smallcaps, &pointsize, &font_id);
+ char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+ if (word_str != nullptr) {
+ LOG(INFO) << absl::StrFormat("Word %s is %s", word_str,
+ smallcaps ? "SMALLCAPS" : "Normal") << "\n";
+ if (r_it->SymbolIsDropcap()) {
+ ++found_dropcaps;
+ }
+ if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 ||
+ strcmp(word_str, "RALPH") == 0 ||
+ strcmp(word_str, "KINNEY") == 0 || // Not working yet.
+ strcmp(word_str, "BENNETT") == 0) {
+ EXPECT_TRUE(smallcaps) << word_str;
+ ++found_smallcaps;
+ } else {
+ if (smallcaps) ++false_positives;
+ }
+ // No symbol other than the first of any word should be dropcap.
+ ResultIterator s_it(*r_it);
+ while (s_it.Next(tesseract::RIL_SYMBOL) &&
+ !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) {
+ if (s_it.SymbolIsDropcap()) {
+ char* sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
+ LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str,
+ word_str);
+ delete[] sym_str;
+ }
+ EXPECT_FALSE(s_it.SymbolIsDropcap());
+ }
+ delete[] word_str;
+ }
+ } while (r_it->Next(tesseract::RIL_WORD));
+ delete r_it;
+ EXPECT_EQ(1, found_dropcaps);
+ EXPECT_GE(4, found_smallcaps);
+ EXPECT_LE(false_positives, 3);
+}
+
+#if 0
+// TODO(rays) uncomment on the next change to layout analysis.
+// CL 22736106 breaks it, but it is fixed in the change when
+// the textline finders start to collapse.
+
+// Tests that Tesseract gets subscript and superscript.
+// TODO(rays) This test is a bit feeble, due to bad textline finding on this
+// image, so beef up the test a bit when we get less false positive subs.
+TEST_F(ResultIteratorTest, SubSuperTest) {
+ SetImage("0146_281.3B.tif");
+ char* result = api_.GetUTF8Text();
+ delete [] result;
+ ResultIterator* r_it = api_.GetIterator();
+ // Iterate over the symbols.
+ // Accuracy isn't great, so just count up and expect a decent count of
+ // positives and negatives.
+ const char kAllowedSupers[] = "O0123456789-";
+ int found_subs = 0;
+ int found_supers = 0;
+ int found_normal = 0;
+ do {
+ if (r_it->SymbolIsSubscript()) {
+ ++found_subs;
+ } else if (r_it->SymbolIsSuperscript()) {
+ result = r_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+ if (strchr(kAllowedSupers, result[0]) == nullptr) {
+ char* word = r_it->GetUTF8Text(tesseract::RIL_WORD);
+ LOG(ERROR) << absl::StrFormat("Char %s in word %s is unexpected super!",
+ result, word);
+ delete [] word;
+ EXPECT_TRUE(strchr(kAllowedSupers, result[0]) != nullptr);
+ }
+ delete [] result;
+ ++found_supers;
+ } else {
+ ++found_normal;
+ }
+ } while (r_it->Next(tesseract::RIL_SYMBOL));
+ delete r_it;
+ LOG(INFO) << absl::StrFormat("Subs = %d, supers= %d, normal = %d",
+ found_subs, found_supers, found_normal) << "\n";
+ EXPECT_GE(found_subs, 25);
+ EXPECT_GE(found_supers, 25);
+ EXPECT_GE(found_normal, 1350);
+}
+#endif
+
+static const StrongScriptDirection dL = DIR_LEFT_TO_RIGHT;
+static const StrongScriptDirection dR = DIR_RIGHT_TO_LEFT;
+static const StrongScriptDirection dN = DIR_NEUTRAL;
+
+// Test that a sequence of words that could be interpreted to start from
+// the left side left-to-right or from the right side right-to-left is
+// interpreted appropriately in different contexts.
+TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) {
+ const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR};
+ int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart,
+ 0, 1, 2, 3, ResultIterator::kMinorRunEnd};
+ int reading_order_ltr_context[] = {0, 1,
+ 2, 3,
+ 4, ResultIterator::kMinorRunStart,
+ 7, 6,
+ 5, ResultIterator::kMinorRunEnd};
+
+ ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+ reading_order_ltr_context,
+ ABSL_ARRAYSIZE(reading_order_ltr_context));
+ ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+ reading_order_rtl_context,
+ ABSL_ARRAYSIZE(reading_order_rtl_context));
+}
+
+// Tests that clearly left-direction text (with no right-to-left indications)
+// comes out strictly left to right no matter the context.
+TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) {
+ const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dN, dL, dL};
+ // The order here is just left to right, nothing fancy.
+ int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7};
+ // In the strange event that this shows up in an RTL paragraph, nonetheless
+ // just presume the whole thing is an LTR line.
+ int reading_order_rtl_context[] = {
+ ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,
+ ResultIterator::kMinorRunEnd};
+
+ ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+ reading_order_ltr_context,
+ ABSL_ARRAYSIZE(reading_order_ltr_context));
+ ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+ reading_order_rtl_context,
+ ABSL_ARRAYSIZE(reading_order_rtl_context));
+}
+
+// Test that right-direction text comes out strictly right-to-left in
+// a right-to-left context.
+TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) {
+ const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR};
+ // The order here is just right-to-left, nothing fancy.
+ int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0};
+ ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+ reading_order_rtl_context,
+ ABSL_ARRAYSIZE(reading_order_rtl_context));
+}
+
+TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) {
+ // Iterate through all 7-word sequences and make sure that the output
+ // contains each of the indices 0..6 exactly once.
+ const int kNumWords(7);
+ const int kNumCombos = 1 << (2 * kNumWords); // 4 ^ 7 combinations
+ StrongScriptDirection word_dirs[kNumWords];
+ for (int i = 0; i < kNumCombos; i++) {
+ // generate the next combination.
+ int tmp = i;
+ for (int j = 0; j < kNumWords; j++) {
+ word_dirs[j] = static_cast<StrongScriptDirection>(tmp % 4);
+ tmp = tmp / 4;
+ }
+ VerifySaneTextlineOrder(true, word_dirs, kNumWords);
+ VerifySaneTextlineOrder(false, word_dirs, kNumWords);
+ }
+}
+
+// TODO: Missing image
+TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) {
+ SetImage("5318c4b679264.jpg");
+ char* result = api_.GetUTF8Text();
+ delete[] result;
+ ResultIterator* r_it = api_.GetIterator();
+ // Iterate over the words.
+ do {
+ char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+ if (word_str != nullptr) {
+ LOG(INFO) << absl::StrFormat("Word %s:", word_str) << "\n";
+ ResultIterator s_it = *r_it;
+ do {
+ tesseract::ChoiceIterator c_it(s_it);
+ do {
+ const char* char_str = c_it.GetUTF8Text();
+ if (char_str == nullptr)
+ LOG(INFO) << "Null char choice" << "\n";
+ else
+ LOG(INFO) << "Char choice " << char_str << "\n";
+ CHECK(char_str != nullptr);
+ } while (c_it.Next());
+ } while (
+ !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
+ s_it.Next(tesseract::RIL_SYMBOL));
+ delete[] word_str;
+ }
+ } while (r_it->Next(tesseract::RIL_WORD));
+ delete r_it;
+}
+
+// TODO: Missing image
+TEST_F(ResultIteratorTest, NonNullConfidencesTest) {
+// SetImage("line6.tiff");
+ SetImage("trainingitalline.tif");
+ api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+ // Force recognition so we can used the result iterator.
+ // We don't care about the return from GetUTF8Text.
+ char* result = api_.GetUTF8Text();
+ delete[] result;
+ ResultIterator* r_it = api_.GetIterator();
+ // Iterate over the words.
+ do {
+ char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+ if (word_str != nullptr) {
+ EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD));
+ EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL));
+ ResultIterator s_it = *r_it;
+ do {
+ const char* char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
+ CHECK(char_str != nullptr);
+ float confidence = s_it.Confidence(tesseract::RIL_SYMBOL);
+ LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str,
+ confidence);
+ delete[] char_str;
+ } while (
+ !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
+ s_it.Next(tesseract::RIL_SYMBOL));
+ delete[] word_str;
+ } else {
+ LOG(INFO) << "Empty word found" << "\n";
+ }
+ } while (r_it->Next(tesseract::RIL_WORD));
+ delete r_it;
+}
+
+} // namespace
diff --git a/tesseract/unittest/scanutils_test.cc b/tesseract/unittest/scanutils_test.cc
new file mode 100644
index 00000000..e6917fce
--- /dev/null
+++ b/tesseract/unittest/scanutils_test.cc
@@ -0,0 +1,114 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream> // for cout
+
+#include "include_gunit.h"
+#include "scanutils.h"
+
+namespace tesseract {
+
+class ScanutilsTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ }
+};
+
+TEST_F(ScanutilsTest, DoesScanf) {
+ // This test verifies that tfscanf does Scanf the same as stdio fscanf.
+ // There are probably a gazillion more test cases that could be added, but
+ // these brought the tesseract and unittest test results in line.
+ std::string filename = file::JoinPath(TESTDATA_DIR, "scanftest.txt");
+ FILE* fp1 = fopen(filename.c_str(), "r");
+ if (fp1 == nullptr) {
+ std::cout << "Failed to open file " << filename << '\n';
+ GTEST_SKIP();
+ }
+ FILE* fp2 = fopen(filename.c_str(), "r");
+ if (fp2 == nullptr) {
+ std::cout << "Failed to open file " << filename << '\n';
+ fclose(fp1);
+ GTEST_SKIP();
+ }
+ // The file contains this:
+ // 42.5 17 0.001000 -0.001000
+ // 0 1 123 -123 0x100
+ // abcdefghijklmnopqrstuvwxyz
+ // abcdefghijklmnopqrstuvwxyz
+ // MF 25 6.25e-2 0.5e5 -1e+4
+ // 42 MF 25 6.25e-2 0.5
+ // 24
+ const int kNumFloats = 4;
+ float f1[kNumFloats], f2[kNumFloats];
+ int r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]);
+ int r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]);
+ EXPECT_EQ(r1, kNumFloats);
+ EXPECT_EQ(r2, kNumFloats);
+ if (r1 == r2) {
+ for (int i = 0; i < r1; ++i) {
+ EXPECT_FLOAT_EQ(f1[i], f2[i]);
+ }
+ }
+ const int kNumInts = 5;
+ int i1[kNumInts], i2[kNumInts];
+ r1 = fscanf(fp1, "%d %d %d %d %i", &i1[0], &i1[1], &i1[2], &i1[3], &i1[4]);
+ r2 = tfscanf(fp2, "%d %d %d %d %i", &i2[0], &i2[1], &i2[2], &i2[3], &i2[4]);
+ EXPECT_EQ(r1, kNumInts);
+ EXPECT_EQ(r2, kNumInts);
+ if (r1 == r2) {
+ for (int i = 0; i < kNumInts; ++i) {
+ EXPECT_EQ(i1[i], i2[i]);
+ }
+ }
+ const int kStrLen = 1024;
+ char s1[kStrLen];
+ char s2[kStrLen];
+ r1 = fscanf(fp1, "%1023s", s1);
+ r2 = tfscanf(fp2, "%1023s", s2);
+ EXPECT_EQ(r1, r2);
+ EXPECT_STREQ(s1, s2);
+ EXPECT_EQ(26, strlen(s2));
+ r1 = fscanf(fp1, "%20s", s1);
+ r2 = tfscanf(fp2, "%20s", s2);
+ EXPECT_EQ(r1, r2);
+ EXPECT_STREQ(s1, s2);
+ EXPECT_EQ(20, strlen(s2));
+ // Now read the rest of the alphabet.
+ r1 = fscanf(fp1, "%1023s", s1);
+ r2 = tfscanf(fp2, "%1023s", s2);
+ EXPECT_EQ(r1, r2);
+ EXPECT_STREQ(s1, s2);
+ EXPECT_EQ(6, strlen(s2));
+ r1 = fscanf(fp1, "%1023s", s1);
+ r2 = tfscanf(fp2, "%1023s", s2);
+ EXPECT_EQ(r1, r2);
+ EXPECT_STREQ(s1, s2);
+ EXPECT_EQ(2, strlen(s2));
+ r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]);
+ r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]);
+ EXPECT_EQ(r1, r2);
+ for (int i = 0; i < kNumFloats; ++i) EXPECT_FLOAT_EQ(f1[i], f2[i]);
+ // Test the * for field suppression.
+ r1 = fscanf(fp1, "%d %*s %*d %*f %*f", &i1[0]);
+ r2 = tfscanf(fp2, "%d %*s %*d %*f %*f", &i2[0]);
+ EXPECT_EQ(r1, r2);
+ EXPECT_EQ(i1[0], i2[0]);
+ // We should still see the next value and no phantoms.
+ r1 = fscanf(fp1, "%d %1023s", &i1[0], s1);
+ r2 = tfscanf(fp2, "%d %1023s", &i2[0], s2);
+ EXPECT_EQ(r1, r2);
+ EXPECT_EQ(1, r2);
+ EXPECT_EQ(i1[0], i2[0]);
+ fclose(fp2);
+ fclose(fp1);
+}
+
+} // namespace
diff --git a/tesseract/unittest/shapetable_test.cc b/tesseract/unittest/shapetable_test.cc
new file mode 100644
index 00000000..285ed833
--- /dev/null
+++ b/tesseract/unittest/shapetable_test.cc
@@ -0,0 +1,182 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_format.h" // for absl::StrFormat
+
+#include "include_gunit.h"
+
+#include "serialis.h"
+#include "shapetable.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+static std::string TmpNameToPath(const std::string& name) {
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+}
+
+// Sets up a simple shape with some unichars.
+static void Setup352(int font_id, Shape* shape) {
+ shape->AddToShape(3, font_id);
+ shape->AddToShape(5, font_id);
+ shape->AddToShape(2, font_id);
+}
+
+// Verifies some properties of the 352 shape.
+static void Expect352(int font_id, const Shape& shape) {
+ EXPECT_EQ(3, shape.size());
+ EXPECT_TRUE(shape.ContainsUnichar(2));
+ EXPECT_TRUE(shape.ContainsUnichar(3));
+ EXPECT_TRUE(shape.ContainsUnichar(5));
+ EXPECT_FALSE(shape.ContainsUnichar(1));
+ EXPECT_TRUE(shape.ContainsUnicharAndFont(2, font_id));
+ EXPECT_FALSE(shape.ContainsUnicharAndFont(2, font_id - 1));
+ EXPECT_FALSE(shape.ContainsUnicharAndFont(font_id, 2));
+ // It should be a subset of itself.
+ EXPECT_TRUE(shape.IsSubsetOf(shape));
+}
+
+#endif
+
+// The fixture for testing Shape.
+class ShapeTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+};
+
+// Tests that a Shape works as expected for all the basic functions.
+TEST_F(ShapeTest, BasicTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because Shape is missing.
+ GTEST_SKIP();
+#else
+ Shape shape1;
+ EXPECT_EQ(0, shape1.size());
+ Setup352(101, &shape1);
+ Expect352(101, shape1);
+ // It should still work after file I/O.
+ std::string filename = TmpNameToPath("shapefile");
+ FILE* fp = fopen(filename.c_str(), "wb");
+ ASSERT_TRUE(fp != nullptr);
+ EXPECT_TRUE(shape1.Serialize(fp));
+ fclose(fp);
+ TFile tfp;
+ EXPECT_TRUE(tfp.Open(filename.c_str(), nullptr));
+ Shape shape2;
+ EXPECT_TRUE(shape2.DeSerialize(&tfp));
+ Expect352(101, shape2);
+ // They should be subsets of each other.
+ EXPECT_TRUE(shape1.IsSubsetOf(shape2));
+ EXPECT_TRUE(shape2.IsSubsetOf(shape1));
+ // They should be equal unichars.
+ EXPECT_TRUE(shape1.IsEqualUnichars(&shape2));
+ // and still pass afterwards.
+ Expect352(101, shape1);
+ Expect352(101, shape2);
+#endif
+}
+
+// Tests AddShape separately, as it takes quite a bit of work.
+TEST_F(ShapeTest, AddShapeTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because Shape is missing.
+ GTEST_SKIP();
+#else
+ Shape shape1;
+ Setup352(101, &shape1);
+ Expect352(101, shape1);
+ // Now setup a different shape with different content.
+ Shape shape2;
+ shape2.AddToShape(3, 101); // Duplicates shape1.
+ shape2.AddToShape(5, 110); // Different font to shape1.
+ shape2.AddToShape(7, 101); // Different unichar to shape1.
+ // They should NOT be subsets of each other.
+ EXPECT_FALSE(shape1.IsSubsetOf(shape2));
+ EXPECT_FALSE(shape2.IsSubsetOf(shape1));
+ // Now add shape2 to shape1.
+ shape1.AddShape(shape2);
+ // Test subsets again.
+ EXPECT_FALSE(shape1.IsSubsetOf(shape2));
+ EXPECT_TRUE(shape2.IsSubsetOf(shape1));
+ EXPECT_EQ(4, shape1.size());
+ EXPECT_FALSE(shape1.ContainsUnichar(1));
+ EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 101));
+ EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 110));
+ EXPECT_FALSE(shape1.ContainsUnicharAndFont(3, 110));
+ EXPECT_FALSE(shape1.ContainsUnicharAndFont(7, 110));
+ EXPECT_FALSE(shape1.IsEqualUnichars(&shape2));
+#endif
+}
+
+// The fixture for testing Shape.
+class ShapeTableTest : public testing::Test {};
+
+// Tests that a Shape works as expected for all the basic functions.
+TEST_F(ShapeTableTest, FullTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+ // Skip test because Shape is missing.
+ GTEST_SKIP();
+#else
+ Shape shape1;
+ Setup352(101, &shape1);
+ // Build a shape table with the same data, but in separate shapes.
+ UNICHARSET unicharset;
+ unicharset.unichar_insert(" ");
+ for (int i = 1; i <= 10; ++i) {
+ std::string class_str = absl::StrFormat("class%d", i);
+ unicharset.unichar_insert(class_str.c_str());
+ }
+ ShapeTable st(unicharset);
+ EXPECT_EQ(0, st.AddShape(3, 101));
+ EXPECT_EQ(1, st.AddShape(5, 101));
+ EXPECT_EQ(2, st.AddShape(2, 101));
+ EXPECT_EQ(3, st.NumShapes());
+ Expect352(101, shape1);
+ EXPECT_EQ(3, st.AddShape(shape1));
+ for (int i = 0; i < 3; ++i) {
+ EXPECT_FALSE(st.MutableShape(i)->IsEqualUnichars(&shape1));
+ }
+ EXPECT_TRUE(st.MutableShape(3)->IsEqualUnichars(&shape1));
+ EXPECT_TRUE(st.AnyMultipleUnichars());
+ st.DeleteShape(3);
+ EXPECT_FALSE(st.AnyMultipleUnichars());
+
+ // Now merge to make a single shape like shape1.
+ EXPECT_EQ(1, st.MasterUnicharCount(0));
+ st.MergeShapes(0, 1);
+ EXPECT_EQ(3, st.MergedUnicharCount(1, 2));
+ st.MergeShapes(1, 2);
+ for (int i = 0; i < 3; ++i) {
+ EXPECT_EQ(3, st.MasterUnicharCount(i));
+ // Master font count is the sum of all the font counts in the shape, not
+ // the actual number of different fonts in the shape.
+ EXPECT_EQ(3, st.MasterFontCount(i));
+ }
+ EXPECT_EQ(0, st.MasterDestinationIndex(1));
+ EXPECT_EQ(0, st.MasterDestinationIndex(2));
+ ShapeTable st2;
+ st2.AppendMasterShapes(st, nullptr);
+ EXPECT_EQ(1, st.NumMasterShapes());
+ EXPECT_EQ(1, st2.NumShapes());
+ EXPECT_TRUE(st2.MutableShape(0)->IsEqualUnichars(&shape1));
+ EXPECT_TRUE(st2.AnyMultipleUnichars());
+#endif
+}
+
+} // namespace
diff --git a/tesseract/unittest/stats_test.cc b/tesseract/unittest/stats_test.cc
new file mode 100644
index 00000000..58c3483d
--- /dev/null
+++ b/tesseract/unittest/stats_test.cc
@@ -0,0 +1,59 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "genericvector.h"
+#include "kdpair.h"
+#include "statistc.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+const int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1};
+
+class STATSTest : public testing::Test {
+ public:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ stats_.set_range(0, 16);
+ for (size_t i = 0; i < ARRAYSIZE(kTestData); ++i)
+ stats_.add(i, kTestData[i]);
+ }
+
+ void TearDown() {}
+
+ STATS stats_;
+};
+
+// Tests some basic numbers from the stats_.
+TEST_F(STATSTest, BasicStats) {
+ EXPECT_EQ(37, stats_.get_total());
+ EXPECT_EQ(2, stats_.mode());
+ EXPECT_EQ(12, stats_.pile_count(2));
+}
+
+// Tests the top_n_modes function.
+TEST_F(STATSTest, TopNModes) {
+ GenericVector<tesseract::KDPairInc<float, int> > modes;
+ int num_modes = stats_.top_n_modes(3, &modes);
+ EXPECT_EQ(3, num_modes);
+ // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14.
+ EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key());
+ EXPECT_EQ(14, modes[0].data());
+ // Mode 1 is 2 10 1 = 13 total count with a mean of 5 12/13.
+ EXPECT_FLOAT_EQ(5.0f + 12.0f / 13, modes[1].key());
+ EXPECT_EQ(13, modes[1].data());
+ // Mode 2 is 4 1 1 = 6 total count with a mean of 13.5.
+ EXPECT_FLOAT_EQ(13.5f, modes[2].key());
+ EXPECT_EQ(6, modes[2].data());
+}
+
+} // namespace.
diff --git a/tesseract/unittest/stridemap_test.cc b/tesseract/unittest/stridemap_test.cc
new file mode 100644
index 00000000..fa1ef234
--- /dev/null
+++ b/tesseract/unittest/stridemap_test.cc
@@ -0,0 +1,219 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef INCLUDE_TENSORFLOW
+#include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D
+#else
+#include <array> // std::array
+#endif
+#include "include_gunit.h"
+#include "stridemap.h"
+
+namespace tesseract {
+
+#if !defined(INCLUDE_TENSORFLOW) && 0
+namespace xla {
+
+template <typename T>
+class Array2D : public std::vector<T> {
+ public:
+ Array2D() : std::vector<T>(std::vector<int64_t>{0, 0}) {}
+
+ Array2D(const int64_t n1, const int64_t n2)
+ : std::vector<T>(std::vector<int64_t>{n1, n2}) {}
+
+ Array2D(const int64_t n1, const int64_t n2, const T value)
+ : std::vector<T>({n1, n2}, value) {}
+};
+}
+#endif
+
+class StridemapTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+#ifdef INCLUDE_TENSORFLOW
+ // Sets up an Array2d object of the given size, initialized to increasing
+ // values starting with start.
+ std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) {
+ std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize));
+ int value = start;
+ for (int y = 0; y < ysize; ++y) {
+ for (int x = 0; x < xsize; ++x) {
+#ifdef INCLUDE_TENSORFLOW
+ (*a)(y, x) = value++;
+#else
+ a[y][x] = value++;
+#endif
+ }
+ }
+ return a;
+ }
+#endif
+};
+
+TEST_F(StridemapTest, Indexing) {
+ // This test verifies that with a batch of arrays of different sizes, the
+ // iteration index each of them in turn, without going out of bounds.
+#ifdef INCLUDE_TENSORFLOW
+ std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;
+ arrays.push_back(SetupArray(3, 4, 0));
+ arrays.push_back(SetupArray(4, 5, 12));
+ arrays.push_back(SetupArray(4, 4, 32));
+ arrays.push_back(SetupArray(3, 5, 48));
+ std::vector<std::pair<int, int>> h_w_sizes;
+ for (size_t i = 0; i < arrays.size(); ++i) {
+ h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());
+ }
+ StrideMap stride_map;
+ stride_map.SetStride(h_w_sizes);
+ StrideMap::Index index(stride_map);
+ int pos = 0;
+ do {
+ EXPECT_GE(index.t(), pos);
+ EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH)),
+ pos);
+ EXPECT_EQ(index.IsLast(FD_BATCH),
+ index.index(FD_BATCH) == arrays.size() - 1);
+ EXPECT_EQ(
+ index.IsLast(FD_HEIGHT),
+ index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1);
+ EXPECT_EQ(
+ index.IsLast(FD_WIDTH),
+ index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1);
+ EXPECT_TRUE(index.IsValid());
+ ++pos;
+ } while (index.Increment());
+ LOG(INFO) << "pos=" << pos;
+ index.InitToLast();
+ do {
+ --pos;
+ EXPECT_GE(index.t(), pos);
+ EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH)),
+ pos);
+ StrideMap::Index copy(index);
+ // Since a change in batch index changes the height and width, it isn't
+ // necessarily true that the position is still valid, even when changing
+ // to another valid batch index.
+ if (index.IsLast(FD_BATCH)) {
+ EXPECT_FALSE(copy.AddOffset(1, FD_BATCH));
+ }
+ copy = index;
+ EXPECT_EQ(index.IsLast(FD_HEIGHT), !copy.AddOffset(1, FD_HEIGHT));
+ copy = index;
+ EXPECT_EQ(index.IsLast(FD_WIDTH), !copy.AddOffset(1, FD_WIDTH));
+ copy = index;
+ if (index.index(FD_BATCH) == 0) {
+ EXPECT_FALSE(copy.AddOffset(-1, FD_BATCH));
+ }
+ copy = index;
+ EXPECT_EQ(index.index(FD_HEIGHT) == 0, !copy.AddOffset(-1, FD_HEIGHT));
+ copy = index;
+ EXPECT_EQ(index.index(FD_WIDTH) == 0, !copy.AddOffset(-1, FD_WIDTH));
+ copy = index;
+ EXPECT_FALSE(copy.AddOffset(10, FD_WIDTH));
+ copy = index;
+ EXPECT_FALSE(copy.AddOffset(-10, FD_HEIGHT));
+ EXPECT_TRUE(index.IsValid());
+ } while (index.Decrement());
+#else
+ LOG(INFO) << "Skip test because of missing xla::Array2D";
+ GTEST_SKIP();
+#endif
+}
+
+TEST_F(StridemapTest, Scaling) {
+ // This test verifies that with a batch of arrays of different sizes, the
+ // scaling/reduction functions work as expected.
+#ifdef INCLUDE_TENSORFLOW
+ std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;
+ arrays.push_back(SetupArray(3, 4, 0)); // 0-11
+ arrays.push_back(SetupArray(4, 5, 12)); // 12-31
+ arrays.push_back(SetupArray(4, 4, 32)); // 32-47
+ arrays.push_back(SetupArray(3, 5, 48)); // 48-62
+ std::vector<std::pair<int, int>> h_w_sizes;
+ for (size_t i = 0; i < arrays.size(); ++i) {
+ h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());
+ }
+ StrideMap stride_map;
+ stride_map.SetStride(h_w_sizes);
+
+ // Scale x by 2, keeping y the same.
+ std::vector<int> values_x2 = {0, 1, 4, 5, 8, 9, 12, 13, 17, 18,
+ 22, 23, 27, 28, 32, 33, 36, 37, 40, 41,
+ 44, 45, 48, 49, 53, 54, 58, 59};
+ StrideMap test_map(stride_map);
+ test_map.ScaleXY(2, 1);
+ StrideMap::Index index(test_map);
+ int pos = 0;
+ do {
+ int expected_value = values_x2[pos++];
+ EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH)),
+ expected_value);
+ } while (index.Increment());
+ EXPECT_EQ(pos, values_x2.size());
+
+ test_map = stride_map;
+ // Scale y by 2, keeping x the same.
+ std::vector<int> values_y2 = {0, 1, 2, 3, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 32, 33, 34, 35,
+ 36, 37, 38, 39, 48, 49, 50, 51, 52};
+ test_map.ScaleXY(1, 2);
+ index.InitToFirst();
+ pos = 0;
+ do {
+ int expected_value = values_y2[pos++];
+ EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH)),
+ expected_value);
+ } while (index.Increment());
+ EXPECT_EQ(pos, values_y2.size());
+
+ test_map = stride_map;
+ // Scale x and y by 2.
+ std::vector<int> values_xy2 = {0, 1, 12, 13, 17, 18, 32, 33, 36, 37, 48, 49};
+ test_map.ScaleXY(2, 2);
+ index.InitToFirst();
+ pos = 0;
+ do {
+ int expected_value = values_xy2[pos++];
+ EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH)),
+ expected_value);
+ } while (index.Increment());
+ EXPECT_EQ(pos, values_xy2.size());
+
+ test_map = stride_map;
+ // Reduce Width to 1.
+ std::vector<int> values_x_to_1 = {0, 4, 8, 12, 17, 22, 27,
+ 32, 36, 40, 44, 48, 53, 58};
+ test_map.ReduceWidthTo1();
+ index.InitToFirst();
+ pos = 0;
+ do {
+ int expected_value = values_x_to_1[pos++];
+ EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+ index.index(FD_WIDTH)),
+ expected_value);
+ } while (index.Increment());
+ EXPECT_EQ(pos, values_x_to_1.size());
+#else
+ LOG(INFO) << "Skip test because of missing xla::Array2D";
+ GTEST_SKIP();
+#endif
+}
+
+} // namespace
diff --git a/tesseract/unittest/stringrenderer_test.cc b/tesseract/unittest/stringrenderer_test.cc
new file mode 100644
index 00000000..8cba6e4f
--- /dev/null
+++ b/tesseract/unittest/stringrenderer_test.cc
@@ -0,0 +1,564 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "boxchar.h"
+#include "boxread.h"
+#include "commandlineflags.h"
+#include "stringrenderer.h"
+#include "strngs.h"
+
+#include "absl/strings/str_split.h" // for absl::StrSplit
+#include "allheaders.h"
+
+#include <memory>
+#include <string>
+
+BOOL_PARAM_FLAG(display, false, "Display image for inspection");
+
+namespace tesseract {
+
+const char kEngText[] = "the quick brown fox jumps over the lazy dog";
+const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
+
+const char kKorText[] = "이는 것으로 다시 넣을 1234 수는 있지만 선택의 의미는";
+const char kArabicText[] =
+ "والفكر والصراع ، بالتأمل والفهم والتحليل ، "
+ "بالعلم والفن ، وأخيرا بالضحك أوبالبكاء ، ";
+const char kMixedText[] = "والفكر 123 والصراع abc";
+
+const char kEngNonLigatureText[] = "fidelity";
+// Same as kEngNonLigatureText, but with "fi" replaced with its ligature.
+const char kEngLigatureText[] = "fidelity";
+
+static PangoFontMap* font_map;
+
+class StringRendererTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!font_map) {
+ font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+ }
+ pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+ }
+
+ static void SetUpTestCase() {
+ static std::locale system_locale("");
+ std::locale::global(system_locale);
+
+ l_chooseDisplayProg(L_DISPLAY_WITH_XZGV);
+ FLAGS_fonts_dir = TESTING_DIR;
+ FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+ file::MakeTmpdir();
+ PangoFontInfo::SoftInitFontConfig(); // init early
+ }
+
+ void DisplayClusterBoxes(Pix* pix) {
+ if (!FLAGS_display) return;
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ Boxa* boxes = boxaCreate(0);
+ for (const auto& boxchar : boxchars) {
+ if (boxchar->box())
+ boxaAddBox(boxes, const_cast<Box*>(boxchar->box()), L_CLONE);
+ }
+ Pix* box_pix = pixDrawBoxaRandom(pix, boxes, 1);
+ boxaDestroy(&boxes);
+ pixDisplay(box_pix, 0, 0);
+ pixDestroy(&box_pix);
+ }
+ std::unique_ptr<StringRenderer> renderer_;
+};
+
+TEST_F(StringRendererTest, DoesRenderToImage) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+
+ renderer_.reset(new StringRenderer("UnBatang 10", 600, 600));
+ EXPECT_EQ(strlen(kKorText),
+ renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+
+ renderer_.reset(new StringRenderer("Lohit Hindi 10", 600, 600));
+ EXPECT_EQ(strlen(kHinText),
+ renderer_->RenderToImage(kHinText, strlen(kHinText), &pix));
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+
+ // RTL text
+ renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+ EXPECT_EQ(strlen(kArabicText),
+ renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+
+ // Mixed direction Arabic + english text
+ renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+ EXPECT_EQ(strlen(kMixedText),
+ renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ // Underline all words but NOT intervening spaces.
+ renderer_->set_underline_start_prob(1.0);
+ renderer_->set_underline_continuation_prob(0);
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+ renderer_->ClearBoxes();
+
+ // Underline all words AND intervening spaces.
+ renderer_->set_underline_start_prob(1.0);
+ renderer_->set_underline_continuation_prob(1.0);
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+ renderer_->ClearBoxes();
+
+ // Underline words and intervening spaces with 0.5 prob.
+ renderer_->set_underline_start_prob(0.5);
+ renderer_->set_underline_continuation_prob(0.5);
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesHandleNewlineCharacters) {
+ const char kRawText[] = "\n\n\n A \nB \nC \n\n\n";
+ const char kStrippedText[] = " A B C "; // text with newline chars removed
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kRawText),
+ renderer_->RenderToImage(kRawText, strlen(kRawText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ // 3 characters + 4 spaces => 7 boxes
+ EXPECT_EQ(7, boxchars.size());
+ if (boxchars.size() == 7) {
+ // Verify the text content of the boxchars
+ for (size_t i = 0; i < boxchars.size(); ++i) {
+ EXPECT_EQ(std::string(1, kStrippedText[i]), boxchars[i]->ch());
+ }
+ }
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesRenderLigatures) {
+ renderer_.reset(new StringRenderer("Arab 12", 600, 250));
+ const char kArabicLigature[] = "لا";
+
+ Pix* pix = nullptr;
+ EXPECT_EQ(
+ strlen(kArabicLigature),
+ renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ const std::vector<BoxChar*>& boxes = renderer_->GetBoxes();
+ EXPECT_EQ(1, boxes.size());
+ EXPECT_TRUE(boxes[0]->box() != nullptr);
+ EXPECT_STREQ(kArabicLigature, boxes[0]->ch().c_str());
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+
+ renderer_.reset(new StringRenderer("Arab 12", 600, 250));
+ const char kArabicMixedText[] = "والفكر والصراع 1234,\nوالفكر لا والصراع";
+ renderer_->RenderToImage(kArabicMixedText, strlen(kArabicMixedText), &pix);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+}
+
+static int FindBoxCharXCoord(const std::vector<BoxChar*>& boxchars,
+ const std::string& ch) {
+ for (const auto& boxchar : boxchars) {
+ if (boxchar->ch() == ch) return boxchar->box()->x;
+ }
+ return INT_MAX;
+}
+
+TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) {
+ renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+ Pix* pix = nullptr;
+ // Arabic letters should be in decreasing x-coordinates
+ const char kArabicWord[] = "\u0644\u0627\u0641\u0643\u0631";
+ const std::string kRevWord = "\u0631\u0643\u0641\u0627\u0644";
+ renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix);
+ std::string boxes_str = renderer_->GetBoxesStr();
+ // Decode to get the box text strings.
+ EXPECT_FALSE(boxes_str.empty());
+ std::vector<STRING> texts;
+ EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts,
+ nullptr, nullptr));
+ std::string ltr_str;
+ for (size_t i = 0; i < texts.size(); ++i) {
+ ltr_str += texts[i].c_str();
+ }
+ // The string should come out perfectly reversed, despite there being a
+ // ligature.
+ EXPECT_EQ(ltr_str, kRevWord);
+ // Just to prove there was a ligature, the number of texts is less than the
+ // number of unicodes.
+ EXPECT_LT(texts.size(), 5);
+ pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) {
+ renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+ Pix* pix = nullptr;
+ // Arabic letters should be in decreasing x-coordinates
+ const char kArabicWord[] = "والفكر";
+ renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ for (size_t i = 1; i < boxchars.size(); ++i) {
+ EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x)
+ << boxchars[i - 1]->ch();
+ }
+ pixDestroy(&pix);
+
+ // English letters should be in increasing x-coordinates
+ const char kEnglishWord[] = "Google";
+ renderer_->ClearBoxes();
+ renderer_->RenderToImage(kEnglishWord, strlen(kEnglishWord), &pix);
+ EXPECT_EQ(boxchars.size(), strlen(kEnglishWord));
+ for (size_t i = 1; i < boxchars.size(); ++i) {
+ EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x)
+ << boxchars[i - 1]->ch();
+ }
+ pixDestroy(&pix);
+
+ // Mixed text should satisfy both.
+ renderer_->ClearBoxes();
+ renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix);
+ EXPECT_LT(FindBoxCharXCoord(boxchars, "a"), FindBoxCharXCoord(boxchars, "b"));
+ EXPECT_LT(FindBoxCharXCoord(boxchars, "1"), FindBoxCharXCoord(boxchars, "2"));
+ EXPECT_GT(FindBoxCharXCoord(boxchars, "و"), FindBoxCharXCoord(boxchars, "ر"));
+ pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesRenderVerticalText) {
+ Pix* pix = nullptr;
+ renderer_.reset(new StringRenderer("UnBatang 10", 600, 600));
+ renderer_->set_vertical_text(true);
+ EXPECT_EQ(strlen(kKorText),
+ renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ DisplayClusterBoxes(pix);
+ pixDestroy(&pix);
+}
+
+// Checks that we preserve charboxes across RenderToImage calls, with
+// appropriate page numbers.
+TEST_F(StringRendererTest, DoesKeepAllImageBoxes) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ Pix* pix = nullptr;
+ int num_boxes_per_page = 0;
+ const int kNumTrials = 2;
+ for (int i = 0; i < kNumTrials; ++i) {
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ EXPECT_TRUE(pix != nullptr);
+ pixDestroy(&pix);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ if (!num_boxes_per_page) {
+ num_boxes_per_page = renderer_->GetBoxes().size();
+ } else {
+ EXPECT_EQ((i + 1) * num_boxes_per_page, renderer_->GetBoxes().size());
+ }
+ for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page;
+ ++j) {
+ EXPECT_EQ(i, renderer_->GetBoxes()[j]->page());
+ }
+ }
+}
+
+TEST_F(StringRendererTest, DoesClearBoxes) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ pixDestroy(&pix);
+ EXPECT_GT(renderer_->GetBoxes().size(), 0);
+ const int num_boxes_per_page = renderer_->GetBoxes().size();
+
+ renderer_->ClearBoxes();
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ pixDestroy(&pix);
+ EXPECT_EQ(num_boxes_per_page, renderer_->GetBoxes().size());
+}
+
+TEST_F(StringRendererTest, DoesLigatureTextForRendering) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ renderer_->set_add_ligatures(true);
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kEngNonLigatureText),
+ renderer_->RenderToImage(kEngNonLigatureText,
+ strlen(kEngNonLigatureText), &pix));
+ pixDestroy(&pix);
+ // There should be one less box than letters due to the 'fi' ligature.
+ EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());
+ // The output box text should be ligatured.
+ EXPECT_STREQ("fi", renderer_->GetBoxes()[0]->ch().c_str());
+}
+
+TEST_F(StringRendererTest, DoesRetainInputLigatureForRendering) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kEngLigatureText),
+ renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText),
+ &pix));
+ pixDestroy(&pix);
+ // There should be one less box than letters due to the 'fi' ligature.
+ EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());
+ // The output box text should be ligatured.
+ EXPECT_STREQ("\uFB01", renderer_->GetBoxes()[0]->ch().c_str());
+}
+
+TEST_F(StringRendererTest, DoesStripUnrenderableWords) {
+ // Verdana should only be able to render the english letters and numbers in
+ // the mixed text.
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ std::string text(kMixedText);
+ EXPECT_GT(renderer_->StripUnrenderableWords(&text), 0);
+ EXPECT_EQ(" 123 abc", text);
+}
+
+TEST_F(StringRendererTest, DoesRenderWordBoxes) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ renderer_->set_output_word_boxes(true);
+ Pix* pix = nullptr;
+ EXPECT_EQ(strlen(kEngText),
+ renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+ pixDestroy(&pix);
+ // Verify #boxchars = #words + #spaces
+ std::vector<std::string> words =
+ absl::StrSplit(kEngText, ' ', absl::SkipEmpty());
+ const int kNumSpaces = words.size() - 1;
+ const int kExpectedNumBoxes = words.size() + kNumSpaces;
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ EXPECT_EQ(kExpectedNumBoxes, boxchars.size());
+ // Verify content of words and spaces
+ for (size_t i = 0; i < boxchars.size(); i += 2) {
+ EXPECT_EQ(words[i / 2], boxchars[i]->ch());
+ if (i < boxchars.size() - 1) {
+ EXPECT_EQ(" ", boxchars[i + 1]->ch());
+ EXPECT_TRUE(boxchars[i + 1]->box() == nullptr);
+ }
+ }
+}
+
+TEST_F(StringRendererTest, DoesRenderWordBoxesFromMultiLineText) {
+ renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+ renderer_->set_output_word_boxes(true);
+ Pix* pix = nullptr;
+ const char kMultlineText[] = "the quick brown fox\njumps over the lazy dog";
+ EXPECT_EQ(strlen(kMultlineText),
+ renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix));
+ pixDestroy(&pix);
+ // Verify #boxchars = #words + #spaces + #newlines
+ std::vector<std::string> words =
+ absl::StrSplit(kMultlineText, absl::ByAnyChar(" \n"), absl::SkipEmpty());
+ const int kNumSeparators = words.size() - 1;
+ const int kExpectedNumBoxes = words.size() + kNumSeparators;
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ EXPECT_EQ(kExpectedNumBoxes, boxchars.size());
+ // Verify content of words and spaces
+ for (size_t i = 0; i < boxchars.size(); i += 2) {
+ EXPECT_EQ(words[i / 2], boxchars[i]->ch());
+ if (i + 1 < boxchars.size()) {
+ EXPECT_EQ(" ", boxchars[i + 1]->ch());
+ EXPECT_TRUE(boxchars[i + 1]->box() == nullptr);
+ }
+ }
+}
+
+TEST_F(StringRendererTest, DoesRenderAllFontsToImage) {
+ renderer_.reset(new StringRenderer("Verdana 10", 1200, 1200));
+ size_t offset = 0;
+ std::string font_used;
+ do {
+ Pix* pix = nullptr;
+ font_used.clear();
+ offset += renderer_->RenderAllFontsToImage(
+ 1.0, kEngText + offset, strlen(kEngText + offset), &font_used, &pix);
+ if (offset < strlen(kEngText)) {
+ EXPECT_TRUE(pix != nullptr);
+ EXPECT_STRNE("", font_used.c_str());
+ }
+ if (FLAGS_display) pixDisplay(pix, 0, 0);
+ pixDestroy(&pix);
+ } while (offset < strlen(kEngText));
+}
+
+TEST_F(StringRendererTest, DoesNotRenderWordJoiner) {
+ renderer_.reset(new StringRenderer("Verdana 10", 500, 200));
+ const std::string word = "A- -B C-D A BC";
+ const std::string joined_word = StringRenderer::InsertWordJoiners(word);
+ Pix* pix = nullptr;
+ renderer_->RenderToImage(joined_word.c_str(), joined_word.length(), &pix);
+ pixDestroy(&pix);
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ const std::string kWordJoinerUTF8 = "\u2060";
+ ASSERT_EQ(word.length(), boxchars.size());
+ for (size_t i = 0; i < boxchars.size(); ++i) {
+ EXPECT_NE(kWordJoinerUTF8, boxchars[i]->ch());
+ EXPECT_EQ(word.substr(i, 1), boxchars[i]->ch());
+ }
+}
+
+TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) {
+ renderer_.reset(new StringRenderer("Verdana 10", 500, 200));
+ renderer_->set_drop_uncovered_chars(true);
+ const std::string kWord = "office";
+ const std::string kCleanWord = "oice";
+ Pix* pix = nullptr;
+ EXPECT_FALSE(
+ renderer_->font().CanRenderString(kWord.c_str(), kWord.length()));
+ EXPECT_FALSE(renderer_->font().CoversUTF8Text(kWord.c_str(), kWord.length()));
+ int offset = renderer_->RenderToImage(kWord.c_str(), kWord.length(), &pix);
+ pixDestroy(&pix);
+ const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+ EXPECT_EQ(kWord.length(), offset);
+ ASSERT_EQ(kCleanWord.length(), boxchars.size());
+ for (size_t i = 0; i < boxchars.size(); ++i) {
+ EXPECT_EQ(kCleanWord.substr(i, 1), boxchars[i]->ch());
+ }
+}
+
+// ------------ StringRenderer::ConvertBasicLatinToFullwidthLatin() ------------
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesConvertBasicLatin) {
+ const std::string kHalfAlpha = "ABCD";
+ const std::string kFullAlpha = "ABCD";
+ EXPECT_EQ(kFullAlpha,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha));
+
+ const std::string kHalfDigit = "0123";
+ const std::string kFullDigit = "0123";
+ EXPECT_EQ(kFullDigit,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit));
+
+ const std::string kHalfSym = "()[]:;!?";
+ const std::string kFullSym = "()[]:;!?";
+ EXPECT_EQ(kFullSym,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym));
+}
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertFullwidthLatin) {
+ const std::string kFullAlpha = "ABCD";
+ EXPECT_EQ(kFullAlpha,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha));
+
+ const std::string kFullDigit = "0123";
+ EXPECT_EQ(kFullDigit,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit));
+
+ const std::string kFullSym = "()[]:;!?";
+ EXPECT_EQ(kFullSym,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym));
+}
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertNonLatin) {
+ const std::string kHalfKana = "アイウエオ";
+ const std::string kFullKana = "アイウエオ";
+ EXPECT_EQ(kHalfKana,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana));
+ EXPECT_EQ(kFullKana,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana));
+}
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) {
+ const std::string kHalfSpace = " ";
+ const std::string kFullSpace = " ";
+ EXPECT_EQ(kHalfSpace,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace));
+ EXPECT_EQ(kFullSpace,
+ StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace));
+}
+
+// ------------ StringRenderer::ConvertFullwidthLatinToBasicLatin() ------------
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesConvertFullwidthLatin) {
+ const std::string kHalfAlpha = "ABCD";
+ const std::string kFullAlpha = "ABCD";
+ EXPECT_EQ(kHalfAlpha,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha));
+
+ const std::string kHalfDigit = "0123";
+ const std::string kFullDigit = "0123";
+ EXPECT_EQ(kHalfDigit,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit));
+
+ const std::string kHalfSym = "()[]:;!?";
+ const std::string kFullSym = "()[]:;!?";
+ EXPECT_EQ(kHalfSym,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym));
+}
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertBasicLatin) {
+ const std::string kHalfAlpha = "ABCD";
+ EXPECT_EQ(kHalfAlpha,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha));
+
+ const std::string kHalfDigit = "0123";
+ EXPECT_EQ(kHalfDigit,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit));
+
+ const std::string kHalfSym = "()[]:;!?";
+ EXPECT_EQ(kHalfSym,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym));
+}
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertNonLatin) {
+ const std::string kHalfKana = "アイウエオ";
+ const std::string kFullKana = "アイウエオ";
+ EXPECT_EQ(kHalfKana,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana));
+ EXPECT_EQ(kFullKana,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana));
+}
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertSpace) {
+ const std::string kHalfSpace = " ";
+ const std::string kFullSpace = " ";
+ EXPECT_EQ(kHalfSpace,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace));
+ EXPECT_EQ(kFullSpace,
+ StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace));
+}
+} // namespace
diff --git a/tesseract/unittest/syntaxnet/base.h b/tesseract/unittest/syntaxnet/base.h
new file mode 100644
index 00000000..5dabbbda
--- /dev/null
+++ b/tesseract/unittest/syntaxnet/base.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_BASE_H_
+#define SYNTAXNET_BASE_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "google/protobuf/util/message_differencer.h"
+
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/default/integral_types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+
+
+using tensorflow::int8;
+using tensorflow::int16;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::uint8;
+using tensorflow::uint16;
+using tensorflow::uint64;
+using tensorflow::uint32;
+using tensorflow::protobuf::TextFormat;
+using tensorflow::mutex_lock;
+using tensorflow::mutex;
+using std::map;
+using std::pair;
+using std::vector;
+using std::unordered_map;
+using std::unordered_set;
+typedef signed int char32;
+
+using tensorflow::StringPiece;
+using std::string;
+
+
+ // namespace syntaxnet
+
+#endif // SYNTAXNET_BASE_H_
diff --git a/tesseract/unittest/tablefind_test.cc b/tesseract/unittest/tablefind_test.cc
new file mode 100644
index 00000000..df6d511c
--- /dev/null
+++ b/tesseract/unittest/tablefind_test.cc
@@ -0,0 +1,261 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "colpartition.h"
+#include "colpartitiongrid.h"
+#include "tablefind.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableTableFinder : public tesseract::TableFinder {
+ public:
+ using TableFinder::GapInXProjection;
+ using TableFinder::HasLeaderAdjacent;
+ using TableFinder::InsertLeaderPartition;
+ using TableFinder::InsertTextPartition;
+ using TableFinder::set_global_median_blob_width;
+ using TableFinder::set_global_median_ledding;
+ using TableFinder::set_global_median_xheight;
+ using TableFinder::SplitAndInsertFragmentedTextPartition;
+
+ void ExpectPartition(const TBOX& box) {
+ tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_);
+ gsearch.SetUniqueMode(true);
+ gsearch.StartFullSearch();
+ ColPartition* part = nullptr;
+ bool found = false;
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ if (part->bounding_box().left() == box.left() &&
+ part->bounding_box().bottom() == box.bottom() &&
+ part->bounding_box().right() == box.right() &&
+ part->bounding_box().top() == box.top()) {
+ found = true;
+ }
+ }
+ EXPECT_TRUE(found);
+ }
+ void ExpectPartitionCount(int expected_count) {
+ tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_);
+ gsearch.SetUniqueMode(true);
+ gsearch.StartFullSearch();
+ ColPartition* part = nullptr;
+ int count = 0;
+ while ((part = gsearch.NextFullSearch()) != nullptr) {
+ ++count;
+ }
+ EXPECT_EQ(expected_count, count);
+ }
+};
+
+class TableFinderTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ free_boxes_it_.set_to_list(&free_boxes_);
+ finder_.reset(new TestableTableFinder());
+ finder_->Init(1, ICOORD(0, 0), ICOORD(500, 500));
+ // gap finding
+ finder_->set_global_median_xheight(5);
+ finder_->set_global_median_blob_width(5);
+ }
+
+ void TearDown() {
+ if (partition_.get() != nullptr) partition_->DeleteBoxes();
+ DeletePartitionListBoxes();
+ finder_.reset(nullptr);
+ }
+
+ void MakePartition(int x_min, int y_min, int x_max, int y_max) {
+ MakePartition(x_min, y_min, x_max, y_max, 0, 0);
+ }
+
+ void MakePartition(int x_min, int y_min, int x_max, int y_max,
+ int first_column, int last_column) {
+ if (partition_.get() != nullptr) partition_->DeleteBoxes();
+ TBOX box;
+ box.set_to_given_coords(x_min, y_min, x_max, y_max);
+ partition_.reset(
+ ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE));
+ partition_->set_first_column(first_column);
+ partition_->set_last_column(last_column);
+ }
+
+ void InsertTextPartition(ColPartition* part) {
+ finder_->InsertTextPartition(part);
+ free_boxes_it_.add_after_then_move(part);
+ }
+
+ void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max) {
+ InsertLeaderPartition(x_min, y_min, x_max, y_max, 0, 0);
+ }
+
+ void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max,
+ int first_column, int last_column) {
+ TBOX box;
+ box.set_to_given_coords(x_min, y_min, x_max, y_max);
+ ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT,
+ BRT_UNKNOWN, BTFT_LEADER);
+ part->set_first_column(first_column);
+ part->set_last_column(last_column);
+ finder_->InsertLeaderPartition(part);
+ free_boxes_it_.add_after_then_move(part);
+ }
+
+ void DeletePartitionListBoxes() {
+ for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list();
+ free_boxes_it_.forward()) {
+ ColPartition* part = free_boxes_it_.data();
+ part->DeleteBoxes();
+ }
+ }
+
+ std::unique_ptr<TestableTableFinder> finder_;
+ std::unique_ptr<ColPartition> partition_;
+
+ private:
+ tesseract::ColPartition_CLIST free_boxes_;
+ tesseract::ColPartition_C_IT free_boxes_it_;
+};
+
+TEST_F(TableFinderTest, GapInXProjectionNoGap) {
+ int data[100];
+ for (int i = 0; i < 100; ++i) data[i] = 10;
+ EXPECT_FALSE(finder_->GapInXProjection(data, 100));
+}
+
+TEST_F(TableFinderTest, GapInXProjectionEdgeGap) {
+ int data[100];
+ for (int i = 0; i < 10; ++i) data[i] = 2;
+ for (int i = 10; i < 90; ++i) data[i] = 10;
+ for (int i = 90; i < 100; ++i) data[i] = 2;
+ EXPECT_FALSE(finder_->GapInXProjection(data, 100));
+}
+
+TEST_F(TableFinderTest, GapInXProjectionExists) {
+ int data[100];
+ for (int i = 0; i < 10; ++i) data[i] = 10;
+ for (int i = 10; i < 90; ++i) data[i] = 2;
+ for (int i = 90; i < 100; ++i) data[i] = 10;
+ EXPECT_TRUE(finder_->GapInXProjection(data, 100));
+}
+
+TEST_F(TableFinderTest, HasLeaderAdjacentOverlapping) {
+ InsertLeaderPartition(90, 0, 150, 5);
+ MakePartition(0, 0, 100, 10);
+ EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(0, 25, 100, 40);
+ EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(145, 0, 200, 20);
+ EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(40, 0, 50, 4);
+ EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+}
+
+TEST_F(TableFinderTest, HasLeaderAdjacentNoOverlap) {
+ InsertLeaderPartition(90, 10, 150, 15);
+ MakePartition(0, 10, 85, 20);
+ EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(0, 25, 100, 40);
+ EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(0, 0, 100, 10);
+ EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+ // TODO(nbeato): is this a useful metric? case fails
+ // MakePartition(160, 0, 200, 15); // leader is primarily above it
+ // EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+}
+
+TEST_F(TableFinderTest, HasLeaderAdjacentPreservesColumns) {
+ InsertLeaderPartition(90, 0, 150, 5, 1, 2);
+ MakePartition(0, 0, 85, 10, 0, 0);
+ EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(0, 0, 100, 10, 0, 1);
+ EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(0, 0, 200, 10, 0, 5);
+ EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+ MakePartition(155, 0, 200, 10, 5, 5);
+ EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+}
+
+// TODO(nbeato): Only testing a splitting case. Add more...
+// Also test non-split cases.
+TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicPass) {
+ finder_->set_global_median_blob_width(3);
+ finder_->set_global_median_xheight(10);
+
+ TBOX part_box(10, 5, 100, 15);
+ ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
+ all->set_type(PT_FLOWING_TEXT);
+ all->set_blob_type(BRT_TEXT);
+ all->set_flow(BTFT_CHAIN);
+ all->set_left_margin(10);
+ all->set_right_margin(100);
+ TBOX blob_box = part_box;
+ for (int i = 10; i <= 20; i += 5) {
+ blob_box.set_left(i + 1);
+ blob_box.set_right(i + 4);
+ all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+ }
+ for (int i = 35; i <= 55; i += 5) {
+ blob_box.set_left(i + 1);
+ blob_box.set_right(i + 4);
+ all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+ }
+ for (int i = 80; i <= 95; i += 5) {
+ blob_box.set_left(i + 1);
+ blob_box.set_right(i + 4);
+ all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+ }
+ // TODO(nbeato): Ray's newer code...
+ // all->ClaimBoxes();
+ all->ComputeLimits(); // This is to make sure median iinfo is set.
+ InsertTextPartition(all); // This is to delete blobs
+ ColPartition* fragment_me = all->CopyButDontOwnBlobs();
+
+ finder_->SplitAndInsertFragmentedTextPartition(fragment_me);
+ finder_->ExpectPartition(TBOX(11, 5, 24, 15));
+ finder_->ExpectPartition(TBOX(36, 5, 59, 15));
+ finder_->ExpectPartition(TBOX(81, 5, 99, 15));
+ finder_->ExpectPartitionCount(3);
+}
+
+TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) {
+ finder_->set_global_median_blob_width(3);
+ finder_->set_global_median_xheight(10);
+
+ TBOX part_box(10, 5, 100, 15);
+ ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
+ all->set_type(PT_FLOWING_TEXT);
+ all->set_blob_type(BRT_TEXT);
+ all->set_flow(BTFT_CHAIN);
+ all->set_left_margin(10);
+ all->set_right_margin(100);
+ TBOX blob_box = part_box;
+ for (int i = 10; i <= 95; i += 5) {
+ blob_box.set_left(i + 1);
+ blob_box.set_right(i + 4);
+ all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+ }
+ // TODO(nbeato): Ray's newer code...
+ // all->ClaimBoxes();
+ all->ComputeLimits(); // This is to make sure median iinfo is set.
+ InsertTextPartition(all); // This is to delete blobs
+ ColPartition* fragment_me = all->CopyButDontOwnBlobs();
+
+ finder_->SplitAndInsertFragmentedTextPartition(fragment_me);
+ finder_->ExpectPartition(TBOX(11, 5, 99, 15));
+ finder_->ExpectPartitionCount(1);
+}
+
+} // namespace
diff --git a/tesseract/unittest/tablerecog_test.cc b/tesseract/unittest/tablerecog_test.cc
new file mode 100644
index 00000000..3dfb32c5
--- /dev/null
+++ b/tesseract/unittest/tablerecog_test.cc
@@ -0,0 +1,316 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "colpartition.h"
+#include "colpartitiongrid.h"
+#include "tablerecog.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableTableRecognizer : public tesseract::TableRecognizer {
+ public:
+ using TableRecognizer::FindLinesBoundingBox;
+ using TableRecognizer::HasSignificantLines;
+ using TableRecognizer::RecognizeLinedTable;
+ using TableRecognizer::RecognizeTable;
+ using TableRecognizer::RecognizeWhitespacedTable;
+};
+
+class TestableStructuredTable : public tesseract::StructuredTable {
+ public:
+ using StructuredTable::CountHorizontalIntersections;
+ using StructuredTable::CountVerticalIntersections;
+ using StructuredTable::FindLinedStructure;
+ using StructuredTable::FindWhitespacedColumns;
+ using StructuredTable::FindWhitespacedStructure;
+ using StructuredTable::VerifyLinedTableCells;
+
+ void InjectCellY(int y) {
+ cell_y_.push_back(y);
+ cell_y_.sort();
+ }
+ void InjectCellX(int x) {
+ cell_x_.push_back(x);
+ cell_x_.sort();
+ }
+
+ void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) {
+ ASSERT_EQ(0, (almost_done - second) % add);
+ EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size());
+ EXPECT_EQ(x_min, cell_x_.get(0));
+ EXPECT_EQ(x_max, cell_x_.get(cell_x_.size() - 1));
+ for (int i = 1; i < cell_x_.size() - 1; ++i) {
+ EXPECT_EQ(second + add * (i - 1), cell_x_.get(i));
+ }
+ }
+
+ void ExpectSortedX() {
+ EXPECT_GT(cell_x_.size(), 0);
+ for (int i = 1; i < cell_x_.size(); ++i) {
+ EXPECT_LT(cell_x_.get(i - 1), cell_x_.get(i));
+ }
+ }
+};
+
+class SharedTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ ICOORD bleft(0, 0);
+ ICOORD tright(1000, 1000);
+ text_grid_.reset(new ColPartitionGrid(5, bleft, tright));
+ line_grid_.reset(new ColPartitionGrid(5, bleft, tright));
+ }
+
+ void TearDown() {
+ tesseract::ColPartition_IT memory(&allocated_parts_);
+ for (memory.mark_cycle_pt(); !memory.cycled_list(); memory.forward()) {
+ memory.data()->DeleteBoxes();
+ }
+ }
+
+ void InsertPartitions() {
+ for (int row = 0; row < 800; row += 20)
+ for (int col = 0; col < 500; col += 25)
+ InsertPartition(col + 1, row + 1, col + 24, row + 19);
+ }
+
+ void InsertPartition(int left, int bottom, int right, int top) {
+ TBOX box(left, bottom, right, top);
+ ColPartition* part =
+ ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+ part->set_median_width(3);
+ part->set_median_height(3);
+ text_grid_->InsertBBox(true, true, part);
+
+ tesseract::ColPartition_IT add_it(&allocated_parts_);
+ add_it.add_after_stay_put(part);
+ }
+
+ void InsertLines() {
+ line_box_.set_to_given_coords(
+ 100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(),
+ 450 + line_grid_->gridsize(), 50 + line_grid_->gridsize());
+ for (int i = 10; i <= 50; i += 10) InsertHorizontalLine(100, 450, i);
+ for (int i = 100; i <= 450; i += 50) InsertVerticalLine(i, 10, 50);
+
+ for (int i = 100; i <= 200; i += 20) InsertHorizontalLine(0, 100, i);
+ }
+
+ void InsertHorizontalLine(int left, int right, int y) {
+ TBOX box(left, y - line_grid_->gridsize(), right,
+ y + line_grid_->gridsize());
+ ColPartition* part =
+ ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE);
+ line_grid_->InsertBBox(true, true, part);
+
+ tesseract::ColPartition_IT add_it(&allocated_parts_);
+ add_it.add_after_stay_put(part);
+ }
+ void InsertVerticalLine(int x, int bottom, int top) {
+ TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(),
+ top);
+ ColPartition* part =
+ ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE);
+ line_grid_->InsertBBox(true, true, part);
+
+ tesseract::ColPartition_IT add_it(&allocated_parts_);
+ add_it.add_after_stay_put(part);
+ }
+
+ void InsertCellsInLines() {
+ for (int y = 10; y <= 50; y += 10)
+ for (int x = 100; x <= 450; x += 50)
+ InsertPartition(x + 1, y + 1, x + 49, y + 9);
+ }
+
+ TBOX line_box_;
+ std::unique_ptr<ColPartitionGrid> text_grid_;
+ std::unique_ptr<ColPartitionGrid> line_grid_;
+ ColPartition_LIST allocated_parts_;
+};
+
+class TableRecognizerTest : public SharedTest {
+ protected:
+ void SetUp() {
+ SharedTest::SetUp();
+ recognizer_.reset(new TestableTableRecognizer());
+ recognizer_->Init();
+ recognizer_->set_text_grid(text_grid_.get());
+ recognizer_->set_line_grid(line_grid_.get());
+ }
+
+ std::unique_ptr<TestableTableRecognizer> recognizer_;
+};
+
+class StructuredTableTest : public SharedTest {
+ protected:
+ void SetUp() {
+ SharedTest::SetUp();
+ table_.reset(new TestableStructuredTable());
+ table_->Init();
+ table_->set_text_grid(text_grid_.get());
+ table_->set_line_grid(line_grid_.get());
+ }
+
+ std::unique_ptr<TestableStructuredTable> table_;
+};
+
+TEST_F(TableRecognizerTest, HasSignificantLinesBasicPass) {
+ InsertLines();
+ TBOX smaller_guess(120, 15, 370, 45);
+ TBOX larger_guess(90, 5, 490, 70);
+ EXPECT_TRUE(recognizer_->HasSignificantLines(line_box_));
+ EXPECT_TRUE(recognizer_->HasSignificantLines(larger_guess));
+ EXPECT_TRUE(recognizer_->HasSignificantLines(smaller_guess));
+}
+
+TEST_F(TableRecognizerTest, HasSignificantLinesBasicFail) {
+ InsertLines();
+ TBOX box(370, 35, 500, 45);
+ EXPECT_FALSE(recognizer_->HasSignificantLines(box));
+}
+
+TEST_F(TableRecognizerTest, HasSignificantLinesHorizontalOnlyFails) {
+ InsertLines();
+ TBOX box(0, 100, 200, 200);
+ EXPECT_FALSE(recognizer_->HasSignificantLines(box));
+}
+
+TEST_F(TableRecognizerTest, FindLinesBoundingBoxBasic) {
+ InsertLines();
+ TBOX box(0, 0, 200, 50);
+ bool result = recognizer_->FindLinesBoundingBox(&box);
+ EXPECT_TRUE(result);
+ EXPECT_EQ(line_box_.left(), box.left());
+ EXPECT_EQ(line_box_.right(), box.right());
+ EXPECT_EQ(line_box_.bottom(), box.bottom());
+ EXPECT_EQ(line_box_.top(), box.top());
+}
+
+TEST_F(TableRecognizerTest, RecognizeLinedTableBasic) {
+ InsertLines();
+ TBOX guess(120, 15, 370, 45);
+ tesseract::StructuredTable table;
+ table.set_text_grid(text_grid_.get());
+ table.set_line_grid(line_grid_.get());
+
+ EXPECT_TRUE(recognizer_->RecognizeLinedTable(guess, &table));
+ EXPECT_EQ(line_box_.bottom(), table.bounding_box().bottom());
+ EXPECT_EQ(line_box_.top(), table.bounding_box().top());
+ EXPECT_EQ(line_box_.left(), table.bounding_box().left());
+ EXPECT_EQ(line_box_.right(), table.bounding_box().right());
+ EXPECT_EQ(line_box_.area(), table.bounding_box().area());
+ EXPECT_EQ(7, table.column_count());
+ EXPECT_EQ(4, table.row_count());
+ EXPECT_EQ(28, table.cell_count());
+ EXPECT_TRUE(table.is_lined());
+}
+
+TEST_F(TableRecognizerTest, RecognizeWhitespacedTableBasic) {
+ InsertPartitions();
+ TBOX guess(0, 0, 500, 800);
+
+ tesseract::StructuredTable table;
+ table.set_text_grid(text_grid_.get());
+ table.set_line_grid(line_grid_.get());
+ EXPECT_TRUE(recognizer_->RecognizeWhitespacedTable(guess, &table));
+ EXPECT_EQ(1, table.bounding_box().bottom());
+ EXPECT_EQ(799, table.bounding_box().top());
+ EXPECT_EQ(1, table.bounding_box().left());
+ EXPECT_EQ(499, table.bounding_box().right());
+ EXPECT_EQ(798 * 498, table.bounding_box().area());
+ EXPECT_EQ(500 / 25, table.column_count());
+ EXPECT_EQ(800 / 20, table.row_count());
+ EXPECT_EQ(500 * 800 / 20 / 25, table.cell_count());
+ EXPECT_FALSE(table.is_lined());
+}
+
+TEST_F(StructuredTableTest, CountVerticalIntersectionsAll) {
+ table_->set_bounding_box(TBOX(0, 0, 1000, 1000));
+ InsertPartition(0, 0, 100, 10);
+ InsertPartition(1, 12, 43, 21);
+ EXPECT_EQ(2, table_->CountVerticalIntersections(4));
+ EXPECT_EQ(2, table_->CountVerticalIntersections(20));
+ EXPECT_EQ(2, table_->CountVerticalIntersections(40));
+ EXPECT_EQ(1, table_->CountVerticalIntersections(50));
+ EXPECT_EQ(1, table_->CountVerticalIntersections(60));
+ EXPECT_EQ(1, table_->CountVerticalIntersections(80));
+ EXPECT_EQ(1, table_->CountVerticalIntersections(95));
+ EXPECT_EQ(0, table_->CountVerticalIntersections(104));
+ EXPECT_EQ(0, table_->CountVerticalIntersections(150));
+}
+
+TEST_F(StructuredTableTest, CountHorizontalIntersectionsAll) {
+ table_->set_bounding_box(TBOX(0, 0, 1000, 1000));
+ InsertPartition(0, 3, 100, 10);
+ InsertPartition(110, 5, 200, 16);
+
+ EXPECT_EQ(0, table_->CountHorizontalIntersections(0));
+ EXPECT_EQ(1, table_->CountHorizontalIntersections(4));
+ EXPECT_EQ(2, table_->CountHorizontalIntersections(8));
+ EXPECT_EQ(1, table_->CountHorizontalIntersections(12));
+ EXPECT_EQ(0, table_->CountHorizontalIntersections(20));
+}
+
+TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) {
+ for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
+ for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
+ InsertLines();
+ InsertCellsInLines();
+ table_->set_bounding_box(line_box_);
+ EXPECT_TRUE(table_->VerifyLinedTableCells());
+}
+
+TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) {
+ for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
+ for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
+ InsertLines();
+ InsertCellsInLines();
+ InsertPartition(101, 11, 299, 19);
+ table_->set_bounding_box(line_box_);
+ EXPECT_FALSE(table_->VerifyLinedTableCells());
+}
+
+TEST_F(StructuredTableTest, VerifyLinedTableVerticalFail) {
+ for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
+ for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
+ InsertLines();
+ InsertCellsInLines();
+ InsertPartition(151, 21, 199, 39);
+ table_->set_bounding_box(line_box_);
+ EXPECT_FALSE(table_->VerifyLinedTableCells());
+}
+
+TEST_F(StructuredTableTest, FindWhitespacedColumnsBasic) {
+ InsertPartitions();
+ TBOX guess(0, 0, 500, 800);
+ table_->set_bounding_box(guess);
+ table_->FindWhitespacedColumns();
+ table_->ExpectCellX(1, 25, 25, 475, 499);
+}
+
+TEST_F(StructuredTableTest, FindWhitespacedColumnsSorted) {
+ InsertPartitions();
+ TBOX guess(0, 0, 500, 800);
+ table_->set_bounding_box(guess);
+ table_->FindWhitespacedColumns();
+ table_->ExpectSortedX();
+}
+
+// TODO(nbeato): check failure cases
+// TODO(nbeato): check Recognize processes correctly on trivial real examples.
+
+} // namespace
diff --git a/tesseract/unittest/tabvector_test.cc b/tesseract/unittest/tabvector_test.cc
new file mode 100644
index 00000000..dab0ace8
--- /dev/null
+++ b/tesseract/unittest/tabvector_test.cc
@@ -0,0 +1,130 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "tabvector.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TabVectorTest : public testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ vector_.reset();
+ }
+
+ void TearDown() {}
+
+ void MakeSimpleTabVector(int x1, int y1, int x2, int y2) {
+ vector_.reset(new TabVector());
+ vector_->set_startpt(ICOORD(x1, y1));
+ vector_->set_endpt(ICOORD(x2, y2));
+ }
+
+ std::unique_ptr<TabVector> vector_;
+};
+
+TEST_F(TabVectorTest, SetStartEndPointsMatch) {
+ vector_.reset(new TabVector());
+ ICOORD start(51, 65);
+ ICOORD end(7568, 234);
+ // Test coordinates individually to avoid adding an ostream operator
+ // explicitly to the ICOORD class (Droid doesn't support it).
+ vector_->set_startpt(start);
+ EXPECT_EQ(start.x(), vector_->startpt().x());
+ EXPECT_EQ(start.y(), vector_->startpt().y());
+ vector_->set_endpt(end);
+ EXPECT_EQ(end.x(), vector_->endpt().x());
+ EXPECT_EQ(end.y(), vector_->endpt().y());
+}
+
+TEST_F(TabVectorTest, XAtY45DegreeSlopeInRangeExact) {
+ MakeSimpleTabVector(0, 0, 100, 100);
+ for (int y = 0; y <= 100; ++y) {
+ int x = vector_->XAtY(y);
+ EXPECT_EQ(y, x);
+ }
+}
+
+TEST_F(TabVectorTest, XAtYVerticalInRangeExact) {
+ const int x = 120; // Arbitrary choice
+ MakeSimpleTabVector(x, 0, x, 100);
+ for (int y = 0; y <= 100; ++y) {
+ int result_x = vector_->XAtY(y);
+ EXPECT_EQ(x, result_x);
+ }
+}
+
+TEST_F(TabVectorTest, XAtYHorizontal) {
+ const int y = 76; // arbitrary
+ MakeSimpleTabVector(0, y, 100, y);
+ EXPECT_EQ(0, vector_->XAtY(y));
+ // TODO(nbeato): What's the failure condition?
+ // Undefined! Should not pass! Allow until resolved answer.
+ EXPECT_EQ(0, vector_->XAtY(10));
+}
+
+TEST_F(TabVectorTest, XAtYRoundingSimple) {
+ MakeSimpleTabVector(0, 0, 2, 10000);
+ int x = vector_->XAtY(1);
+ EXPECT_EQ(0, x);
+ x = vector_->XAtY(4999);
+ EXPECT_EQ(0, x);
+ x = vector_->XAtY(5001);
+ EXPECT_EQ(1, x);
+ x = vector_->XAtY(9999);
+ EXPECT_EQ(1, x);
+}
+
+TEST_F(TabVectorTest, XAtYLargeNumbers) {
+ // Assume a document is 800 DPI,
+ // the width of a page is 10 inches across (8000 pixels), and
+ // the height of the page is 15 inches (12000 pixels).
+ MakeSimpleTabVector(7804, 504, 7968, 11768); // Arbitrary for vertical line
+ int x = vector_->XAtY(6136); // test mid point
+ EXPECT_EQ(7886, x);
+}
+
+TEST_F(TabVectorTest, XAtYHorizontalInRangeExact) {
+ const int y = 120; // Arbitrary choice
+ MakeSimpleTabVector(50, y, 150, y);
+
+ int x = vector_->XAtY(y);
+ EXPECT_EQ(50, x);
+}
+
+TEST_F(TabVectorTest, VOverlapInRangeSimple) {
+ MakeSimpleTabVector(0, 0, 100, 100);
+ int overlap = vector_->VOverlap(90, 10);
+ EXPECT_EQ(80, overlap);
+ overlap = vector_->VOverlap(100, 0);
+ EXPECT_EQ(100, overlap);
+}
+
+TEST_F(TabVectorTest, VOverlapOutOfRange) {
+ MakeSimpleTabVector(0, 10, 100, 90);
+ int overlap = vector_->VOverlap(100, 0);
+ EXPECT_EQ(80, overlap);
+}
+
+TEST_F(TabVectorTest, XYFlip) {
+ MakeSimpleTabVector(1, 2, 3, 4);
+ vector_->XYFlip();
+ EXPECT_EQ(2, vector_->startpt().x());
+ EXPECT_EQ(1, vector_->startpt().y());
+ EXPECT_EQ(4, vector_->endpt().x());
+ EXPECT_EQ(3, vector_->endpt().y());
+}
+
+} // namespace
diff --git a/tesseract/unittest/tatweel_test.cc b/tesseract/unittest/tatweel_test.cc
new file mode 100644
index 00000000..4bd8b337
--- /dev/null
+++ b/tesseract/unittest/tatweel_test.cc
@@ -0,0 +1,114 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(_WIN32)
+#include <io.h> // for _access
+#else
+#include <unistd.h> // for access
+#endif
+
+#include "include_gunit.h"
+#include "dawg.h"
+#include "trie.h"
+#include "unicharset.h"
+#ifdef INCLUDE_TENSORFLOW
+#include "util/utf8/unicodetext.h" // for UnicodeText
+#endif
+
+namespace tesseract {
+
+// Replacement for std::filesystem::exists (C++-17)
+static bool file_exists(const char* filename) {
+#if defined(_WIN32)
+ return _access(filename, 0) == 0;
+#else
+ return access(filename, 0) == 0;
+#endif
+}
+
+class TatweelTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ static std::locale system_locale("");
+ std::locale::global(system_locale);
+ }
+
+ TatweelTest() {
+#ifdef INCLUDE_TENSORFLOW
+ std::string filename = TestDataNameToPath("ara.wordlist");
+ if (file_exists(filename.c_str())) {
+ std::string wordlist(u8"\u0640");
+ CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
+ // Put all the unicodes in the unicharset_.
+ UnicodeText text;
+ text.PointToUTF8(wordlist.data(), wordlist.size());
+ int num_tatweel = 0;
+ for (auto it = text.begin(); it != text.end(); ++it) {
+ std::string utf8 = it.get_utf8_string();
+ if (utf8.find(u8"\u0640") != std::string::npos) ++num_tatweel;
+ unicharset_.unichar_insert(utf8.c_str());
+ }
+ LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
+ EXPECT_GT(num_tatweel, 0);
+ }
+#endif
+ }
+
+ std::string TestDataNameToPath(const std::string& name) {
+ return file::JoinPath(TESTDATA_DIR, name);
+ }
+ UNICHARSET unicharset_;
+};
+
+TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
+ // This test verifies that the unicharset ignores the Tatweel character.
+ for (int i = 0; i < unicharset_.size(); ++i) {
+ const char* utf8 = unicharset_.id_to_unichar(i);
+ EXPECT_EQ(strstr(utf8, u8"\u0640"), nullptr);
+ }
+}
+
+TEST_F(TatweelTest, DictIgnoresTatweel) {
+ // This test verifies that the dictionary ignores the Tatweel character.
+ tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM,
+ unicharset_.size(), 0);
+ std::string filename = TestDataNameToPath("ara.wordlist");
+ if (!file_exists(filename.c_str())) {
+ LOG(INFO) << "Skip test because of missing " << filename;
+ GTEST_SKIP();
+ } else {
+ EXPECT_TRUE(trie.read_and_add_word_list(
+ filename.c_str(), unicharset_,
+ tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));
+ EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
+ }
+}
+
+TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
+ // This test verifies that a load of an existing unicharset keeps any
+ // existing tatweel for backwards compatibility.
+ std::string filename = TestDataNameToPath("ara.unicharset");
+ if (!file_exists(filename.c_str())) {
+ LOG(INFO) << "Skip test because of missing " << filename;
+ GTEST_SKIP();
+ } else {
+ EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
+ int num_tatweel = 0;
+ for (int i = 0; i < unicharset_.size(); ++i) {
+ const char* utf8 = unicharset_.id_to_unichar(i);
+ if (strstr(utf8, u8"\u0640") != nullptr) ++num_tatweel;
+ }
+ LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
+ EXPECT_EQ(num_tatweel, 4);
+ }
+}
+
+} // namespace
diff --git a/tesseract/unittest/tesseract_leaksanitizer.supp b/tesseract/unittest/tesseract_leaksanitizer.supp
new file mode 100644
index 00000000..6cc39999
--- /dev/null
+++ b/tesseract/unittest/tesseract_leaksanitizer.supp
@@ -0,0 +1,12 @@
+# Suppress memory leaks.
+# Use with LSAN_OPTIONS=suppressions=tesseract_lsan.supp
+leak:FcLangSetCreate
+leak:FcPatternObjectAddWithBinding
+leak:FcPatternObjectInsertElt
+leak:FcValueListAppend
+leak:FcValueListDuplicate
+leak:FcValueListPrepend
+leak:IA__FcLangSetCreate
+leak:IA__FcValueSave
+leak:libfontconfig.so
+leak:libfreetype.so
diff --git a/tesseract/unittest/textlineprojection_test.cc b/tesseract/unittest/textlineprojection_test.cc
new file mode 100644
index 00000000..f8423615
--- /dev/null
+++ b/tesseract/unittest/textlineprojection_test.cc
@@ -0,0 +1,262 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <allheaders.h>
+#include <string> // for std::string
+
+#include "absl/strings/str_format.h" // for absl::StrFormat
+#include "include_gunit.h"
+
+#include <tesseract/baseapi.h>
+#include "colfind.h"
+#include "log.h" // for LOG
+#include "mutableiterator.h"
+#include <tesseract/osdetect.h>
+#include "pageres.h"
+#include "tesseractclass.h"
+#include "textlineprojection.h"
+
+namespace tesseract {
+
+// Minimum score for a STRONG_CHAIN textline.
+// NOTE: Keep in sync with textlineprojection.cc.
+const int kMinStrongTextValue = 6;
+
+// The fixture for testing Tesseract.
+class TextlineProjectionTest : public testing::Test {
+ protected:
+ std::string OutputNameToPath(const std::string& name) {
+ file::MakeTmpdir();
+ return file::JoinPath(FLAGS_test_tmpdir, name);
+ }
+
+ TextlineProjectionTest() {
+ src_pix_ = nullptr;
+ bin_pix_ = nullptr;
+ tesseract_ = nullptr;
+ finder_ = nullptr;
+ denorm_ = nullptr;
+ projection_ = nullptr;
+ }
+ virtual ~TextlineProjectionTest() {
+ pixDestroy(&src_pix_);
+ pixDestroy(&bin_pix_);
+ delete finder_;
+ delete tesseract_;
+ }
+
+ void SetImage(const char* filename) {
+ pixDestroy(&src_pix_);
+ src_pix_ = pixRead(file::JoinPath(TESTING_DIR, filename).c_str());
+ api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY);
+ api_.SetPageSegMode(tesseract::PSM_AUTO_OSD);
+ api_.SetImage(src_pix_);
+ }
+
+ // Ugly hacked-together function sets up projection_ and denorm_ by setting
+ // up for auto pagelayout, setting up a ColumnFinder, running it, and
+ // using accessors to get at the internal denorm and projection.
+ // If the coordinates have been rotated, the denorm should match
+ // correctly and transform coordinates back to the projection.
+ // We throw away all the blocks, blobs etc, and test the projection with
+ // the resultiterator from a separate BaseAPI run.
+ void SetupProjection() {
+ tesseract::TessdataManager mgr;
+ Tesseract* osd_tess = new Tesseract;
+ OSResults osr;
+ EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, nullptr, "osd",
+ tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
+ nullptr, nullptr, false, &mgr),
+ 0);
+ tesseract_ = new Tesseract;
+ EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, nullptr, "eng",
+ tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
+ nullptr, nullptr, false, &mgr),
+ 0);
+ bin_pix_ = api_.GetThresholdedImage();
+ *tesseract_->mutable_pix_binary() = pixClone(bin_pix_);
+ osd_tess->set_source_resolution(api_.tesseract()->source_resolution());
+ tesseract_->set_source_resolution(api_.tesseract()->source_resolution());
+ int width = pixGetWidth(bin_pix_);
+ int height = pixGetHeight(bin_pix_);
+ // First make a single block covering the whole image.
+ BLOCK* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+ block->set_right_to_left(false);
+ BLOCK_LIST src_blocks;
+ BLOCK_IT block_it(&src_blocks);
+ block_it.add_to_end(block);
+ Pix* photomask_pix = nullptr;
+ // The blocks made by the ColumnFinder. Moved to blocks before return.
+ BLOCK_LIST found_blocks;
+ TO_BLOCK_LIST temp_blocks;
+ finder_ = tesseract_->SetupPageSegAndDetectOrientation(
+ tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess, &osr, &temp_blocks,
+ &photomask_pix, nullptr);
+ TO_BLOCK_IT to_block_it(&temp_blocks);
+ TO_BLOCK* to_block = to_block_it.data();
+ denorm_ = finder_->denorm();
+ TO_BLOCK_LIST to_blocks;
+ BLOBNBOX_LIST diacritic_blobs;
+ EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block,
+ photomask_pix, nullptr, nullptr, nullptr,
+ &found_blocks, &diacritic_blobs, &to_blocks),
+ 0);
+ projection_ = finder_->projection();
+ pixDestroy(&photomask_pix);
+ delete osd_tess;
+ }
+
+ // Helper evaluates the given box, expects the result to be greater_than
+ // or !greater_than the target_value and provides diagnostics if not.
+ void EvaluateBox(const TBOX& box, bool greater_or_equal, int target_value,
+ const char* text, const char* message) {
+ int value = projection_->EvaluateBox(box, denorm_, false);
+ if (greater_or_equal != (value > target_value)) {
+ LOG(INFO) << absl::StrFormat(
+ "EvaluateBox too %s:%d vs %d for %s word '%s' at:",
+ greater_or_equal ? "low" : "high", value, target_value, message,
+ text);
+ box.print();
+ value = projection_->EvaluateBox(box, denorm_, true);
+ } else {
+ LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'",
+ value, message, text);
+ }
+ if (greater_or_equal) {
+ EXPECT_GE(value, target_value);
+ } else {
+ EXPECT_LT(value, target_value);
+ }
+ }
+
+ // Helper evaluates the DistanceOfBoxFromBox function by expecting that
+ // box should be nearer to true_box than false_box.
+ void EvaluateDistance(const TBOX& box, const TBOX& true_box,
+ const TBOX& false_box, const char* text,
+ const char* message) {
+ int true_dist =
+ projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false);
+ int false_dist =
+ projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false);
+ if (false_dist <= true_dist) {
+ LOG(INFO) << absl::StrFormat(
+ "Distance wrong:%d vs %d for %s word '%s' at:",
+ false_dist, true_dist, message, text);
+ true_box.print();
+ projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, true);
+ projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, true);
+ } else {
+ LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'",
+ false_dist, true_dist, message, text);
+ }
+ }
+
+ // Tests the projection on the word boxes of the given image.
+ // line_height is the cap + descender size of the text.
+ void VerifyBoxes(const char* imagefile, int line_height) {
+ SetImage(imagefile);
+ api_.Recognize(nullptr);
+ SetupProjection();
+ MutableIterator* it = api_.GetMutableIterator();
+ do {
+ char* text = it->GetUTF8Text(tesseract::RIL_WORD);
+ const PAGE_RES_IT* pr_it = it->PageResIt();
+ WERD_RES* word = pr_it->word();
+ // The word_box refers to the internal, possibly rotated, coords.
+ TBOX word_box = word->word->bounding_box();
+ bool small_word = word_box.height() * 1.5 < line_height;
+ bool tall_word = word_box.height() * 1.125 > line_height;
+ // We pad small and tall words differently because ascenders and
+ // descenders affect the position and size of the upper/lower boxes.
+ int padding;
+ if (small_word) {
+ padding = word_box.height();
+ } else if (tall_word) {
+ padding = word_box.height() / 3;
+ } else {
+ padding = word_box.height() / 2;
+ }
+ // Test that the word box gets a good score.
+ EvaluateBox(word_box, true, kMinStrongTextValue, text, "Real Word");
+
+ // Now test a displaced box, both above and below the word.
+ TBOX upper_box(word_box);
+ upper_box.set_bottom(word_box.top());
+ upper_box.set_top(word_box.top() + padding);
+ EvaluateBox(upper_box, false, kMinStrongTextValue, text, "Upper Word");
+ EvaluateBox(upper_box, true, -1, text, "Upper Word not vertical");
+ TBOX lower_box = word_box;
+ lower_box.set_top(word_box.bottom());
+ lower_box.set_bottom(word_box.bottom() - padding);
+ if (tall_word) lower_box.move(ICOORD(0, padding / 2));
+ EvaluateBox(lower_box, false, kMinStrongTextValue, text, "Lower Word");
+ EvaluateBox(lower_box, true, -1, text, "Lower Word not vertical");
+
+ // Since some words have no text below and some words have no text above
+ // check that at least one of the boxes satisfies BoxOutOfTextline.
+ bool upper_or_lower_out_of_textline =
+ projection_->BoxOutOfHTextline(upper_box, denorm_, false) ||
+ projection_->BoxOutOfHTextline(lower_box, denorm_, false);
+ if (!upper_or_lower_out_of_textline) {
+ projection_->BoxOutOfHTextline(upper_box, denorm_, true);
+ projection_->BoxOutOfHTextline(lower_box, denorm_, true);
+ }
+ EXPECT_TRUE(upper_or_lower_out_of_textline);
+
+ // Now test DistanceOfBoxFromBox by faking a challenger word, and asking
+ // that each pad box be nearer to its true textline than the
+ // challenger. Due to the tight spacing of latin text, getting
+ // the right position and size of these test boxes is quite fiddly.
+ padding = line_height / 4;
+ upper_box.set_top(upper_box.bottom() + padding);
+ TBOX target_box(word_box);
+ if (!small_word) {
+ upper_box.move(ICOORD(0, -padding * 3 / 2));
+ }
+ target_box.set_top(upper_box.bottom());
+ TBOX upper_challenger(upper_box);
+ upper_challenger.set_bottom(upper_box.top());
+ upper_challenger.set_top(upper_box.top() + word_box.height());
+ EvaluateDistance(upper_box, target_box, upper_challenger, text,
+ "Upper Word");
+ if (tall_word) lower_box.move(ICOORD(0, padding / 2));
+ lower_box.set_bottom(lower_box.top() - padding);
+ target_box = word_box;
+ target_box.set_bottom(lower_box.top());
+ TBOX lower_challenger(lower_box);
+ lower_challenger.set_top(lower_box.bottom());
+ lower_challenger.set_bottom(lower_box.bottom() - word_box.height());
+ EvaluateDistance(lower_box, target_box, lower_challenger, text,
+ "Lower Word");
+
+ delete[] text;
+ } while (it->Next(tesseract::RIL_WORD));
+ delete it;
+ }
+
+ Pix* src_pix_;
+ Pix* bin_pix_;
+ BLOCK_LIST blocks_;
+ std::string ocr_text_;
+ tesseract::TessBaseAPI api_;
+ Tesseract* tesseract_;
+ ColumnFinder* finder_;
+ const DENORM* denorm_;
+ const TextlineProjection* projection_;
+};
+
+// Tests all word boxes on an unrotated image.
+TEST_F(TextlineProjectionTest, Unrotated) { VerifyBoxes("phototest.tif", 31); }
+
+// Tests character-level applyboxes on italic Times New Roman.
+TEST_F(TextlineProjectionTest, Rotated) { VerifyBoxes("phototestrot.tif", 31); }
+
+} // namespace
diff --git a/tesseract/unittest/tfile_test.cc b/tesseract/unittest/tfile_test.cc
new file mode 100644
index 00000000..166405ff
--- /dev/null
+++ b/tesseract/unittest/tfile_test.cc
@@ -0,0 +1,179 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "genericvector.h"
+#include "serialis.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+// Tests TFile and std::vector serialization by serializing and
+// writing/reading.
+
+class TfileTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ }
+
+ TfileTest() {}
+
+ // Some data to serialize.
+ class MathData {
+ public:
+ MathData() : num_squares_(0), num_triangles_(0) {}
+ void Setup() {
+ // Setup some data.
+ for (int s = 0; s < 42; ++s) squares_.push_back(s * s);
+ num_squares_ = squares_.size();
+ for (int t = 0; t < 52; ++t) triangles_.push_back(t * (t + 1) / 2);
+ num_triangles_ = triangles_.size();
+ }
+ void ExpectEq(const MathData& other) {
+ // Check the data.
+ EXPECT_EQ(num_squares_, other.num_squares_);
+ for (int s = 0; s < squares_.size(); ++s)
+ EXPECT_EQ(squares_[s], other.squares_[s]);
+ EXPECT_EQ(num_triangles_, other.num_triangles_);
+ for (int s = 0; s < triangles_.size(); ++s)
+ EXPECT_EQ(triangles_[s], other.triangles_[s]);
+ }
+ bool Serialize(TFile* fp) {
+ if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
+ if (!squares_.Serialize(fp)) return false;
+ if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+ return false;
+ if (!triangles_.Serialize(fp)) return false;
+ return true;
+ }
+ bool DeSerialize(TFile* fp) {
+ if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1)
+ return false;
+ if (!squares_.DeSerialize(fp)) return false;
+ if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+ return false;
+ if (!triangles_.DeSerialize(fp)) return false;
+ return true;
+ }
+ bool SerializeBigEndian(TFile* fp) {
+ ReverseN(&num_squares_, sizeof(num_squares_));
+ if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
+ // Write an additional reversed size before the vector, which will get
+ // used as its size on reading.
+ if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
+ for (int i = 0; i < squares_.size(); ++i)
+ ReverseN(&squares_[i], sizeof(squares_[i]));
+ if (!squares_.Serialize(fp)) return false;
+ ReverseN(&num_triangles_, sizeof(num_triangles_));
+ if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+ return false;
+ if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+ return false;
+ for (int i = 0; i < triangles_.size(); ++i)
+ ReverseN(&triangles_[i], sizeof(triangles_[i]));
+ return triangles_.Serialize(fp);
+ }
+ bool DeSerializeBigEndian(TFile* fp) {
+ if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1)
+ return false;
+ if (!squares_.DeSerialize(fp)) return false;
+ // The first element is the size that was written, so we will delete it
+ // and read the last element separately.
+ int last_element;
+ if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1)
+ return false;
+ squares_.remove(0);
+ squares_.push_back(last_element);
+ if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+ return false;
+ if (!triangles_.DeSerialize(fp)) return false;
+ if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1)
+ return false;
+ triangles_.remove(0);
+ triangles_.push_back(last_element);
+ return true;
+ }
+
+ private:
+ GenericVector<int> squares_;
+ int num_squares_;
+ GenericVector<int> triangles_;
+ int num_triangles_;
+ };
+};
+
+TEST_F(TfileTest, Serialize) {
+ // This test verifies that Tfile can serialize a class.
+ MathData m1;
+ m1.Setup();
+ std::vector<char> data;
+ TFile fpw;
+ fpw.OpenWrite(&data);
+ EXPECT_TRUE(m1.Serialize(&fpw));
+ TFile fpr;
+ EXPECT_TRUE(fpr.Open(&data[0], data.size()));
+ MathData m2;
+ EXPECT_TRUE(m2.DeSerialize(&fpr));
+ m1.ExpectEq(m2);
+ MathData m3;
+ EXPECT_FALSE(m3.DeSerialize(&fpr));
+ fpr.Rewind();
+ EXPECT_TRUE(m3.DeSerialize(&fpr));
+ m1.ExpectEq(m3);
+}
+
+TEST_F(TfileTest, FGets) {
+ // This test verifies that Tfile can interleave FGets with binary data.
+ MathData m1;
+ std::string line_str = "This is a textline with a newline\n";
+ m1.Setup();
+ std::vector<char> data;
+ TFile fpw;
+ fpw.OpenWrite(&data);
+ EXPECT_TRUE(m1.Serialize(&fpw));
+ EXPECT_EQ(1, fpw.FWrite(line_str.data(), line_str.size(), 1));
+ EXPECT_TRUE(m1.Serialize(&fpw));
+ // Now get back the 2 copies of m1 with the line in between.
+ TFile fpr;
+ EXPECT_TRUE(fpr.Open(&data[0], data.size()));
+ MathData m2;
+ EXPECT_TRUE(m2.DeSerialize(&fpr));
+ m1.ExpectEq(m2);
+ const int kBufsize = 1024;
+ char buffer[kBufsize + 1];
+ EXPECT_EQ(buffer, fpr.FGets(buffer, kBufsize));
+ EXPECT_STREQ(line_str.c_str(), buffer);
+ MathData m3;
+ EXPECT_TRUE(m3.DeSerialize(&fpr));
+ m1.ExpectEq(m3);
+}
+
+TEST_F(TfileTest, BigEndian) {
+ // This test verifies that Tfile can auto-reverse big-endian data.
+ MathData m1;
+ m1.Setup();
+ std::vector<char> data;
+ TFile fpw;
+ fpw.OpenWrite(&data);
+ EXPECT_TRUE(m1.SerializeBigEndian(&fpw));
+ TFile fpr;
+ EXPECT_TRUE(fpr.Open(&data[0], data.size()));
+ fpr.set_swap(true);
+ MathData m2;
+ EXPECT_TRUE(m2.DeSerializeBigEndian(&fpr));
+ // That serialize was destructive, so test against a fresh MathData.
+ MathData m3;
+ m3.Setup();
+ m3.ExpectEq(m2);
+}
+
+} // namespace
diff --git a/tesseract/unittest/third_party/utf/rune.c b/tesseract/unittest/third_party/utf/rune.c
new file mode 100644
index 00000000..3d860570
--- /dev/null
+++ b/tesseract/unittest/third_party/utf/rune.c
@@ -0,0 +1,357 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ * Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "third_party/utf/utf.h"
+#include "third_party/utf/utfdef.h"
+
+enum
+{
+ Bit1 = 7,
+ Bitx = 6,
+ Bit2 = 5,
+ Bit3 = 4,
+ Bit4 = 3,
+ Bit5 = 2,
+
+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
+
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1,
+ /* 0001 1111 1111 1111 1111 1111 */
+
+ Maskx = (1<<Bitx)-1, /* 0011 1111 */
+ Testx = Maskx ^ 0xFF, /* 1100 0000 */
+
+ Bad = Runeerror,
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune
+ * that works on strings that are not necessarily null-terminated.
+ *
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer. This is to avoid
+ * possible access violations. If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int
+charntorune(Rune *rune, const char *str, int length)
+{
+ int c, c1, c2, c3;
+ long l;
+
+ /* When we're not allowed to read anything */
+ if(length <= 0) {
+ goto badlen;
+ }
+
+ /*
+ * one character sequence (7-bit value)
+ * 00000-0007F => T1
+ */
+ c = *(uchar*)str;
+ if(c < Tx) {
+ *rune = c;
+ return 1;
+ }
+
+ // If we can't read more than one character we must stop
+ if(length <= 1) {
+ goto badlen;
+ }
+
+ /*
+ * two character sequence (11-bit value)
+ * 0080-07FF => T2 Tx
+ */
+ c1 = *(uchar*)(str+1) ^ Tx;
+ if(c1 & Testx)
+ goto bad;
+ if(c < T3) {
+ if(c < T2)
+ goto bad;
+ l = ((c << Bitx) | c1) & Rune2;
+ if(l <= Rune1)
+ goto bad;
+ *rune = l;
+ return 2;
+ }
+
+ // If we can't read more than two characters we must stop
+ if(length <= 2) {
+ goto badlen;
+ }
+
+ /*
+ * three character sequence (16-bit value)
+ * 0800-FFFF => T3 Tx Tx
+ */
+ c2 = *(uchar*)(str+2) ^ Tx;
+ if(c2 & Testx)
+ goto bad;
+ if(c < T4) {
+ l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+ if(l <= Rune2)
+ goto bad;
+ *rune = l;
+ return 3;
+ }
+
+ if (length <= 3)
+ goto badlen;
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if (c3 & Testx)
+ goto bad;
+ if (c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if (l <= Rune3)
+ goto bad;
+ if (l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+
+ // Support for 5-byte or longer UTF-8 would go here, but
+ // since we don't have that, we'll just fall through to bad.
+
+ /*
+ * bad decoding
+ */
+bad:
+ *rune = Bad;
+ return 1;
+badlen:
+ *rune = Bad;
+ return 0;
+
+}
+
+
+/*
+ * This is the older "unsafe" version, which works fine on
+ * null-terminated strings.
+ */
+int
+chartorune(Rune *rune, const char *str)
+{
+ int c, c1, c2, c3;
+ long l;
+
+ /*
+ * one character sequence
+ * 00000-0007F => T1
+ */
+ c = *(uchar*)str;
+ if(c < Tx) {
+ *rune = c;
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 0080-07FF => T2 Tx
+ */
+ c1 = *(uchar*)(str+1) ^ Tx;
+ if(c1 & Testx)
+ goto bad;
+ if(c < T3) {
+ if(c < T2)
+ goto bad;
+ l = ((c << Bitx) | c1) & Rune2;
+ if(l <= Rune1)
+ goto bad;
+ *rune = l;
+ return 2;
+ }
+
+ /*
+ * three character sequence
+ * 0800-FFFF => T3 Tx Tx
+ */
+ c2 = *(uchar*)(str+2) ^ Tx;
+ if(c2 & Testx)
+ goto bad;
+ if(c < T4) {
+ l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+ if(l <= Rune2)
+ goto bad;
+ *rune = l;
+ return 3;
+ }
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if (c3 & Testx)
+ goto bad;
+ if (c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if (l <= Rune3)
+ goto bad;
+ if (l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+
+ /*
+ * Support for 5-byte or longer UTF-8 would go here, but
+ * since we don't have that, we'll just fall through to bad.
+ */
+
+ /*
+ * bad decoding
+ */
+bad:
+ *rune = Bad;
+ return 1;
+}
+
+int
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+ *consumed = charntorune(rune, str, length);
+ return *rune != Runeerror || *consumed == 3;
+}
+
+int
+runetochar(char *str, const Rune *rune)
+{
+ /* Runes are signed, so convert to unsigned for range check. */
+ unsigned long c;
+
+ /*
+ * one character sequence
+ * 00000-0007F => 00-7F
+ */
+ c = *rune;
+ if(c <= Rune1) {
+ str[0] = c;
+ return 1;
+ }
+
+ /*
+ * two character sequence
+ * 0080-07FF => T2 Tx
+ */
+ if(c <= Rune2) {
+ str[0] = T2 | (c >> 1*Bitx);
+ str[1] = Tx | (c & Maskx);
+ return 2;
+ }
+
+ /*
+ * If the Rune is out of range, convert it to the error rune.
+ * Do this test here because the error rune encodes to three bytes.
+ * Doing it earlier would duplicate work, since an out of range
+ * Rune wouldn't have fit in one or two bytes.
+ */
+ if (c > Runemax)
+ c = Runeerror;
+
+ /*
+ * three character sequence
+ * 0800-FFFF => T3 Tx Tx
+ */
+ if (c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence (21-bit value)
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
+}
+
+int
+runelen(Rune rune)
+{
+ char str[10];
+
+ return runetochar(str, &rune);
+}
+
+int
+runenlen(const Rune *r, int nrune)
+{
+ int nb;
+ ulong c; /* Rune is signed, so use unsigned for range check. */
+
+ nb = 0;
+ while(nrune--) {
+ c = *r++;
+ if (c <= Rune1)
+ nb++;
+ else if (c <= Rune2)
+ nb += 2;
+ else if (c <= Rune3)
+ nb += 3;
+ else if (c <= Runemax)
+ nb += 4;
+ else
+ nb += 3; /* Runeerror = 0xFFFD, see runetochar */
+ }
+ return nb;
+}
+
+int
+fullrune(const char *str, int n)
+{
+ if (n > 0) {
+ int c = *(uchar*)str;
+ if (c < Tx)
+ return 1;
+ if (n > 1) {
+ if (c < T3)
+ return 1;
+ if (n > 2) {
+ if (c < T4 || n > 3)
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
diff --git a/tesseract/unittest/third_party/utf/utf.h b/tesseract/unittest/third_party/utf/utf.h
new file mode 100644
index 00000000..06982e58
--- /dev/null
+++ b/tesseract/unittest/third_party/utf/utf.h
@@ -0,0 +1,246 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ * Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#ifndef _UTFH_
+#define _UTFH_ 1
+
+#include <stdint.h>
+
+typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+ UTFmax = 4, /* maximum bytes per rune */
+ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
+ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0x10FFFF, /* maximum rune value */
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * rune routines
+ */
+
+/*
+ * These routines were written by Rob Pike and Ken Thompson
+ * and first appeared in Plan 9.
+ * SEE ALSO
+ * utf (7)
+ * tcs (1)
+*/
+
+// runetochar copies (encodes) one rune, pointed to by r, to at most
+// UTFmax bytes starting at s and returns the number of bytes generated.
+
+int runetochar(char* s, const Rune* r);
+
+
+// chartorune copies (decodes) at most UTFmax bytes starting at s to
+// one rune, pointed to by r, and returns the number of bytes consumed.
+// If the input is not exactly in UTF format, chartorune will set *r
+// to Runeerror and return 1.
+//
+// Note: There is no special case for a "null-terminated" string. A
+// string whose first byte has the value 0 is the UTF8 encoding of the
+// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
+// anywhere else in a UTF sequence.
+
+int chartorune(Rune* r, const char* s);
+
+
+// charntorune is like chartorune, except that it will access at most
+// n bytes of s. If the UTF sequence is incomplete within n bytes,
+// charntorune will set *r to Runeerror and return 0. If it is complete
+// but not in UTF format, it will set *r to Runeerror and return 1.
+//
+// Added 2004-09-24 by Wei-Hwa Huang
+
+int charntorune(Rune* r, const char* s, int n);
+
+// isvalidcharntorune(str, n, r, consumed)
+// is a convenience function that calls "*consumed = charntorune(r, str, n)"
+// and returns an int (logically boolean) indicating whether the first
+// n bytes of str was a valid and complete UTF sequence.
+
+int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
+
+// runelen returns the number of bytes required to convert r into UTF.
+
+int runelen(Rune r);
+
+
+// runenlen returns the number of bytes required to convert the n
+// runes pointed to by r into UTF.
+
+int runenlen(const Rune* r, int n);
+
+
+// fullrune returns 1 if the string s of length n is long enough to be
+// decoded by chartorune, and 0 otherwise. This does not guarantee
+// that the string contains a legal UTF encoding. This routine is used
+// by programs that obtain input one byte at a time and need to know
+// when a full rune has arrived.
+
+int fullrune(const char* s, int n);
+
+// The following routines are analogous to the corresponding string
+// routines with "utf" substituted for "str", and "rune" substituted
+// for "chr".
+
+// utflen returns the number of runes that are represented by the UTF
+// string s. (cf. strlen)
+
+int utflen(const char* s);
+
+
+// utfnlen returns the number of complete runes that are represented
+// by the first n bytes of the UTF string s. If the last few bytes of
+// the string contain an incompletely coded rune, utfnlen will not
+// count them; in this way, it differs from utflen, which includes
+// every byte of the string. (cf. strnlen)
+
+int utfnlen(const char* s, long n);
+
+
+// utfrune returns a pointer to the first occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string. The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strchr)
+
+const char* utfrune(const char* s, Rune r);
+
+
+// utfrrune returns a pointer to the last occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string. The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strrchr)
+
+const char* utfrrune(const char* s, Rune r);
+
+
+// utfutf returns a pointer to the first occurrence of the UTF string
+// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
+// null string, utfutf returns s1. (cf. strstr)
+
+const char* utfutf(const char* s1, const char* s2);
+
+
+// utfecpy copies UTF sequences until a null sequence has been copied,
+// but writes no sequences beyond es1. If any sequences are copied,
+// s1 is terminated by a null sequence, and a pointer to that sequence
+// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
+
+char* utfecpy(char *s1, char *es1, const char *s2);
+
+
+
+// These functions are rune-string analogues of the corresponding
+// functions in strcat (3).
+//
+// These routines first appeared in Plan 9.
+// SEE ALSO
+// memmove (3)
+// rune (3)
+// strcat (2)
+//
+// BUGS: The outcome of overlapping moves varies among implementations.
+
+Rune* runestrcat(Rune* s1, const Rune* s2);
+Rune* runestrncat(Rune* s1, const Rune* s2, long n);
+
+const Rune* runestrchr(const Rune* s, Rune c);
+
+int runestrcmp(const Rune* s1, const Rune* s2);
+int runestrncmp(const Rune* s1, const Rune* s2, long n);
+
+Rune* runestrcpy(Rune* s1, const Rune* s2);
+Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
+Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
+
+Rune* runestrdup(const Rune* s);
+
+const Rune* runestrrchr(const Rune* s, Rune c);
+long runestrlen(const Rune* s);
+const Rune* runestrstr(const Rune* s1, const Rune* s2);
+
+
+
+// The following routines test types and modify cases for Unicode
+// characters. Unicode defines some characters as letters and
+// specifies three cases: upper, lower, and title. Mappings among the
+// cases are also defined, although they are not exhaustive: some
+// upper case letters have no lower case mapping, and so on. Unicode
+// also defines several character properties, a subset of which are
+// checked by these routines. These routines are based on Unicode
+// version 3.0.0.
+//
+// NOTE: The routines are implemented in C, so the boolean functions
+// (e.g., isupperrune) return 0 for false and 1 for true.
+//
+//
+// toupperrune, tolowerrune, and totitlerune are the Unicode case
+// mappings. These routines return the character unchanged if it has
+// no defined mapping.
+
+Rune toupperrune(Rune r);
+Rune tolowerrune(Rune r);
+Rune totitlerune(Rune r);
+
+
+// isupperrune tests for upper case characters, including Unicode
+// upper case letters and targets of the toupper mapping. islowerrune
+// and istitlerune are defined analogously.
+
+int isupperrune(Rune r);
+int islowerrune(Rune r);
+int istitlerune(Rune r);
+
+
+// isalpharune tests for Unicode letters; this includes ideographs in
+// addition to alphabetic characters.
+
+int isalpharune(Rune r);
+
+
+// isdigitrune tests for digits. Non-digit numbers, such as Roman
+// numerals, are not included.
+
+int isdigitrune(Rune r);
+
+
+// isideographicrune tests for ideographic characters and numbers, as
+// defined by the Unicode standard.
+
+int isideographicrune(Rune r);
+
+
+// isspacerune tests for whitespace characters, including "C" locale
+// whitespace, Unicode defined whitespace, and the "zero-width
+// non-break space" character.
+
+int isspacerune(Rune r);
+
+
+// (The comments in this file were copied from the manpage files rune.3,
+// isalpharune.3, and runestrcat.3. Some formatting changes were also made
+// to conform to Google style. /JRM 11/11/05)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tesseract/unittest/third_party/utf/utfdef.h b/tesseract/unittest/third_party/utf/utfdef.h
new file mode 100644
index 00000000..4b58ae87
--- /dev/null
+++ b/tesseract/unittest/third_party/utf/utfdef.h
@@ -0,0 +1,14 @@
+#define uchar _utfuchar
+#define ushort _utfushort
+#define uint _utfuint
+#define ulong _utfulong
+#define vlong _utfvlong
+#define uvlong _utfuvlong
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+#define nil ((void*)0)
diff --git a/tesseract/unittest/unichar_test.cc b/tesseract/unittest/unichar_test.cc
new file mode 100644
index 00000000..54394436
--- /dev/null
+++ b/tesseract/unittest/unichar_test.cc
@@ -0,0 +1,43 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "gmock/gmock.h" // for testing::ElementsAreArray
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+TEST(UnicharTest, Conversion) {
+ // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8
+ // show the required conversion properties.
+ // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes.
+ const char* kUTF8Src = "a\u05d0\u0ca4\U0002a714";
+ const std::vector<char32> kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714};
+ // Check for round-trip conversion.
+ std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src);
+ EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src));
+ std::string utf8 = UNICHAR::UTF32ToUTF8(utf32);
+ EXPECT_STREQ(kUTF8Src, utf8.c_str());
+}
+
+TEST(UnicharTest, InvalidText) {
+ // This test verifies that Unichar correctly deals with invalid text.
+ const char* kInvalidUTF8 = "a b\200d string";
+ const std::vector<char32> kInvalidUTF32 = {'a', ' ', 0x200000, 'x'};
+ // Invalid utf8 produces an empty vector.
+ std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8);
+ EXPECT_TRUE(utf32.empty());
+ // Invalid utf32 produces an empty string.
+ std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32);
+ EXPECT_TRUE(utf8.empty());
+}
+
+} // namespace
diff --git a/tesseract/unittest/unicharcompress_test.cc b/tesseract/unittest/unicharcompress_test.cc
new file mode 100644
index 00000000..1777930e
--- /dev/null
+++ b/tesseract/unittest/unicharcompress_test.cc
@@ -0,0 +1,257 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "allheaders.h"
+
+#include "include_gunit.h"
+#include "log.h" // for LOG
+#include "serialis.h"
+#include "tprintf.h"
+#include "unicharcompress.h"
+
+namespace tesseract {
+
+class UnicharcompressTest : public ::testing::Test {
+ protected:
+ void SetUp() {
+ std::locale::global(std::locale(""));
+ file::MakeTmpdir();
+ }
+
+ // Loads and compresses the given unicharset.
+ void LoadUnicharset(const std::string& unicharset_name) {
+ std::string radical_stroke_file =
+ file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
+ std::string unicharset_file =
+ file::JoinPath(TESTDATA_DIR, unicharset_name);
+ std::string radical_data;
+ CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
+ file::Defaults()));
+ CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
+ STRING radical_str(radical_data.c_str());
+ null_char_ =
+ unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();
+ compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
+ // Get the encoding of the null char.
+ RecodedCharID code;
+ compressed_.EncodeUnichar(null_char_, &code);
+ encoded_null_char_ = code(0);
+ std::string output_name = file::JoinPath(
+ FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
+ STRING encoding = compressed_.GetEncodingAsString(unicharset_);
+ std::string encoding_str(&encoding[0], encoding.size());
+ CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
+ LOG(INFO) << "Wrote encoding to:" << output_name;
+ }
+ // Serializes and de-serializes compressed_ over itself.
+ void SerializeAndUndo() {
+ std::vector<char> data;
+ TFile wfp;
+ wfp.OpenWrite(&data);
+ EXPECT_TRUE(compressed_.Serialize(&wfp));
+ TFile rfp;
+ rfp.Open(&data[0], data.size());
+ EXPECT_TRUE(compressed_.DeSerialize(&rfp));
+ }
+ // Returns true if the lang is in CJK.
+ bool IsCJKLang(const std::string& lang) {
+ return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
+ lang == "jpn";
+ }
+ // Returns true if the lang is Indic.
+ bool IsIndicLang(const std::string& lang) {
+ return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
+ lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
+ lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
+ lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" ||
+ lang == "tel";
+ }
+
+ // Expects the appropriate results from the compressed_ unicharset_.
+ void ExpectCorrect(const std::string& lang) {
+ // Count the number of times each code is used in each element of
+ // RecodedCharID.
+ RecodedCharID zeros;
+ for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
+ int code_range = compressed_.code_range();
+ std::vector<RecodedCharID> times_seen(code_range, zeros);
+ for (int u = 0; u <= unicharset_.size(); ++u) {
+ if (u != UNICHAR_SPACE && u != null_char_ &&
+ (u == unicharset_.size() || (unicharset_.has_special_codes() &&
+ u < SPECIAL_UNICHAR_CODES_COUNT))) {
+ continue; // Not used so not encoded.
+ }
+ RecodedCharID code;
+ int len = compressed_.EncodeUnichar(u, &code);
+ // Check round-trip encoding.
+ int unichar_id;
+ GenericVector<UNICHAR_ID> normed_ids;
+ if (u == null_char_ || u == unicharset_.size()) {
+ unichar_id = null_char_;
+ } else {
+ unichar_id = u;
+ }
+ EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
+ // Check that the codes are valid.
+ for (int i = 0; i < len; ++i) {
+ int code_val = code(i);
+ EXPECT_GE(code_val, 0);
+ EXPECT_LT(code_val, code_range);
+ times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
+ }
+ }
+ // Check that each code is used in at least one position.
+ for (int c = 0; c < code_range; ++c) {
+ int num_used = 0;
+ for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
+ if (times_seen[c](i) != 0) ++num_used;
+ }
+ EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
+ }
+ // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
+ // and create valid codes.
+ RecodedCharID code;
+ CheckCodeExtensions(code, times_seen);
+ // Finally, we achieved all that using a codebook < 10% of the size of
+ // the original unicharset, for CK or Indic, and 20% with J, but just
+ // no bigger for all others.
+ if (IsCJKLang(lang) || IsIndicLang(lang)) {
+ EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
+ } else {
+ EXPECT_LE(code_range, unicharset_.size() + 1);
+ }
+ LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to "
+ << code_range;
+ }
+ // Checks for extensions of the current code that either finish a code, or
+ // extend it and checks those extensions recursively.
+ void CheckCodeExtensions(const RecodedCharID& code,
+ const std::vector<RecodedCharID>& times_seen) {
+ RecodedCharID extended = code;
+ int length = code.length();
+ const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code);
+ if (final_codes != nullptr) {
+ for (int i = 0; i < final_codes->size(); ++i) {
+ int ending = (*final_codes)[i];
+ EXPECT_GT(times_seen[ending](length), 0);
+ extended.Set(length, ending);
+ int unichar_id = compressed_.DecodeUnichar(extended);
+ EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
+ }
+ }
+ const GenericVector<int>* next_codes = compressed_.GetNextCodes(code);
+ if (next_codes != nullptr) {
+ for (int i = 0; i < next_codes->size(); ++i) {
+ int extension = (*next_codes)[i];
+ EXPECT_GT(times_seen[extension](length), 0);
+ extended.Set(length, extension);
+ CheckCodeExtensions(extended, times_seen);
+ }
+ }
+ }
+
+ UnicharCompress compressed_;
+ UNICHARSET unicharset_;
+ int null_char_;
+ // The encoding of the null_char_.
+ int encoded_null_char_;
+};
+
+TEST_F(UnicharcompressTest, DoesChinese) {
+ LOG(INFO) << "Testing chi_tra";
+ LoadUnicharset("chi_tra.unicharset");
+ ExpectCorrect("chi_tra");
+ LOG(INFO) << "Testing chi_sim";
+ LoadUnicharset("chi_sim.unicharset");
+ ExpectCorrect("chi_sim");
+}
+
+TEST_F(UnicharcompressTest, DoesJapanese) {
+ LOG(INFO) << "Testing jpn";
+ LoadUnicharset("jpn.unicharset");
+ ExpectCorrect("jpn");
+}
+
+TEST_F(UnicharcompressTest, DoesKorean) {
+ LOG(INFO) << "Testing kor";
+ LoadUnicharset("kor.unicharset");
+ ExpectCorrect("kor");
+}
+
+TEST_F(UnicharcompressTest, DoesKannada) {
+ LOG(INFO) << "Testing kan";
+ LoadUnicharset("kan.unicharset");
+ ExpectCorrect("kan");
+ SerializeAndUndo();
+ ExpectCorrect("kan");
+}
+
+TEST_F(UnicharcompressTest, DoesMarathi) {
+ LOG(INFO) << "Testing mar";
+ LoadUnicharset("mar.unicharset");
+ ExpectCorrect("mar");
+}
+
+TEST_F(UnicharcompressTest, DoesEnglish) {
+ LOG(INFO) << "Testing eng";
+ LoadUnicharset("eng.unicharset");
+ ExpectCorrect("eng");
+}
+
+// Tests that a unicharset that contains double-letter ligatures (eg ff) has
+// no null char in the encoding at all.
+TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
+ LOG(INFO) << "Testing por with ligatures";
+ LoadUnicharset("por.unicharset");
+ ExpectCorrect("por");
+ // Check that any unichar-id that is encoded with multiple codes has the
+ // correct encoded_nulll_char_ in between.
+ for (int u = 0; u <= unicharset_.size(); ++u) {
+ RecodedCharID code;
+ int len = compressed_.EncodeUnichar(u, &code);
+ if (len > 1) {
+ // The should not be any null char in the code.
+ for (int i = 0; i < len; ++i) {
+ EXPECT_NE(encoded_null_char_, code(i));
+ }
+ }
+ }
+}
+
+// Tests that GetEncodingAsString returns the right result for a trivial
+// unicharset.
+TEST_F(UnicharcompressTest, GetEncodingAsString) {
+ LoadUnicharset("trivial.unicharset");
+ ExpectCorrect("trivial");
+ STRING encoding = compressed_.GetEncodingAsString(unicharset_);
+ std::string encoding_str(&encoding[0], encoding.length());
+ std::vector<std::string> lines =
+ absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
+ EXPECT_EQ(5, lines.size());
+ // The first line is always space.
+ EXPECT_EQ("0\t ", lines[0]);
+ // Next we have i.
+ EXPECT_EQ("1\ti", lines[1]);
+ // Next we have f.
+ EXPECT_EQ("2\tf", lines[2]);
+ // Next we have the fi ligature: fi. There are no nulls in it, as there are no
+ // repeated letter ligatures in this unicharset, unlike por.unicharset above.
+ EXPECT_EQ("2,1\tfi", lines[3]);
+ // Finally the null character.
+ EXPECT_EQ("3\t<nul>", lines[4]);
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/unicharset_test.cc b/tesseract/unittest/unicharset_test.cc
new file mode 100644
index 00000000..401a34c1
--- /dev/null
+++ b/tesseract/unittest/unicharset_test.cc
@@ -0,0 +1,161 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "log.h" // for LOG
+#include "unicharset.h"
+#include "gmock/gmock.h" // for testing::ElementsAreArray
+#include "include_gunit.h"
+
+using testing::ElementsAreArray;
+
+namespace tesseract {
+
+class UnicharsetTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ std::locale::global(std::locale(""));
+ }
+};
+
+TEST(UnicharsetTest, Basics) {
+ // This test verifies basic insertion, unichar_to_id, and encode.
+ UNICHARSET u;
+ u.unichar_insert("a");
+ EXPECT_EQ(u.size(), 4);
+ u.unichar_insert("f");
+ EXPECT_EQ(u.size(), 5);
+ u.unichar_insert("i");
+ EXPECT_EQ(u.size(), 6);
+ // The fi ligature is NOT added because it can be encoded with a cleanup as f
+ // then i.
+ u.unichar_insert("\ufb01");
+ EXPECT_EQ(u.size(), 6);
+ u.unichar_insert("e");
+ EXPECT_EQ(u.size(), 7);
+ u.unichar_insert("n");
+ EXPECT_EQ(u.size(), 8);
+ EXPECT_EQ(u.unichar_to_id("f"), 4);
+ EXPECT_EQ(u.unichar_to_id("i"), 5);
+ // The fi ligature has no valid id.
+ EXPECT_EQ(u.unichar_to_id("\ufb01"), INVALID_UNICHAR_ID);
+ // The fi pair has no valid id.
+ EXPECT_EQ(u.unichar_to_id("fi"), INVALID_UNICHAR_ID);
+ std::vector<int> labels;
+ EXPECT_TRUE(u.encode_string("affine", true, &labels, nullptr, nullptr));
+ std::vector<int> v(&labels[0], &labels[0] + labels.size());
+ EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
+ // With the fi ligature encoding fails without a pre-cleanup.
+ std::string lig_str = "af\ufb01ne";
+ EXPECT_FALSE(
+ u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
+ lig_str = u.CleanupString(lig_str.c_str());
+ EXPECT_TRUE(
+ u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
+ v = std::vector<int>(&labels[0], &labels[0] + labels.size());
+ EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
+}
+
+TEST(UnicharsetTest, Multibyte) {
+ // This test verifies basic insertion, unichar_to_id, and encode.
+ // The difference from Basic above is that now we are testing multi-byte
+ // unicodes instead of single byte.
+ UNICHARSET u;
+ // Insert some Arabic letters.
+ u.unichar_insert("\u0627");
+ EXPECT_EQ(u.size(), 4);
+ u.unichar_insert("\u062c");
+ EXPECT_EQ(u.size(), 5);
+ u.unichar_insert("\u062f");
+ EXPECT_EQ(u.size(), 6);
+ u.unichar_insert("\ufb01"); // fi ligature is added as fi pair.
+ EXPECT_EQ(u.size(), 7);
+ u.unichar_insert("\u062b");
+ EXPECT_EQ(u.size(), 8);
+ u.unichar_insert("\u0635");
+ EXPECT_EQ(u.size(), 9);
+ EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
+ EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
+ // The first two bytes of this string is \u0627, which matches id 3;
+ EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
+ EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
+ // Individual f and i are not present, but they are there as a pair.
+ EXPECT_EQ(u.unichar_to_id("f"), INVALID_UNICHAR_ID);
+ EXPECT_EQ(u.unichar_to_id("i"), INVALID_UNICHAR_ID);
+ EXPECT_EQ(u.unichar_to_id("fi"), 6);
+ // The fi ligature is findable.
+ EXPECT_EQ(u.unichar_to_id("\ufb01"), 6);
+ std::vector<int> labels;
+ EXPECT_TRUE(u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true,
+ &labels, nullptr, nullptr));
+ std::vector<int> v(&labels[0], &labels[0] + labels.size());
+ EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));
+ // With the fi ligature the fi is picked out.
+ std::vector<char> lengths;
+ int encoded_length;
+ std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
+ // src_str has to be pre-cleaned for lengths to be correct.
+ std::string cleaned = u.CleanupString(src_str.c_str());
+ EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths,
+ &encoded_length));
+ EXPECT_EQ(encoded_length, cleaned.size());
+ std::string len_str(&lengths[0], lengths.size());
+ EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
+ v = std::vector<int>(&labels[0], &labels[0] + labels.size());
+ EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));
+}
+
+TEST(UnicharsetTest, MultibyteBigrams) {
+ // This test verifies basic insertion, unichar_to_id, and encode.
+ // The difference from Basic above is that now we are testing multi-byte
+ // unicodes instead of single byte.
+ UNICHARSET u;
+ // Insert some Arabic letters.
+ u.unichar_insert("\u0c9c");
+ EXPECT_EQ(u.size(), 4);
+ u.unichar_insert("\u0cad");
+ EXPECT_EQ(u.size(), 5);
+ u.unichar_insert("\u0ccd\u0c9c");
+ EXPECT_EQ(u.size(), 6);
+ u.unichar_insert("\u0ccd");
+ EXPECT_EQ(u.size(), 7);
+ // By default the encodable bigram is NOT added.
+ u.unichar_insert("\u0ccd\u0cad");
+ EXPECT_EQ(u.size(), 7);
+ // It is added if we force it to be.
+ u.unichar_insert("\u0ccd\u0cad", OldUncleanUnichars::kTrue);
+ EXPECT_EQ(u.size(), 8);
+ std::vector<char> data;
+ tesseract::TFile fp;
+ fp.OpenWrite(&data);
+ u.save_to_file(&fp);
+ fp.Open(&data[0], data.size());
+ UNICHARSET v;
+ v.load_from_file(&fp, false);
+ EXPECT_EQ(v.unichar_to_id("\u0c9c"), 3);
+ EXPECT_EQ(v.unichar_to_id("\u0cad"), 4);
+ EXPECT_EQ(v.unichar_to_id("\u0ccd\u0c9c"), 5);
+ EXPECT_EQ(v.unichar_to_id("\u0ccd"), 6);
+ EXPECT_EQ(v.unichar_to_id("\u0ccd\u0cad"), 7);
+}
+
+TEST(UnicharsetTest, OldStyle) {
+ // This test verifies an old unicharset that contains fi/fl ligatures loads
+ // and keeps all the entries.
+ std::string filename =
+ file::JoinPath(TESTDATA_DIR, "eng.unicharset");
+ UNICHARSET u;
+ LOG(INFO) << "Filename=" << filename;
+ EXPECT_TRUE(u.load_from_file(filename.c_str()));
+ EXPECT_EQ(u.size(), 111);
+}
+
+} // namespace
diff --git a/tesseract/unittest/util/utf8/unicodetext.cc b/tesseract/unittest/util/utf8/unicodetext.cc
new file mode 100644
index 00000000..1a884dd1
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unicodetext.cc
@@ -0,0 +1,507 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/utf8/unicodetext.h"
+
+#include <string.h> // for memcpy, NULL, memcmp, etc
+#include <algorithm> // for max
+
+//#include "base/logging.h" // for operator<<, CHECK, etc
+//#include "base/stringprintf.h" // for StringPrintf, StringAppendF
+//#include "strings/stringpiece.h" // for StringPiece, etc
+
+#include "third_party/utf/utf.h" // for isvalidcharntorune, etc
+#include "util/utf8/unilib.h" // for IsInterchangeValid, etc
+#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
+
+static int CodepointDistance(const char* start, const char* end) {
+ int n = 0;
+ // Increment n on every non-trail-byte.
+ for (const char* p = start; p < end; ++p) {
+ n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
+ }
+ return n;
+}
+
+static int CodepointCount(const char* utf8, int len) {
+ return CodepointDistance(utf8, utf8 + len);
+}
+
+UnicodeText::const_iterator::difference_type
+distance(const UnicodeText::const_iterator& first,
+ const UnicodeText::const_iterator& last) {
+ return CodepointDistance(first.it_, last.it_);
+}
+
+// ---------- Utility ----------
+
+static int ConvertToInterchangeValid(char* start, int len) {
+ // This routine is called only when we've discovered that a UTF-8 buffer
+ // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
+ // was not interchange valid. This indicates a bug in the caller, and
+ // a LOG(WARNING) is done in that case.
+ // This is similar to CoerceToInterchangeValid, but it replaces each
+ // structurally valid byte with a space, and each non-interchange
+ // character with a space, even when that character requires more
+ // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
+ // structurally valid UTF8, but U+FDD0 is not an interchange-valid
+ // code point. The result should contain one space, not three.
+ //
+ // Since the conversion never needs to write more data than it
+ // reads, it is safe to change the buffer in place. It returns the
+ // number of bytes written.
+ char* const in = start;
+ char* out = start;
+ char* const end = start + len;
+ while (start < end) {
+ int good = UniLib::SpanInterchangeValid(start, end - start);
+ if (good > 0) {
+ if (out != start) {
+ memmove(out, start, good);
+ }
+ out += good;
+ start += good;
+ if (start == end) {
+ break;
+ }
+ }
+ // Is the current string invalid UTF8 or just non-interchange UTF8?
+ char32 rune;
+ int n;
+ if (isvalidcharntorune(start, end - start, &rune, &n)) {
+ // structurally valid UTF8, but not interchange valid
+ start += n; // Skip over the whole character.
+ } else { // bad UTF8
+ start += 1; // Skip over just one byte
+ }
+ *out++ = ' ';
+ }
+ return out - in;
+}
+
+
+// *************** Data representation **********
+
+// Note: the copy constructor is undefined.
+
+// After reserve(), resize(), or clear(), we're an owner, not an alias.
+
+void UnicodeText::Repr::reserve(int new_capacity) {
+ // If there's already enough capacity, and we're an owner, do nothing.
+ if (capacity_ >= new_capacity && ours_) return;
+
+ // Otherwise, allocate a new buffer.
+ capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
+ char* new_data = new char[capacity_];
+
+ // If there is an old buffer, copy it into the new buffer.
+ if (data_) {
+ memcpy(new_data, data_, size_);
+ if (ours_) delete[] data_; // If we owned the old buffer, free it.
+ }
+ data_ = new_data;
+ ours_ = true; // We own the new buffer.
+ // size_ is unchanged.
+}
+
+void UnicodeText::Repr::resize(int new_size) {
+ if (new_size == 0) {
+ clear();
+ } else {
+ if (!ours_ || new_size > capacity_) reserve(new_size);
+ // Clear the memory in the expanded part.
+ if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
+ size_ = new_size;
+ ours_ = true;
+ }
+}
+
+// This implementation of clear() deallocates the buffer if we're an owner.
+// That's not strictly necessary; we could just set size_ to 0.
+void UnicodeText::Repr::clear() {
+ if (ours_) delete[] data_;
+ data_ = nullptr;
+ size_ = capacity_ = 0;
+ ours_ = true;
+}
+
+void UnicodeText::Repr::Copy(const char* data, int size) {
+ resize(size);
+ memcpy(data_, data, size);
+}
+
+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
+ if (data == data_) return; // We already own this memory. (Weird case.)
+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
+ data_ = data;
+ size_ = size;
+ capacity_ = capacity;
+ ours_ = true;
+}
+
+void UnicodeText::Repr::PointTo(const char* data, int size) {
+ if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
+ data_ = const_cast<char*>(data);
+ size_ = size;
+ capacity_ = size;
+ ours_ = false;
+}
+
+void UnicodeText::Repr::append(const char* bytes, int byte_length) {
+ reserve(size_ + byte_length);
+ memcpy(data_ + size_, bytes, byte_length);
+ size_ += byte_length;
+}
+
+string UnicodeText::Repr::DebugString() const {
+ return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
+ this,
+ data_, size_, capacity_,
+ ours_ ? "Owned" : "Alias");
+}
+
+
+
+// *************** UnicodeText ******************
+
+// ----- Constructors -----
+
+// Default constructor
+UnicodeText::UnicodeText() {
+}
+
+// Copy constructor
+UnicodeText::UnicodeText(const UnicodeText& src) {
+ Copy(src);
+}
+
+// Substring constructor
+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
+ const UnicodeText::const_iterator& last) {
+ CHECK(first <= last) << " Incompatible iterators";
+ repr_.append(first.it_, last.it_ - first.it_);
+}
+
+string UnicodeText::UTF8Substring(const const_iterator& first,
+ const const_iterator& last) {
+ CHECK(first <= last) << " Incompatible iterators";
+ return string(first.it_, last.it_ - first.it_);
+}
+
+
+// ----- Copy -----
+
+UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
+ if (this != &src) {
+ Copy(src);
+ }
+ return *this;
+}
+
+UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
+ repr_.Copy(src.repr_.data_, src.repr_.size_);
+ return *this;
+}
+
+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
+ repr_.Copy(buffer, byte_length);
+ if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+ LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+ }
+ return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
+ int byte_length) {
+ repr_.Copy(buffer, byte_length);
+ return *this;
+}
+
+// ----- TakeOwnershipOf -----
+
+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
+ int byte_length,
+ int byte_capacity) {
+ repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+ if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+ LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+ }
+ return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
+ int byte_length,
+ int byte_capacity) {
+ repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+ return *this;
+}
+
+// ----- PointTo -----
+
+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
+ if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
+ repr_.PointTo(buffer, byte_length);
+ } else {
+ LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+ repr_.Copy(buffer, byte_length);
+ repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+ }
+ return *this;
+}
+
+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
+ int byte_length) {
+ repr_.PointTo(buffer, byte_length);
+ return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
+ repr_.PointTo(src.repr_.data_, src.repr_.size_);
+ return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const const_iterator &first,
+ const const_iterator &last) {
+ CHECK(first <= last) << " Incompatible iterators";
+ repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
+ return *this;
+}
+
+// ----- Append -----
+
+UnicodeText& UnicodeText::append(const UnicodeText& u) {
+ repr_.append(u.repr_.data_, u.repr_.size_);
+ return *this;
+}
+
+UnicodeText& UnicodeText::append(const const_iterator& first,
+ const const_iterator& last) {
+ CHECK(first <= last) << " Incompatible iterators";
+ repr_.append(first.it_, last.it_ - first.it_);
+ return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
+ repr_.append(utf8, len);
+ return *this;
+}
+
+// ----- substring searching -----
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
+ const_iterator start_pos) const {
+ CHECK_GE(start_pos.utf8_data(), utf8_data());
+ CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
+ return UnsafeFind(look, start_pos);
+}
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
+ return UnsafeFind(look, begin());
+}
+
+UnicodeText::const_iterator UnicodeText::UnsafeFind(
+ const UnicodeText& look, const_iterator start_pos) const {
+ // Due to the magic of the UTF8 encoding, searching for a sequence of
+ // letters is equivalent to substring search.
+ StringPiece searching(utf8_data(), utf8_length());
+ StringPiece look_piece(look.utf8_data(), look.utf8_length());
+ LOG(FATAL) << "Not implemented";
+ //StringPiece::size_type found =
+ // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
+ StringPiece::size_type found = StringPiece::npos;
+ if (found == StringPiece::npos) return end();
+ return const_iterator(utf8_data() + found);
+}
+
+bool UnicodeText::HasReplacementChar() const {
+ // Equivalent to:
+ // UnicodeText replacement_char;
+ // replacement_char.push_back(0xFFFD);
+ // return find(replacement_char) != end();
+ StringPiece searching(utf8_data(), utf8_length());
+ StringPiece looking_for("\xEF\xBF\xBD", 3);
+ LOG(FATAL) << "Not implemented";
+ //return searching.find(looking_for) != StringPiece::npos;
+ return false;
+}
+
+// ----- other methods -----
+
+// Clear operator
+void UnicodeText::clear() {
+ repr_.clear();
+}
+
+// Destructor
+UnicodeText::~UnicodeText() {}
+
+
+void UnicodeText::push_back(char32 c) {
+ if (UniLib::IsValidCodepoint(c)) {
+ char buf[UTFmax];
+ int len = runetochar(buf, &c);
+ if (UniLib::IsInterchangeValid(buf, len)) {
+ repr_.append(buf, len);
+ } else {
+ LOG(WARNING) << "Unicode value 0x" << std::hex << c
+ << " is not valid for interchange";
+ repr_.append(" ", 1);
+ }
+ } else {
+ LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
+ repr_.append(" ", 1);
+ }
+}
+
+int UnicodeText::size() const {
+ return CodepointCount(repr_.data_, repr_.size_);
+}
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
+ if (&lhs == &rhs) return true;
+ if (lhs.repr_.size_ != rhs.repr_.size_) return false;
+ return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
+}
+
+string UnicodeText::DebugString() const {
+ return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
+ this,
+ size(),
+ repr_.DebugString().c_str());
+}
+
+
+// ******************* UnicodeText::const_iterator *********************
+
+// The implementation of const_iterator would be nicer if it
+// inherited from boost::iterator_facade
+// (http://boost.org/libs/iterator/doc/iterator_facade.html).
+
+UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
+
+UnicodeText::const_iterator::const_iterator(const const_iterator& other)
+ : it_(other.it_) {
+}
+
+UnicodeText::const_iterator&
+UnicodeText::const_iterator::operator=(const const_iterator& other) {
+ if (&other != this)
+ it_ = other.it_;
+ return *this;
+}
+
+UnicodeText::const_iterator UnicodeText::begin() const {
+ return const_iterator(repr_.data_);
+}
+
+UnicodeText::const_iterator UnicodeText::end() const {
+ return const_iterator(repr_.data_ + repr_.size_);
+}
+
+bool operator<(const UnicodeText::const_iterator& lhs,
+ const UnicodeText::const_iterator& rhs) {
+ return lhs.it_ < rhs.it_;
+}
+
+char32 UnicodeText::const_iterator::operator*() const {
+ // (We could call chartorune here, but that does some
+ // error-checking, and we're guaranteed that our data is valid
+ // UTF-8. Also, we expect this routine to be called very often. So
+ // for speed, we do the calculation ourselves.)
+
+ // Convert from UTF-8
+ unsigned char byte1 = it_[0];
+ if (byte1 < 0x80)
+ return byte1;
+
+ unsigned char byte2 = it_[1];
+ if (byte1 < 0xE0)
+ return ((byte1 & 0x1F) << 6)
+ | (byte2 & 0x3F);
+
+ unsigned char byte3 = it_[2];
+ if (byte1 < 0xF0)
+ return ((byte1 & 0x0F) << 12)
+ | ((byte2 & 0x3F) << 6)
+ | (byte3 & 0x3F);
+
+ unsigned char byte4 = it_[3];
+ return ((byte1 & 0x07) << 18)
+ | ((byte2 & 0x3F) << 12)
+ | ((byte3 & 0x3F) << 6)
+ | (byte4 & 0x3F);
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
+ it_ += UniLib::OneCharLen(it_);
+ return *this;
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
+ while (UniLib::IsTrailByte(*--it_));
+ return *this;
+}
+
+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
+ utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
+ utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
+ utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
+ utf8_output[3] = it_[3];
+ return 4;
+}
+
+string UnicodeText::const_iterator::get_utf8_string() const {
+ return string(utf8_data(), utf8_length());
+}
+
+int UnicodeText::const_iterator::utf8_length() const {
+ if ((it_[0] & 0xff) < 0x80) {
+ return 1;
+ } else if ((it_[0] & 0xff) < 0xE0) {
+ return 2;
+ } else if ((it_[0] & 0xff) < 0xF0) {
+ return 3;
+ } else {
+ return 4;
+ }
+}
+
+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
+ CHECK(p != nullptr);
+ const char* start = utf8_data();
+ int len = utf8_length();
+ const char* end = start + len;
+ CHECK(p >= start);
+ CHECK(p <= end);
+ CHECK(p == end || !UniLib::IsTrailByte(*p));
+ return const_iterator(p);
+}
+
+string UnicodeText::const_iterator::DebugString() const {
+ return tensorflow::strings::Printf("{iter %p}", it_);
+}
+
+
+// *************************** Utilities *************************
+
+string CodepointString(const UnicodeText& t) {
+ string s;
+ UnicodeText::const_iterator it = t.begin(), end = t.end();
+ while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
+ return s;
+}
diff --git a/tesseract/unittest/util/utf8/unicodetext.h b/tesseract/unittest/util/utf8/unicodetext.h
new file mode 100644
index 00000000..4e25d3ee
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unicodetext.h
@@ -0,0 +1,477 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+
+#include <stddef.h> // for NULL, ptrdiff_t
+#include <iterator> // for bidirectional_iterator_tag, etc
+#include <string> // for string
+#include <utility> // for pair
+
+#include "syntaxnet/base.h"
+
+// ***************************** UnicodeText **************************
+//
+// A UnicodeText object is a container for a sequence of Unicode
+// codepoint values. It has default, copy, and assignment constructors.
+// Data can be appended to it from another UnicodeText, from
+// iterators, or from a single codepoint.
+//
+// The internal representation of the text is UTF-8. Since UTF-8 is a
+// variable-width format, UnicodeText does not provide random access
+// to the text, and changes to the text are permitted only at the end.
+//
+// The UnicodeText class defines a const_iterator. The dereferencing
+// operator (*) returns a codepoint (char32). The iterator is a
+// bidirectional, read-only iterator. It becomes invalid if the text
+// is changed.
+//
+// There are methods for appending and retrieving UTF-8 data directly.
+// The 'utf8_data' method returns a const char* that contains the
+// UTF-8-encoded version of the text; 'utf8_length' returns the number
+// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
+// 4 bytes of UTF-8 data in a char array and returns the number of
+// bytes that it stored.
+//
+// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
+// 0x10FFFF], but UnicodeText has the additional restriction that it
+// can contain only those characters that are valid for interchange on
+// the Web. This excludes all of the control codes except for carriage
+// return, line feed, and horizontal tab. It also excludes
+// non-characters, but codepoints that are in the Private Use regions
+// are allowed, as are codepoints that are unassigned. (See the
+// Unicode reference for details.) The function UniLib::IsInterchangeValid
+// can be used as a test for this property.
+//
+// UnicodeTexts are safe. Every method that constructs or modifies a
+// UnicodeText tests for interchange-validity, and will substitute a
+// space for the invalid data. Such cases are reported via
+// LOG(WARNING).
+//
+// MEMORY MANAGEMENT: copy, take ownership, or point to
+//
+// A UnicodeText is either an "owner", meaning that it owns the memory
+// for the data buffer and will free it when the UnicodeText is
+// destroyed, or it is an "alias", meaning that it does not.
+//
+// There are three methods for storing UTF-8 data in a UnicodeText:
+//
+// CopyUTF8(buffer, len) copies buffer.
+//
+// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
+//
+// PointToUTF8(buffer, size) creates an alias pointing to buffer.
+//
+// All three methods perform a validity check on the buffer. There are
+// private, "unsafe" versions of these functions that bypass the
+// validity check. They are used internally and by friend-functions
+// that are handling UTF-8 data that has already been validated.
+//
+// The purpose of an alias is to avoid making an unnecessary copy of a
+// UTF-8 buffer while still providing access to the Unicode values
+// within that text through iterators or the fast scanners that are
+// based on UTF-8 state tables. The lifetime of an alias must not
+// exceed the lifetime of the buffer from which it was constructed.
+//
+// The semantics of an alias might be described as "copy on write or
+// repair." The source data is never modified. If push_back() or
+// append() is called on an alias, a copy of the data will be created,
+// and the UnicodeText will become an owner. If clear() is called on
+// an alias, it becomes an (empty) owner.
+//
+// The copy constructor and the assignment operator produce an owner.
+// That is, after direct initialization ("UnicodeText x(y);") or copy
+// initialization ("UnicodeText x = y;") x will be an owner, even if y
+// was an alias. The assignment operator ("x = y;") also produces an
+// owner unless x and y are the same object and y is an alias.
+//
+// Aliases should be used with care. If the source from which an alias
+// was created is freed, or if the contents are changed, while the
+// alias is still in use, fatal errors could result. But it can be
+// quite useful to have a UnicodeText "window" through which to see a
+// UTF-8 buffer without having to pay the price of making a copy.
+//
+// UTILITIES
+//
+// The interfaces in util/utf8/public/textutils.h provide higher-level
+// utilities for dealing with UnicodeTexts, including routines for
+// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
+// strings, creating strings from UnicodeTexts, normalizing text for
+// efficient matching or display, and others.
+
+class UnicodeText {
+ public:
+ class const_iterator;
+
+ typedef char32 value_type;
+
+ // Constructors. These always produce owners.
+ UnicodeText(); // Create an empty text.
+ UnicodeText(const UnicodeText& src); // copy constructor
+ // Construct a substring (copies the data).
+ UnicodeText(const const_iterator& first, const const_iterator& last);
+
+ // Assignment operator. This copies the data and produces an owner
+ // unless this == &src, e.g., "x = x;", which is a no-op.
+ UnicodeText& operator=(const UnicodeText& src);
+
+ // x.Copy(y) copies the data from y into x.
+ UnicodeText& Copy(const UnicodeText& src);
+ inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
+
+ // x.PointTo(y) changes x so that it points to y's data.
+ // It does not copy y or take ownership of y's data.
+ UnicodeText& PointTo(const UnicodeText& src);
+ UnicodeText& PointTo(const const_iterator& first,
+ const const_iterator& last);
+
+ ~UnicodeText();
+
+ void clear(); // Clear text.
+ bool empty() const { return repr_.size_ == 0; } // Test if text is empty.
+
+ // Add a codepoint to the end of the text.
+ // If the codepoint is not interchange-valid, add a space instead
+ // and log a warning.
+ void push_back(char32 codepoint);
+
+ // Generic appending operation.
+ // iterator_traits<ForwardIterator>::value_type must be implicitly
+ // convertible to char32. Typical uses of this method might include:
+ // char32 chars[] = {0x1, 0x2, ...};
+ // vector<char32> more_chars = ...;
+ // utext.append(chars, chars+arraysize(chars));
+ // utext.append(more_chars.begin(), more_chars.end());
+ template<typename ForwardIterator>
+ UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
+ while (first != last) { push_back(*first++); }
+ return *this;
+ }
+
+ // A specialization of the generic append() method.
+ UnicodeText& append(const const_iterator& first, const const_iterator& last);
+
+ // An optimization of append(source.begin(), source.end()).
+ UnicodeText& append(const UnicodeText& source);
+
+ int size() const; // the number of Unicode characters (codepoints)
+
+ friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+ friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
+
+ class const_iterator {
+ typedef const_iterator CI;
+ public:
+ typedef std::bidirectional_iterator_tag iterator_category;
+ typedef char32 value_type;
+ typedef ptrdiff_t difference_type;
+ typedef void pointer; // (Not needed.)
+ typedef const char32 reference; // (Needed for const_reverse_iterator)
+
+ // Iterators are default-constructible.
+ const_iterator();
+
+ // It's safe to make multiple passes over a UnicodeText.
+ const_iterator(const const_iterator& other);
+ const_iterator& operator=(const const_iterator& other);
+
+ char32 operator*() const; // Dereference
+
+ const_iterator& operator++(); // Advance (++iter)
+ const_iterator operator++(int) { // (iter++)
+ const_iterator result(*this);
+ ++*this;
+ return result;
+ }
+
+ const_iterator& operator--(); // Retreat (--iter)
+ const_iterator operator--(int) { // (iter--)
+ const_iterator result(*this);
+ --*this;
+ return result;
+ }
+
+ // We love relational operators.
+ friend bool operator==(const CI& lhs, const CI& rhs) {
+ return lhs.it_ == rhs.it_; }
+ friend bool operator!=(const CI& lhs, const CI& rhs) {
+ return !(lhs == rhs); }
+ friend bool operator<(const CI& lhs, const CI& rhs);
+ friend bool operator>(const CI& lhs, const CI& rhs) {
+ return rhs < lhs; }
+ friend bool operator<=(const CI& lhs, const CI& rhs) {
+ return !(rhs < lhs); }
+ friend bool operator>=(const CI& lhs, const CI& rhs) {
+ return !(lhs < rhs); }
+
+ friend difference_type distance(const CI& first, const CI& last);
+
+ // UTF-8-specific methods
+ // Store the UTF-8 encoding of the current codepoint into buf,
+ // which must be at least 4 bytes long. Return the number of
+ // bytes written.
+ int get_utf8(char* buf) const;
+ // Return the UTF-8 character that the iterator points to.
+ string get_utf8_string() const;
+ // Return the byte length of the UTF-8 character the iterator points to.
+ int utf8_length() const;
+ // Return the iterator's pointer into the UTF-8 data.
+ const char* utf8_data() const { return it_; }
+
+ string DebugString() const;
+
+ private:
+ friend class UnicodeText;
+ friend class UnicodeTextUtils;
+ friend class UTF8StateTableProperty;
+ explicit const_iterator(const char* it) : it_(it) {}
+
+ const char* it_;
+ };
+
+ const_iterator begin() const;
+ const_iterator end() const;
+
+ class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
+ public:
+ explicit const_reverse_iterator(const_iterator it) :
+ std::reverse_iterator<const_iterator>(it) {}
+ const char* utf8_data() const {
+ const_iterator tmp_it = base();
+ return (--tmp_it).utf8_data();
+ }
+ int get_utf8(char* buf) const {
+ const_iterator tmp_it = base();
+ return (--tmp_it).get_utf8(buf);
+ }
+ string get_utf8_string() const {
+ const_iterator tmp_it = base();
+ return (--tmp_it).get_utf8_string();
+ }
+ int utf8_length() const {
+ const_iterator tmp_it = base();
+ return (--tmp_it).utf8_length();
+ }
+ };
+ const_reverse_iterator rbegin() const {
+ return const_reverse_iterator(end());
+ }
+ const_reverse_iterator rend() const {
+ return const_reverse_iterator(begin());
+ }
+
+ // Substring searching. Returns the beginning of the first
+ // occurrence of "look", or end() if not found.
+ const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
+ // Equivalent to find(look, begin())
+ const_iterator find(const UnicodeText& look) const;
+
+ // Returns whether this contains the character U+FFFD. This can
+ // occur, for example, if the input to Encodings::Decode() had byte
+ // sequences that were invalid in the source encoding.
+ bool HasReplacementChar() const;
+
+ // UTF-8-specific methods
+ //
+ // Return the data, length, and capacity of UTF-8-encoded version of
+ // the text. Length and capacity are measured in bytes.
+ const char* utf8_data() const { return repr_.data_; }
+ int utf8_length() const { return repr_.size_; }
+ int utf8_capacity() const { return repr_.capacity_; }
+
+ // Return the UTF-8 data as a string.
+ static string UTF8Substring(const const_iterator& first,
+ const const_iterator& last);
+
+ // There are three methods for initializing a UnicodeText from UTF-8
+ // data. They vary in details of memory management. In all cases,
+ // the data is tested for interchange-validity. If it is not
+ // interchange-valid, a LOG(WARNING) is issued, and each
+ // structurally invalid byte and each interchange-invalid codepoint
+ // is replaced with a space.
+
+ // x.CopyUTF8(buf, len) copies buf into x.
+ UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
+
+ // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
+ // buf. buf is not copied.
+ UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
+ int byte_length,
+ int byte_capacity);
+
+ // x.PointToUTF8(buf,len) changes x so that it points to buf
+ // ("becomes an alias"). It does not take ownership or copy buf.
+ // If the buffer is not valid, this has the same effect as
+ // CopyUTF8(utf8_buffer, byte_length).
+ UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
+
+ // Occasionally it is necessary to use functions that operate on the
+ // pointer returned by utf8_data(). MakeIterator(p) provides a way
+ // to get back to the UnicodeText level. It uses CHECK to ensure
+ // that p is a pointer within this object's UTF-8 data, and that it
+ // points to the beginning of a character.
+ const_iterator MakeIterator(const char* p) const;
+
+ string DebugString() const;
+
+ private:
+ friend class const_iterator;
+ friend class UnicodeTextUtils;
+
+ class Repr { // A byte-string.
+ public:
+ char* data_;
+ int size_;
+ int capacity_;
+ bool ours_; // Do we own data_?
+
+ Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
+ ~Repr() { if (ours_) delete[] data_; }
+
+ void clear();
+ void reserve(int capacity);
+ void resize(int size);
+
+ void append(const char* bytes, int byte_length);
+ void Copy(const char* data, int size);
+ void TakeOwnershipOf(char* data, int size, int capacity);
+ void PointTo(const char* data, int size);
+
+ string DebugString() const;
+
+ private:
+ Repr& operator=(const Repr&);
+ Repr(const Repr& other);
+ };
+
+ Repr repr_;
+
+ // UTF-8-specific private methods.
+ // These routines do not perform a validity check when compiled
+ // in opt mode.
+ // It is an error to call these methods with UTF-8 data that
+ // is not interchange-valid.
+ //
+ UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
+ UnicodeText& UnsafeTakeOwnershipOfUTF8(
+ char* utf8_buffer, int byte_length, int byte_capacity);
+ UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
+ UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
+ const_iterator UnsafeFind(const UnicodeText& look,
+ const_iterator start_pos) const;
+};
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+
+inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
+ return !(lhs == rhs);
+}
+
+// UnicodeTextRange is a pair of iterators, useful for specifying text
+// segments. If the iterators are ==, the segment is empty.
+typedef pair<UnicodeText::const_iterator,
+ UnicodeText::const_iterator> UnicodeTextRange;
+
+inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
+ return r.first == r.second;
+}
+
+
+// *************************** Utilities *************************
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
+// is an "owner.")
+//
+// Each byte that is structurally invalid will be replaced with a
+// space. Each codepoint that is interchange-invalid will also be
+// replaced with a space, even if the codepoint was represented with a
+// multibyte sequence in the UTF-8 data.
+//
+inline UnicodeText MakeUnicodeTextAcceptingOwnership(
+ char* utf8_buffer, int byte_length, int byte_capacity) {
+ return UnicodeText().TakeOwnershipOfUTF8(
+ utf8_buffer, byte_length, byte_capacity);
+}
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText does not take ownership of the
+// buffer. (It is an "alias.")
+//
+inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
+ const char* utf8_buffer, int byte_length) {
+ return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
+}
+
+// Create a UnicodeText from a UTF-8 string or buffer.
+//
+// If do_copy is true, then a copy of the string is made. The copy is
+// owned by the resulting UnicodeText object and will be freed when
+// the object is destroyed. This UnicodeText object is referred to
+// as an "owner."
+//
+// If do_copy is false, then no copy is made. The resulting
+// UnicodeText object does NOT take ownership of the string; in this
+// case, the lifetime of the UnicodeText object must not exceed the
+// lifetime of the string. This Unicodetext object is referred to as
+// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
+//
+// If the input string does not contain valid UTF-8, then a copy is
+// made (as if do_copy were true) and coerced to valid UTF-8 by
+// replacing each invalid byte with a space.
+//
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
+ bool do_copy) {
+ UnicodeText t;
+ if (do_copy) {
+ t.CopyUTF8(utf8_buf, len);
+ } else {
+ t.PointToUTF8(utf8_buf, len);
+ }
+ return t;
+}
+
+inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
+ return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
+}
+
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
+ return UTF8ToUnicodeText(utf8_buf, len, true);
+}
+inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
+ return UTF8ToUnicodeText(utf8_string, true);
+}
+
+// Return a string containing the UTF-8 encoded version of all the
+// Unicode characters in t.
+inline string UnicodeTextToUTF8(const UnicodeText& t) {
+ return string(t.utf8_data(), t.utf8_length());
+}
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// For debugging. Return a string of integers, written in uppercase
+// hex (%X), corresponding to the codepoints within the text. Each
+// integer is followed by a space. E.g., "61 62 6A 3005 ".
+string CodepointString(const UnicodeText& t);
+
+#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
diff --git a/tesseract/unittest/util/utf8/unilib.cc b/tesseract/unittest/util/utf8/unilib.cc
new file mode 100644
index 00000000..c00759ae
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib.cc
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: sligocki@google.com (Shawn Ligocki)
+
+#include "util/utf8/unilib.h"
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Codepoints not allowed for interchange are:
+// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
+// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
+// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
+// C1 controls: U+007F to U+009F
+// Surrogates: U+D800 to U+DFFF
+// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
+bool IsInterchangeValid(char32 c) {
+ return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
+ (c >= 0x7F && c <= 0x9F) ||
+ (c >= 0xD800 && c <= 0xDFFF) ||
+ (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
+}
+
+int SpanInterchangeValid(const char* begin, int byte_length) {
+ char32 rune;
+ const char* p = begin;
+ const char* end = begin + byte_length;
+ while (p < end) {
+ int bytes_consumed = charntorune(&rune, p, end - p);
+ // We want to accept Runeerror == U+FFFD as a valid char, but it is used
+ // by chartorune to indicate error. Luckily, the real codepoint is size 3
+ // while errors return bytes_consumed <= 1.
+ if ((rune == Runeerror && bytes_consumed <= 1) ||
+ !IsInterchangeValid(rune)) {
+ break; // Found
+ }
+ p += bytes_consumed;
+ }
+ return p - begin;
+}
+
+} // namespace UniLib
diff --git a/tesseract/unittest/util/utf8/unilib.h b/tesseract/unittest/util/utf8/unilib.h
new file mode 100644
index 00000000..e99895a2
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Routines to do manipulation of Unicode characters or text
+//
+// The StructurallyValid routines accept buffers of arbitrary bytes.
+// For CoerceToStructurallyValid(), the input buffer and output buffers may
+// point to exactly the same memory.
+//
+// In all other cases, the UTF-8 string must be structurally valid and
+// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
+// Debug builds take a fatal error for invalid UTF-8 input.
+// The input and output buffers may not overlap at all.
+//
+// The char32 routines are here only for convenience; they convert to UTF-8
+// internally and use the UTF-8 routines.
+
+#ifndef UTIL_UTF8_UNILIB_H__
+#define UTIL_UTF8_UNILIB_H__
+
+#include <string>
+#include "syntaxnet/base.h"
+
+// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
+// but they are defined in unilib_utf8_utils.h.
+//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
+
+namespace UniLib {
+
+// Returns the length in bytes of the prefix of src that is all
+// interchange valid UTF-8
+int SpanInterchangeValid(const char* src, int byte_length);
+inline int SpanInterchangeValid(const std::string& src) {
+ return SpanInterchangeValid(src.data(), src.size());
+}
+
+// Returns true if the source is all interchange valid UTF-8
+// "Interchange valid" is a stronger than structurally valid --
+// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
+bool IsInterchangeValid(char32 codepoint);
+inline bool IsInterchangeValid(const char* src, int byte_length) {
+ return (byte_length == SpanInterchangeValid(src, byte_length));
+}
+inline bool IsInterchangeValid(const std::string& src) {
+ return IsInterchangeValid(src.data(), src.size());
+}
+
+} // namespace UniLib
+
+#endif // UTIL_UTF8_PUBLIC_UNILIB_H_
diff --git a/tesseract/unittest/util/utf8/unilib_utf8_utils.h b/tesseract/unittest/util/utf8/unilib_utf8_utils.h
new file mode 100644
index 00000000..a9c10166
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib_utf8_utils.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+
+// These definitions are self-contained and have no dependencies.
+// They are also exported from unilib.h for legacy reasons.
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
+// (i.e., is not a surrogate codepoint). See also
+// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
+inline bool IsValidCodepoint(char32 c) {
+ return (static_cast<uint32>(c) < 0xD800)
+ || (c >= 0xE000 && c <= 0x10FFFF);
+}
+
+// Returns true if 'str' is the start of a structurally valid UTF-8
+// sequence and is not a surrogate codepoint. Returns false if str.empty()
+// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
+// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
+inline bool IsUTF8ValidCodepoint(StringPiece str) {
+ char32 c;
+ int consumed;
+ // It's OK if str.length() > consumed.
+ return !str.empty()
+ && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
+ && IsValidCodepoint(c);
+}
+
+// Returns the length (number of bytes) of the Unicode code point
+// starting at src, based on inspecting just that one byte. This
+// requires that src point to a well-formed UTF-8 string; the result
+// is undefined otherwise.
+inline int OneCharLen(const char* src) {
+ return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+}
+
+// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
+inline bool IsTrailByte(char x) {
+ // return (x & 0xC0) == 0x80;
+ // Since trail bytes are always in [0x80, 0xBF], we can optimize:
+ return static_cast<signed char>(x) < -0x40;
+}
+
+} // namespace UniLib
+
+#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
diff --git a/tesseract/unittest/validate_grapheme_test.cc b/tesseract/unittest/validate_grapheme_test.cc
new file mode 100644
index 00000000..54e2f490
--- /dev/null
+++ b/tesseract/unittest/validate_grapheme_test.cc
@@ -0,0 +1,179 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
+ std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ // It made 3 graphemes.
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
+ EXPECT_EQ(glyphs[1], std::string("\u0c15"));
+ EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
+}
+
+TEST(ValidateGraphemeTest, SingleConsonantOK) {
+ std::string str = "\u0cb9"; // HA
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+}
+
+TEST(ValidateGraphemeTest, SimpleCV) {
+ std::string str = "\u0cb9\u0cbf"; // HA I
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+}
+
+TEST(ValidateGraphemeTest, SubscriptConjunct) {
+ std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
+}
+
+TEST(ValidateGraphemeTest, HalfFormJoiner) {
+ std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
+}
+
+TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
+ std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
+}
+
+TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
+ std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
+ // Malaylam only, so not allowed in Telugu.
+ str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
+ EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+}
+
+TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
+ std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 2);
+ EXPECT_EQ(glyphs[1], std::string("\u0d24"));
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
+}
+
+TEST(ValidateGraphemeTest, ThaiGraphemes) {
+ // This is a single grapheme unless in glyph split mode
+ std::string str = "\u0e14\u0e38\u0e4a";
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 1);
+ EXPECT_EQ(glyphs[0], str);
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[0], std::string("\u0e14"));
+}
+
+TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
+ std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
+ std::vector<std::string> glyphs;
+ // Returns true, but the joiner is gone.
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(glyphs.size(), 5);
+ EXPECT_EQ(glyphs[0], std::string("'"));
+ EXPECT_EQ(glyphs[1], std::string("\u0d24"));
+ EXPECT_EQ(glyphs[2], std::string("\u0d23"));
+ EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
+ EXPECT_EQ(glyphs[4], std::string("'"));
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/validate_indic_test.cc b/tesseract/unittest/validate_indic_test.cc
new file mode 100644
index 00000000..d317198b
--- /dev/null
+++ b/tesseract/unittest/validate_indic_test.cc
@@ -0,0 +1,231 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+// Though the unicode example for Telugu in section 12.7:
+// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
+// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to
+// suppress a conjugate that would otherwise occur. If a consonant is followed
+// by a virama and then by a non-Indic character, OpenType will presume that
+// the user simply meant to suppress the inherent vowel of the consonant
+// and render it as the consonant with an explicit virama, the same as if
+// a ZWNJ had followed. Since this is confusing to an OCR engine, the
+// normalizer always puts a termninating ZWNJ on the end if not present,
+// and accepts the string as valid.
+TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
+ std::string str = "\u0c15\u0c4d"; // KA - virama
+ std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
+ // Same result if we started with the normalized string.
+ ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1,
+ target_str);
+}
+
+// Only one dependent vowel is allowed.
+TEST(ValidateIndicTest, OnlyOneDependentVowel) {
+ std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU
+ std::string dest;
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &dest))
+ << PrintString32WithUnicodes(str);
+}
+
+// [c26][c4d][c01]
+// A consonant (DA) followed by the virama followed by a bindu
+// Syllable modifiers [c01][c02][c03] all modify the pronunciation of
+// the vowel in a syllable, as does the virama [c04]. You can only
+// have one of these on a syllable.
+//
+// References:
+// http://www.omniglot.com/writing/telugu.htm
+TEST(ValidateIndicTest, OnlyOneVowelModifier) {
+ std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu
+ std::string result;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+ // It made 1 grapheme of 4 chars, by terminating the explicit virama.
+ EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);
+
+ str = "\u0995\u0983\u0981"; // KA visarga candrabindu
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+
+ // Exception: Malayalam allows multiple anusvara.
+ str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+ EXPECT_EQ(str, result);
+}
+
+// [c28][c02][c3f]
+// A consonant (NA) followed by the Anusvara/sunna and another matra (I).
+// The anusvara [c02] is a pronunciation directive
+// for a whole syllable and only appears at the end of the syllable
+// References:
+// + Unicode v9, 12.1 "Modifier Mark Rules R10,"
+// and the Microsoft page
+// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
+TEST(ValidateIndicTest, VowelModifierMustBeLast) {
+ std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I
+ std::string dest;
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &dest))
+ << PrintString32WithUnicodes(str);
+ // Swap c02/c3f and all is ok.
+ str = "\u0c28\u0c3f\u0c02"; // NA I Sunna
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+}
+
+// [c05][c47]
+// A Vowel (A) followed by a combining vowel/matra (EE).
+// In Telugu, matras are only put on consonants, not independent
+// vowels.
+// References:
+// + Unicode v9, 12.1:
+// Principles of the Devanagari Script: Dependent Vowel Signs (Matras).
+// + http://varamozhi.sourceforge.net/iscii91.pdf
+TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
+ std::string str = "\u0c05\u0c47"; // A EE
+ std::string dest;
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &dest))
+ << PrintString32WithUnicodes(str);
+ str = "\u0c1e\u0c3e"; // NYA AA
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+}
+
+// Sub-graphemes are allowed if GraphemeNorm is turned off.
+TEST(ValidateIndicTest, SubGraphemes) {
+ std::string str = "\u0d3e"; // AA
+ std::string dest;
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNone, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+}
+
+TEST(ValidateIndicTest, Nukta) {
+ std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));
+ // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
+ std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA
+ ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str);
+}
+
+// Sinhala has some of its own specific rules. See www.macciato.com/sinhala
+TEST(ValidateIndicTest, SinhalaRakaransaya) {
+ std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna
+ std::string dest;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 2);
+ EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));
+ // Can be followed by a dependent vowel.
+ str += "\u0dd9"; // E
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+}
+
+TEST(ValidateIndicTest, SinhalaYansaya) {
+ std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna
+ std::string dest;
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+ // Can be followed by a dependent vowel.
+ str += "\u0ddd"; // OO
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(), &dest))
+ << PrintString32WithUnicodes(str);
+ EXPECT_EQ(dest, str);
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));
+}
+
+TEST(ValidateIndicTest, SinhalaRepaya) {
+ std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 2);
+ EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 3);
+ EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
+}
+
+TEST(ValidateIndicTest, SinhalaSpecials) {
+ // Sinhala has some exceptions from the usual rules.
+ std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
+ std::vector<std::string> glyphs;
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(glyphs[0], std::string("\u0dc0"));
+ EXPECT_EQ(glyphs[1], std::string("\u0d9c"));
+ EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb"));
+ EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));
+ EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));
+ str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
+ EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
+ EXPECT_EQ(glyphs[0], std::string("\u0dc3"));
+ EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
+ EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d"));
+ EXPECT_EQ(glyphs[3], std::string("\u0dcf"));
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/validate_khmer_test.cc b/tesseract/unittest/validate_khmer_test.cc
new file mode 100644
index 00000000..74b87e61
--- /dev/null
+++ b/tesseract/unittest/validate_khmer_test.cc
@@ -0,0 +1,50 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+// Test some random Khmer words.
+TEST(ValidateKhmerTest, GoodKhmerWords) {
+ std::string str = "ព័ត៏មានប្លែកៗ";
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 13, 12, 7, str);
+ str = "ទំនុកច្រៀង";
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 10, 9, 5, str);
+ str = "កាលីហ្វូញ៉ា";
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 4, str);
+ str = "ចាប់ពីផ្លូវ";
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 5, str);
+}
+
+// Test some random Khmer words with dotted circles.
+TEST(ValidateKhmerTest, BadKhmerWords) {
+ std::string result;
+ // Multiple dependent vowels not allowed
+ std::string str = "\u1796\u17b6\u17b7";
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+ // Multiple shifters not allowed
+ str = "\u1798\u17c9\u17ca";
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+ // Multiple signs not allowed
+ str = "\u1780\u17b6\u17cb\u17cd";
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/validate_myanmar_test.cc b/tesseract/unittest/validate_myanmar_test.cc
new file mode 100644
index 00000000..262e04b6
--- /dev/null
+++ b/tesseract/unittest/validate_myanmar_test.cc
@@ -0,0 +1,54 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+// Test some random Myanmar words.
+TEST(ValidateMyanmarTest, GoodMyanmarWords) {
+ std::string str = "လျှာကသိသည် "; // No viramas in this one.
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 11, 5, str);
+ str = "တုန္လႈပ္မႈ ";
+ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 9, 4, str);
+}
+
+// Test some random Myanmar words with dotted circles.
+TEST(ValidateMyanmarTest, BadMyanmarWords) {
+ std::string str = "က်န္းမာေရး";
+ std::vector<std::string> glyphs;
+ EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+ str.c_str(), &glyphs));
+ std::string result;
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+ // It works if the grapheme normalization is turned off.
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNone, str.c_str(), &result));
+ EXPECT_EQ(str, result);
+ str = "ခုႏွစ္";
+ EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
+ UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+ true, str.c_str(), &glyphs));
+ EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNormalize, str.c_str(),
+ &result));
+ // It works if the grapheme normalization is turned off.
+ EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+ GraphemeNorm::kNone, str.c_str(), &result));
+ EXPECT_EQ(str, result);
+}
+
+} // namespace tesseract
diff --git a/tesseract/unittest/validator_test.cc b/tesseract/unittest/validator_test.cc
new file mode 100644
index 00000000..84cb42af
--- /dev/null
+++ b/tesseract/unittest/validator_test.cc
@@ -0,0 +1,76 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "validator.h"
+
+#include "gmock/gmock.h" // for testing::ElementsAreArray
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableValidator : public Validator {
+ public:
+ static ViramaScript TestableMostFrequentViramaScript(
+ const std::vector<char32>& utf32) {
+ return MostFrequentViramaScript(utf32);
+ }
+};
+
+// The majority of Validator is tested by the script-specific tests of its
+// subclasses, but the MostFrequentViramaScript function is worth a unittest.
+TEST(ValidatorTest, MostFrequentViramaScript) {
+ // The most frequent virama script should come out correct, despite
+ // distractions from other scripts.
+ EXPECT_EQ(ViramaScript::kTelugu,
+ TestableValidator::TestableMostFrequentViramaScript({0xc05}));
+ // It is still Telugu surrounded by Latin.
+ EXPECT_EQ(ViramaScript::kTelugu,
+ TestableValidator::TestableMostFrequentViramaScript(
+ {'a', 0xc05, 'b', 'c'}));
+ // But not still Telugu surrounded by Devanagari.
+ EXPECT_EQ(ViramaScript::kDevanagari,
+ TestableValidator::TestableMostFrequentViramaScript(
+ {0x905, 0xc05, 0x906, 0x907}));
+ EXPECT_EQ(ViramaScript::kKannada,
+ TestableValidator::TestableMostFrequentViramaScript(
+ {0xc85, 0xc05, 0xc86, 0xc87}));
+ EXPECT_EQ(ViramaScript::kBengali,
+ TestableValidator::TestableMostFrequentViramaScript(
+ {0x985, 0xc05, 0x986, 0x987}));
+ // Danda and double Danda don't count as Devanagari, as they are common.
+ EXPECT_EQ(ViramaScript::kTelugu,
+ TestableValidator::TestableMostFrequentViramaScript(
+ {0x964, 0xc05, 0x965, 0x965}));
+}
+
+// ValidateCleanAndSegment doesn't modify the input by much, but its
+// transformation should be idempotent. (Doesn't change again if re-applied.)
+TEST(ValidatorTest, Idempotency) {
+ std::vector<char32> str1(
+ {0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c});
+ std::vector<char32> str2(
+ {0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''});
+ std::vector<std::vector<char32>> result1, result2, result3, result4;
+ EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+ GraphemeNormMode::kSingleString, true, str1, &result1));
+ EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+ GraphemeNormMode::kSingleString, true, result1[0], &result2));
+ EXPECT_EQ(result1.size(), result2.size());
+ EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0]));
+ EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+ GraphemeNormMode::kSingleString, true, str2, &result3));
+ EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+ GraphemeNormMode::kSingleString, true, result3[0], &result4));
+ EXPECT_EQ(result3.size(), result4.size());
+ EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0]));
+}
+
+} // namespace tesseract