82 files changed, 14481 insertions, 0 deletions
diff --git a/tesseract/unittest/README.md b/tesseract/unittest/README.md
new file mode 100644
index 00000000..bf4f83fe
--- /dev/null
+++ b/tesseract/unittest/README.md
@@ -0,0 +1,88 @@
+# Unit Testing for Tesseract
+
+
+## Requirements
+
+### Files and structure
+```
+
+├── langdata_lstm
+│   ├── common.punc
+│   ├── common.unicharambigs
+│   ├── desired_bigrams.txt
+│   ├── eng
+│   │   ├── desired_characters
+│   │   ├── eng.config
+│   │   ├── eng.numbers
+│   │   ├── eng.punc
+│   │   ├── eng.singles_text
+│   │   ├── eng.training_text
+│   │   ├── eng.unicharambigs
+│   │   ├── eng.wordlist
+│   │   └── okfonts.txt
+│   ├── extended
+│   │   └── extended.config
+│   ├── extendedhin
+│   │   └── extendedhin.config
+│   ├── font_properties
+│   ├── forbidden_characters_default
+│   ├── hin
+│   │   ├── hin.config
+│   │   ├── hin.numbers
+│   │   ├── hin.punc
+│   │   └── hin.wordlist
+│   ├── kan
+│   │   └── kan.config
+│   ├── kor
+│   │   └── kor.config
+│   ├── osd
+│   │   └── osd.unicharset
+│   └── radical-stroke.txt
+├── tessdata
+│   ├── ara.traineddata
+│   ├── chi_tra.traineddata
+│   ├── eng.traineddata
+│   ├── heb.traineddata
+│   ├── hin.traineddata
+│   ├── jpn.traineddata
+│   ├── kmr.traineddata
+│   ├── osd.traineddata
+│   └── vie.traineddata
+├── tessdata_best
+│   ├── eng.traineddata
+│   ├── fra.traineddata
+│   ├── kmr.traineddata
+│   └── osd.traineddata
+├── tessdata_fast
+│   ├── eng.traineddata
+│   ├── kmr.traineddata
+│   ├── osd.traineddata
+│   └── script
+│       └── Latin.traineddata
+└── tesseract
+    ├── abseil
+    ...
+    ├── test
+    ├── unittest
+    └── VERSION
+```
+
+### Fonts
+
+* Microsoft fonts: arialbi.ttf, times.ttf, verdana.ttf - [instalation guide](https://www.makeuseof.com/tag/how-to-install-microsoft-core-fonts-in-ubuntu-linux/)
+* [ae_Arab.ttf](https://www.wfonts.com/download/data/2014/12/03/ae-arab/ae-arab.zip)
+* dejavu-fonts: [DejaVuSans-ExtraLight.ttf](https://dejavu-fonts.github.io/Download.html)
+* [Lohit-Hindi.ttf](https://raw.githubusercontent.com/pratul/packageofpractices/master/assets/fonts/Lohit-Hindi.ttf)
+* [UnBatang.ttf](https://raw.githubusercontent.com/byrongibson/fonts/master/backup/truetype.original/unfonts-core/UnBatang.ttf)
+
+
+## Run tests
+
+To run the tests, do the following in tesseract folder
+
+```
+autoreconf -fiv
+git submodule update --init
+export TESSDATA_PREFIX=/prefix/to/path/to/tessdata
+make check
+```
diff --git a/tesseract/unittest/apiexample_test.cc b/tesseract/unittest/apiexample_test.cc
new file mode 100644
index 00000000..5a721fa3
--- /dev/null
+++ b/tesseract/unittest/apiexample_test.cc
@@ -0,0 +1,119 @@
+///////////////////////////////////////////////////////////////////////
+// File:        apiexample_test.cc
+// Description: Api Test for Tesseract using text fixtures and parameters.
+// Tests for Devanagari, Latin and Arabic scripts are disabled by default.
+// Disabled tests can be run when required by using the
+// --gtest_also_run_disabled_tests argument.
+//                 ./unittest/apiexample_test --gtest_also_run_disabled_tests
+//
+// Author:      ShreeDevi Kumar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// expects clone of tessdata_fast repo in ../../tessdata_fast
+
+//#include "log.h"
+#include <time.h>
+#include <fstream>
+#include <iostream>
+#include <locale>
+#include <memory>               // std::unique_ptr
+#include <string>
+#include <tesseract/baseapi.h>
+#include "include_gunit.h"
+#include "allheaders.h"
+
+namespace tesseract {
+
+class QuickTest : public testing::Test {
+ protected:
+  virtual void SetUp() { start_time_ = time(nullptr); }
+  virtual void TearDown() {
+#ifndef NDEBUG
+    // Debug builds can be very slow, so allow 4 min for OCR of a test image.
+    // apitest_example including disabled tests takes about 18 min on ARMv7.
+    const time_t MAX_SECONDS_FOR_TEST = 240;
+#else
+    // Release builds typically need less than 10 s for OCR of a test image,
+    // apitest_example including disabled tests takes about 90 s on ARMv7.
+    const time_t MAX_SECONDS_FOR_TEST = 55;
+#endif
+    const time_t end_time = time(nullptr);
+    EXPECT_TRUE(end_time - start_time_ <= MAX_SECONDS_FOR_TEST)
+        << "The test took too long - "
+        << ::testing::PrintToString(end_time - start_time_);
+  }
+  time_t start_time_;
+};
+
+void OCRTester(const char* imgname, const char* groundtruth,
+               const char* tessdatadir, const char* lang) {
+  // log.info() << tessdatadir << " for language: " << lang << std::endl;
+  char* outText;
+  std::locale loc("C");  // You can also use "" for the default system locale
+  std::ifstream file(groundtruth);
+  file.imbue(loc);  // Use it for file input
+  std::string gtText((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+  std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+  ASSERT_FALSE(api->Init(tessdatadir, lang))
+      << "Could not initialize tesseract.";
+  Pix* image = pixRead(imgname);
+  ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+  api->SetImage(image);
+  outText = api->GetUTF8Text();
+  EXPECT_EQ(gtText, outText)
+      << "Phototest.tif OCR does not match ground truth for "
+      << ::testing::PrintToString(lang);
+  api->End();
+  delete[] outText;
+  pixDestroy(&image);
+}
+
+class MatchGroundTruth : public QuickTest,
+                         public ::testing::WithParamInterface<const char*> {};
+
+TEST_P(MatchGroundTruth, FastPhototestOCR) {
+  OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
+            TESSDATA_DIR "_fast", GetParam());
+}
+
+TEST_P(MatchGroundTruth, BestPhototestOCR) {
+  OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
+            TESSDATA_DIR "_best", GetParam());
+}
+
+TEST_P(MatchGroundTruth, TessPhototestOCR) {
+  OCRTester(TESTING_DIR "/phototest.tif", TESTING_DIR "/phototest.txt",
+            TESSDATA_DIR, GetParam());
+}
+
+INSTANTIATE_TEST_SUITE_P(Eng, MatchGroundTruth, ::testing::Values("eng"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Latin, MatchGroundTruth,
+                        ::testing::Values("script/Latin"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Deva, MatchGroundTruth,
+                        ::testing::Values("script/Devanagari"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Arabic, MatchGroundTruth,
+                        ::testing::Values("script/Arabic"));
+
+class EuroText : public QuickTest {};
+
+TEST_F(EuroText, FastLatinOCR) {
+  OCRTester(TESTING_DIR "/eurotext.tif", TESTING_DIR "/eurotext.txt",
+            TESSDATA_DIR "_fast", "script/Latin");
+}
+
+// script/Latin for eurotext.tif does not match groundtruth
+// for tessdata & tessdata_best.
+// so do not test these here.
+
+}  // namespace
diff --git a/tesseract/unittest/applybox_test.cc b/tesseract/unittest/applybox_test.cc
new file mode 100644
index 00000000..055172d7
--- /dev/null
+++ b/tesseract/unittest/applybox_test.cc
@@ -0,0 +1,128 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "boxread.h"
+#include "rect.h"
+#include <tesseract/resultiterator.h>
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+const char* kTruthTextWords = "To simple burn running of goods lately.\n";
+const char* kTruthTextLine = "Tosimpleburnrunningofgoodslately.\n";
+
+// The fixture for testing Tesseract.
+class ApplyBoxTest : public testing::Test {
+ protected:
+  std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTING_DIR, name);
+  }
+  std::string TessdataPath() { return TESSDATA_DIR; }
+
+  ApplyBoxTest() { src_pix_ = nullptr; }
+  ~ApplyBoxTest() { pixDestroy(&src_pix_); }
+
+  bool SetImage(const char* filename) {
+    bool found = false;
+    pixDestroy(&src_pix_);
+    src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
+    if (api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+      api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+      api_.SetImage(src_pix_);
+      api_.SetVariable("tessedit_make_boxes_from_boxes", "1");
+      api_.SetInputName(TestDataNameToPath(filename).c_str());
+      found = true;
+    }
+    return found;
+  }
+
+  // Runs ApplyBoxes (via setting the appropriate variables and Recognize)
+  // and checks that the output ocr text matches the truth_str, and that
+  // the boxes match the given box file well enough.
+  // If line_mode is true, ApplyBoxes is run in line segmentation mode,
+  // otherwise the input box file is assumed to have character-level boxes.
+  void VerifyBoxesAndText(const char* imagefile, const char* truth_str,
+                          const char* target_box_file, bool line_mode) {
+    if (!SetImage(imagefile)) {
+      // eng.traineddata not found or other problem during Init.
+      GTEST_SKIP();
+      return;
+    }
+    if (line_mode)
+      api_.SetVariable("tessedit_resegment_from_line_boxes", "1");
+    else
+      api_.SetVariable("tessedit_resegment_from_boxes", "1");
+    api_.Recognize(nullptr);
+    char* ocr_text = api_.GetUTF8Text();
+    EXPECT_STREQ(truth_str, ocr_text);
+    delete[] ocr_text;
+    // Test the boxes by reading the target box file in parallel with the
+    // bounding boxes in the ocr output.
+    std::string box_filename = TestDataNameToPath(target_box_file);
+    FILE* box_file = OpenBoxFile(box_filename.c_str());
+    ASSERT_TRUE(box_file != nullptr);
+    int height = pixGetHeight(src_pix_);
+    ResultIterator* it = api_.GetIterator();
+    do {
+      int left, top, right, bottom;
+      EXPECT_TRUE(
+          it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom));
+      TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top));
+      int line_number = 0;
+      TBOX truth_box;
+      STRING box_text;
+      EXPECT_TRUE(
+          ReadNextBox(0, &line_number, box_file, &box_text, &truth_box));
+      // Testing for major overlap is a bit weak, but if they all
+      // major overlap successfully, then it has to be fairly close.
+      EXPECT_TRUE(ocr_box.major_overlap(truth_box));
+      // Also check that the symbol text matches the box text.
+      char* symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL);
+      EXPECT_STREQ(box_text.c_str(), symbol_text);
+      delete[] symbol_text;
+    } while (it->Next(tesseract::RIL_SYMBOL));
+    delete it;
+  }
+
+  Pix* src_pix_;
+  std::string ocr_text_;
+  tesseract::TessBaseAPI api_;
+};
+
+// Tests character-level applyboxes on normal Times New Roman.
+TEST_F(ApplyBoxTest, TimesCharLevel) {
+  VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box",
+                     false);
+}
+
+// Tests character-level applyboxes on italic Times New Roman.
+TEST_F(ApplyBoxTest, ItalicCharLevel) {
+  VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box",
+                     false);
+}
+
+// Tests line-level applyboxes on normal Times New Roman.
+TEST_F(ApplyBoxTest, TimesLineLevel) {
+  VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine,
+                     "trainingtimes.box", true);
+}
+
+// Tests line-level applyboxes on italic Times New Roman.
+TEST_F(ApplyBoxTest, ItalLineLevel) {
+  VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box",
+                     true);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/baseapi_test.cc b/tesseract/unittest/baseapi_test.cc
new file mode 100644
index 00000000..285172e3
--- /dev/null
+++ b/tesseract/unittest/baseapi_test.cc
@@ -0,0 +1,402 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "cycletimer.h" // for CycleTimer
+#include "log.h"        // for LOG
+#include "ocrblock.h"   // for class BLOCK
+#include "pageres.h"
+
+#include <tesseract/baseapi.h>
+
+#include "allheaders.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "gmock/gmock-matchers.h"
+
+#include <memory>
+#include <regex>
+#include <string>
+#include <vector>
+
+namespace tesseract {
+
+using ::testing::ContainsRegex;
+using ::testing::HasSubstr;
+
+static const char* langs[] = {"eng", "vie", "hin", "ara", nullptr};
+static const char* image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif",
+                             "arabic.tif", nullptr};
+static const char* gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
+                         "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
+                         "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a",
+                         nullptr};
+
+class FriendlyTessBaseAPI : public tesseract::TessBaseAPI {
+  FRIEND_TEST(TesseractTest, LSTMGeometryTest);
+};
+
+std::string GetCleanedTextResult(tesseract::TessBaseAPI* tess, Pix* pix) {
+  tess->SetImage(pix);
+  char* result = tess->GetUTF8Text();
+  std::string ocr_result = result;
+  delete[] result;
+  absl::StripAsciiWhitespace(&ocr_result);
+  return ocr_result;
+}
+
+// The fixture for testing Tesseract.
+class TesseractTest : public testing::Test {
+ protected:
+  static std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTING_DIR, name);
+  }
+  static std::string TessdataPath() {
+    return TESSDATA_DIR;
+  }
+};
+
+// Tests that Tesseract gets exactly the right answer on phototest.
+TEST_F(TesseractTest, BasicTesseractTest) {
+  tesseract::TessBaseAPI api;
+  std::string truth_text;
+  std::string ocr_text;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+    Pix* src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
+    CHECK(src_pix);
+    ocr_text = GetCleanedTextResult(&api, src_pix);
+    CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
+                               &truth_text, file::Defaults()));
+    absl::StripAsciiWhitespace(&truth_text);
+    EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
+    pixDestroy(&src_pix);
+  } else {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+  }
+}
+
+// Test that api.GetComponentImages() will return a set of images for
+// paragraphs even if text recognition was not run.
+TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
+  tesseract::TessBaseAPI api;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
+    api.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+    api.SetVariable("paragraph_debug_level", "3");
+#if 0 // TODO: b622.png is missing
+    Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
+    CHECK(src_pix);
+    api.SetImage(src_pix);
+    Boxa* para_boxes =
+        api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);
+    EXPECT_TRUE(para_boxes != nullptr);
+    Boxa* block_boxes =
+        api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);
+    EXPECT_TRUE(block_boxes != nullptr);
+    // TODO(eger): Get paragraphs out of this page pre-text.
+    EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
+    boxaDestroy(&block_boxes);
+    boxaDestroy(&para_boxes);
+    pixDestroy(&src_pix);
+#endif
+  } else {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+  }
+}
+
+// We should get hOCR output and not seg fault, even if the api caller doesn't
+// call SetInputName().
+TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {
+  tesseract::TessBaseAPI api;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
+  Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
+  CHECK(src_pix);
+  api.SetImage(src_pix);
+  char* result = api.GetHOCRText(0);
+  EXPECT_TRUE(result != nullptr);
+  EXPECT_THAT(result, HasSubstr("Hello"));
+  EXPECT_THAT(result, HasSubstr("<div class='ocr_page'"));
+  delete[] result;
+  pixDestroy(&src_pix);
+}
+
+// hOCR output should contain baseline info for upright textlines.
+TEST_F(TesseractTest, HOCRContainsBaseline) {
+  tesseract::TessBaseAPI api;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
+  Pix* src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
+  CHECK(src_pix);
+  api.SetInputName("HelloGoogle.tif");
+  api.SetImage(src_pix);
+  char* result = api.GetHOCRText(0);
+  EXPECT_TRUE(result != nullptr);
+  EXPECT_THAT(result, HasSubstr("Hello"));
+  EXPECT_TRUE(std::regex_search(result, std::regex{ "<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+" }));
+
+  delete[] result;
+  pixDestroy(&src_pix);
+}
+
+// Tests that Tesseract gets exactly the right answer on some page numbers.
+TEST_F(TesseractTest, AdaptToWordStrTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because TessBaseAPI::AdaptToWordStr is missing.
+  GTEST_SKIP();
+#else
+  static const char* kTrainingPages[] = {
+      "136.tif", "256.tif", "410.tif", "432.tif", "540.tif",
+      "692.tif", "779.tif", "793.tif", "808.tif", "815.tif",
+      "12.tif",  "12.tif",  nullptr};
+  static const char* kTrainingText[] = {
+      "1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0", "6 9 2", "7 7 9",
+      "7 9 3", "8 0 8", "8 1 5", "1 2",   "1 2",   nullptr};
+  static const char* kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr};
+  static const char* kTestText[] = {"324", "433", "12", nullptr};
+  tesseract::TessBaseAPI api;
+  std::string truth_text;
+  std::string ocr_text;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
+  api.SetVariable("matcher_sufficient_examples_for_prototyping", "1");
+  api.SetVariable("classify_class_pruner_threshold", "220");
+  // Train on the training text.
+  for (int i = 0; kTrainingPages[i] != nullptr; ++i) {
+    std::string image_file = TestDataNameToPath(kTrainingPages[i]);
+    Pix* src_pix = pixRead(image_file.c_str());
+    CHECK(src_pix);
+    api.SetImage(src_pix);
+    EXPECT_TRUE(
+        api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))
+        << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image "
+        << image_file;
+    pixDestroy(&src_pix);
+  }
+  // Test the test text.
+  api.SetVariable("tess_bn_matching", "1");
+  api.SetPageSegMode(tesseract::PSM_SINGLE_WORD);
+  for (int i = 0; kTestPages[i] != nullptr; ++i) {
+    Pix* src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str());
+    CHECK(src_pix);
+    ocr_text = GetCleanedTextResult(&api, src_pix);
+    absl::StripAsciiWhitespace(&truth_text);
+    EXPECT_STREQ(kTestText[i], ocr_text.c_str());
+    pixDestroy(&src_pix);
+  }
+#endif
+}
+
+// Tests that LSTM gets exactly the right answer on phototest.
+TEST_F(TesseractTest, BasicLSTMTest) {
+  tesseract::TessBaseAPI api;
+  std::string truth_text;
+  std::string ocr_text;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
+  Pix* src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
+  CHECK(src_pix);
+  ocr_text = GetCleanedTextResult(&api, src_pix);
+  CHECK_OK(file::GetContents(TestDataNameToPath("phototest.gold.txt"),
+                             &truth_text, file::Defaults()));
+  absl::StripAsciiWhitespace(&truth_text);
+  EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
+  pixDestroy(&src_pix);
+}
+
+// Test that LSTM's character bounding boxes are properly converted to
+// Tesseract structures. Note that we can't guarantee that LSTM's
+// character boxes fall completely within Tesseract's word box because
+// the baseline denormalization/normalization transforms may introduce
+// errors due to float/int conversions (e.g., see OUTLINE::move() in
+// ccstruct/poutline.h) Instead, we do a loose check.
+TEST_F(TesseractTest, LSTMGeometryTest) {
+  Pix* src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str());
+  FriendlyTessBaseAPI api;
+  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
+    // eng.traineddata not found.
+    GTEST_SKIP();
+    return;
+  }
+  api.SetImage(src_pix);
+  ASSERT_EQ(api.Recognize(nullptr), 0);
+
+  const PAGE_RES* page_res = api.GetPageRes();
+  PAGE_RES_IT page_res_it(const_cast<PAGE_RES*>(page_res));
+  page_res_it.restart_page();
+  BLOCK* block = page_res_it.block()->block;
+  CHECK(block);
+
+  // extract word and character boxes for each word
+  for (page_res_it.restart_page(); page_res_it.word() != nullptr;
+       page_res_it.forward()) {
+    WERD_RES* word = page_res_it.word();
+    CHECK(word);
+    CHECK(word->best_choice);
+    CHECK_GT(word->best_choice->length(), 0);
+    CHECK(word->word);
+    CHECK(word->box_word);
+    // tesseract's word box
+    TBOX tess_blob_box;
+    tess_blob_box = word->word->bounding_box();
+    tess_blob_box.rotate(block->re_rotation());
+    // verify that each of LSTM's character boxes lies close to within
+    // tesseract's word box
+    for (int i = 0; i < word->box_word->length(); ++i) {
+      TBOX lstm_blob_box = word->box_word->BlobBox(i);
+      // LSTM character box should not spill out of tesseract word box
+      // by more than a few pixels in any direction
+      EXPECT_LT(tess_blob_box.left() - lstm_blob_box.left(), 5);
+      EXPECT_LT(lstm_blob_box.right() - tess_blob_box.right(), 5);
+      EXPECT_LT(tess_blob_box.bottom() - lstm_blob_box.bottom(), 5);
+      EXPECT_LT(lstm_blob_box.top() - tess_blob_box.top(), 5);
+    }
+  }
+  pixDestroy(&src_pix);
+}
+
+TEST_F(TesseractTest, InitConfigOnlyTest) {
+  // Languages for testing initialization.
+  const char* langs[] = {"eng", "chi_tra", "jpn", "vie"};
+  std::unique_ptr<tesseract::TessBaseAPI> api;
+  CycleTimer timer;
+  for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
+    api.reset(new tesseract::TessBaseAPI);
+    timer.Restart();
+    EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
+                           tesseract::OEM_TESSERACT_ONLY));
+    timer.Stop();
+    LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs()
+              << "ms in regular init";
+  }
+  // Init variables to set for config-only initialization.
+  std::vector<std::string> vars_vec, vars_values;
+  vars_vec.push_back("tessedit_init_config_only");
+  vars_values.push_back("1");
+  LOG(INFO) << "Switching to config only initialization:";
+  for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
+    api.reset(new tesseract::TessBaseAPI);
+    timer.Restart();
+    EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
+                           tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec,
+                           &vars_values, false));
+    timer.Stop();
+    LOG(INFO) << "Lang " << langs[i] << " took " << timer.GetInMs()
+              << "ms in config-only init";
+  }
+}
+
+// Tests if two instances of Tesseract/LSTM can co-exist in the same thread.
+// NOTE: This is not an exhaustive test and current support for multiple
+// instances in Tesseract is fragile. This test is intended largely as a means
+// of detecting and guarding against the existing support being possibly broken
+// by future CLs. TessBaseAPI instances are initialized using the default
+// OEM_DEFAULT mode.
+TEST(TesseractInstanceTest, TestMultipleTessInstances) {
+  int num_langs = 0;
+  while (langs[num_langs] != nullptr) ++num_langs;
+
+  const std::string kTessdataPath = TESSDATA_DIR;
+
+  // Preload images and verify that OCR is correct on them individually.
+  std::vector<Pix*> pix(num_langs);
+  for (int i = 0; i < num_langs; ++i) {
+    SCOPED_TRACE(absl::StrCat("Single instance test with lang = ", langs[i]));
+    std::string path = file::JoinPath(TESTING_DIR, image_files[i]);
+    pix[i] = pixRead(path.c_str());
+    QCHECK(pix[i] != nullptr) << "Could not read " << path;
+
+    tesseract::TessBaseAPI tess;
+    EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i]));
+    std::string ocr_result = GetCleanedTextResult(&tess, pix[i]);
+    EXPECT_STREQ(gt_text[i], ocr_result.c_str());
+  }
+
+  // Process the images in all pairwise combinations of associated languages.
+  std::string ocr_result[2];
+  for (int i = 0; i < num_langs; ++i) {
+    for (int j = i + 1; j < num_langs; ++j) {
+      tesseract::TessBaseAPI tess1, tess2;
+      tess1.Init(kTessdataPath.c_str(), langs[i]);
+      tess2.Init(kTessdataPath.c_str(), langs[j]);
+
+      ocr_result[0] = GetCleanedTextResult(&tess1, pix[i]);
+      ocr_result[1] = GetCleanedTextResult(&tess2, pix[j]);
+
+      EXPECT_FALSE(strcmp(gt_text[i], ocr_result[0].c_str()) ||
+                   strcmp(gt_text[j], ocr_result[1].c_str()))
+          << "OCR failed on language pair " << langs[i] << "-" << langs[j];
+    }
+  }
+
+  for (int i = 0; i < num_langs; ++i) pixDestroy(&pix[i]);
+}
+
+// Tests whether Tesseract parameters are correctly set for the two instances.
+TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
+  std::string illegal_name = "an_illegal_name";
+  std::string langs[2] = {"eng", "hin"};
+  std::string int_param_name = "tessedit_pageseg_mode";
+  int int_param[2] = {1, 2};
+  std::string int_param_str[2] = {"1", "2"};
+  std::string bool_param_name = "tessedit_ambigs_training";
+  bool bool_param[2] = {false, true};
+  std::string bool_param_str[2] = {"F", "T"};
+  std::string str_param_name = "tessedit_char_blacklist";
+  std::string str_param[2] = {"abc", "def"};
+  std::string double_param_name = "segment_penalty_dict_frequent_word";
+  std::string double_param_str[2] = {"0.01", "2"};
+  double double_param[2] = {0.01, 2};
+
+  const std::string kTessdataPath = TESSDATA_DIR;
+
+  tesseract::TessBaseAPI tess1, tess2;
+  for (int i = 0; i < 2; ++i) {
+    tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2;
+    api->Init(kTessdataPath.c_str(), langs[i].c_str());
+    api->SetVariable(illegal_name.c_str(), "none");
+    api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str());
+    api->SetVariable(bool_param_name.c_str(), bool_param_str[i].c_str());
+    api->SetVariable(str_param_name.c_str(), str_param[i].c_str());
+    api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str());
+  }
+  for (int i = 0; i < 2; ++i) {
+    tesseract::TessBaseAPI* api = (i == 0) ? &tess1 : &tess2;
+    EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str()));
+    int intvar;
+    EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar));
+    EXPECT_EQ(int_param[i], intvar);
+    bool boolvar;
+    EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar));
+    EXPECT_EQ(bool_param[i], boolvar);
+    EXPECT_STREQ(str_param[i].c_str(),
+                 api->GetStringVariable(str_param_name.c_str()));
+    double doublevar;
+    EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar));
+    EXPECT_EQ(double_param[i], doublevar);
+  }
+}
+
+}  // namespace
diff --git a/tesseract/unittest/baseapi_thread_test.cc b/tesseract/unittest/baseapi_thread_test.cc
new file mode 100644
index 00000000..3608a748
--- /dev/null
+++ b/tesseract/unittest/baseapi_thread_test.cc
@@ -0,0 +1,229 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Unit test to run Tesseract instances in parallel threads and verify
+// the OCR result.
+
+// Note that success of running this test as-is does NOT verify
+// thread-safety. For that, you need to run this binary under TSAN using the
+// associated baseapi_thread_test_with_tsan.sh script.
+//
+// The tests are partitioned by instance to allow running Tesseract/Cube/both
+// and by stage to run initialization/recognition/both. See flag descriptions
+// for details.
+
+#include <functional>
+#include <memory>
+#include <string>
+#ifdef INCLUDE_TENSORFLOW
+#include <tensorflow/core/lib/core/threadpool.h>
+#endif
+#include "absl/strings/ascii.h"         // for absl::StripAsciiWhitespace
+#include "allheaders.h"
+#include "include_gunit.h"
+#include <tesseract/baseapi.h>
+#include "commandlineflags.h"
+#include "log.h"
+
+// Run with Tesseract instances.
+BOOL_PARAM_FLAG(test_tesseract, true, "Test tesseract instances");
+// Run with Cube instances.
+// Note that with TSAN, Cube typically takes much longer to test. Ignoring
+// std::string operations using the associated tess_tsan.ignore file when
+// testing Cube significantly reduces testing time.
+BOOL_PARAM_FLAG(test_cube, true, "Test Cube instances");
+
+// When used with TSAN, having more repetitions can help in finding hidden
+// thread-safety violations at the expense of increased testing time.
+INT_PARAM_FLAG(reps, 1, "Num of parallel test repetitions to run.");
+
+INT_PARAM_FLAG(max_concurrent_instances, 0,
+             "Maximum number of instances to run in parallel at any given "
+             "instant. The number of concurrent instances cannot exceed "
+             "reps * number_of_langs_tested, which is also the default value.");
+
+namespace tesseract {
+
+static const char* kTessLangs[] = {"eng", "vie", nullptr};
+static const char* kTessImages[] = {"HelloGoogle.tif", "viet.tif", nullptr};
+static const char* kTessTruthText[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
+                                nullptr};
+
+static const char* kCubeLangs[] = {"hin", "ara", nullptr};
+static const char* kCubeImages[] = {"raaj.tif", "arabic.tif", nullptr};
+static const char* kCubeTruthText[] = {
+    "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
+    "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
+
+class BaseapiThreadTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    CHECK(FLAGS_test_tesseract || FLAGS_test_cube)
+        << "Need to test at least one of Tesseract/Cube!";
+    // Form a list of langs/gt_text/image_files we will work with.
+    std::vector<std::string> image_files;
+    if (FLAGS_test_tesseract) {
+      int i = 0;
+      while (kTessLangs[i] && kTessTruthText[i] && kTessImages[i]) {
+        langs_.push_back(kTessLangs[i]);
+        gt_text_.push_back(kTessTruthText[i]);
+        image_files.push_back(kTessImages[i]);
+        ++i;
+      }
+      LOG(INFO) << "Testing Tesseract on " << i << " languages.";
+    }
+    if (FLAGS_test_cube) {
+      int i = 0;
+      while (kCubeLangs[i] && kCubeTruthText[i] && kCubeImages[i]) {
+        langs_.push_back(kCubeLangs[i]);
+        gt_text_.push_back(kCubeTruthText[i]);
+        image_files.push_back(kCubeImages[i]);
+        ++i;
+      }
+      LOG(INFO) << "Testing Cube on " << i << " languages.";
+    }
+    num_langs_ = langs_.size();
+
+    // Pre-load the images into an array. We will be making multiple copies of
+    // an image here if FLAGS_reps > 1 and that is intentional. In this test, we
+    // wish to not make any assumptions about the thread-safety of Pix objects,
+    // and so entirely disallow concurrent access of a Pix instance.
+    const int n = num_langs_ * FLAGS_reps;
+    for (int i = 0; i < n; ++i) {
+      std::string path = TESTING_DIR "/" + image_files[i % num_langs_];
+      Pix* new_pix = pixRead(path.c_str());
+      QCHECK(new_pix != nullptr) << "Could not read " << path;
+      pix_.push_back(new_pix);
+    }
+
+#ifdef INCLUDE_TENSORFLOW
+    pool_size_ = (FLAGS_max_concurrent_instances < 1)
+                     ? num_langs_ * FLAGS_reps
+                     : FLAGS_max_concurrent_instances;
+#endif
+  }
+
+  static void TearDownTestCase() {
+    for (auto& pix : pix_) {
+      pixDestroy(&pix);
+    }
+  }
+
+#ifdef INCLUDE_TENSORFLOW
+  void ResetPool() {
+    pool_.reset(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(), "tessthread", pool_size_));
+  }
+
+  void WaitForPoolWorkers() { pool_.reset(nullptr); }
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool_;
+  static int pool_size_;
+#endif
+  static std::vector<Pix*> pix_;
+  static std::vector<std::string> langs_;
+  static std::vector<std::string> gt_text_;
+  static int num_langs_;
+};
+
+// static member variable declarations.
+#ifdef INCLUDE_TENSORFLOW
+int BaseapiThreadTest::pool_size_;
+#endif
+std::vector<Pix*> BaseapiThreadTest::pix_;
+std::vector<std::string> BaseapiThreadTest::langs_;
+std::vector<std::string> BaseapiThreadTest::gt_text_;
+int BaseapiThreadTest::num_langs_;
+
+static void InitTessInstance(TessBaseAPI* tess, const std::string& lang) {
+  CHECK(tess != nullptr);
+  EXPECT_EQ(0, tess->Init(TESSDATA_DIR, lang.c_str()));
+}
+
+static void GetCleanedText(TessBaseAPI* tess, Pix* pix, std::string* ocr_text) {
+  tess->SetImage(pix);
+  char* result = tess->GetUTF8Text();
+  *ocr_text = result;
+  delete[] result;
+  absl::StripAsciiWhitespace(ocr_text);
+}
+
+static void VerifyTextResult(TessBaseAPI* tess, Pix* pix, const std::string& lang,
+                             const std::string& expected_text) {
+  TessBaseAPI* tess_local = nullptr;
+  if (tess) {
+    tess_local = tess;
+  } else {
+    tess_local = new TessBaseAPI;
+    InitTessInstance(tess_local, lang);
+  }
+  std::string ocr_text;
+  GetCleanedText(tess_local, pix, &ocr_text);
+  EXPECT_STREQ(expected_text.c_str(), ocr_text.c_str());
+  if (tess_local != tess) delete tess_local;
+}
+
+// Check that Tesseract/Cube produce the correct results in single-threaded
+// operation. If not, it is pointless to run the real multi-threaded tests.
+TEST_F(BaseapiThreadTest, TestBasicSanity) {
+  for (int i = 0; i < num_langs_; ++i) {
+    TessBaseAPI tess;
+    InitTessInstance(&tess, langs_[i]);
+    std::string ocr_text;
+    GetCleanedText(&tess, pix_[i], &ocr_text);
+    CHECK(strcmp(gt_text_[i].c_str(), ocr_text.c_str()) == 0)
+        << "Failed with lang = " << langs_[i];
+  }
+}
+
+// Test concurrent instance initialization.
+TEST_F(BaseapiThreadTest, TestInit) {
+#ifdef INCLUDE_TENSORFLOW
+  const int n = num_langs_ * FLAGS_reps;
+  ResetPool();
+  std::vector<TessBaseAPI> tess(n);
+  for (int i = 0; i < n; ++i) {
+    pool_->Schedule(std::bind(InitTessInstance, &tess[i], langs_[i % num_langs_]));
+  }
+  WaitForPoolWorkers();
+#endif
+}
+
+// Test concurrent recognition.
+TEST_F(BaseapiThreadTest, TestRecognition) {
+#ifdef INCLUDE_TENSORFLOW
+  const int n = num_langs_ * FLAGS_reps;
+  std::vector<TessBaseAPI> tess(n);
+  // Initialize api instances in a single thread.
+  for (int i = 0; i < n; ++i) {
+    InitTessInstance(&tess[i], langs_[i % num_langs_]);
+  }
+
+  ResetPool();
+  for (int i = 0; i < n; ++i) {
+    pool_->Schedule(std::bind(VerifyTextResult, &tess[i], pix_[i],
+      langs_[i % num_langs_], gt_text_[i % num_langs_]));
+  }
+  WaitForPoolWorkers();
+#endif
+}
+
+TEST_F(BaseapiThreadTest, TestAll) {
+#ifdef INCLUDE_TENSORFLOW
+  const int n = num_langs_ * FLAGS_reps;
+  ResetPool();
+  for (int i = 0; i < n; ++i) {
+    pool_->Schedule(std::bind(VerifyTextResult, nullptr, pix_[i],
+      langs_[i % num_langs_], gt_text_[i % num_langs_]));
+  }
+  WaitForPoolWorkers();
+#endif
+}
+}  // namespace
diff --git a/tesseract/unittest/bitvector_test.cc b/tesseract/unittest/bitvector_test.cc
new file mode 100644
index 00000000..9be718a0
--- /dev/null
+++ b/tesseract/unittest/bitvector_test.cc
@@ -0,0 +1,166 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+
+#include "bitvector.h"
+
+#include "include_gunit.h"
+
+const int kPrimeLimit = 1000;
+
+namespace tesseract {
+
+class BitVectorTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+ public:
+  std::string OutputNameToPath(const std::string& name) {
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+  // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes.
+  void ComputePrimes(BitVector* map) {
+    map->Init(kPrimeLimit + 1);
+    TestAll(*map, false);
+    map->SetBit(2);
+    // Set all the odds to true.
+    for (int i = 3; i <= kPrimeLimit; i += 2) map->SetValue(i, true);
+    int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));
+    for (int f = 3; f <= factor_limit; f += 2) {
+      if (map->At(f)) {
+        for (int m = 2; m * f <= kPrimeLimit; ++m) map->ResetBit(f * m);
+      }
+    }
+  }
+
+  void TestPrimes(const BitVector& map) {
+    // Now all primes in the vector are true, and all others false.
+    // According to Wikipedia, there are 168 primes under 1000, the last
+    // of which is 997.
+    int total_primes = 0;
+    for (int i = 0; i <= kPrimeLimit; ++i) {
+      if (map[i]) ++total_primes;
+    }
+    EXPECT_EQ(168, total_primes);
+    EXPECT_TRUE(map[997]);
+    EXPECT_FALSE(map[998]);
+    EXPECT_FALSE(map[999]);
+  }
+  // Test that all bits in the vector have the given value.
+  void TestAll(const BitVector& map, bool value) {
+    for (int i = 0; i < map.size(); ++i) {
+      EXPECT_EQ(value, map[i]);
+    }
+  }
+
+  // Sets up a BitVector with bit patterns for byte values in
+  // [start_byte, end_byte) positioned every spacing bytes (for spacing >= 1)
+  // with spacing-1  zero bytes in between the pattern bytes.
+  void SetBitPattern(int start_byte, int end_byte, int spacing, BitVector* bv) {
+    bv->Init((end_byte - start_byte) * 8 * spacing);
+    for (int byte_value = start_byte; byte_value < end_byte; ++byte_value) {
+      for (int bit = 0; bit < 8; ++bit) {
+        if (byte_value & (1 << bit))
+          bv->SetBit((byte_value - start_byte) * 8 * spacing + bit);
+      }
+    }
+  }
+
+  // Expects that every return from NextSetBit is really set and that all others
+  // are really not set. Checks the return from NumSetBits also.
+  void ExpectCorrectBits(const BitVector& bv) {
+    int bit_index = -1;
+    int prev_bit_index = -1;
+    int num_bits_tested = 0;
+    while ((bit_index = bv.NextSetBit(bit_index)) >= 0) {
+      EXPECT_LT(bit_index, bv.size());
+      // All bits in between must be 0.
+      for (int i = prev_bit_index + 1; i < bit_index; ++i) {
+        EXPECT_EQ(0, bv[i]) << "i = " << i << " prev = " << prev_bit_index;
+      }
+      // This bit must be 1.
+      EXPECT_EQ(1, bv[bit_index]) << "Bit index = " << bit_index;
+      ++num_bits_tested;
+      prev_bit_index = bit_index;
+    }
+    // Check the bits between the last and the end.
+    for (int i = prev_bit_index + 1; i < bv.size(); ++i) {
+      EXPECT_EQ(0, bv[i]);
+    }
+    EXPECT_EQ(num_bits_tested, bv.NumSetBits());
+  }
+};
+
+// Tests the sieve of Eratosthenes as a way of testing set/reset and I/O.
+TEST_F(BitVectorTest, Primes) {
+  BitVector map;
+  ComputePrimes(&map);
+  TestPrimes(map);
+  // It still works if we use the copy constructor.
+  BitVector map2(map);
+  TestPrimes(map2);
+  // Or if we assign it.
+  BitVector map3;
+  map3 = map;
+  TestPrimes(map3);
+  // Test file i/o too.
+  std::string filename = OutputNameToPath("primesbitvector");
+  FILE* fp = fopen(filename.c_str(), "wb");
+  ASSERT_TRUE(fp != nullptr);
+  EXPECT_TRUE(map.Serialize(fp));
+  fclose(fp);
+  fp = fopen(filename.c_str(), "rb");
+  ASSERT_TRUE(fp != nullptr);
+  BitVector read_map;
+  EXPECT_TRUE(read_map.DeSerialize(false, fp));
+  fclose(fp);
+  TestPrimes(read_map);
+}
+
+// Tests the many-to-one setup feature.
+TEST_F(BitVectorTest, SetAll) {
+  // Test the default constructor and set/resetall.
+  BitVector map(42);
+  TestAll(map, false);
+  map.SetAllTrue();
+  TestAll(map, true);
+  map.SetAllFalse();
+  TestAll(map, false);
+}
+
+// Tests the values in the tables offset_table_, next_table_, hamming_table_
+// by setting all possible byte patterns and verifying that the NextSetBit and
+// NumSetBits functions return the correct values.
+TEST_F(BitVectorTest, TestNextSetBit) {
+  BitVector bv;
+  for (int spacing = 1; spacing <= 5; ++spacing) {
+    SetBitPattern(0, 256, spacing, &bv);
+    ExpectCorrectBits(bv);
+  }
+}
+
+// Tests the values in hamming_table_ more thoroughly by setting single byte
+// patterns for each byte individually.
+TEST_F(BitVectorTest, TestNumSetBits) {
+  BitVector bv;
+  for (int byte = 0; byte < 256; ++byte) {
+    SetBitPattern(byte, byte + 1, 1, &bv);
+    ExpectCorrectBits(bv);
+  }
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/capiexample_c_test.c b/tesseract/unittest/capiexample_c_test.c
new file mode 100644
index 00000000..5917f0c4
--- /dev/null
+++ b/tesseract/unittest/capiexample_c_test.c
@@ -0,0 +1,21 @@
+///////////////////////////////////////////////////////////////////////
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// Verifies that C is able to include capi header.
+#include <tesseract/capi.h>
+
+// Verifies that the libtesseract library has C API symbols.
+int main()
+{
+    printf("%s\n", TessVersion());
+    return 0;
+}
diff --git a/tesseract/unittest/capiexample_test.cc b/tesseract/unittest/capiexample_test.cc
new file mode 100644
index 00000000..3c843056
--- /dev/null
+++ b/tesseract/unittest/capiexample_test.cc
@@ -0,0 +1,19 @@
+///////////////////////////////////////////////////////////////////////
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// Verifies that C++ is able to include capi header.
+#include <tesseract/capi.h>
+
+#include <gtest/gtest.h>
+
+// Verifies that the libtesseract library has C API symbols.
+TEST(C, VersionTest) { TessVersion(); }
diff --git a/tesseract/unittest/cleanapi_test.cc b/tesseract/unittest/cleanapi_test.cc
new file mode 100644
index 00000000..4d284af0
--- /dev/null
+++ b/tesseract/unittest/cleanapi_test.cc
@@ -0,0 +1,28 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <tesseract/baseapi.h>
+
+// Dummy enum in the global namespace that checks for collision with awkward
+// names.
+// If this test fails to compile, clean up the includes in tesseract/baseapi.h!
+// They are not supposed to drag in definitions of any of the tesseract
+// types included in this enum!
+enum NameTester { ABORT, OKAY, LOG, BLOB, ELIST, TBOX, TPOINT, WORD };
+
+#include "gtest/gtest.h"
+
+namespace tesseract {
+
+// Verifies that the global namespace is clean.
+TEST(CleanNamespaceTess, DummyTest) { tesseract::TessBaseAPI api; }
+
+}  // namespace.
diff --git a/tesseract/unittest/colpartition_test.cc b/tesseract/unittest/colpartition_test.cc
new file mode 100644
index 00000000..caebe605
--- /dev/null
+++ b/tesseract/unittest/colpartition_test.cc
@@ -0,0 +1,76 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "colpartition.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableColPartition : public ColPartition {
+ public:
+  void SetColumnRange(int first, int last) {
+    set_first_column(first);
+    set_last_column(last);
+  }
+};
+
+class ColPartitionTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+  void TearDown() {}
+};
+
+TEST_F(ColPartitionTest, IsInSameColumnAsReflexive) {
+  TestableColPartition a, b;
+  a.SetColumnRange(1, 2);
+  b.SetColumnRange(3, 3);
+
+  EXPECT_TRUE(a.IsInSameColumnAs(a));
+  EXPECT_TRUE(b.IsInSameColumnAs(b));
+}
+
+TEST_F(ColPartitionTest, IsInSameColumnAsBorders) {
+  TestableColPartition a, b, c, d;
+  a.SetColumnRange(0, 1);
+  b.SetColumnRange(1, 2);
+  c.SetColumnRange(2, 3);
+  d.SetColumnRange(4, 5);
+
+  EXPECT_TRUE(a.IsInSameColumnAs(b));
+  EXPECT_TRUE(b.IsInSameColumnAs(a));
+  EXPECT_FALSE(c.IsInSameColumnAs(d));
+  EXPECT_FALSE(d.IsInSameColumnAs(c));
+  EXPECT_FALSE(a.IsInSameColumnAs(d));
+}
+
+TEST_F(ColPartitionTest, IsInSameColumnAsSuperset) {
+  TestableColPartition a, b;
+  a.SetColumnRange(4, 7);
+  b.SetColumnRange(2, 8);
+
+  EXPECT_TRUE(a.IsInSameColumnAs(b));
+  EXPECT_TRUE(b.IsInSameColumnAs(a));
+}
+
+TEST_F(ColPartitionTest, IsInSameColumnAsPartialOverlap) {
+  TestableColPartition a, b;
+  a.SetColumnRange(3, 8);
+  b.SetColumnRange(6, 10);
+
+  EXPECT_TRUE(a.IsInSameColumnAs(b));
+  EXPECT_TRUE(b.IsInSameColumnAs(a));
+}
+
+}  // namespace
diff --git a/tesseract/unittest/commandlineflags_test.cc b/tesseract/unittest/commandlineflags_test.cc
new file mode 100644
index 00000000..7b16fbdd
--- /dev/null
+++ b/tesseract/unittest/commandlineflags_test.cc
@@ -0,0 +1,158 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+
+#include "include_gunit.h"
+
+// Flags used for testing parser.
+INT_PARAM_FLAG(foo_int, 0, "Integer flag for testing");
+INT_PARAM_FLAG(bar_int, 0, "Integer flag for testing");
+DOUBLE_PARAM_FLAG(foo_double, 0.1, "Double flag for testing");
+DOUBLE_PARAM_FLAG(bar_double, 0.2, "Double flag for testing");
+STRING_PARAM_FLAG(foo_string, "foo", "String flag for testing");
+STRING_PARAM_FLAG(bar_string, "bar", "String flag for testing");
+BOOL_PARAM_FLAG(foo_bool, false, "Bool flag for testing");
+BOOL_PARAM_FLAG(bar_bool, false, "Bool flag for testing");
+// A flag whose name is a single character, tested for backward
+// compatibility. This should be selected to not conflict with existing flags
+// in commontraining.cpp.
+STRING_PARAM_FLAG(q, "", "Single character name");
+
+namespace tesseract {
+
+class CommandlineflagsTest : public ::testing::Test {
+ protected:
+  void TestParser(int argc, const char** const_argv) {
+    TestParser("", argc, const_argv);
+  }
+  void TestParser(const char* usage, int argc, const char** const_argv) {
+    // Make a copy of the pointer since it can be altered by the function.
+    char** argv = const_cast<char**>(const_argv);
+    tesseract::ParseCommandLineFlags(usage, &argc, &argv, true);
+  }
+};
+
+TEST_F(CommandlineflagsTest, RemoveFlags) {
+  const char* const_argv[] = {"Progname", "--foo_int", "3", "file1.h",
+                              "file2.h"};
+  int argc = ARRAYSIZE(const_argv);
+  char** argv = const_cast<char**>(const_argv);
+  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
+
+  // argv should be rearranged to look like { "Progname", "file1.h", "file2.h" }
+  EXPECT_EQ(3, argc);
+  EXPECT_STREQ("Progname", argv[0]);
+  EXPECT_STREQ("file1.h", argv[1]);
+  EXPECT_STREQ("file2.h", argv[2]);
+}
+
+#if 0  // TODO: this test needs an update (it currently fails).
+TEST_F(CommandlineflagsTest, PrintUsageAndExit) {
+  const char* argv[] = { "Progname", "--help" };
+  EXPECT_EXIT(TestParser("Progname [flags]", ARRAYSIZE(argv), argv),
+              ::testing::ExitedWithCode(0),
+              "USAGE: Progname \\[flags\\]");
+}
+#endif
+
+TEST_F(CommandlineflagsTest, ExitsWithErrorOnInvalidFlag) {
+  const char* argv[] = {"", "--test_nonexistent_flag"};
+  EXPECT_EXIT(TestParser(ARRAYSIZE(argv), argv), ::testing::ExitedWithCode(1),
+              "ERROR: Non-existent flag");
+}
+
+TEST_F(CommandlineflagsTest, ParseIntegerFlags) {
+  const char* argv[] = {"", "--foo_int=3", "--bar_int", "-4"};
+  TestParser(ARRAYSIZE(argv), argv);
+  EXPECT_EQ(3, FLAGS_foo_int);
+  EXPECT_EQ(-4, FLAGS_bar_int);
+
+  const char* arg_no_value[] = {"", "--bar_int"};
+  EXPECT_EXIT(TestParser(ARRAYSIZE(arg_no_value), arg_no_value),
+              ::testing::ExitedWithCode(1), "ERROR");
+
+  const char* arg_invalid_value[] = {"", "--bar_int", "--foo_int=3"};
+  EXPECT_EXIT(TestParser(ARRAYSIZE(arg_invalid_value), arg_invalid_value),
+              ::testing::ExitedWithCode(1), "ERROR");
+
+  const char* arg_bad_format[] = {"", "--bar_int="};
+  EXPECT_EXIT(TestParser(ARRAYSIZE(arg_bad_format), arg_bad_format),
+              ::testing::ExitedWithCode(1), "ERROR");
+}
+
+TEST_F(CommandlineflagsTest, ParseDoubleFlags) {
+  const char* argv[] = {"", "--foo_double=3.14", "--bar_double", "1.2"};
+  TestParser(ARRAYSIZE(argv), argv);
+
+  EXPECT_EQ(3.14, FLAGS_foo_double);
+  EXPECT_EQ(1.2, FLAGS_bar_double);
+
+  const char* arg_no_value[] = {"", "--bar_double"};
+  EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1),
+              "ERROR");
+
+  const char* arg_bad_format[] = {"", "--bar_double="};
+  EXPECT_EXIT(TestParser(2, arg_bad_format), ::testing::ExitedWithCode(1),
+              "ERROR");
+}
+
+TEST_F(CommandlineflagsTest, ParseStringFlags) {
+  const char* argv[] = {"", "--foo_string=abc", "--bar_string", "def"};
+  TestParser(ARRAYSIZE(argv), argv);
+
+  EXPECT_STREQ("abc", FLAGS_foo_string.c_str());
+  EXPECT_STREQ("def", FLAGS_bar_string.c_str());
+
+  const char* arg_no_value[] = {"", "--bar_string"};
+  EXPECT_EXIT(TestParser(2, arg_no_value), ::testing::ExitedWithCode(1),
+              "ERROR");
+
+  FLAGS_bar_string.set_value("bar");
+  const char* arg_empty_string[] = {"", "--bar_string="};
+  TestParser(2, arg_empty_string);
+  EXPECT_STREQ("", FLAGS_bar_string.c_str());
+}
+
+TEST_F(CommandlineflagsTest, ParseBoolFlags) {
+  const char* argv[] = {"", "--foo_bool=true", "--bar_bool=1"};
+  FLAGS_foo_bool.set_value(false);
+  FLAGS_bar_bool.set_value(false);
+  TestParser(ARRAYSIZE(argv), argv);
+  // Verify changed value
+  EXPECT_TRUE(FLAGS_foo_bool);
+  EXPECT_TRUE(FLAGS_bar_bool);
+
+  const char* inv_argv[] = {"", "--foo_bool=false", "--bar_bool=0"};
+  FLAGS_foo_bool.set_value(true);
+  FLAGS_bar_bool.set_value(true);
+  TestParser(3, inv_argv);
+  // Verify changed value
+  EXPECT_FALSE(FLAGS_foo_bool);
+  EXPECT_FALSE(FLAGS_bar_bool);
+
+  const char* arg_implied_true[] = {"", "--bar_bool"};
+  FLAGS_bar_bool.set_value(false);
+  TestParser(2, arg_implied_true);
+  EXPECT_TRUE(FLAGS_bar_bool);
+
+  const char* arg_missing_val[] = {"", "--bar_bool="};
+  EXPECT_EXIT(TestParser(2, arg_missing_val), ::testing::ExitedWithCode(1),
+              "ERROR");
+}
+
+TEST_F(CommandlineflagsTest, ParseOldFlags) {
+  EXPECT_STREQ("", FLAGS_q.c_str());
+  const char* argv[] = {"", "-q", "text"};
+  TestParser(ARRAYSIZE(argv), argv);
+  EXPECT_STREQ("text", FLAGS_q.c_str());
+}
+}  // namespace
diff --git a/tesseract/unittest/cycletimer.h b/tesseract/unittest/cycletimer.h
new file mode 100644
index 00000000..e1a13719
--- /dev/null
+++ b/tesseract/unittest/cycletimer.h
@@ -0,0 +1,61 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Portability include to match the Google test environment.
+
+#ifndef TESSERACT_UNITTEST_CYCLETIMER_H
+#define TESSERACT_UNITTEST_CYCLETIMER_H
+
+#include "absl/time/clock.h" // for GetCurrentTimeNanos
+
+// See https://github.com/google/or-tools/blob/master/ortools/base/timer.h
+class CycleTimer {
+public:
+  CycleTimer() {
+    Reset();
+  }
+
+  void Reset() {
+    running_ = false;
+    sum_ = 0;
+    start_ = 0;
+  }
+
+  // When Start() is called multiple times, only the most recent is used.
+  void Start() {
+    running_ = true;
+    start_ = absl::GetCurrentTimeNanos();
+  }
+
+  void Restart() {
+    sum_ = 0;
+    Start();
+  }
+
+  void Stop() {
+    if (running_) {
+      sum_ += absl::GetCurrentTimeNanos() - start_;
+      running_ = false;
+    }
+  }
+  int64_t GetInMs() const { return GetNanos() / 1000000; }
+
+ protected:
+  int64_t GetNanos() const {
+    return running_ ? absl::GetCurrentTimeNanos() - start_ + sum_ : sum_;
+  }
+
+ private:
+  bool running_;
+  int64_t start_;
+  int64_t sum_;
+};
+
+#endif  // TESSERACT_UNITTEST_CYCLETIMER_H
diff --git a/tesseract/unittest/dawg_test.cc b/tesseract/unittest/dawg_test.cc
new file mode 100644
index 00000000..4a40b050
--- /dev/null
+++ b/tesseract/unittest/dawg_test.cc
@@ -0,0 +1,115 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "ratngs.h"
+#include "unicharset.h"
+#include "trie.h"
+
+#include <cstdlib>      // for system
+#include <fstream>      // for ifstream
+#include <set>
+#include <string>
+#include <vector>
+#include <sys/stat.h>
+
+#ifndef SW_TESTING
+#define wordlist2dawg_prog "wordlist2dawg"
+#define dawg2wordlist_prog "dawg2wordlist"
+#endif
+
+namespace tesseract {
+
+// Test some basic functionality dealing with Dawgs (compressed dictionaries,
+// aka Directed Acyclic Word Graphs).
+class DawgTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+  void LoadWordlist(const std::string& filename, std::set<std::string>* words) const {
+    std::ifstream file(filename);
+    if (file.is_open()) {
+      std::string line;
+      while (getline(file, line)) {
+        // Remove trailing line terminators from line.
+        while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
+          line.resize(line.size() - 1);
+        }
+        // Add line to set.
+        words->insert(line.c_str());
+      }
+      file.close();
+    }
+  }
+  std::string TessBinaryPath(const std::string& name) const {
+    return file::JoinPath(TESSBIN_DIR, name);
+  }
+  std::string OutputNameToPath(const std::string& name) const {
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+  int RunCommand(const std::string& program, const std::string& arg1,
+                 const std::string& arg2, const std::string& arg3) const {
+    std::string cmdline =
+      TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
+    return system(cmdline.c_str());
+  }
+  // Test that we are able to convert a wordlist file (one "word" per line) to
+  // a dawg (a compressed format) and then extract the original wordlist back
+  // out using the tools "wordlist2dawg" and "dawg2wordlist."
+  void TestDawgRoundTrip(const std::string& unicharset_filename,
+                         const std::string& wordlist_filename) const {
+    std::set<std::string> orig_words, roundtrip_words;
+    std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
+    std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
+    std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
+    std::string output_wordlist = OutputNameToPath(wordlist_filename);
+    LoadWordlist(orig_wordlist, &orig_words);
+    EXPECT_EQ(
+        RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
+    EXPECT_EQ(
+        RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist),
+        0);
+    LoadWordlist(output_wordlist, &roundtrip_words);
+    EXPECT_EQ(orig_words, roundtrip_words);
+  }
+};
+
+TEST_F(DawgTest, TestDawgConversion) {
+  TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
+}
+
+TEST_F(DawgTest, TestMatching) {
+  UNICHARSET unicharset;
+  unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
+  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM,
+                       unicharset.size(), 0);
+  WERD_CHOICE space_apos(" '", unicharset);
+  trie.add_word_to_dawg(space_apos);
+
+  WERD_CHOICE space(" ", unicharset);
+
+  // partial match ok - then good!
+  EXPECT_TRUE(trie.prefix_in_dawg(space, false));
+  // require complete match - not present.
+  EXPECT_FALSE(trie.word_in_dawg(space));
+  EXPECT_FALSE(trie.prefix_in_dawg(space, true));
+
+  // partial or complete match ok for full word:
+  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
+  EXPECT_TRUE(trie.word_in_dawg(space_apos));
+  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
+}
+
+}  // namespace
diff --git a/tesseract/unittest/denorm_test.cc b/tesseract/unittest/denorm_test.cc
new file mode 100644
index 00000000..28328b15
--- /dev/null
+++ b/tesseract/unittest/denorm_test.cc
@@ -0,0 +1,99 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "blobs.h"
+#include "normalis.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class DENORMTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+ public:
+  void TearDown() {}
+
+  void ExpectCorrectTransform(const DENORM& denorm, const TPOINT& src,
+                              const TPOINT& result, bool local) {
+    TPOINT normed;
+    if (local)
+      denorm.LocalNormTransform(src, &normed);
+    else
+      denorm.NormTransform(nullptr, src, &normed);
+    EXPECT_EQ(result.x, normed.x);
+    EXPECT_EQ(result.y, normed.y);
+    // Now undo
+    TPOINT denormed;
+    if (local)
+      denorm.LocalDenormTransform(normed, &denormed);
+    else
+      denorm.DenormTransform(nullptr, normed, &denormed);
+    EXPECT_EQ(src.x, denormed.x);
+    EXPECT_EQ(src.y, denormed.y);
+  }
+};
+
+// Tests a simple baseline-style normalization.
+TEST_F(DENORMTest, NoRotations) {
+  DENORM denorm;
+  denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f,
+                            0.0f, static_cast<float>(kBlnBaselineOffset));
+  TPOINT pt1(1100, 2000);
+  TPOINT result1(200, kBlnBaselineOffset);
+  ExpectCorrectTransform(denorm, pt1, result1, true);
+  ExpectCorrectTransform(denorm, pt1, result1, false);
+  TPOINT pt2(900, 2100);
+  TPOINT result2(-200, 300 + kBlnBaselineOffset);
+  ExpectCorrectTransform(denorm, pt2, result2, true);
+  ExpectCorrectTransform(denorm, pt2, result2, false);
+}
+
+// Tests a simple baseline-style normalization with a rotation.
+TEST_F(DENORMTest, WithRotations) {
+  DENORM denorm;
+  FCOORD rotation90(0.0f, 1.0f);
+  denorm.SetupNormalization(nullptr, &rotation90, nullptr, 1000.0f, 2000.0f, 2.0f,
+                            3.0f, 0.0f, static_cast<float>(kBlnBaselineOffset));
+
+  TPOINT pt1(1100, 2000);
+  TPOINT result1(0, 200 + kBlnBaselineOffset);
+  ExpectCorrectTransform(denorm, pt1, result1, true);
+  ExpectCorrectTransform(denorm, pt1, result1, false);
+  TPOINT pt2(900, 2100);
+  TPOINT result2(-300, kBlnBaselineOffset - 200);
+  ExpectCorrectTransform(denorm, pt2, result2, true);
+  ExpectCorrectTransform(denorm, pt2, result2, false);
+}
+
+// Tests a simple baseline-style normalization with a second rotation & scale.
+TEST_F(DENORMTest, Multiple) {
+  DENORM denorm;
+  denorm.SetupNormalization(nullptr, nullptr, nullptr, 1000.0f, 2000.0f, 2.0f, 3.0f,
+                            0.0f, static_cast<float>(kBlnBaselineOffset));
+
+  DENORM denorm2;
+  FCOORD rotation90(0.0f, 1.0f);
+  denorm2.SetupNormalization(nullptr, &rotation90, &denorm, 128.0f, 128.0f, 0.5f,
+                             0.25f, 0.0f, 0.0f);
+  TPOINT pt1(1050, 2000);
+  TPOINT result1(100, kBlnBaselineOffset);
+  ExpectCorrectTransform(denorm, pt1, result1, true);
+  ExpectCorrectTransform(denorm, pt1, result1, false);
+  TPOINT result2(kBlnBaselineOffset / 4, -14);
+  ExpectCorrectTransform(denorm2, result1, result2, true);
+  ExpectCorrectTransform(denorm2, pt1, result2, false);
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/doubleptr.h b/tesseract/unittest/doubleptr.h
new file mode 100644
index 00000000..38628b5f
--- /dev/null
+++ b/tesseract/unittest/doubleptr.h
@@ -0,0 +1,93 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        doubleptr.h
+// Description: Double-ended pointer that keeps pointing correctly even
+//              when reallocated or copied.
+// Author:      Ray Smith
+// Created:     Wed Mar 14 12:22:57 PDT 2012
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_
+#define TESSERACT_CCUTIL_DOUBLEPTR_H_
+
+#include "errcode.h"
+
+namespace tesseract {
+
+// A smart pointer class that implements a double-ended pointer. Each end
+// points to the other end. The copy constructor and operator= have MOVE
+// semantics, meaning that the relationship with the other end moves to the
+// destination of the copy, leaving the source unattached.
+// For this reason both the copy constructor and the operator= take a non-const
+// reference argument, and the const reference versions cannot be used.
+// DoublePtr is useful to incorporate into structures that are part of a
+// collection such as GenericVector or STL containers, where reallocs can
+// relocate the members. DoublePtr is also useful in a GenericHeap, where it
+// can correctly maintain the pointer to an element of the heap despite it
+// getting moved around on the heap.
+class DoublePtr {
+ public:
+  DoublePtr() : other_end_(nullptr) {}
+  // Copy constructor steals the partner off src and is therefore a non
+  // const reference arg.
+  // Copying a const DoublePtr generates a compiler error.
+  DoublePtr(const DoublePtr& src) {
+    other_end_ = src.other_end_;
+    if (other_end_ != nullptr) {
+      other_end_->other_end_ = this;
+      ((DoublePtr&)src).other_end_ = nullptr;
+    }
+  }
+  // Operator= steals the partner off src, and therefore needs src to be a non-
+  // const reference.
+  // Assigning from a const DoublePtr generates a compiler error.
+  void operator=(const DoublePtr& src) {
+    Disconnect();
+    other_end_ = src.other_end_;
+    if (other_end_ != nullptr) {
+      other_end_->other_end_ = this;
+      ((DoublePtr&)src).other_end_ = nullptr;
+    }
+  }
+
+  // Connects this and other, discarding any existing connections.
+  void Connect(DoublePtr* other) {
+    other->Disconnect();
+    Disconnect();
+    other->other_end_ = this;
+    other_end_ = other;
+  }
+  // Disconnects this and other, making OtherEnd() return nullptr for both.
+  void Disconnect() {
+    if (other_end_ != nullptr) {
+      other_end_->other_end_ = nullptr;
+      other_end_ = nullptr;
+    }
+  }
+  // Returns the pointer to the other end of the double pointer.
+  DoublePtr* OtherEnd() const {
+    return other_end_;
+  }
+
+ private:
+  // Pointer to the other end of the link. It is always true that either
+  // other_end_ == nullptr or other_end_->other_end_ == this.
+  DoublePtr* other_end_;
+};
+
+}  // namespace tesseract.
+
+#endif  // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_
diff --git a/tesseract/unittest/equationdetect_test.cc b/tesseract/unittest/equationdetect_test.cc
new file mode 100644
index 00000000..eb52231e
--- /dev/null
+++ b/tesseract/unittest/equationdetect_test.cc
@@ -0,0 +1,549 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "colpartitiongrid.h"
+#include "equationdetect.h"
+#include "tesseractclass.h"
+
+#include "allheaders.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#define ENABLE_IdentifySpecialText_TEST 0
+#if ENABLE_IdentifySpecialText_TEST
+#define EQU_TRAINEDDATA_NAME "equ"
+#else
+#define EQU_TRAINEDDATA_NAME "equINTENTIONALLY_MISSING_FILE"
+#endif
+
+namespace tesseract {
+
+class TestableEquationDetect : public EquationDetect {
+ public:
+  TestableEquationDetect(const char* tessdata, Tesseract* lang_tesseract)
+      : EquationDetect(tessdata, EQU_TRAINEDDATA_NAME) {
+    SetLangTesseract(lang_tesseract);
+  }
+
+  // Insert a certain math and digit blobs into part.
+  void AddMathDigitBlobs(const int math_blobs, const int digit_blobs,
+                         const int total_blobs, ColPartition* part) {
+    CHECK(part != nullptr);
+    CHECK_LE(math_blobs + digit_blobs, total_blobs);
+    int count = 0;
+    for (int i = 0; i < math_blobs; i++, count++) {
+      BLOBNBOX* blob = new BLOBNBOX();
+      blob->set_special_text_type(BSTT_MATH);
+      part->AddBox(blob);
+    }
+    for (int i = 0; i < digit_blobs; i++, count++) {
+      BLOBNBOX* blob = new BLOBNBOX();
+      blob->set_special_text_type(BSTT_DIGIT);
+      part->AddBox(blob);
+    }
+    for (int i = count; i < total_blobs; i++) {
+      BLOBNBOX* blob = new BLOBNBOX();
+      blob->set_special_text_type(BSTT_NONE);
+      part->AddBox(blob);
+    }
+  }
+
+  // Set up pix_binary for lang_tesseract_.
+  void SetPixBinary(Pix* pix) {
+    CHECK_EQ(1, pixGetDepth(pix));
+    *(lang_tesseract_->mutable_pix_binary()) = pix;
+  }
+
+  void RunIdentifySpecialText(BLOBNBOX* blob, const int height_th) {
+    IdentifySpecialText(blob, height_th);
+  }
+
+  BlobSpecialTextType RunEstimateTypeForUnichar(const char* val) {
+    const UNICHARSET& unicharset = lang_tesseract_->unicharset;
+    return EstimateTypeForUnichar(unicharset, unicharset.unichar_to_id(val));
+  }
+
+  EquationDetect::IndentType RunIsIndented(ColPartitionGrid* part_grid,
+                                           ColPartition* part) {
+    this->part_grid_ = part_grid;
+    return IsIndented(part);
+  }
+
+  bool RunIsNearSmallNeighbor(const TBOX& seed_box, const TBOX& part_box) {
+    return IsNearSmallNeighbor(seed_box, part_box);
+  }
+
+  bool RunCheckSeedBlobsCount(ColPartition* part) {
+    return CheckSeedBlobsCount(part);
+  }
+
+  float RunComputeForegroundDensity(const TBOX& tbox) {
+    return ComputeForegroundDensity(tbox);
+  }
+
+  int RunCountAlignment(const GenericVector<int>& sorted_vec, const int val) {
+    return CountAlignment(sorted_vec, val);
+  }
+
+  void RunSplitCPHorLite(ColPartition* part,
+                         GenericVector<TBOX>* splitted_boxes) {
+    SplitCPHorLite(part, splitted_boxes);
+  }
+
+  void RunSplitCPHor(ColPartition* part,
+                     GenericVector<ColPartition*>* parts_splitted) {
+    SplitCPHor(part, parts_splitted);
+  }
+
+  void TestComputeCPsSuperBBox(const TBOX& box, ColPartitionGrid* part_grid) {
+    CHECK(part_grid != nullptr);
+    part_grid_ = part_grid;
+    ComputeCPsSuperBBox();
+    EXPECT_TRUE(*cps_super_bbox_ == box);
+  }
+};
+
+class EquationFinderTest : public testing::Test {
+ protected:
+  std::unique_ptr<TestableEquationDetect> equation_det_;
+  std::unique_ptr<Tesseract> tesseract_;
+
+  // The directory for testdata;
+  std::string testdata_dir_;
+
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    tesseract_.reset(new Tesseract());
+    tesseract_->init_tesseract(TESSDATA_DIR, "eng", OEM_TESSERACT_ONLY);
+    tesseract_->set_source_resolution(300);
+    equation_det_.reset(
+        new TestableEquationDetect(TESSDATA_DIR, tesseract_.get()));
+    equation_det_->SetResolution(300);
+
+    testdata_dir_ = TESTDATA_DIR;
+  }
+
+  void TearDown() {
+    tesseract_.reset(nullptr);
+    equation_det_.reset(nullptr);
+  }
+
+  // Add a BLOCK covering the whole page.
+  void AddPageBlock(Pix* pix, BLOCK_LIST* blocks) {
+    CHECK(pix != nullptr);
+    CHECK(blocks != nullptr);
+    BLOCK_IT block_it(blocks);
+    BLOCK* block =
+        new BLOCK("", true, 0, 0, 0, 0, pixGetWidth(pix), pixGetHeight(pix));
+    block_it.add_to_end(block);
+  }
+
+  // Create col partitions, add into part_grid, and put them into all_parts.
+  void CreateColParts(const int rows, const int cols,
+                      ColPartitionGrid* part_grid,
+                      std::vector<ColPartition*>* all_parts) {
+    const int kWidth = 10, kHeight = 10;
+    ClearParts(all_parts);
+    for (int y = 0; y < rows; ++y) {
+      for (int x = 0; x < cols; ++x) {
+        int left = x * kWidth * 2, bottom = y * kHeight * 2;
+        TBOX box(left, bottom, left + kWidth, bottom + kHeight);
+        ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT,
+                                                         BRT_TEXT, BTFT_NONE);
+        part_grid->InsertBBox(true, true, part);
+        all_parts->push_back(part);
+      }
+    }
+  }
+
+  void ClearParts(std::vector<ColPartition*>* all_parts) {
+    for (size_t i = 0; i < all_parts->size(); ++i) {
+      (*all_parts)[i]->DeleteBoxes();
+      delete ((*all_parts)[i]);
+    }
+  }
+
+  // Create a BLOBNBOX object with bounding box tbox, and add it into part.
+  void AddBlobIntoPart(const TBOX& tbox, ColPartition* part) {
+    CHECK(part != nullptr);
+    BLOBNBOX* blob = new BLOBNBOX();
+    blob->set_bounding_box(tbox);
+    part->AddBox(blob);
+  }
+};
+
+TEST_F(EquationFinderTest, IdentifySpecialText) {
+#if !ENABLE_IdentifySpecialText_TEST
+  GTEST_SKIP();
+#else // TODO: missing equ_gt1.tif
+  // Load Image.
+  std::string imagefile = file::JoinPath(testdata_dir_, "equ_gt1.tif");
+  Pix* pix_binary = pixRead(imagefile.c_str());
+  CHECK(pix_binary != nullptr && pixGetDepth(pix_binary) == 1);
+
+  // Get components.
+  BLOCK_LIST blocks;
+  TO_BLOCK_LIST to_blocks;
+  AddPageBlock(pix_binary, &blocks);
+  Textord* textord = tesseract_->mutable_textord();
+  textord->find_components(pix_binary, &blocks, &to_blocks);
+
+  // Identify special texts from to_blocks.
+  TO_BLOCK_IT to_block_it(&to_blocks);
+  std::map<int, int> stt_count;
+  for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list();
+       to_block_it.forward()) {
+    TO_BLOCK* to_block = to_block_it.data();
+    BLOBNBOX_IT blob_it(&(to_block->blobs));
+    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+      BLOBNBOX* blob = blob_it.data();
+      // blob->set_special_text_type(BSTT_NONE);
+      equation_det_->RunIdentifySpecialText(blob, 0);
+      tensorflow::gtl::InsertIfNotPresent(&stt_count, blob->special_text_type(), 0);
+      stt_count[blob->special_text_type()]++;
+    }
+  }
+
+  // Verify the number, but allow a range of +/- kCountRange before squealing.
+  const int kCountRange = 3;
+  EXPECT_GE(39 + kCountRange, stt_count[BSTT_NONE]);
+  EXPECT_LE(39 - kCountRange, stt_count[BSTT_NONE]);
+
+  // if you count all the subscripts etc, there are ~45 italic chars.
+  EXPECT_GE(45 + kCountRange, stt_count[BSTT_ITALIC]);
+  EXPECT_LE(45 - kCountRange, stt_count[BSTT_ITALIC]);
+  EXPECT_GE(41 + kCountRange, stt_count[BSTT_DIGIT]);
+  EXPECT_LE(41 - kCountRange, stt_count[BSTT_DIGIT]);
+  EXPECT_GE(50 + kCountRange, stt_count[BSTT_MATH]);
+  EXPECT_LE(50 - kCountRange, stt_count[BSTT_MATH]);
+  EXPECT_GE(10 + kCountRange, stt_count[BSTT_UNCLEAR]);
+  EXPECT_LE(10 - kCountRange, stt_count[BSTT_UNCLEAR]);
+
+  // Release memory.
+  pixDestroy(&pix_binary);
+#endif
+}
+
+TEST_F(EquationFinderTest, EstimateTypeForUnichar) {
+  // Test abc characters.
+  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("a"));
+  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("c"));
+
+  // Test punctuation characters.
+  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar("'"));
+  EXPECT_EQ(BSTT_NONE, equation_det_->RunEstimateTypeForUnichar(","));
+
+  // Test digits.
+  EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("1"));
+  EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("4"));
+  EXPECT_EQ(BSTT_DIGIT, equation_det_->RunEstimateTypeForUnichar("|"));
+
+  // Test math symbols.
+  EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar("("));
+  EXPECT_EQ(BSTT_MATH, equation_det_->RunEstimateTypeForUnichar("+"));
+}
+
+TEST_F(EquationFinderTest, IsIndented) {
+  ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));
+
+  // Create five ColPartitions:
+  // part 1: ************
+  // part 2:   *********
+  // part 3: *******
+  // part 4:   *****
+  //
+  // part 5:   ********
+  TBOX box1(0, 950, 999, 999);
+  ColPartition* part1 =
+      ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part_grid.InsertBBox(true, true, part1);
+  TBOX box2(300, 920, 900, 940);
+  ColPartition* part2 =
+      ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part_grid.InsertBBox(true, true, part2);
+  TBOX box3(0, 900, 600, 910);
+  ColPartition* part3 =
+      ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part_grid.InsertBBox(true, true, part3);
+  TBOX box4(300, 890, 600, 899);
+  ColPartition* part4 =
+      ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part_grid.InsertBBox(true, true, part4);
+  TBOX box5(300, 500, 900, 510);
+  ColPartition* part5 =
+      ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part_grid.InsertBBox(true, true, part5);
+
+  // Test
+  // part1 should be no indent.
+  EXPECT_EQ(EquationDetect::NO_INDENT,
+            equation_det_->RunIsIndented(&part_grid, part1));
+  // part2 should be left indent in terms of part1.
+  EXPECT_EQ(EquationDetect::LEFT_INDENT,
+            equation_det_->RunIsIndented(&part_grid, part2));
+  // part3 should be right indent.
+  EXPECT_EQ(EquationDetect::RIGHT_INDENT,
+            equation_det_->RunIsIndented(&part_grid, part3));
+  // part4 should be both indented.
+  EXPECT_EQ(EquationDetect::BOTH_INDENT,
+            equation_det_->RunIsIndented(&part_grid, part4));
+  // part5 should be no indent because it is too far from part1.
+  EXPECT_EQ(EquationDetect::NO_INDENT,
+            equation_det_->RunIsIndented(&part_grid, part5));
+
+  // Release memory.
+  part1->DeleteBoxes();
+  delete (part1);
+  part2->DeleteBoxes();
+  delete (part2);
+  part3->DeleteBoxes();
+  delete (part3);
+  part4->DeleteBoxes();
+  delete (part4);
+  part5->DeleteBoxes();
+  delete (part5);
+}
+
+TEST_F(EquationFinderTest, IsNearSmallNeighbor) {
+  // Create four tboxes:
+  //          part 1, part 2
+  //           *****   *****
+  // part 3:   *****
+  //
+  // part 4: *****************
+  TBOX box1(0, 950, 499, 999);
+  TBOX box2(500, 950, 999, 998);
+  TBOX box3(0, 900, 499, 949);
+  TBOX box4(0, 550, 499, 590);
+
+  // Test
+  // box2 should be box1's near neighbor but not vice versa.
+  EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box2));
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box1));
+  // box1 and box3 should be near neighbors of each other.
+  EXPECT_TRUE(equation_det_->RunIsNearSmallNeighbor(box1, box3));
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3));
+  // box2 and box3 should not be near neighbors of each other.
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box3));
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box2));
+
+  // box4 should not be the near neighbor of any one.
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box1, box4));
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box2, box4));
+  EXPECT_FALSE(equation_det_->RunIsNearSmallNeighbor(box3, box4));
+}
+
+TEST_F(EquationFinderTest, CheckSeedBlobsCount) {
+  TBOX box(0, 950, 999, 999);
+  ColPartition* part1 =
+      ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  ColPartition* part2 =
+      ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  ColPartition* part3 =
+      ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  ColPartition* part4 =
+      ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+
+  // Part 1: 8 math, 0 digit, 20 total.
+  equation_det_->AddMathDigitBlobs(8, 0, 20, part1);
+  EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part1));
+
+  // Part 2: 1 math, 8 digit, 20 total.
+  equation_det_->AddMathDigitBlobs(1, 8, 20, part2);
+  EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part2));
+
+  // Part 3: 3 math, 8 digit, 8 total.
+  equation_det_->AddMathDigitBlobs(3, 8, 20, part3);
+  EXPECT_TRUE(equation_det_->RunCheckSeedBlobsCount(part3));
+
+  // Part 4: 8 math, 0 digit, 8 total.
+  equation_det_->AddMathDigitBlobs(0, 0, 8, part4);
+  EXPECT_FALSE(equation_det_->RunCheckSeedBlobsCount(part4));
+
+  // Release memory.
+  part1->DeleteBoxes();
+  delete (part1);
+  part2->DeleteBoxes();
+  delete (part2);
+  part3->DeleteBoxes();
+  delete (part3);
+  part4->DeleteBoxes();
+  delete (part4);
+}
+
+TEST_F(EquationFinderTest, ComputeForegroundDensity) {
+  // Create the pix with top half foreground, bottom half background.
+  int width = 1024, height = 768;
+  Pix* pix = pixCreate(width, height, 1);
+  pixRasterop(pix, 0, 0, width, height / 2, PIX_SET, nullptr, 0, 0);
+  TBOX box1(100, 0, 140, 140), box2(100, height / 2 - 20, 140, height / 2 + 20),
+      box3(100, height - 40, 140, height);
+  equation_det_->SetPixBinary(pix);
+
+  // Verify
+  EXPECT_NEAR(0.0, equation_det_->RunComputeForegroundDensity(box1), 0.0001f);
+  EXPECT_NEAR(0.5, equation_det_->RunComputeForegroundDensity(box2), 0.0001f);
+  EXPECT_NEAR(1.0, equation_det_->RunComputeForegroundDensity(box3), 0.0001f);
+}
+
+TEST_F(EquationFinderTest, CountAlignment) {
+  GenericVector<int> vec;
+  vec.push_back(1);
+  vec.push_back(1);
+  vec.push_back(1);
+  vec.push_back(100);
+  vec.push_back(200);
+  vec.push_back(200);
+
+  // Test the right point.
+  EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 1));
+  EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 100));
+  EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 200));
+
+  // Test the near neighbors.
+  EXPECT_EQ(3, equation_det_->RunCountAlignment(vec, 3));
+  EXPECT_EQ(1, equation_det_->RunCountAlignment(vec, 99));
+  EXPECT_EQ(2, equation_det_->RunCountAlignment(vec, 202));
+
+  // Test the far neighbors.
+  EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 150));
+  EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 50));
+  EXPECT_EQ(0, equation_det_->RunCountAlignment(vec, 250));
+}
+
+TEST_F(EquationFinderTest, ComputeCPsSuperBBox) {
+  Pix* pix = pixCreate(1001, 1001, 1);
+  equation_det_->SetPixBinary(pix);
+  ColPartitionGrid part_grid(10, ICOORD(0, 0), ICOORD(1000, 1000));
+
+  TBOX box1(0, 0, 999, 99);
+  ColPartition* part1 =
+      ColPartition::FakePartition(box1, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  TBOX box2(0, 100, 499, 199);
+  ColPartition* part2 =
+      ColPartition::FakePartition(box2, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  TBOX box3(500, 100, 999, 199);
+  ColPartition* part3 =
+      ColPartition::FakePartition(box3, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  TBOX box4(0, 200, 999, 299);
+  ColPartition* part4 =
+      ColPartition::FakePartition(box4, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  TBOX box5(0, 900, 999, 999);
+  ColPartition* part5 =
+      ColPartition::FakePartition(box5, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+
+  // Add part1->part3 into part_grid and test.
+  part_grid.InsertBBox(true, true, part1);
+  part_grid.InsertBBox(true, true, part2);
+  part_grid.InsertBBox(true, true, part3);
+  TBOX super_box(0, 0, 999, 199);
+  equation_det_->TestComputeCPsSuperBBox(super_box, &part_grid);
+
+  // Add part4 and test.
+  part_grid.InsertBBox(true, true, part4);
+  TBOX super_box2(0, 0, 999, 299);
+  equation_det_->TestComputeCPsSuperBBox(super_box2, &part_grid);
+
+  // Add part5 and test.
+  part_grid.InsertBBox(true, true, part5);
+  TBOX super_box3(0, 0, 999, 999);
+  equation_det_->TestComputeCPsSuperBBox(super_box3, &part_grid);
+
+  // Release memory.
+  part1->DeleteBoxes();
+  delete (part1);
+  part2->DeleteBoxes();
+  delete (part2);
+  part3->DeleteBoxes();
+  delete (part3);
+  part4->DeleteBoxes();
+  delete (part4);
+  part5->DeleteBoxes();
+  delete (part5);
+}
+
+TEST_F(EquationFinderTest, SplitCPHorLite) {
+  TBOX box(0, 0, 999, 99);
+  ColPartition* part =
+      ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part->DeleteBoxes();
+  part->set_median_width(10);
+  GenericVector<TBOX> splitted_boxes;
+
+  // Test an empty part.
+  equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
+  EXPECT_TRUE(splitted_boxes.empty());
+
+  // Test with one blob.
+  AddBlobIntoPart(TBOX(0, 0, 10, 50), part);
+  equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
+  EXPECT_EQ(1, splitted_boxes.size());
+  EXPECT_TRUE(TBOX(0, 0, 10, 50) == splitted_boxes[0]);
+
+  // Add more blob and test.
+  AddBlobIntoPart(TBOX(11, 0, 20, 60), part);
+  AddBlobIntoPart(TBOX(25, 0, 30, 55), part);  // break point.
+  AddBlobIntoPart(TBOX(100, 0, 110, 15), part);
+  AddBlobIntoPart(TBOX(125, 0, 140, 45), part);  // break point.
+  AddBlobIntoPart(TBOX(500, 0, 540, 35), part);  // break point.
+  equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
+  // Verify.
+  EXPECT_EQ(3, splitted_boxes.size());
+  EXPECT_TRUE(TBOX(0, 0, 30, 60) == splitted_boxes[0]);
+  EXPECT_TRUE(TBOX(100, 0, 140, 45) == splitted_boxes[1]);
+  EXPECT_TRUE(TBOX(500, 0, 540, 35) == splitted_boxes[2]);
+
+  part->DeleteBoxes();
+  delete (part);
+}
+
+TEST_F(EquationFinderTest, SplitCPHor) {
+  TBOX box(0, 0, 999, 99);
+  ColPartition* part =
+      ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+  part->DeleteBoxes();
+  part->set_median_width(10);
+  GenericVector<ColPartition*> parts_splitted;
+
+  // Test an empty part.
+  equation_det_->RunSplitCPHor(part, &parts_splitted);
+  EXPECT_TRUE(parts_splitted.empty());
+  // Test with one blob.
+  AddBlobIntoPart(TBOX(0, 0, 10, 50), part);
+
+  equation_det_->RunSplitCPHor(part, &parts_splitted);
+  EXPECT_EQ(1, parts_splitted.size());
+  EXPECT_TRUE(TBOX(0, 0, 10, 50) == parts_splitted[0]->bounding_box());
+
+  // Add more blob and test.
+  AddBlobIntoPart(TBOX(11, 0, 20, 60), part);
+  AddBlobIntoPart(TBOX(25, 0, 30, 55), part);  // break point.
+  AddBlobIntoPart(TBOX(100, 0, 110, 15), part);
+  AddBlobIntoPart(TBOX(125, 0, 140, 45), part);  // break point.
+  AddBlobIntoPart(TBOX(500, 0, 540, 35), part);  // break point.
+  equation_det_->RunSplitCPHor(part, &parts_splitted);
+
+  // Verify.
+  EXPECT_EQ(3, parts_splitted.size());
+  EXPECT_TRUE(TBOX(0, 0, 30, 60) == parts_splitted[0]->bounding_box());
+  EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());
+  EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());
+
+  parts_splitted.delete_data_pointers();
+  part->DeleteBoxes();
+  delete (part);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/fileio_test.cc b/tesseract/unittest/fileio_test.cc
new file mode 100644
index 00000000..00488918
--- /dev/null
+++ b/tesseract/unittest/fileio_test.cc
@@ -0,0 +1,66 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <stdio.h>
+#include <memory>
+
+#include "absl/strings/str_split.h"
+
+#include "fileio.h"
+#include "include_gunit.h"
+
+namespace tesseract {
+
+TEST(FileTest, JoinPath) {
+  EXPECT_EQ("/abc/def", File::JoinPath("/abc", "def"));
+  EXPECT_EQ("/abc/def", File::JoinPath("/abc/", "def"));
+  EXPECT_EQ("def", File::JoinPath("", "def"));
+}
+
+TEST(OutputBufferTest, WriteString) {
+  const int kMaxBufSize = 128;
+  char buffer[kMaxBufSize];
+  for (int i = 0; i < kMaxBufSize; ++i) buffer[i] = '\0';
+  FILE* fp = tmpfile();
+  CHECK(fp != nullptr);
+
+  std::unique_ptr<OutputBuffer> output(new OutputBuffer(fp));
+  output->WriteString("Hello ");
+  output->WriteString("world!");
+
+  rewind(fp);
+  auto s = "Hello world!";
+  fread(buffer, strlen(s), 1, fp);
+  EXPECT_STREQ(s, buffer);
+}
+
+TEST(InputBufferTest, Read) {
+  const int kMaxBufSize = 128;
+  char buffer[kMaxBufSize];
+  auto s = "Hello\n world!";
+  strncpy(buffer, s, kMaxBufSize);
+  EXPECT_STREQ(s, buffer);
+  FILE* fp = tmpfile();
+  CHECK(fp != nullptr);
+  fwrite(buffer, strlen(s), 1, fp);
+  rewind(fp);
+
+  std::string str;
+  std::unique_ptr<InputBuffer> input(new InputBuffer(fp));
+  EXPECT_TRUE(input->Read(&str));
+  std::vector<std::string> lines = absl::StrSplit(str, '\n', absl::SkipEmpty());
+  EXPECT_EQ(2, lines.size());
+  EXPECT_EQ("Hello", lines[0]);
+  EXPECT_EQ(" world!", lines[1]);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/fuzzers/fuzzer-api.cpp b/tesseract/unittest/fuzzers/fuzzer-api.cpp
new file mode 100644
index 00000000..a1e4e7c4
--- /dev/null
+++ b/tesseract/unittest/fuzzers/fuzzer-api.cpp
@@ -0,0 +1,101 @@
+#include <tesseract/baseapi.h>
+#include <allheaders.h>
+
+#include <libgen.h>     // for dirname
+#include <cstdio>       // for printf
+#include <cstdlib>      // for std::getenv, std::setenv
+#include <string>       // for std::string
+
+#ifndef TESSERACT_FUZZER_WIDTH
+#define TESSERACT_FUZZER_WIDTH 100
+#endif
+
+#ifndef TESSERACT_FUZZER_HEIGHT
+#define TESSERACT_FUZZER_HEIGHT 100
+#endif
+
+class BitReader {
+ private:
+  uint8_t const* data;
+  size_t size;
+  size_t shift;
+
+ public:
+  BitReader(const uint8_t* data, size_t size)
+      : data(data), size(size), shift(0) {}
+
+  int Read(void) {
+    if (size == 0) {
+      return 0;
+    }
+
+    const int ret = ((*data) >> shift) & 1;
+
+    shift++;
+    if (shift >= 8) {
+      shift = 0;
+      data++;
+      size--;
+    }
+
+    return ret;
+  }
+};
+
+static tesseract::TessBaseAPI* api = nullptr;
+
+extern "C" int LLVMFuzzerInitialize(int* /*pArgc*/, char*** pArgv) {
+  if (std::getenv("TESSDATA_PREFIX") == nullptr) {
+    std::string binary_path = *pArgv[0];
+    const std::string filepath = dirname(&binary_path[0]);
+
+    const std::string tessdata_path = filepath + "/" + "tessdata";
+    if (setenv("TESSDATA_PREFIX", tessdata_path.c_str(), 1) != 0) {
+      printf("Setenv failed\n");
+      std::abort();
+    }
+  }
+
+  api = new tesseract::TessBaseAPI();
+  if (api->Init(nullptr, "eng") != 0) {
+    printf("Cannot initialize API\n");
+    abort();
+  }
+
+  /* Silence output */
+  api->SetVariable("debug_file", "/dev/null");
+
+  return 0;
+}
+
+static PIX* createPix(BitReader& BR, const size_t width, const size_t height) {
+  Pix* pix = pixCreate(width, height, 1);
+
+  if (pix == nullptr) {
+    printf("pix creation failed\n");
+    abort();
+  }
+
+  for (size_t i = 0; i < width; i++) {
+    for (size_t j = 0; j < height; j++) {
+      pixSetPixel(pix, i, j, BR.Read());
+    }
+  }
+
+  return pix;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  BitReader BR(data, size);
+
+  auto pix = createPix(BR, TESSERACT_FUZZER_WIDTH, TESSERACT_FUZZER_HEIGHT);
+
+  api->SetImage(pix);
+
+  char* outText = api->GetUTF8Text();
+
+  pixDestroy(&pix);
+  delete[] outText;
+
+  return 0;
+}
diff --git a/tesseract/unittest/fuzzers/oss-fuzz-build.sh b/tesseract/unittest/fuzzers/oss-fuzz-build.sh
new file mode 100755
index 00000000..d10f2d80
--- /dev/null
+++ b/tesseract/unittest/fuzzers/oss-fuzz-build.sh
@@ -0,0 +1,59 @@
+#!/bin/bash -eu
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+cd $SRC/leptonica
+./autogen.sh
+./configure --disable-shared
+make SUBDIRS=src install -j$(nproc)
+ldconfig
+
+cd $SRC/tesseract
+./autogen.sh
+CXXFLAGS="$CXXFLAGS -D_GLIBCXX_DEBUG" ./configure --disable-graphics --disable-shared
+make -j$(nproc)
+
+cp -R $SRC/tessdata $OUT
+
+$CXX $CXXFLAGS \
+    -I $SRC/tesseract/include \
+    -I/usr/local/include/leptonica \
+     $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api \
+     $SRC/tesseract/.libs/libtesseract.a \
+     /usr/local/lib/liblept.a \
+     /usr/lib/x86_64-linux-gnu/libtiff.a \
+     /usr/lib/x86_64-linux-gnu/libpng.a \
+     /usr/lib/x86_64-linux-gnu/libjpeg.a \
+     /usr/lib/x86_64-linux-gnu/libjbig.a \
+     /usr/lib/x86_64-linux-gnu/liblzma.a \
+     -lz \
+     $LIB_FUZZING_ENGINE
+
+$CXX $CXXFLAGS \
+    -DTESSERACT_FUZZER_WIDTH=512 \
+    -DTESSERACT_FUZZER_HEIGHT=256 \
+    -I $SRC/tesseract/include \
+    -I/usr/local/include/leptonica \
+     $SRC/tesseract/unittest/fuzzers/fuzzer-api.cpp -o $OUT/fuzzer-api-512x256 \
+     $SRC/tesseract/.libs/libtesseract.a \
+     /usr/local/lib/liblept.a \
+     /usr/lib/x86_64-linux-gnu/libtiff.a \
+     /usr/lib/x86_64-linux-gnu/libpng.a \
+     /usr/lib/x86_64-linux-gnu/libjpeg.a \
+     /usr/lib/x86_64-linux-gnu/libjbig.a \
+     /usr/lib/x86_64-linux-gnu/liblzma.a \
+     -lz \
+     $LIB_FUZZING_ENGINE
diff --git a/tesseract/unittest/heap_test.cc b/tesseract/unittest/heap_test.cc
new file mode 100644
index 00000000..c2754181
--- /dev/null
+++ b/tesseract/unittest/heap_test.cc
@@ -0,0 +1,202 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include_gunit.h"
+
+#include "doubleptr.h"
+#include "genericheap.h"
+#include "genericvector.h"
+#include "kdpair.h"
+
+#include <string>
+#include <utility>
+
+namespace tesseract {
+
+int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0};
+
+// The fixture for testing GenericHeap and DoublePtr.
+class HeapTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+ public:
+  virtual ~HeapTest();
+  // Pushes the test data onto both the heap and the KDVector.
+  void PushTestData(GenericHeap<IntKDPair>* heap, KDVector* v) {
+    for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+      IntKDPair pair(test_data[i], i);
+      heap->Push(&pair);
+      v->push_back(pair);
+    }
+  }
+  // Verifies that the data in the heap matches the vector (after sorting) by
+  // popping everything off the heap.
+  void VerifyHeapVectorMatch(GenericHeap<IntKDPair>* heap, KDVector* v) {
+    EXPECT_FALSE(heap->empty());
+    EXPECT_EQ(heap->size(), v->size());
+    // Sort the vector and check that the keys come out of the heap in the same
+    // order as v.
+    // Also check that the indices match, except for 9, which is duplicated.
+    v->sort();
+    // Check that we have increasing order.
+    EXPECT_LT((*v)[0].key(), v->back().key());
+    for (int i = 0; i < v->size(); ++i) {
+      EXPECT_EQ((*v)[i].key(), heap->PeekTop().key());
+      // Indices don't necessarily match for equal keys, so don't test them.
+      if (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) {
+        while (i + 1 < v->size() && (*v)[i + 1].key() == (*v)[i].key()) {
+          heap->Pop(nullptr);
+          ++i;
+          EXPECT_FALSE(heap->empty());
+          EXPECT_EQ((*v)[i].key(), heap->PeekTop().key());
+        }
+      } else {
+        // The indices must also match if the key is unique.
+        EXPECT_EQ((*v)[i].data(), heap->PeekTop().data());
+      }
+      EXPECT_FALSE(heap->empty());
+      EXPECT_TRUE(heap->Pop(nullptr));
+    }
+    EXPECT_TRUE(heap->empty());
+  }
+};
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of a weak vtable (fixes compiler warning).
+HeapTest::~HeapTest() = default;
+
+// Tests that a sort using a GenericHeap matches the result of a sort using
+// a KDVector.
+TEST_F(HeapTest, SortTest) {
+  GenericHeap<IntKDPair> heap;
+  EXPECT_TRUE(heap.empty());
+  KDVector v;
+  EXPECT_EQ(heap.size(), v.size());
+  // Push the test data onto both the heap and the KDVector.
+  PushTestData(&heap, &v);
+  VerifyHeapVectorMatch(&heap, &v);
+}
+
+// Tests that pushing some stuff, popping some stuff, and then pushing more
+// stuff results in output that matches the sort using a KDVector.
+// a KDVector.
+TEST_F(HeapTest, MixedTest) {
+  GenericHeap<IntKDPair> heap;
+  KDVector v;
+  // Push the test data onto both the heap and the KDVector.
+  PushTestData(&heap, &v);
+  // Sort the vector and remove the first 5 values from both heap and v.
+  v.sort();
+  for (int i = 0; i < 5; ++i) {
+    heap.Pop(nullptr);
+    v.remove(0);
+  }
+  // Push the test data onto both the heap and the KDVector.
+  PushTestData(&heap, &v);
+  // Heap and vector should still match!
+  VerifyHeapVectorMatch(&heap, &v);
+}
+
+// Tests that PopWorst still leaves the heap in a state such that it still
+// matches a sorted KDVector.
+TEST_F(HeapTest, PopWorstTest) {
+  GenericHeap<IntKDPair> heap;
+  KDVector v;
+  // Push the test data onto both the heap and the KDVector.
+  PushTestData(&heap, &v);
+  // Get the worst element off the heap.
+  IntKDPair pair;
+  heap.PopWorst(&pair);
+  EXPECT_EQ(pair.key(), 65536);
+  EXPECT_EQ(pair.data(), 6);
+  // Sort and remove the worst element from the vector.
+  v.sort();
+  v.truncate(v.size() - 1);
+  // After that they should still match!
+  VerifyHeapVectorMatch(&heap, &v);
+}
+
+// Tests that Reshuffle works and the heap still matches a KDVector with the
+// same value changed. Doubles up as a test of DoublePtr.
+TEST_F(HeapTest, RevalueTest) {
+  // Here the data element of the pair is a DoublePtr, which links the entries
+  // in the vector and heap, and we test a MAX heap.
+  typedef KDPairDec<int, DoublePtr> PtrPair;
+  GenericHeap<PtrPair> heap;
+  GenericVector<PtrPair> v;
+  // Push the test data onto both the heap and the vector.
+  for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+    PtrPair h_pair;
+    h_pair.key() = test_data[i];
+    PtrPair v_pair;
+    v_pair.key() = test_data[i];
+    h_pair.data().Connect(&v_pair.data());
+    heap.Push(&h_pair);
+    v.push_back(v_pair);
+  }
+  // Test changes both ways. Index 0 is 8, so change it to -1.
+  v[0].key() = -1;
+  // v[0].data.OtherEnd() is a pointer to the data element in the appropriate
+  // heap entry, wherever it may be. We can change its value via that pointer.
+  // Without Reshuffle, that would be a terribly bad thing to do, as it violates
+  // the heap invariant, making the heap corrupt.
+  PtrPair* pair_ptr = reinterpret_cast<PtrPair*>(v[0].data().OtherEnd());
+  pair_ptr->key() = v[0].key();
+  heap.Reshuffle(pair_ptr);
+  // Index 1 is 1. Change to 32767.
+  v[1].key() = 32767;
+  pair_ptr = reinterpret_cast<PtrPair*>(v[1].data().OtherEnd());
+  pair_ptr->key() = v[1].key();
+  heap.Reshuffle(pair_ptr);
+  // After the changes, popping the heap should still match the sorted order
+  // of the vector.
+  v.sort();
+  EXPECT_GT(v[0].key(), v.back().key());
+  for (int i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i].key(), heap.PeekTop().key());
+    EXPECT_FALSE(heap.empty());
+    heap.Pop(nullptr);
+  }
+  EXPECT_TRUE(heap.empty());
+}
+
+#if 0
+// Helper checks that the compiler rejects use of a copy constructor with
+// a const argument and the default copy constructor is properly hidden by
+// the non-const version.
+static void ConstRefTest(const DoublePtr& ptr1) {
+  DoublePtr ptr2(ptr1);  // Compiler error here.
+  EXPECT_EQ(&ptr2, ptr2.OtherEnd()->OtherEnd());
+  EXPECT_TRUE(ptr1.OtherEnd() == nullptr);
+}
+#endif
+
+// Tests that DoublePtr works as expected.
+TEST_F(HeapTest, DoublePtrTest) {
+  DoublePtr ptr1;
+  DoublePtr ptr2;
+  ptr1.Connect(&ptr2);
+  // Check that the correct copy constructor is used.
+  DoublePtr ptr3(ptr1);
+  EXPECT_EQ(&ptr3, ptr3.OtherEnd()->OtherEnd());
+  EXPECT_TRUE(ptr1.OtherEnd() == nullptr);
+  // Check that the correct operator= is used.
+  ptr1 = ptr3;
+  EXPECT_EQ(&ptr1, ptr1.OtherEnd()->OtherEnd());
+  EXPECT_TRUE(ptr3.OtherEnd() == nullptr);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/imagedata_test.cc b/tesseract/unittest/imagedata_test.cc
new file mode 100644
index 00000000..31bd2f24
--- /dev/null
+++ b/tesseract/unittest/imagedata_test.cc
@@ -0,0 +1,131 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+
+#include "imagedata.h"
+#include "include_gunit.h"
+#include "log.h"
+
+namespace tesseract {
+
+// Tests the caching mechanism of DocumentData/ImageData.
+
+class ImagedataTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+  ImagedataTest() {}
+
+  // Creates a fake DocumentData, writes it to a file, and returns the filename.
+  std::string MakeFakeDoc(int num_pages, unsigned doc_id,
+                     std::vector<std::string>* page_texts) {
+    // The size of the fake images that we will use.
+    const int kImageSize = 1048576;
+    // Not using a real image here - just an array of zeros! We are just testing
+    // that the truth text matches.
+    std::vector<char> fake_image(kImageSize, 0);
+    DocumentData write_doc("My document");
+    for (int p = 0; p < num_pages; ++p) {
+      // Make some fake text that is different for each page and save it.
+      page_texts->push_back(
+          absl::StrFormat("Page %d of %d in doc %u", p, num_pages, doc_id));
+      // Make an imagedata and put it in the document.
+      ImageData* imagedata =
+          ImageData::Build("noname", p, "eng", fake_image.data(),
+                           fake_image.size(), (*page_texts)[p].c_str(), nullptr);
+      EXPECT_EQ(kImageSize, imagedata->MemoryUsed());
+      write_doc.AddPageToDocument(imagedata);
+    }
+    // Write it to a file.
+    std::string filename = file::JoinPath(
+        FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf"));
+    EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr));
+    return filename;
+  }
+};
+
+TEST_F(ImagedataTest, CachesProperly) {
+  // This test verifies that Imagedata can be stored in a DocumentData and a
+  // collection of them is cached correctly given limited memory.
+  // Number of pages to put in the fake document.
+  const int kNumPages = 12;
+  // Allowances to read the document. Big enough for 1, 3, 0, all pages.
+  const int kMemoryAllowances[] = {2000000, 4000000, 1000000, 100000000, 0};
+  // Order in which to read the pages, with some sequential and some seeks.
+  const int kPageReadOrder[] = {0, 1, 2, 3, 8, 4, 5, 6, 7, 11, 10, 9, -1};
+
+  std::vector<std::string> page_texts;
+  std::string filename = MakeFakeDoc(kNumPages, 0, &page_texts);
+  // Now try getting it back with different memory allowances and check that
+  // the pages can still be read.
+  for (int m = 0; kMemoryAllowances[m] > 0; ++m) {
+    DocumentData read_doc("My document");
+    EXPECT_TRUE(
+        read_doc.LoadDocument(filename.c_str(), 0, kMemoryAllowances[m], nullptr));
+    LOG(ERROR) << "Allowance = " << kMemoryAllowances[m];
+    // Read the pages in a specific order.
+    for (int p = 0; kPageReadOrder[p] >= 0; ++p) {
+      int page = kPageReadOrder[p];
+      const ImageData* imagedata = read_doc.GetPage(page);
+      EXPECT_NE(nullptr, imagedata);
+      //EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata);
+      // Check that this is the right page.
+      EXPECT_STREQ(page_texts[page].c_str(),
+                   imagedata->transcription().c_str());
+    }
+  }
+}
+
+TEST_F(ImagedataTest, CachesMultiDocs) {
+  // This test verifies that DocumentCache works to store multiple DocumentData
+  // and the two caching strategies read images in the right order.
+  // Number of pages in each document.
+  const std::vector<int> kNumPages = {6, 5, 7};
+  std::vector<std::vector<std::string>> page_texts;
+  std::vector<STRING> filenames;
+  for (size_t d = 0; d < kNumPages.size(); ++d) {
+    page_texts.emplace_back(std::vector<std::string>());
+    std::string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());
+    filenames.push_back(STRING(filename.c_str()));
+  }
+  // Now try getting them back with different cache strategies and check that
+  // the pages come out in the right order.
+  DocumentCache robin_cache(8000000);
+  robin_cache.LoadDocuments(filenames, tesseract::CS_ROUND_ROBIN, nullptr);
+  DocumentCache serial_cache(8000000);
+  serial_cache.LoadDocuments(filenames, tesseract::CS_SEQUENTIAL, nullptr);
+  for (int p = 0; p <= 21; ++p) {
+    LOG(INFO) << "Page " << p;
+    const ImageData* robin_data = robin_cache.GetPageBySerial(p);
+    const ImageData* serial_data = serial_cache.GetPageBySerial(p);
+    CHECK(robin_data != nullptr);
+    CHECK(serial_data != nullptr);
+    int robin_doc = p % kNumPages.size();
+    int robin_page = p / kNumPages.size() % kNumPages[robin_doc];
+    // Check that this is the right page.
+    EXPECT_STREQ(page_texts[robin_doc][robin_page].c_str(),
+                 robin_data->transcription().c_str());
+    int serial_doc = p / kNumPages[0] % kNumPages.size();
+    int serial_page = p % kNumPages[0] % kNumPages[serial_doc];
+    EXPECT_STREQ(page_texts[serial_doc][serial_page].c_str(),
+                 serial_data->transcription().c_str());
+  }
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/include_gunit.h b/tesseract/unittest/include_gunit.h
new file mode 100644
index 00000000..568326cb
--- /dev/null
+++ b/tesseract/unittest/include_gunit.h
@@ -0,0 +1,76 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Portability include to match the Google test environment.
+
+#ifndef TESSERACT_UNITTEST_INCLUDE_GUNIT_H_
+#define TESSERACT_UNITTEST_INCLUDE_GUNIT_H_
+
+#include "errcode.h"  // for ASSERT_HOST
+#include "fileio.h"   // for tesseract::File
+#include "log.h"      // for LOG
+#include "gtest/gtest.h"
+
+const char* FLAGS_test_tmpdir = "./tmp";
+
+class file : public tesseract::File {
+public:
+
+  static void MakeTmpdir() {
+#if defined(_WIN32)
+    _mkdir(FLAGS_test_tmpdir);
+#else
+    mkdir(FLAGS_test_tmpdir, S_IRWXU | S_IRWXG);
+#endif
+  }
+
+// Create a file and write a string to it.
+  static bool WriteStringToFile(const std::string& contents, const std::string& filename) {
+    File::WriteStringToFileOrDie(contents, filename);
+    return true;
+  }
+
+  static bool GetContents(const std::string& filename, std::string* out, int) {
+    return File::ReadFileToString(filename, out);
+  }
+
+  static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) {
+    return WriteStringToFile(contents, name);
+  }
+
+  static int Defaults() {
+    return 0;
+  }
+
+  static std::string JoinPath(const std::string& s1, const std::string& s2) {
+    return tesseract::File::JoinPath(s1, s2);
+  }
+
+  static std::string JoinPath(const std::string& s1, const std::string& s2,
+                              const std::string& s3) {
+    return JoinPath(JoinPath(s1, s2), s3);
+  }
+};
+
+#define ARRAYSIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+
+// /usr/include/tensorflow/core/platform/default/logging.h defines the CHECK* macros.
+#if !defined(CHECK)
+#define CHECK(condition)              \
+  if (!(condition)) \
+    LOG(FATAL) << "Check failed: " #condition " "
+#define CHECK_EQ(test, value) CHECK((test) == (value))
+#define CHECK_GT(test, value) CHECK((test) > (value))
+#define CHECK_LT(test, value) CHECK((test) < (value))
+#define CHECK_LE(test, value) CHECK((test) <= (value))
+#define CHECK_OK(test) CHECK(test)
+#endif
+
+#endif  // TESSERACT_UNITTEST_INCLUDE_GUNIT_H_
diff --git a/tesseract/unittest/indexmapbidi_test.cc b/tesseract/unittest/indexmapbidi_test.cc
new file mode 100644
index 00000000..bdd3c895
--- /dev/null
+++ b/tesseract/unittest/indexmapbidi_test.cc
@@ -0,0 +1,117 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+
+#include "indexmapbidi.h"
+
+#include "include_gunit.h"
+
+const int kPrimeLimit = 1000;
+
+namespace tesseract {
+
+class IndexMapBiDiTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+ public:
+  std::string OutputNameToPath(const std::string& name) {
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+  // Computes primes up to kPrimeLimit, using the sieve of Eratosthenes.
+  void ComputePrimes(IndexMapBiDi* map) {
+    map->Init(kPrimeLimit + 1, false);
+    map->SetMap(2, true);
+    // Set all the odds to true.
+    for (int i = 3; i <= kPrimeLimit; i += 2) map->SetMap(i, true);
+    int factor_limit = static_cast<int>(sqrt(1.0 + kPrimeLimit));
+    for (int f = 3; f <= factor_limit; f += 2) {
+      if (map->SparseToCompact(f) >= 0) {
+        for (int m = 2; m * f <= kPrimeLimit; ++m) map->SetMap(f * m, false);
+      }
+    }
+    map->Setup();
+  }
+
+  void TestPrimes(const IndexMap& map) {
+    // Now all primes are mapped in the sparse map to their index.
+    // According to Wikipedia, the 168th prime is 997, and it has compact
+    // index 167 because we are indexing from 0.
+    EXPECT_EQ(167, map.SparseToCompact(997));
+    EXPECT_EQ(997, map.CompactToSparse(167));
+    // 995, 996, 998, 999 are not prime.
+    EXPECT_EQ(-1, map.SparseToCompact(995));
+    EXPECT_EQ(-1, map.SparseToCompact(996));
+    EXPECT_EQ(-1, map.SparseToCompact(998));
+    EXPECT_EQ(-1, map.SparseToCompact(999));
+    // The 167th prime is 991.
+    EXPECT_EQ(991, map.CompactToSparse(166));
+    // There are 168 primes in 0..1000.
+    EXPECT_EQ(168, map.CompactSize());
+    EXPECT_EQ(kPrimeLimit + 1, map.SparseSize());
+  }
+};
+
+// Tests the sieve of Eratosthenes as a way of testing setup.
+TEST_F(IndexMapBiDiTest, Primes) {
+  IndexMapBiDi map;
+  ComputePrimes(&map);
+  TestPrimes(map);
+  // It still works if we assign it to another.
+  IndexMapBiDi map2;
+  map2.CopyFrom(map);
+  TestPrimes(map2);
+  // Or if we assign it to a base class.
+  IndexMap base_map;
+  base_map.CopyFrom(map);
+  TestPrimes(base_map);
+  // Test file i/o too.
+  std::string filename = OutputNameToPath("primesmap");
+  FILE* fp = fopen(filename.c_str(), "wb");
+  CHECK(fp != nullptr);
+  EXPECT_TRUE(map.Serialize(fp));
+  fclose(fp);
+  fp = fopen(filename.c_str(), "rb");
+  CHECK(fp != nullptr);
+  IndexMapBiDi read_map;
+  EXPECT_TRUE(read_map.DeSerialize(false, fp));
+  fclose(fp);
+  TestPrimes(read_map);
+}
+
+// Tests the many-to-one setup feature.
+TEST_F(IndexMapBiDiTest, ManyToOne) {
+  // Test the example in the comment on CompleteMerges.
+  IndexMapBiDi map;
+  map.Init(13, false);
+  map.SetMap(2, true);
+  map.SetMap(4, true);
+  map.SetMap(7, true);
+  map.SetMap(9, true);
+  map.SetMap(11, true);
+  map.Setup();
+  map.Merge(map.SparseToCompact(2), map.SparseToCompact(9));
+  map.Merge(map.SparseToCompact(4), map.SparseToCompact(11));
+  map.CompleteMerges();
+  EXPECT_EQ(3, map.CompactSize());
+  EXPECT_EQ(13, map.SparseSize());
+  EXPECT_EQ(1, map.SparseToCompact(4));
+  EXPECT_EQ(4, map.CompactToSparse(1));
+  EXPECT_EQ(1, map.SparseToCompact(11));
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/intfeaturemap_test.cc b/tesseract/unittest/intfeaturemap_test.cc
new file mode 100644
index 00000000..e95aa0c3
--- /dev/null
+++ b/tesseract/unittest/intfeaturemap_test.cc
@@ -0,0 +1,129 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "intfeaturemap.h"
+#include "intfeaturespace.h"
+
+#include "include_gunit.h"
+
+// Random re-quantization to test that they don't have to be easy.
+// WARNING! Change these and change the expected_misses calculation below.
+const int kXBuckets = 16;
+const int kYBuckets = 24;
+const int kThetaBuckets = 13;
+
+namespace tesseract {
+
+class IntFeatureMapTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+ public:
+  // Expects that the given vector has contiguous integer values in the
+  // range [start, end).
+  void ExpectContiguous(const GenericVector<int>& v, int start, int end) {
+    for (int i = start; i < end; ++i) {
+      EXPECT_EQ(i, v[i - start]);
+    }
+  }
+};
+
+// Tests the IntFeatureMap and implicitly the IntFeatureSpace underneath.
+TEST_F(IntFeatureMapTest, Exhaustive) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because IntFeatureSpace is missing.
+  GTEST_SKIP();
+#else
+  IntFeatureSpace space;
+  space.Init(kXBuckets, kYBuckets, kThetaBuckets);
+  IntFeatureMap map;
+  map.Init(space);
+  int total_size = kIntFeatureExtent * kIntFeatureExtent * kIntFeatureExtent;
+  std::unique_ptr<INT_FEATURE_STRUCT[]> features(
+      new INT_FEATURE_STRUCT[total_size]);
+  // Fill the features with every value.
+  for (int y = 0; y < kIntFeatureExtent; ++y) {
+    for (int x = 0; x < kIntFeatureExtent; ++x) {
+      for (int theta = 0; theta < kIntFeatureExtent; ++theta) {
+        int f_index = (y * kIntFeatureExtent + x) * kIntFeatureExtent + theta;
+        features[f_index].X = x;
+        features[f_index].Y = y;
+        features[f_index].Theta = theta;
+      }
+    }
+  }
+  GenericVector<int> index_features;
+  map.IndexAndSortFeatures(features.get(), total_size, &index_features);
+  EXPECT_EQ(total_size, index_features.size());
+  int total_buckets = kXBuckets * kYBuckets * kThetaBuckets;
+  GenericVector<int> map_features;
+  int misses = map.MapIndexedFeatures(index_features, &map_features);
+  EXPECT_EQ(0, misses);
+  EXPECT_EQ(total_buckets, map_features.size());
+  ExpectContiguous(map_features, 0, total_buckets);
+  EXPECT_EQ(total_buckets, map.compact_size());
+  EXPECT_EQ(total_buckets, map.sparse_size());
+
+  // Every offset should be within dx, dy, dtheta of the start point.
+  int dx = kIntFeatureExtent / kXBuckets + 1;
+  int dy = kIntFeatureExtent / kYBuckets + 1;
+  int dtheta = kIntFeatureExtent / kThetaBuckets + 1;
+  int bad_offsets = 0;
+  for (int index = 0; index < total_buckets; ++index) {
+    for (int dir = -tesseract::kNumOffsetMaps; dir <= tesseract::kNumOffsetMaps;
+         ++dir) {
+      int offset_index = map.OffsetFeature(index, dir);
+      if (dir == 0) {
+        EXPECT_EQ(index, offset_index);
+      } else if (offset_index >= 0) {
+        INT_FEATURE_STRUCT f = map.InverseIndexFeature(index);
+        INT_FEATURE_STRUCT f2 = map.InverseIndexFeature(offset_index);
+        EXPECT_TRUE(f.X != f2.X || f.Y != f2.Y || f.Theta != f2.Theta);
+        EXPECT_LE(abs(f.X - f2.X), dx);
+        EXPECT_LE(abs(f.Y - f2.Y), dy);
+        int theta_delta = abs(f.Theta - f2.Theta);
+        if (theta_delta > kIntFeatureExtent / 2)
+          theta_delta = kIntFeatureExtent - theta_delta;
+        EXPECT_LE(theta_delta, dtheta);
+      } else {
+        ++bad_offsets;
+        INT_FEATURE_STRUCT f = map.InverseIndexFeature(index);
+      }
+    }
+  }
+  EXPECT_LE(bad_offsets, (kXBuckets + kYBuckets) * kThetaBuckets);
+
+  // To test the mapping further, delete the 1st and last map feature, and
+  // test again.
+  map.DeleteMapFeature(0);
+  map.DeleteMapFeature(total_buckets - 1);
+  map.FinalizeMapping(nullptr);
+  map.IndexAndSortFeatures(features.get(), total_size, &index_features);
+  // Has no effect on index features.
+  EXPECT_EQ(total_size, index_features.size());
+  misses = map.MapIndexedFeatures(index_features, &map_features);
+  int expected_misses = (kIntFeatureExtent / kXBuckets) *
+                        (kIntFeatureExtent / kYBuckets) *
+                        (kIntFeatureExtent / kThetaBuckets + 1);
+  expected_misses += (kIntFeatureExtent / kXBuckets) *
+                     (kIntFeatureExtent / kYBuckets + 1) *
+                     (kIntFeatureExtent / kThetaBuckets);
+  EXPECT_EQ(expected_misses, misses);
+  EXPECT_EQ(total_buckets - 2, map_features.size());
+  ExpectContiguous(map_features, 0, total_buckets - 2);
+  EXPECT_EQ(total_buckets - 2, map.compact_size());
+  EXPECT_EQ(total_buckets, map.sparse_size());
+#endif
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/intsimdmatrix_test.cc b/tesseract/unittest/intsimdmatrix_test.cc
new file mode 100644
index 00000000..cdfbaa2c
--- /dev/null
+++ b/tesseract/unittest/intsimdmatrix_test.cc
@@ -0,0 +1,135 @@
+///////////////////////////////////////////////////////////////////////
+// File:        intsimdmatrix_test.cc
+// Author:      rays@google.com (Ray Smith)
+//
+// Copyright 2017 Google Inc. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "intsimdmatrix.h"
+#include <memory>
+#include <vector>
+#include <gtest/gtest.h>
+#include <gtest/internal/gtest-port.h>
+#include "include_gunit.h"
+#include "matrix.h"
+#include "simddetect.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+class IntSimdMatrixTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+  // Makes a random weights matrix of the given size.
+  GENERIC_2D_ARRAY<int8_t> InitRandom(int no, int ni) {
+    GENERIC_2D_ARRAY<int8_t> a(no, ni, 0);
+    for (int i = 0; i < no; ++i) {
+      for (int j = 0; j < ni; ++j) {
+        a(i, j) = static_cast<int8_t>(random_.SignedRand(INT8_MAX));
+      }
+    }
+    return a;
+  }
+  // Makes a random input vector of the given size, with rounding up.
+  std::vector<int8_t> RandomVector(int size, const IntSimdMatrix& matrix) {
+    int rounded_size = matrix.RoundInputs(size);
+    std::vector<int8_t> v(rounded_size, 0);
+    for (int i = 0; i < size; ++i) {
+      v[i] = static_cast<int8_t>(random_.SignedRand(INT8_MAX));
+    }
+    return v;
+  }
+  // Makes a random scales vector of the given size.
+  std::vector<double> RandomScales(int size) {
+    std::vector<double> v(size);
+    for (int i = 0; i < size; ++i) {
+      v[i] = (1.0 + random_.SignedRand(1.0)) / INT8_MAX;
+    }
+    return v;
+  }
+  // Tests a range of sizes and compares the results against the generic version.
+  void ExpectEqualResults(const IntSimdMatrix& matrix) {
+    double total = 0.0;
+    for (int num_out = 1; num_out < 130; ++num_out) {
+      for (int num_in = 1; num_in < 130; ++num_in) {
+        GENERIC_2D_ARRAY<int8_t> w = InitRandom(num_out, num_in + 1);
+        std::vector<int8_t> u = RandomVector(num_in, matrix);
+        std::vector<double> scales = RandomScales(num_out);
+        int ro = num_out;
+        if (IntSimdMatrix::intSimdMatrix)
+          ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
+        std::vector<double> base_result(ro);
+        base_result.resize(num_out);
+        IntSimdMatrix::MatrixDotVector(w, scales, u.data(), base_result.data());
+        std::vector<double> test_result(ro);
+        test_result.resize(num_out);
+        std::vector<int8_t> shaped_wi;
+        int32_t rounded_num_out;
+        matrix.Init(w, shaped_wi, rounded_num_out);
+        scales.reserve(rounded_num_out);
+        if (matrix.matrixDotVectorFunction) {
+          matrix.matrixDotVectorFunction(w.dim1(), w.dim2(), &shaped_wi[0],
+                                         &scales[0], &u[0], &test_result[0]);
+        } else {
+          IntSimdMatrix::MatrixDotVector(w, scales, u.data(), test_result.data());
+        }
+        for (int i = 0; i < num_out; ++i) {
+          EXPECT_FLOAT_EQ(base_result[i], test_result[i]) << "i=" << i;
+          total += base_result[i];
+        }
+      }
+    }
+    // Compare sum of all results with expected value.
+    EXPECT_FLOAT_EQ(total, 337849.39354684710);
+  }
+
+  TRand random_;
+};
+
+// Test the C++ implementation without SIMD.
+TEST_F(IntSimdMatrixTest, C) {
+  static const IntSimdMatrix matrix = {nullptr, 1, 1, 1, 1};
+  ExpectEqualResults(matrix);
+}
+
+// Tests that the SSE implementation gets the same result as the vanilla.
+TEST_F(IntSimdMatrixTest, SSE) {
+#if defined(HAVE_SSE4_1)
+  if (!SIMDDetect::IsSSEAvailable()) {
+    GTEST_LOG_(INFO) << "No SSE found! Not tested!";
+    GTEST_SKIP();
+  }
+  ExpectEqualResults(IntSimdMatrix::intSimdMatrixSSE);
+#else
+  GTEST_LOG_(INFO) << "SSE unsupported! Not tested!";
+  GTEST_SKIP();
+#endif
+}
+
+// Tests that the AVX2 implementation gets the same result as the vanilla.
+TEST_F(IntSimdMatrixTest, AVX2) {
+#if defined(HAVE_AVX2)
+  if (!SIMDDetect::IsAVX2Available()) {
+    GTEST_LOG_(INFO) << "No AVX2 found! Not tested!";
+    GTEST_SKIP();
+  }
+  ExpectEqualResults(IntSimdMatrix::intSimdMatrixAVX2);
+#else
+  GTEST_LOG_(INFO) << "AVX2 unsupported! Not tested!";
+  GTEST_SKIP();
+#endif
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/lang_model_test.cc b/tesseract/unittest/lang_model_test.cc
new file mode 100644
index 00000000..b059c18c
--- /dev/null
+++ b/tesseract/unittest/lang_model_test.cc
@@ -0,0 +1,217 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>                       // for std::string
+
+#include "absl/strings/str_cat.h"
+
+#include "gmock/gmock.h"                // for testing::ElementsAreArray
+
+#include "include_gunit.h"
+#include "lang_model_helpers.h"
+#include "log.h"                        // for LOG
+#include "lstmtrainer.h"
+#include "unicharset_training_utils.h"
+
+namespace tesseract {
+
+std::string TestDataNameToPath(const std::string& name) {
+  return file::JoinPath(TESTING_DIR, name);
+}
+
+// This is an integration test that verifies that CombineLangModel works to
+// the extent that an LSTMTrainer can be initialized with the result, and it
+// can encode strings. More importantly, the test verifies that adding an extra
+// character to the unicharset does not change the encoding of strings.
+TEST(LangModelTest, AddACharacter) {
+  constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
+  constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
+  // Setup the arguments.
+  std::string script_dir = LANGDATA_DIR;
+  std::string eng_dir = file::JoinPath(script_dir, "eng");
+  std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
+  UNICHARSET unicharset;
+  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
+  std::string version_str = "TestVersion";
+  file::MakeTmpdir();
+  std::string output_dir = FLAGS_test_tmpdir;
+  LOG(INFO) << "Output dir=" << output_dir << "\n";
+  std::string lang1 = "eng";
+  bool pass_through_recoder = false;
+  std::vector<STRING> words, puncs, numbers;
+  // If these reads fail, we get a warning message and an empty list of words.
+  ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr)
+      .split('\n', &words);
+  EXPECT_GT(words.size(), 0);
+  ReadFile(file::JoinPath(eng_dir, "eng.punc"), nullptr).split('\n', &puncs);
+  EXPECT_GT(puncs.size(), 0);
+  ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr)
+      .split('\n', &numbers);
+  EXPECT_GT(numbers.size(), 0);
+  bool lang_is_rtl = false;
+  // Generate the traineddata file.
+  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir,
+                                lang1, pass_through_recoder, words, puncs,
+                                numbers, lang_is_rtl, nullptr, nullptr));
+  // Init a trainer with it, and encode kTestString.
+  std::string traineddata1 =
+      file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
+  LSTMTrainer trainer1;
+  trainer1.InitCharSet(traineddata1);
+  std::vector<int> labels1;
+  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
+  STRING test1_decoded = trainer1.DecodeLabels(labels1);
+  std::string test1_str(&test1_decoded[0], test1_decoded.length());
+  LOG(INFO) << "Labels1=" << test1_str << "\n";
+
+  // Add a new character to the unicharset and try again.
+  int size_before = unicharset.size();
+  unicharset.unichar_insert("₹");
+  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false,
+                       &unicharset);
+  EXPECT_EQ(size_before + 1, unicharset.size());
+  // Generate the traineddata file.
+  std::string lang2 = "extended";
+  EXPECT_EQ(EXIT_SUCCESS,
+            CombineLangModel(unicharset, script_dir, version_str, output_dir,
+                             lang2, pass_through_recoder, words, puncs, numbers,
+                             lang_is_rtl, nullptr, nullptr));
+  // Init a trainer with it, and encode kTestString.
+  std::string traineddata2 =
+      file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
+  LSTMTrainer trainer2;
+  trainer2.InitCharSet(traineddata2);
+  std::vector<int> labels2;
+  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
+  STRING test2_decoded = trainer2.DecodeLabels(labels2);
+  std::string test2_str(&test2_decoded[0], test2_decoded.length());
+  LOG(INFO) << "Labels2=" << test2_str << "\n";
+  // encode kTestStringRupees.
+  std::vector<int> labels3;
+  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
+  STRING test3_decoded = trainer2.DecodeLabels(labels3);
+  std::string test3_str(&test3_decoded[0], test3_decoded.length());
+  LOG(INFO) << "labels3=" << test3_str << "\n";
+  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
+  // Since Tensor Flow's CTC implementation insists on having the null be the
+  // last label, and we want to be compatible, null has to be renumbered when
+  // we add a class.
+  int null1 = trainer1.null_char();
+  int null2 = trainer2.null_char();
+  EXPECT_EQ(null1 + 1, null2);
+  std::vector<int> labels1_v(labels1.size());
+  for (int i = 0; i < labels1.size(); ++i) {
+    if (labels1[i] == null1)
+      labels1_v[i] = null2;
+    else
+      labels1_v[i] = labels1[i];
+  }
+  EXPECT_THAT(labels1_v,
+              testing::ElementsAreArray(&labels2[0], labels2.size()));
+  // To make sure we we are not cheating somehow, we can now encode the Rupee
+  // symbol, which we could not do before.
+  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
+  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
+}
+
+// Same as above test, for hin instead of eng
+TEST(LangModelTest, AddACharacterHindi) {
+  constexpr char kTestString[] = "हिन्दी में एक लाइन लिखें";
+  constexpr char kTestStringRupees[] = "हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००";
+  // Setup the arguments.
+  std::string script_dir = LANGDATA_DIR;
+  std::string hin_dir = file::JoinPath(script_dir, "hin");
+  std::string unicharset_path = TestDataNameToPath("hin_beam.unicharset");
+  UNICHARSET unicharset;
+  EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
+  std::string version_str = "TestVersion";
+  file::MakeTmpdir();
+  std::string output_dir = FLAGS_test_tmpdir;
+  LOG(INFO) << "Output dir=" << output_dir << "\n";
+  std::string lang1 = "hin";
+  bool pass_through_recoder = false;
+  std::vector<STRING> words, puncs, numbers;
+  // If these reads fail, we get a warning message and an empty list of words.
+  ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr)
+      .split('\n', &words);
+  EXPECT_GT(words.size(), 0);
+  ReadFile(file::JoinPath(hin_dir, "hin.punc"), nullptr).split('\n', &puncs);
+  EXPECT_GT(puncs.size(), 0);
+  ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr)
+      .split('\n', &numbers);
+  EXPECT_GT(numbers.size(), 0);
+  bool lang_is_rtl = false;
+  // Generate the traineddata file.
+  EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir,
+                                lang1, pass_through_recoder, words, puncs,
+                                numbers, lang_is_rtl, nullptr, nullptr));
+  // Init a trainer with it, and encode kTestString.
+  std::string traineddata1 =
+      file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
+  LSTMTrainer trainer1;
+  trainer1.InitCharSet(traineddata1);
+  std::vector<int> labels1;
+  EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1));
+  STRING test1_decoded = trainer1.DecodeLabels(labels1);
+  std::string test1_str(&test1_decoded[0], test1_decoded.length());
+  LOG(INFO) << "Labels1=" << test1_str << "\n";
+
+  // Add a new character to the unicharset and try again.
+  int size_before = unicharset.size();
+  unicharset.unichar_insert("₹");
+  SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false,
+                       &unicharset);
+  EXPECT_EQ(size_before + 1, unicharset.size());
+  // Generate the traineddata file.
+  std::string lang2 = "extendedhin";
+  EXPECT_EQ(EXIT_SUCCESS,
+            CombineLangModel(unicharset, script_dir, version_str, output_dir,
+                             lang2, pass_through_recoder, words, puncs, numbers,
+                             lang_is_rtl, nullptr, nullptr));
+  // Init a trainer with it, and encode kTestString.
+  std::string traineddata2 =
+      file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
+  LSTMTrainer trainer2;
+  trainer2.InitCharSet(traineddata2);
+  std::vector<int> labels2;
+  EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2));
+  STRING test2_decoded = trainer2.DecodeLabels(labels2);
+  std::string test2_str(&test2_decoded[0], test2_decoded.length());
+  LOG(INFO) << "Labels2=" << test2_str << "\n";
+  // encode kTestStringRupees.
+  std::vector<int> labels3;
+  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3));
+  STRING test3_decoded = trainer2.DecodeLabels(labels3);
+  std::string test3_str(&test3_decoded[0], test3_decoded.length());
+  LOG(INFO) << "labels3=" << test3_str << "\n";
+  // Copy labels1 to a std::vector, renumbering the null char to match trainer2.
+  // Since Tensor Flow's CTC implementation insists on having the null be the
+  // last label, and we want to be compatible, null has to be renumbered when
+  // we add a class.
+  int null1 = trainer1.null_char();
+  int null2 = trainer2.null_char();
+  EXPECT_EQ(null1 + 1, null2);
+  std::vector<int> labels1_v(labels1.size());
+  for (int i = 0; i < labels1.size(); ++i) {
+    if (labels1[i] == null1)
+      labels1_v[i] = null2;
+    else
+      labels1_v[i] = labels1[i];
+  }
+  EXPECT_THAT(labels1_v,
+              testing::ElementsAreArray(&labels2[0], labels2.size()));
+  // To make sure we we are not cheating somehow, we can now encode the Rupee
+  // symbol, which we could not do before.
+  EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
+  EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/layout_test.cc b/tesseract/unittest/layout_test.cc
new file mode 100644
index 00000000..8a20c908
--- /dev/null
+++ b/tesseract/unittest/layout_test.cc
@@ -0,0 +1,234 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+
+#include "include_gunit.h"
+
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "coutln.h"
+#include "log.h"                        // for LOG
+#include "mutableiterator.h"
+#include "ocrblock.h"                   // for class BLOCK
+#include "pageres.h"
+#include "polyblk.h"
+#include <tesseract/resultiterator.h>
+#include "stepblob.h"
+
+namespace tesseract {
+
+/** String name for each block type. Keep in sync with PolyBlockType. */
+static const char* kPolyBlockNames[] = {
+  "Unknown",
+  "Flowing Text",
+  "Heading Text",
+  "Pullout Text",
+  "Equation",
+  "Inline Equation",
+  "Table",
+  "Vertical Text",
+  "Caption Text",
+  "Flowing Image",
+  "Heading Image",
+  "Pullout Image",
+  "Horizontal Line",
+  "Vertical Line",
+  "Noise",
+  ""  // End marker for testing that sizes match.
+};
+
+const char* kStrings8087_054[] = {
+    "dat", "Dalmatian", "", "DAMAGED DURING", "margarine,", nullptr};
+const PolyBlockType kBlocks8087_054[] = {PT_HEADING_TEXT, PT_FLOWING_TEXT,
+                                         PT_PULLOUT_IMAGE, PT_CAPTION_TEXT,
+                                         PT_FLOWING_TEXT};
+
+// The fixture for testing Tesseract.
+class LayoutTest : public testing::Test {
+ protected:
+  std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTING_DIR, "/" + name);
+  }
+  std::string TessdataPath() {
+    return file::JoinPath(TESSDATA_DIR, "");
+  }
+
+  LayoutTest() { src_pix_ = nullptr; }
+  ~LayoutTest() { pixDestroy(&src_pix_); }
+
+  void SetImage(const char* filename, const char* lang) {
+    pixDestroy(&src_pix_);
+    src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
+    api_.Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY);
+    api_.SetPageSegMode(tesseract::PSM_AUTO);
+    api_.SetImage(src_pix_);
+  }
+
+  // Tests reading order and block finding (very roughly) by iterating
+  // over the blocks, expecting that they contain the strings in order,
+  // allowing for other blocks in between.
+  // An empty string should match an image block, and a nullptr string
+  // indicates the end of the array.
+  void VerifyBlockTextOrder(const char* strings[], const PolyBlockType* blocks,
+                            ResultIterator* it) {
+    it->Begin();
+    int string_index = 0;
+    int block_index = 0;
+    do {
+      char* block_text = it->GetUTF8Text(tesseract::RIL_BLOCK);
+      if (block_text != nullptr && it->BlockType() == blocks[string_index] &&
+          strstr(block_text, strings[string_index]) != nullptr) {
+        LOG(INFO) << "Found string " << strings[string_index]
+          << " in block " << block_index
+          << " of type " << kPolyBlockNames[blocks[string_index]] << "\n";
+        // Found this one.
+        ++string_index;
+      } else if (it->BlockType() == blocks[string_index] &&
+                 block_text == nullptr && strings[string_index][0] == '\0') {
+        LOG(INFO) << "Found block of type " << kPolyBlockNames[blocks[string_index]]
+           << " at block " << block_index << "\n";
+        // Found this one.
+        ++string_index;
+      } else {
+        LOG(INFO) << "No match found in block with text:\n" << block_text;
+      }
+      delete[] block_text;
+      ++block_index;
+      if (strings[string_index] == nullptr) break;
+    } while (it->Next(tesseract::RIL_BLOCK));
+    EXPECT_TRUE(strings[string_index] == nullptr);
+  }
+
+  // Tests that approximate order of the biggest text blocks is correct.
+  // Correctness is tested by the following simple rules:
+  // If a block overlaps its predecessor in x, then it must be below it.
+  // otherwise, if the block is not below its predecessor, then it must
+  // be to the left of it if right_to_left is true, or to the right otherwise.
+  void VerifyRoughBlockOrder(bool right_to_left, ResultIterator* it) {
+    int prev_left = 0;
+    int prev_right = 0;
+    int prev_bottom = 0;
+    it->Begin();
+    do {
+      int left, top, right, bottom;
+      if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&
+          PTIsTextType(it->BlockType()) && right - left > 800 &&
+          bottom - top > 200) {
+        if (prev_right > prev_left) {
+          if (std::min(right, prev_right) > std::max(left, prev_left)) {
+            EXPECT_GE(top, prev_bottom) << "Overlapping block should be below";
+          } else if (top < prev_bottom) {
+            if (right_to_left) {
+              EXPECT_GE(prev_left, right) << "Block should be to the left";
+            } else {
+              EXPECT_GE(left, prev_right) << "Block should be to the right";
+            }
+          }
+        }
+        prev_left = left;
+        prev_right = right;
+        prev_bottom = bottom;
+      }
+    } while (it->Next(tesseract::RIL_BLOCK));
+  }
+
+  // Tests that every blob assigned to the biggest text blocks is contained
+  // fully within its block by testing that the block polygon winds around
+  // the center of the bounding boxes of the outlines in the blob.
+  void VerifyTotalContainment(int winding_target, MutableIterator* it) {
+    it->Begin();
+    do {
+      int left, top, right, bottom;
+      if (it->BoundingBox(tesseract::RIL_BLOCK, &left, &top, &right, &bottom) &&
+          PTIsTextType(it->BlockType()) && right - left > 800 &&
+          bottom - top > 200) {
+        const PAGE_RES_IT* pr_it = it->PageResIt();
+        POLY_BLOCK* pb = pr_it->block()->block->pdblk.poly_block();
+        CHECK(pb != nullptr);
+        FCOORD skew = pr_it->block()->block->skew();
+        EXPECT_GT(skew.x(), 0.0f);
+        EXPECT_GT(skew.y(), 0.0f);
+        // Iterate the words in the block.
+        MutableIterator word_it = *it;
+        do {
+          const PAGE_RES_IT* w_it = word_it.PageResIt();
+          // Iterate the blobs in the word.
+          C_BLOB_IT b_it(w_it->word()->word->cblob_list());
+          for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+            C_BLOB* blob = b_it.data();
+            // Iterate the outlines in the blob.
+            C_OUTLINE_IT ol_it(blob->out_list());
+            for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
+              C_OUTLINE* ol = ol_it.data();
+              TBOX box = ol->bounding_box();
+              ICOORD middle((box.left() + box.right()) / 2,
+                            (box.top() + box.bottom()) / 2);
+              EXPECT_EQ(winding_target, pb->winding_number(middle));
+            }
+          }
+        } while (word_it.Next(tesseract::RIL_WORD) &&
+                 !word_it.IsAtBeginningOf(tesseract::RIL_BLOCK));
+      }
+    } while (it->Next(tesseract::RIL_BLOCK));
+  }
+
+  Pix* src_pix_;
+  std::string ocr_text_;
+  tesseract::TessBaseAPI api_;
+};
+
+// Tests that array sizes match their intended size.
+TEST_F(LayoutTest, ArraySizeTest) {
+  int size = 0;
+  for (size = 0; kPolyBlockNames[size][0] != '\0'; ++size)
+    ;
+  EXPECT_EQ(size, PT_COUNT);
+}
+
+// Tests that Tesseract gets the important blocks and in the right order
+// on a UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
+TEST_F(LayoutTest, UNLV8087_054) {
+  SetImage("8087_054.3B.tif", "eng");
+  // Just run recognition.
+  EXPECT_EQ(api_.Recognize(nullptr), 0);
+  // Check iterator position.
+  tesseract::ResultIterator* it = api_.GetIterator();
+  VerifyBlockTextOrder(kStrings8087_054, kBlocks8087_054, it);
+  delete it;
+}
+
+// Tests that Tesseract gets the important blocks and in the right order
+// on GOOGLE:13510798882202548:74:84.sj-79.tif (Hebrew image)
+// TODO: replace hebrew.png by Google image referred above
+TEST_F(LayoutTest, HebrewOrderingAndSkew) {
+  SetImage("hebrew.png", "eng");
+  // Just run recognition.
+  EXPECT_EQ(api_.Recognize(nullptr), 0);
+  tesseract::MutableIterator* it = api_.GetMutableIterator();
+  // In eng mode, block order should not be RTL.
+  VerifyRoughBlockOrder(false, it);
+  VerifyTotalContainment(1, it);
+  delete it;
+  // Now try again using Hebrew.
+  SetImage("hebrew.png", "heb");
+  // Just run recognition.
+  EXPECT_EQ(api_.Recognize(nullptr), 0);
+  it = api_.GetMutableIterator();
+  // In heb mode, block order should be RTL.
+  VerifyRoughBlockOrder(true, it);
+  // And blobs should still be fully contained.
+  VerifyTotalContainment(-1, it);
+  delete it;
+}
+
+}  // namespace
diff --git a/tesseract/unittest/ligature_table_test.cc b/tesseract/unittest/ligature_table_test.cc
new file mode 100644
index 00000000..0047f857
--- /dev/null
+++ b/tesseract/unittest/ligature_table_test.cc
@@ -0,0 +1,111 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+#include "fileio.h"
+#include "include_gunit.h"
+#include "ligature_table.h"
+#include "pango_font_info.h"
+
+namespace tesseract {
+
+const char kEngNonLigatureText[] = "fidelity effigy ſteep";
+// Same as above text, but with "fi" in the first word and "ffi" in the second
+// word replaced with their respective ligatures.
+const char kEngLigatureText[] = "ﬁdelity eﬃgy ﬅeep";
+// Same as kEngLigatureText but with "fi" in both words replaced with their
+// ligature. The test Verdana font does not support the "ffi" or "ſt" ligature.
+const char kRenderableEngLigatureText[] = "ﬁdelity efﬁgy ſteep";
+
+static PangoFontMap* font_map;
+
+class LigatureTableTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+    lig_table_ = LigatureTable::Get();
+    if (!font_map) {
+      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+    }
+    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+  }
+
+  static void SetUpTestCase() {
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
+
+    FLAGS_fonts_dir = TESTING_DIR;
+    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+    file::MakeTmpdir();
+    PangoFontInfo::SoftInitFontConfig(); // init early
+  }
+  LigatureTable* lig_table_;
+};
+
+TEST_F(LigatureTableTest, DoesFillLigatureTables) {
+  EXPECT_GT(lig_table_->norm_to_lig_table().size(), 0);
+  EXPECT_GT(lig_table_->lig_to_norm_table().size(), 0);
+}
+
+TEST_F(LigatureTableTest, DoesAddLigatures) {
+  EXPECT_STREQ(kEngLigatureText,
+               lig_table_->AddLigatures(kEngNonLigatureText, nullptr).c_str());
+}
+
+TEST_F(LigatureTableTest, DoesAddLigaturesWithSupportedFont) {
+  PangoFontInfo font;
+  EXPECT_TRUE(font.ParseFontDescriptionName("Verdana"));
+printf("1:%s\n", kRenderableEngLigatureText);
+printf("2:%s\n", lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
+  EXPECT_STREQ(kRenderableEngLigatureText,
+               lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
+}
+
+TEST_F(LigatureTableTest, DoesNotAddLigaturesWithUnsupportedFont) {
+  PangoFontInfo font;
+  EXPECT_TRUE(font.ParseFontDescriptionName("Lohit Hindi"));
+  EXPECT_STREQ(kEngNonLigatureText,
+               lig_table_->AddLigatures(kEngNonLigatureText, &font).c_str());
+}
+
+TEST_F(LigatureTableTest, DoesRemoveLigatures) {
+  EXPECT_STREQ(kEngNonLigatureText,
+               lig_table_->RemoveLigatures(kEngLigatureText).c_str());
+}
+
+TEST_F(LigatureTableTest, TestCustomLigatures) {
+  const char* kTestCases[] = {
+      "act",       "a\uE003", "publiſh",    "publi\uE006", "ſince",
+      "\uE007nce", "aſleep",  "a\uE008eep", "neceſſary",   "nece\uE009ary",
+  };
+  for (size_t i = 0; i < ARRAYSIZE(kTestCases); i += 2) {
+    EXPECT_STREQ(kTestCases[i + 1],
+                 lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
+    EXPECT_STREQ(kTestCases[i],
+                 lig_table_->RemoveLigatures(kTestCases[i + 1]).c_str());
+    EXPECT_STREQ(kTestCases[i],
+                 lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
+  }
+}
+
+TEST_F(LigatureTableTest, TestRemovesCustomLigatures) {
+  const char* kTestCases[] = {
+      "fiction",
+      "ﬁ\uE003ion",
+      "ﬁction",
+  };
+  for (size_t i = 0; i < ARRAYSIZE(kTestCases); i += 3) {
+    EXPECT_STREQ(kTestCases[i + 1],
+                 lig_table_->AddLigatures(kTestCases[i], nullptr).c_str());
+    EXPECT_STREQ(kTestCases[i + 2],
+                 lig_table_->RemoveCustomLigatures(kTestCases[i + 1]).c_str());
+  }
+}
+}  // namespace
diff --git a/tesseract/unittest/linlsq_test.cc b/tesseract/unittest/linlsq_test.cc
new file mode 100644
index 00000000..2ca0ea9e
--- /dev/null
+++ b/tesseract/unittest/linlsq_test.cc
@@ -0,0 +1,118 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "linlsq.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class LLSQTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+ public:
+  void TearDown() {}
+
+  void ExpectCorrectLine(const LLSQ& llsq, double m, double c, double rms,
+                         double pearson, double tolerance) {
+    EXPECT_NEAR(m, llsq.m(), tolerance);
+    EXPECT_NEAR(c, llsq.c(llsq.m()), tolerance);
+    EXPECT_NEAR(rms, llsq.rms(llsq.m(), llsq.c(llsq.m())), tolerance);
+    EXPECT_NEAR(pearson, llsq.pearson(), tolerance);
+  }
+  FCOORD PtsMean(const std::vector<FCOORD>& pts) {
+    FCOORD total(0, 0);
+    for (const auto& p : pts) {
+      total += p;
+    }
+    return (pts.size() > 0) ? total / pts.size() : total;
+  }
+  void VerifyRmsOrth(const std::vector<FCOORD>& pts, const FCOORD& orth) {
+    LLSQ llsq;
+    FCOORD xavg = PtsMean(pts);
+    FCOORD nvec = !orth;
+    nvec.normalise();
+    double expected_answer = 0;
+    for (const auto& p : pts) {
+      llsq.add(p.x(), p.y());
+      double dot = nvec % (p - xavg);
+      expected_answer += dot * dot;
+    }
+    expected_answer /= pts.size();
+    expected_answer = sqrt(expected_answer);
+    EXPECT_NEAR(expected_answer, llsq.rms_orth(orth), 0.0001);
+  }
+  void ExpectCorrectVector(const LLSQ& llsq, FCOORD correct_mean_pt,
+                           FCOORD correct_vector, float tolerance) {
+    FCOORD mean_pt = llsq.mean_point();
+    FCOORD vector = llsq.vector_fit();
+    EXPECT_NEAR(correct_mean_pt.x(), mean_pt.x(), tolerance);
+    EXPECT_NEAR(correct_mean_pt.y(), mean_pt.y(), tolerance);
+    EXPECT_NEAR(correct_vector.x(), vector.x(), tolerance);
+    EXPECT_NEAR(correct_vector.y(), vector.y(), tolerance);
+  }
+};
+
+// Tests a simple baseline-style normalization.
+TEST_F(LLSQTest, BasicLines) {
+  LLSQ llsq;
+  llsq.add(1.0, 1.0);
+  llsq.add(2.0, 2.0);
+  ExpectCorrectLine(llsq, 1.0, 0.0, 0.0, 1.0, 1e-6);
+  float half_root_2 = sqrt(2.0) / 2.0f;
+  ExpectCorrectVector(llsq, FCOORD(1.5f, 1.5f),
+                      FCOORD(half_root_2, half_root_2), 1e-6);
+  llsq.remove(2.0, 2.0);
+  llsq.add(1.0, 2.0);
+  llsq.add(10.0, 1.0);
+  llsq.add(-8.0, 1.0);
+  // The point at 1,2 pulls the result away from what would otherwise be a
+  // perfect fit to a horizontal line by 0.25 unit, with rms error of 0.433.
+  ExpectCorrectLine(llsq, 0.0, 1.25, 0.433, 0.0, 1e-2);
+  ExpectCorrectVector(llsq, FCOORD(1.0f, 1.25f), FCOORD(1.0f, 0.0f), 1e-3);
+  llsq.add(1.0, 2.0, 10.0);
+  // With a heavy weight, the point at 1,2 pulls the line nearer.
+  ExpectCorrectLine(llsq, 0.0, 1.786, 0.41, 0.0, 1e-2);
+  ExpectCorrectVector(llsq, FCOORD(1.0f, 1.786f), FCOORD(1.0f, 0.0f), 1e-3);
+}
+
+// Tests a simple baseline-style normalization with a rotation.
+TEST_F(LLSQTest, Vectors) {
+  LLSQ llsq;
+  llsq.add(1.0, 1.0);
+  llsq.add(1.0, -1.0);
+  ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-6);
+  llsq.add(0.9, -2.0);
+  llsq.add(1.1, -3.0);
+  llsq.add(0.9, 2.0);
+  llsq.add(1.10001, 3.0);
+  ExpectCorrectVector(llsq, FCOORD(1.0f, 0.0f), FCOORD(0.0f, 1.0f), 1e-3);
+}
+
+// Verify that rms_orth() actually calculates:
+//   sqrt( sum (!nvec * (x_i - x_avg))^2 / n)
+TEST_F(LLSQTest, RmsOrthWorksAsIntended) {
+  std::vector<FCOORD> pts;
+  pts.push_back(FCOORD(0.56, 0.95));
+  pts.push_back(FCOORD(0.09, 0.09));
+  pts.push_back(FCOORD(0.13, 0.77));
+  pts.push_back(FCOORD(0.16, 0.83));
+  pts.push_back(FCOORD(0.45, 0.79));
+  VerifyRmsOrth(pts, FCOORD(1, 0));
+  VerifyRmsOrth(pts, FCOORD(1, 1));
+  VerifyRmsOrth(pts, FCOORD(1, 2));
+  VerifyRmsOrth(pts, FCOORD(2, 1));
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/list_test.cc b/tesseract/unittest/list_test.cc
new file mode 100644
index 00000000..e6a2bf1d
--- /dev/null
+++ b/tesseract/unittest/list_test.cc
@@ -0,0 +1,68 @@
+// (C) Copyright 2020, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#if 0 // TODO: add tests for CLIST
+#include "clst.h"
+#endif
+#include "elst.h"
+#if 0 // TODO: add tests for ELIST2
+#include "elst2.h"
+#endif
+
+namespace tesseract {
+
+class ListTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
+  }
+};
+
+class Elst : public ELIST_LINK {
+ public:
+  Elst(unsigned n) : value(n) {
+  }
+  unsigned value;
+};
+
+ELISTIZEH(Elst)
+ELISTIZE(Elst)
+
+TEST_F(ListTest, TestELIST) {
+  Elst_LIST list;
+  auto it = ELIST_ITERATOR(&list);
+  for (unsigned i = 0; i < 10; i++) {
+    auto* elst = new Elst(i);
+    //EXPECT_TRUE(elst->empty());
+    //EXPECT_EQ(elst->length(), 0);
+    it.add_to_end(elst);
+  }
+  it.move_to_first();
+  unsigned n = 0;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    auto* elst = reinterpret_cast<Elst*>(it.data());
+    EXPECT_EQ(elst->value, n);
+    n++;
+  }
+  it.forward();
+  n++;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    auto* elst = reinterpret_cast<Elst*>(it.extract());
+    EXPECT_EQ(elst->value, n % 10);
+    n++;
+    delete elst;
+  }
+  // TODO: add more tests for ELIST
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/unittest/loadlang_test.cc b/tesseract/unittest/loadlang_test.cc
new file mode 100644
index 00000000..ba7a9f6d
--- /dev/null
+++ b/tesseract/unittest/loadlang_test.cc
@@ -0,0 +1,251 @@
+///////////////////////////////////////////////////////////////////////
+// File:        loadlang_test.cc
+// Description: Test loading of All languages and Scripts for Tesseract.
+// Tests for All languages and scripts are Disabled by default.
+// Force the disabled test to run if required by using the
+// --gtest_also_run_disabled_tests argument. Author:      Shree Devi Kumar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include <memory>               // std::unique_ptr
+#include <time.h>
+#include <tesseract/baseapi.h>
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class QuickTest : public testing::Test {
+ protected:
+  virtual void SetUp() { start_time_ = time(nullptr); }
+  virtual void TearDown() {
+    const time_t end_time = time(nullptr);
+    EXPECT_TRUE(end_time - start_time_ <= 25)
+        << "The test took too long - "
+        << ::testing::PrintToString(end_time - start_time_);
+  }
+  time_t start_time_;
+};
+
+void LangLoader(const char* lang, const char* tessdatadir) {
+  std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+  ASSERT_FALSE(api->Init(tessdatadir, lang))
+      << "Could not initialize tesseract for $lang.";
+  api->End();
+}
+
+// For all languages
+
+class LoadLanguage : public QuickTest,
+                     public ::testing::WithParamInterface<const char*> {};
+
+TEST_P(LoadLanguage, afr) { LangLoader("afr", GetParam()); }
+TEST_P(LoadLanguage, amh) { LangLoader("amh", GetParam()); }
+TEST_P(LoadLanguage, ara) { LangLoader("ara", GetParam()); }
+TEST_P(LoadLanguage, asm) { LangLoader("asm", GetParam()); }
+TEST_P(LoadLanguage, aze) { LangLoader("aze", GetParam()); }
+TEST_P(LoadLanguage, aze_cyrl) { LangLoader("aze_cyrl", GetParam()); }
+TEST_P(LoadLanguage, bel) { LangLoader("bel", GetParam()); }
+TEST_P(LoadLanguage, ben) { LangLoader("ben", GetParam()); }
+TEST_P(LoadLanguage, bod) { LangLoader("bod", GetParam()); }
+TEST_P(LoadLanguage, bos) { LangLoader("bos", GetParam()); }
+TEST_P(LoadLanguage, bre) { LangLoader("bre", GetParam()); }
+TEST_P(LoadLanguage, bul) { LangLoader("bul", GetParam()); }
+TEST_P(LoadLanguage, cat) { LangLoader("cat", GetParam()); }
+TEST_P(LoadLanguage, ceb) { LangLoader("ceb", GetParam()); }
+TEST_P(LoadLanguage, ces) { LangLoader("ces", GetParam()); }
+TEST_P(LoadLanguage, chi_sim) { LangLoader("chi_sim", GetParam()); }
+TEST_P(LoadLanguage, chi_sim_vert) { LangLoader("chi_sim_vert", GetParam()); }
+TEST_P(LoadLanguage, chi_tra) { LangLoader("chi_tra", GetParam()); }
+TEST_P(LoadLanguage, chi_tra_vert) { LangLoader("chi_tra_vert", GetParam()); }
+TEST_P(LoadLanguage, chr) { LangLoader("chr", GetParam()); }
+TEST_P(LoadLanguage, cos) { LangLoader("cos", GetParam()); }
+TEST_P(LoadLanguage, cym) { LangLoader("cym", GetParam()); }
+TEST_P(LoadLanguage, dan) { LangLoader("dan", GetParam()); }
+TEST_P(LoadLanguage, deu) { LangLoader("deu", GetParam()); }
+TEST_P(LoadLanguage, div) { LangLoader("div", GetParam()); }
+TEST_P(LoadLanguage, dzo) { LangLoader("dzo", GetParam()); }
+TEST_P(LoadLanguage, ell) { LangLoader("ell", GetParam()); }
+TEST_P(LoadLanguage, eng) { LangLoader("eng", GetParam()); }
+TEST_P(LoadLanguage, enm) { LangLoader("enm", GetParam()); }
+TEST_P(LoadLanguage, epo) { LangLoader("epo", GetParam()); }
+TEST_P(LoadLanguage, est) { LangLoader("est", GetParam()); }
+TEST_P(LoadLanguage, eus) { LangLoader("eus", GetParam()); }
+TEST_P(LoadLanguage, fao) { LangLoader("fao", GetParam()); }
+TEST_P(LoadLanguage, fas) { LangLoader("fas", GetParam()); }
+TEST_P(LoadLanguage, fil) { LangLoader("fil", GetParam()); }
+TEST_P(LoadLanguage, fin) { LangLoader("fin", GetParam()); }
+TEST_P(LoadLanguage, fra) { LangLoader("fra", GetParam()); }
+TEST_P(LoadLanguage, frk) { LangLoader("frk", GetParam()); }
+TEST_P(LoadLanguage, frm) { LangLoader("frm", GetParam()); }
+TEST_P(LoadLanguage, fry) { LangLoader("fry", GetParam()); }
+TEST_P(LoadLanguage, gla) { LangLoader("gla", GetParam()); }
+TEST_P(LoadLanguage, gle) { LangLoader("gle", GetParam()); }
+TEST_P(LoadLanguage, glg) { LangLoader("glg", GetParam()); }
+TEST_P(LoadLanguage, grc) { LangLoader("grc", GetParam()); }
+TEST_P(LoadLanguage, guj) { LangLoader("guj", GetParam()); }
+TEST_P(LoadLanguage, hat) { LangLoader("hat", GetParam()); }
+TEST_P(LoadLanguage, heb) { LangLoader("heb", GetParam()); }
+TEST_P(LoadLanguage, hin) { LangLoader("hin", GetParam()); }
+TEST_P(LoadLanguage, hrv) { LangLoader("hrv", GetParam()); }
+TEST_P(LoadLanguage, hun) { LangLoader("hun", GetParam()); }
+TEST_P(LoadLanguage, hye) { LangLoader("hye", GetParam()); }
+TEST_P(LoadLanguage, iku) { LangLoader("iku", GetParam()); }
+TEST_P(LoadLanguage, ind) { LangLoader("ind", GetParam()); }
+TEST_P(LoadLanguage, isl) { LangLoader("isl", GetParam()); }
+TEST_P(LoadLanguage, ita) { LangLoader("ita", GetParam()); }
+TEST_P(LoadLanguage, ita_old) { LangLoader("ita_old", GetParam()); }
+TEST_P(LoadLanguage, jav) { LangLoader("jav", GetParam()); }
+TEST_P(LoadLanguage, jpn) { LangLoader("jpn", GetParam()); }
+TEST_P(LoadLanguage, jpn_vert) { LangLoader("jpn_vert", GetParam()); }
+TEST_P(LoadLanguage, kan) { LangLoader("kan", GetParam()); }
+TEST_P(LoadLanguage, kat) { LangLoader("kat", GetParam()); }
+TEST_P(LoadLanguage, kat_old) { LangLoader("kat_old", GetParam()); }
+TEST_P(LoadLanguage, kaz) { LangLoader("kaz", GetParam()); }
+TEST_P(LoadLanguage, khm) { LangLoader("khm", GetParam()); }
+TEST_P(LoadLanguage, kir) { LangLoader("kir", GetParam()); }
+//  TEST_P(LoadLanguage, kmr) {LangLoader("kmr" , GetParam());}
+TEST_P(LoadLanguage, kor) { LangLoader("kor", GetParam()); }
+TEST_P(LoadLanguage, kor_vert) { LangLoader("kor_vert", GetParam()); }
+TEST_P(LoadLanguage, lao) { LangLoader("lao", GetParam()); }
+TEST_P(LoadLanguage, lat) { LangLoader("lat", GetParam()); }
+TEST_P(LoadLanguage, lav) { LangLoader("lav", GetParam()); }
+TEST_P(LoadLanguage, lit) { LangLoader("lit", GetParam()); }
+TEST_P(LoadLanguage, ltz) { LangLoader("ltz", GetParam()); }
+TEST_P(LoadLanguage, mal) { LangLoader("mal", GetParam()); }
+TEST_P(LoadLanguage, mar) { LangLoader("mar", GetParam()); }
+TEST_P(LoadLanguage, mkd) { LangLoader("mkd", GetParam()); }
+TEST_P(LoadLanguage, mlt) { LangLoader("mlt", GetParam()); }
+TEST_P(LoadLanguage, mon) { LangLoader("mon", GetParam()); }
+TEST_P(LoadLanguage, mri) { LangLoader("mri", GetParam()); }
+TEST_P(LoadLanguage, msa) { LangLoader("msa", GetParam()); }
+TEST_P(LoadLanguage, mya) { LangLoader("mya", GetParam()); }
+TEST_P(LoadLanguage, nep) { LangLoader("nep", GetParam()); }
+TEST_P(LoadLanguage, nld) { LangLoader("nld", GetParam()); }
+TEST_P(LoadLanguage, nor) { LangLoader("nor", GetParam()); }
+TEST_P(LoadLanguage, oci) { LangLoader("oci", GetParam()); }
+TEST_P(LoadLanguage, ori) { LangLoader("ori", GetParam()); }
+TEST_P(LoadLanguage, osd) { LangLoader("osd", GetParam()); }
+TEST_P(LoadLanguage, pan) { LangLoader("pan", GetParam()); }
+TEST_P(LoadLanguage, pol) { LangLoader("pol", GetParam()); }
+TEST_P(LoadLanguage, por) { LangLoader("por", GetParam()); }
+TEST_P(LoadLanguage, pus) { LangLoader("pus", GetParam()); }
+TEST_P(LoadLanguage, que) { LangLoader("que", GetParam()); }
+TEST_P(LoadLanguage, ron) { LangLoader("ron", GetParam()); }
+TEST_P(LoadLanguage, rus) { LangLoader("rus", GetParam()); }
+TEST_P(LoadLanguage, san) { LangLoader("san", GetParam()); }
+TEST_P(LoadLanguage, sin) { LangLoader("sin", GetParam()); }
+TEST_P(LoadLanguage, slk) { LangLoader("slk", GetParam()); }
+TEST_P(LoadLanguage, slv) { LangLoader("slv", GetParam()); }
+TEST_P(LoadLanguage, snd) { LangLoader("snd", GetParam()); }
+TEST_P(LoadLanguage, spa) { LangLoader("spa", GetParam()); }
+TEST_P(LoadLanguage, spa_old) { LangLoader("spa_old", GetParam()); }
+TEST_P(LoadLanguage, sqi) { LangLoader("sqi", GetParam()); }
+TEST_P(LoadLanguage, srp) { LangLoader("srp", GetParam()); }
+TEST_P(LoadLanguage, srp_latn) { LangLoader("srp_latn", GetParam()); }
+TEST_P(LoadLanguage, sun) { LangLoader("sun", GetParam()); }
+TEST_P(LoadLanguage, swa) { LangLoader("swa", GetParam()); }
+TEST_P(LoadLanguage, swe) { LangLoader("swe", GetParam()); }
+TEST_P(LoadLanguage, syr) { LangLoader("syr", GetParam()); }
+TEST_P(LoadLanguage, tam) { LangLoader("tam", GetParam()); }
+TEST_P(LoadLanguage, tat) { LangLoader("tat", GetParam()); }
+TEST_P(LoadLanguage, tel) { LangLoader("tel", GetParam()); }
+TEST_P(LoadLanguage, tgk) { LangLoader("tgk", GetParam()); }
+TEST_P(LoadLanguage, tha) { LangLoader("tha", GetParam()); }
+TEST_P(LoadLanguage, tir) { LangLoader("tir", GetParam()); }
+TEST_P(LoadLanguage, ton) { LangLoader("ton", GetParam()); }
+TEST_P(LoadLanguage, tur) { LangLoader("tur", GetParam()); }
+TEST_P(LoadLanguage, uig) { LangLoader("uig", GetParam()); }
+TEST_P(LoadLanguage, ukr) { LangLoader("ukr", GetParam()); }
+TEST_P(LoadLanguage, urd) { LangLoader("urd", GetParam()); }
+TEST_P(LoadLanguage, uzb) { LangLoader("uzb", GetParam()); }
+TEST_P(LoadLanguage, uzb_cyrl) { LangLoader("uzb_cyrl", GetParam()); }
+TEST_P(LoadLanguage, vie) { LangLoader("vie", GetParam()); }
+TEST_P(LoadLanguage, yid) { LangLoader("yid", GetParam()); }
+TEST_P(LoadLanguage, yor) { LangLoader("yor", GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadLanguage,
+                        ::testing::Values(TESSDATA_DIR "_fast"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadLanguage,
+                        ::testing::Values(TESSDATA_DIR "_best"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadLanguage,
+                        ::testing::Values(TESSDATA_DIR));
+
+// For all scripts
+
+class LoadScript : public QuickTest,
+                   public ::testing::WithParamInterface<const char*> {};
+
+TEST_P(LoadScript, Arabic) { LangLoader("script/Arabic", GetParam()); }
+TEST_P(LoadScript, Armenian) { LangLoader("script/Armenian", GetParam()); }
+TEST_P(LoadScript, Bengali) { LangLoader("script/Bengali", GetParam()); }
+TEST_P(LoadScript, Canadian_Aboriginal) {
+  LangLoader("script/Canadian_Aboriginal", GetParam());
+}
+TEST_P(LoadScript, Cherokee) { LangLoader("script/Cherokee", GetParam()); }
+TEST_P(LoadScript, Cyrillic) { LangLoader("script/Cyrillic", GetParam()); }
+TEST_P(LoadScript, Devanagari) { LangLoader("script/Devanagari", GetParam()); }
+TEST_P(LoadScript, Ethiopic) { LangLoader("script/Ethiopic", GetParam()); }
+TEST_P(LoadScript, Fraktur) { LangLoader("script/Fraktur", GetParam()); }
+TEST_P(LoadScript, Georgian) { LangLoader("script/Georgian", GetParam()); }
+TEST_P(LoadScript, Greek) { LangLoader("script/Greek", GetParam()); }
+TEST_P(LoadScript, Gujarati) { LangLoader("script/Gujarati", GetParam()); }
+TEST_P(LoadScript, Gurmukhi) { LangLoader("script/Gurmukhi", GetParam()); }
+TEST_P(LoadScript, HanS) { LangLoader("script/HanS", GetParam()); }
+TEST_P(LoadScript, HanS_vert) { LangLoader("script/HanS_vert", GetParam()); }
+TEST_P(LoadScript, HanT) { LangLoader("script/HanT", GetParam()); }
+TEST_P(LoadScript, HanT_vert) { LangLoader("script/HanT_vert", GetParam()); }
+TEST_P(LoadScript, Hangul) { LangLoader("script/Hangul", GetParam()); }
+TEST_P(LoadScript, Hangul_vert) {
+  LangLoader("script/Hangul_vert", GetParam());
+}
+TEST_P(LoadScript, Hebrew) { LangLoader("script/Hebrew", GetParam()); }
+TEST_P(LoadScript, Japanese) { LangLoader("script/Japanese", GetParam()); }
+TEST_P(LoadScript, Japanese_vert) {
+  LangLoader("script/Japanese_vert", GetParam());
+}
+TEST_P(LoadScript, Kannada) { LangLoader("script/Kannada", GetParam()); }
+TEST_P(LoadScript, Khmer) { LangLoader("script/Khmer", GetParam()); }
+TEST_P(LoadScript, Lao) { LangLoader("script/Lao", GetParam()); }
+TEST_P(LoadScript, Latin) { LangLoader("script/Latin", GetParam()); }
+TEST_P(LoadScript, Malayalam) { LangLoader("script/Malayalam", GetParam()); }
+TEST_P(LoadScript, Myanmar) { LangLoader("script/Myanmar", GetParam()); }
+TEST_P(LoadScript, Oriya) { LangLoader("script/Oriya", GetParam()); }
+TEST_P(LoadScript, Sinhala) { LangLoader("script/Sinhala", GetParam()); }
+TEST_P(LoadScript, Syriac) { LangLoader("script/Syriac", GetParam()); }
+TEST_P(LoadScript, Tamil) { LangLoader("script/Tamil", GetParam()); }
+TEST_P(LoadScript, Telugu) { LangLoader("script/Telugu", GetParam()); }
+TEST_P(LoadScript, Thaana) { LangLoader("script/Thaana", GetParam()); }
+TEST_P(LoadScript, Thai) { LangLoader("script/Thai", GetParam()); }
+TEST_P(LoadScript, Tibetan) { LangLoader("script/Tibetan", GetParam()); }
+TEST_P(LoadScript, Vietnamese) { LangLoader("script/Vietnamese", GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_fast, LoadScript,
+                        ::testing::Values(TESSDATA_DIR "_fast"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata_best, LoadScript,
+                        ::testing::Values(TESSDATA_DIR "_best"));
+INSTANTIATE_TEST_SUITE_P(DISABLED_Tessdata, LoadScript,
+                        ::testing::Values(TESSDATA_DIR));
+
+class LoadLang : public QuickTest {};
+
+// Test Load of English here, as the parameterized tests are disabled by
+// default.
+TEST_F(LoadLang, engFast) { LangLoader("eng", TESSDATA_DIR "_fast"); }
+TEST_F(LoadLang, engBest) { LangLoader("eng", TESSDATA_DIR "_best"); }
+TEST_F(LoadLang, engBestInt) { LangLoader("eng", TESSDATA_DIR); }
+
+// Use class LoadLang for languages which are NOT there in all three repos
+TEST_F(LoadLang, kmrFast) { LangLoader("kmr", TESSDATA_DIR "_fast"); }
+TEST_F(LoadLang, kmrBest) { LangLoader("kmr", TESSDATA_DIR "_best"); }
+//  TEST_F(LoadLang, kmrBestInt) {LangLoader("kmr" , TESSDATA_DIR);}
+
+}  // namespace
diff --git a/tesseract/unittest/log.h b/tesseract/unittest/log.h
new file mode 100644
index 00000000..0b21f3ee
--- /dev/null
+++ b/tesseract/unittest/log.h
@@ -0,0 +1,67 @@
+///////////////////////////////////////////////////////////////////////
+// File:        log.h
+// Description: Include for custom log message for unittest for tesseract.
+//              based on
+//              https://stackoverflow.com/questions/16491675/how-to-send-custom-message-in-google-c-testing-framework
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_UNITTEST_LOG_H_
+#define TESSERACT_UNITTEST_LOG_H_
+
+// This is a minimal implementation of the TensorFlow logging API
+// which is sufficient for the Tesseract unit tests.
+
+// See tensorflow/core/platform/default/logging.h for the original code.
+
+#include <iostream>
+
+enum LogLevel {
+  INFO, WARNING, ERROR, FATAL
+};
+
+// Avoid conflict with logging.h from TensorFlow.
+#undef LOG
+
+static inline std::ostream& LOG(enum LogLevel level)
+{
+  switch (level) {
+    case INFO:
+      std::cout << "[INFO]  ";
+      break;
+    case WARNING:
+      std::cout << "[WARN]  ";
+      break;
+    case ERROR:
+      std::cout << "[ERROR] ";
+      break;
+    case FATAL:
+      std::cout << "[FATAL] ";
+      break;
+  }
+  return std::cout;
+}
+
+// Avoid conflict with logging.h from TensorFlow.
+#undef QCHECK
+
+// https://github.com/google/ion/blob/master/ion/base/logging.h
+static inline std::ostream& QCHECK(bool condition)
+{
+  if (condition) {
+    static std::ostream null_stream(nullptr);
+    return null_stream;
+  }
+  return std::cout;
+}
+
+#endif  // TESSERACT_UNITTEST_LOG_H_
diff --git a/tesseract/unittest/lstm_recode_test.cc b/tesseract/unittest/lstm_recode_test.cc
new file mode 100644
index 00000000..5365bf4b
--- /dev/null
+++ b/tesseract/unittest/lstm_recode_test.cc
@@ -0,0 +1,45 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lstm_test.h"
+
+namespace tesseract {
+
+// Tests that training with unicharset recoding learns faster than without,
+// for Korean. This test is split in two, so it can be run sharded.
+
+TEST_F(LSTMTrainerTest, RecodeTestKorBase) {
+  // A basic single-layer, bi-di 1d LSTM on Korean.
+  SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-full", "kor/kor.unicharset",
+               "kor.Arial_Unicode_MS.exp0.lstmf", false, true, 5e-4, false, "kor");
+  double kor_full_err = TrainIterations(kTrainerIterations * 2);
+  EXPECT_LT(kor_full_err, 88);
+//  EXPECT_GT(kor_full_err, 85);
+  LOG(INFO) << "********** Expected  < 88 ************\n" ;
+}
+
+TEST_F(LSTMTrainerTest, RecodeTestKor) {
+  // A basic single-layer, bi-di 1d LSTM on Korean.
+  SetupTrainer("[1,1,0,32 Lbx96 O1c1]", "kor-recode", "kor/kor.unicharset",
+               "kor.Arial_Unicode_MS.exp0.lstmf", true, true, 5e-4, false, "kor");
+  double kor_recode_err = TrainIterations(kTrainerIterations);
+  EXPECT_LT(kor_recode_err, 60);
+  LOG(INFO) << "********** Expected  < 60 ************\n" ;
+}
+
+// Tests that the given string encodes and decodes back to the same
+// with both recode on and off for Korean.
+
+TEST_F(LSTMTrainerTest, EncodeDecodeBothTestKor) {
+  TestEncodeDecodeBoth("kor", "한국어 위키백과에 오신 것을 환영합니다!");
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/unittest/lstm_squashed_test.cc b/tesseract/unittest/lstm_squashed_test.cc
new file mode 100644
index 00000000..1dd08746
--- /dev/null
+++ b/tesseract/unittest/lstm_squashed_test.cc
@@ -0,0 +1,31 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lstm_test.h"
+
+namespace tesseract {
+
+// Tests that a Squashed network learns correctly.
+// Almost as fast as the 2d-lstm.
+TEST_F(LSTMTrainerTest, TestSquashed) {
+  // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom, and
+  // a small convolution/maxpool below that.
+  // Match training conditions to those typically used with this spec:
+  // recoding on, adam on.
+  SetupTrainerEng("[1,32,0,1 Ct3,3,16 Mp3,3 Lfys48 Lbx96 O1c1]",
+                  "SQU-2-layer-lstm", /*recode*/ true, /*adam*/ true);
+  double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2);
+  EXPECT_LT(lstm_2d_err, 80);
+  LOG(INFO) << "********** < 80 ************\n" ;
+  TestIntMode(kTrainerIterations);
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/unittest/lstm_test.cc b/tesseract/unittest/lstm_test.cc
new file mode 100644
index 00000000..930384a6
--- /dev/null
+++ b/tesseract/unittest/lstm_test.cc
@@ -0,0 +1,221 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Generating the training data:
+// If the format of the lstmf (ImageData) file changes, the training data will
+// have to be regenerated as follows:
+//
+// Use --xsize 800 for text2image to be similar to original training data.
+//
+// src/training/tesstrain.sh --fonts_dir /usr/share/fonts --lang eng \
+// --linedata_only   --noextract_font_properties --langdata_dir ../langdata_lstm \
+// --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \
+// --fontlist "Arial" --maxpages 10
+//
+
+#include "lstm_test.h"
+
+namespace tesseract {
+
+// Tests that some simple networks can learn Arial and meet accuracy targets.
+TEST_F(LSTMTrainerTest, BasicTest) {
+  // A Convolver sliding window classifier without LSTM.
+  SetupTrainer(
+      "[1,32,0,1 Ct5,5,16 Mp4,4 Ct1,1,16 Ct3,3,128 Mp4,1 Ct1,1,64 S2,1 "
+      "Ct1,1,64O1c1]",
+      "no-lstm", "eng/eng.unicharset", "eng.Arial.exp0.lstmf", false, false,
+      2e-4, false, "eng");
+  double non_lstm_err = TrainIterations(kTrainerIterations * 4);
+  EXPECT_LT(non_lstm_err, 98);
+  LOG(INFO) << "********** Expected  < 98 ************\n" ;
+
+  // A basic single-layer, single direction LSTM.
+  SetupTrainerEng("[1,1,0,32 Lfx100 O1c1]", "1D-lstm", false, false);
+  double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
+  EXPECT_LT(lstm_uni_err, 86);
+   LOG(INFO) << "********** Expected  < 86 ************\n" ;
+  // Beats the convolver. (Although it does have a lot more weights, it still
+  // iterates faster.)
+  EXPECT_LT(lstm_uni_err, non_lstm_err);
+}
+
+// Color learns almost as fast as normalized grey/2D.
+TEST_F(LSTMTrainerTest, ColorTest) {
+  // A basic single-layer, single direction LSTM.
+  SetupTrainerEng("[1,32,0,3 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+                  "2D-color-lstm", true, true);
+  double lstm_uni_err = TrainIterations(kTrainerIterations);
+  EXPECT_LT(lstm_uni_err, 85);
+//  EXPECT_GT(lstm_uni_err, 66);
+  LOG(INFO) << "********** Expected  < 85 ************\n" ;
+}
+
+TEST_F(LSTMTrainerTest, BidiTest) {
+  // A basic single-layer, bi-di 1d LSTM.
+  SetupTrainerEng("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", false, false);
+  double lstm_bi_err = TrainIterations(kTrainerIterations);
+  EXPECT_LT(lstm_bi_err, 75);
+  LOG(INFO) << "********** Expected   < 75 ************\n" ;
+  // Int mode training is dead, so convert the trained network to int and check
+  // that its error rate is close to the float version.
+  TestIntMode(kTrainerIterations);
+}
+
+// Tests that a 2d-2-layer network learns correctly.
+// It takes a lot of iterations to get there.
+TEST_F(LSTMTrainerTest, Test2D) {
+  // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
+  SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+                  "2-D-2-layer-lstm", false, false);
+  double lstm_2d_err = TrainIterations(kTrainerIterations * 3 / 2 );
+  EXPECT_LT(lstm_2d_err, 98);
+//  EXPECT_GT(lstm_2d_err, 90);
+  LOG(INFO) << "********** Expected  < 98 ************\n" ;
+  // Int mode training is dead, so convert the trained network to int and check
+  // that its error rate is close to the float version.
+  TestIntMode(kTrainerIterations);
+}
+
+// Tests that a 2d-2-layer network with Adam does *a lot* better than
+// without it.
+TEST_F(LSTMTrainerTest, TestAdam) {
+  // A 2-layer LSTM with a 2-D feature-extracting LSTM on the bottom.
+  SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+                  "2-D-2-layer-lstm", false, true);
+  double lstm_2d_err = TrainIterations(kTrainerIterations);
+  EXPECT_LT(lstm_2d_err, 70);
+  LOG(INFO) << "********** Expected   < 70 ************\n" ;
+  TestIntMode(kTrainerIterations);
+}
+
+// Trivial test of training speed on a fairly complex network.
+TEST_F(LSTMTrainerTest, SpeedTest) {
+  SetupTrainerEng(
+      "[1,30,0,1 Ct5,5,16 Mp2,2 L2xy24 Ct1,1,48 Mp5,1 Ct1,1,32 S3,1 Lbx64 "
+      "O1c1]",
+      "2-D-2-layer-lstm", false, true);
+  TrainIterations(kTrainerIterations);
+   LOG(INFO) << "********** *** ************\n" ;
+}
+
+// Tests that two identical networks trained the same get the same results.
+// Also tests that the same happens with a serialize/deserialize in the middle.
+TEST_F(LSTMTrainerTest, DeterminismTest) {
+  SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+                  "2-D-2-layer-lstm", false, false);
+  double lstm_2d_err_a = TrainIterations(kTrainerIterations);
+  double act_error_a = trainer_->ActivationError();
+  double char_error_a = trainer_->CharError();
+  std::vector<char> trainer_a_data;
+  EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
+                                         &trainer_a_data));
+  SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+                  "2-D-2-layer-lstm", false, false);
+  double lstm_2d_err_b = TrainIterations(kTrainerIterations);
+  double act_error_b = trainer_->ActivationError();
+  double char_error_b = trainer_->CharError();
+  EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
+  EXPECT_FLOAT_EQ(act_error_a, act_error_b);
+  EXPECT_FLOAT_EQ(char_error_a, char_error_b);
+  // Now train some more iterations.
+  lstm_2d_err_b = TrainIterations(kTrainerIterations / 3);
+  act_error_b = trainer_->ActivationError();
+  char_error_b = trainer_->CharError();
+  // Unpack into a new trainer and train that some more too.
+  SetupTrainerEng("[1,32,0,1 S4,2 L2xy16 Ct1,1,16 S8,1 Lbx100 O1c1]",
+                  "2-D-2-layer-lstm", false, false);
+  EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_a_data, trainer_.get()));
+  lstm_2d_err_a = TrainIterations(kTrainerIterations / 3);
+  act_error_a = trainer_->ActivationError();
+  char_error_a = trainer_->CharError();
+  EXPECT_FLOAT_EQ(lstm_2d_err_a, lstm_2d_err_b);
+  EXPECT_FLOAT_EQ(act_error_a, act_error_b);
+  EXPECT_FLOAT_EQ(char_error_a, char_error_b);
+  LOG(INFO) << "********** *** ************\n" ;
+}
+
+// The baseline network against which to test the built-in softmax.
+TEST_F(LSTMTrainerTest, SoftmaxBaselineTest) {
+  // A basic single-layer, single direction LSTM.
+  SetupTrainerEng("[1,1,0,32 Lfx96 O1c1]", "1D-lstm", false, true);
+  double lstm_uni_err = TrainIterations(kTrainerIterations * 2);
+  EXPECT_LT(lstm_uni_err, 60);
+//  EXPECT_GT(lstm_uni_err, 48);
+  LOG(INFO) << "********** Expected  < 60 ************\n" ;
+  // Check that it works in int mode too.
+  TestIntMode(kTrainerIterations);
+  // If we run TestIntMode again, it tests that int_mode networks can
+  // serialize and deserialize correctly.
+  double delta = TestIntMode(kTrainerIterations);
+  // The two tests (both of int mode this time) should be almost identical.
+  LOG(INFO) << "Delta in Int mode error rates = " << delta << "\n";
+  EXPECT_LT(delta, 0.01);
+}
+
+// Tests that the built-in softmax does better than the external one,
+// which has an error rate slightly less than 55%, as tested by
+// SoftmaxBaselineTest.
+TEST_F(LSTMTrainerTest, SoftmaxTest) {
+  // LSTM with a built-in softmax can beat the external softmax.
+  SetupTrainerEng("[1,1,0,32 LS96]", "Lstm-+-softmax", false, true);
+  double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
+  EXPECT_LT(lstm_sm_err, 49.0);
+  LOG(INFO) << "********** Expected  < 49 ************\n" ;
+  // Check that it works in int mode too.
+  TestIntMode(kTrainerIterations);
+}
+
+// Tests that the built-in encoded softmax does better than the external one.
+// It takes a lot of iterations to get there.
+TEST_F(LSTMTrainerTest, EncodedSoftmaxTest) {
+  // LSTM with a built-in encoded softmax can beat the external softmax.
+  SetupTrainerEng("[1,1,0,32 LE96]", "Lstm-+-softmax", false, true);
+  double lstm_sm_err = TrainIterations(kTrainerIterations * 2);
+  EXPECT_LT(lstm_sm_err, 62.0);
+  LOG(INFO) << "********** Expected   < 62 ************\n" ;
+  // Check that it works in int mode too.
+  TestIntMode(kTrainerIterations);
+}
+
+// Tests that layer access methods work correctly.
+TEST_F(LSTMTrainerTest, TestLayerAccess) {
+  // A 2-layer LSTM with a Squashed feature-extracting LSTM on the bottom.
+  SetupTrainerEng("[1,32,0,1 Ct5,5,16 Mp2,2 Lfys32 Lbx128 O1c1]", "SQU-lstm",
+                  false, false);
+  // Number of layers.
+  const int kNumLayers = 8;
+  // Expected layer names.
+  const char* kLayerIds[kNumLayers] = {":0",   ":1:0", ":1:1",   ":2",
+                                       ":3:0", ":4:0", ":4:1:0", ":5"};
+  const char* kLayerNames[kNumLayers] = {"Input",   "Convolve", "ConvNL",
+                                         "Maxpool", "Lfys32",   "Lbx128LTR",
+                                         "Lbx128",  "Output"};
+  // Expected number of weights.
+  const int kNumWeights[kNumLayers] = {0,
+                                       0,
+                                       16 * (25 + 1),
+                                       0,
+                                       32 * (4 * (32 + 16 + 1)),
+                                       128 * (4 * (128 + 32 + 1)),
+                                       128 * (4 * (128 + 32 + 1)),
+                                       112 * (2 * 128 + 1)};
+
+  auto layers = trainer_->EnumerateLayers();
+  EXPECT_EQ(kNumLayers, layers.size());
+  for (int i = 0; i < kNumLayers && i < layers.size(); ++i) {
+    EXPECT_STREQ(kLayerIds[i], layers[i].c_str());
+    EXPECT_STREQ(kLayerNames[i],
+                 trainer_->GetLayer(layers[i])->name().c_str());
+    EXPECT_EQ(kNumWeights[i], trainer_->GetLayer(layers[i])->num_weights());
+  }
+}
+
+}  // namespace tesseract.
diff --git a/tesseract/unittest/lstm_test.h b/tesseract/unittest/lstm_test.h
new file mode 100644
index 00000000..4f3d9572
--- /dev/null
+++ b/tesseract/unittest/lstm_test.h
@@ -0,0 +1,189 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TESSERACT_UNITTEST_LSTM_TEST_H_
+#define TESSERACT_UNITTEST_LSTM_TEST_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "include_gunit.h"
+
+#include "absl/strings/str_cat.h"
+#include "tprintf.h"
+#include "helpers.h"
+
+#include "functions.h"
+#include "lang_model_helpers.h"
+#include "log.h"                        // for LOG
+#include "lstmtrainer.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+#if DEBUG_DETAIL == 0
+// Number of iterations to run all the trainers.
+const int kTrainerIterations = 600;
+// Number of iterations between accuracy checks.
+const int kBatchIterations = 100;
+#else
+// Number of iterations to run all the trainers.
+const int kTrainerIterations = 2;
+// Number of iterations between accuracy checks.
+const int kBatchIterations = 1;
+#endif
+
+// The fixture for testing LSTMTrainer.
+class LSTMTrainerTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+  LSTMTrainerTest() {}
+  std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTDATA_DIR,
+                          "" + name);
+  }
+  std::string TessDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESSDATA_DIR,
+                          "" + name);
+  }
+  std::string TestingNameToPath(const std::string& name) {
+    return file::JoinPath(TESTING_DIR,
+                          "" + name);
+  }
+
+  void SetupTrainerEng(const std::string& network_spec, const std::string& model_name,
+                       bool recode, bool adam) {
+    SetupTrainer(network_spec, model_name, "eng/eng.unicharset",
+                 "eng.Arial.exp0.lstmf", recode, adam, 5e-4, false, "eng");
+  }
+  void SetupTrainer(const std::string& network_spec, const std::string& model_name,
+                    const std::string& unicharset_file, const std::string& lstmf_file,
+                    bool recode, bool adam, double learning_rate,
+                    bool layer_specific, const std::string& kLang) {
+//    constexpr char kLang[] = "eng";  // Exact value doesn't matter.
+    std::string unicharset_name = TestDataNameToPath(unicharset_file);
+    UNICHARSET unicharset;
+    ASSERT_TRUE(unicharset.load_from_file(unicharset_name.c_str(), false));
+    std::string script_dir = file::JoinPath(
+        LANGDATA_DIR, "");
+    std::vector<STRING> words;
+    EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, "", FLAGS_test_tmpdir,
+                                  kLang, !recode, words, words, words, false,
+                                  nullptr, nullptr));
+    std::string model_path = file::JoinPath(FLAGS_test_tmpdir, model_name);
+    std::string checkpoint_path = model_path + "_checkpoint";
+    trainer_.reset(new LSTMTrainer(model_path.c_str(), checkpoint_path.c_str(),
+                                   0, 0));
+    trainer_->InitCharSet(file::JoinPath(FLAGS_test_tmpdir, kLang,
+    absl::StrCat(kLang, ".traineddata")));
+    int net_mode = adam ? NF_ADAM : 0;
+    // Adam needs a higher learning rate, due to not multiplying the effective
+    // rate by 1/(1-momentum).
+    if (adam) learning_rate *= 20.0;
+    if (layer_specific) net_mode |= NF_LAYER_SPECIFIC_LR;
+    EXPECT_TRUE(trainer_->InitNetwork(network_spec.c_str(), -1, net_mode, 0.1,
+                                      learning_rate, 0.9, 0.999));
+    std::vector<STRING> filenames;
+    filenames.push_back(STRING(TestDataNameToPath(lstmf_file).c_str()));
+    EXPECT_TRUE(trainer_->LoadAllTrainingData(filenames, CS_SEQUENTIAL, false));
+    LOG(INFO) << "Setup network:" << model_name << "\n" ;
+  }
+  // Trains for a given number of iterations and returns the char error rate.
+  double TrainIterations(int max_iterations) {
+    int iteration = trainer_->training_iteration();
+    int iteration_limit = iteration + max_iterations;
+    double best_error = 100.0;
+    do {
+      STRING log_str;
+      int target_iteration = iteration + kBatchIterations;
+      // Train a few.
+      double mean_error = 0.0;
+      while (iteration < target_iteration && iteration < iteration_limit) {
+        trainer_->TrainOnLine(trainer_.get(), false);
+        iteration = trainer_->training_iteration();
+        mean_error += trainer_->LastSingleError(ET_CHAR_ERROR);
+      }
+      trainer_->MaintainCheckpoints(nullptr, &log_str);
+      iteration = trainer_->training_iteration();
+      mean_error *= 100.0 / kBatchIterations;
+      if (mean_error < best_error) best_error = mean_error;
+    } while (iteration < iteration_limit);
+    LOG(INFO) << "Trainer error rate = " << best_error << "\n";
+    return best_error;
+  }
+  // Tests for a given number of iterations and returns the char error rate.
+  double TestIterations(int max_iterations) {
+    CHECK_GT(max_iterations, 0);
+    int iteration = trainer_->sample_iteration();
+    double mean_error = 0.0;
+    int error_count = 0;
+    while (error_count < max_iterations) {
+      const ImageData& trainingdata =
+          *trainer_->mutable_training_data()->GetPageBySerial(iteration);
+      NetworkIO fwd_outputs, targets;
+      if (trainer_->PrepareForBackward(&trainingdata, &fwd_outputs, &targets) !=
+          UNENCODABLE) {
+        mean_error += trainer_->NewSingleError(ET_CHAR_ERROR);
+        ++error_count;
+      }
+      trainer_->SetIteration(++iteration);
+    }
+    mean_error *= 100.0 / max_iterations;
+    LOG(INFO) << "Tester error rate = " << mean_error << "\n" ;
+    return mean_error;
+  }
+  // Tests that the current trainer_ can be converted to int mode and still gets
+  // within 1% of the error rate. Returns the increase in error from float to
+  // int.
+  double TestIntMode(int test_iterations) {
+    std::vector<char> trainer_data;
+    EXPECT_TRUE(trainer_->SaveTrainingDump(NO_BEST_TRAINER, trainer_.get(),
+                                           &trainer_data));
+    // Get the error on the next few iterations in float mode.
+    double float_err = TestIterations(test_iterations);
+    // Restore the dump, convert to int and test error on that.
+    EXPECT_TRUE(trainer_->ReadTrainingDump(trainer_data, trainer_.get()));
+    trainer_->ConvertToInt();
+    double int_err = TestIterations(test_iterations);
+    EXPECT_LT(int_err, float_err + 1.0);
+    return int_err - float_err;
+  }
+  // Sets up a trainer with the given language and given recode+ctc condition.
+  // It then verifies that the given str encodes and decodes back to the same
+  // string.
+  void TestEncodeDecode(const std::string& lang, const std::string& str, bool recode) {
+    std::string unicharset_name = lang + "/" + lang + ".unicharset";
+    std::string lstmf_name = lang +  ".Arial_Unicode_MS.exp0.lstmf";
+    SetupTrainer("[1,1,0,32 Lbx100 O1c1]", "bidi-lstm", unicharset_name,
+                 lstmf_name, recode, true, 5e-4, true, lang);
+    std::vector<int> labels;
+    EXPECT_TRUE(trainer_->EncodeString(str.c_str(), &labels));
+    STRING decoded = trainer_->DecodeLabels(labels);
+    std::string decoded_str(&decoded[0], decoded.length());
+    EXPECT_EQ(str, decoded_str);
+  }
+  // Calls TestEncodeDeode with both recode on and off.
+  void TestEncodeDecodeBoth(const std::string& lang, const std::string& str) {
+    TestEncodeDecode(lang, str, false);
+    TestEncodeDecode(lang, str, true);
+  }
+
+  std::unique_ptr<LSTMTrainer> trainer_;
+};
+
+}  // namespace tesseract.
+
+#endif  // THIRD_PARTY_TESSERACT_UNITTEST_LSTM_TEST_H_
diff --git a/tesseract/unittest/lstmtrainer_test.cc b/tesseract/unittest/lstmtrainer_test.cc
new file mode 100644
index 00000000..967d1fe5
--- /dev/null
+++ b/tesseract/unittest/lstmtrainer_test.cc
@@ -0,0 +1,106 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "lstm_test.h"
+
+namespace tesseract {
+
+TEST_F(LSTMTrainerTest, EncodesEng) {
+  TestEncodeDecodeBoth("eng",
+                       "The quick brown 'fox' jumps over: the lazy dog!");
+}
+
+TEST_F(LSTMTrainerTest, EncodesKan) {
+  TestEncodeDecodeBoth("kan", "ಫ್ರಬ್ರವರಿ ತತ್ವಾಂಶಗಳೆಂದರೆ ಮತ್ತು ಜೊತೆಗೆ ಕ್ರಮವನ್ನು");
+}
+
+TEST_F(LSTMTrainerTest, EncodesKor) {
+  TestEncodeDecodeBoth("kor",
+                       "이는 것으로 다시 넣을 수는 있지만 선택의 의미는");
+}
+
+TEST_F(LSTMTrainerTest, MapCoder) {
+  LSTMTrainer fra_trainer;
+  fra_trainer.InitCharSet(TestDataNameToPath("fra/fra.traineddata"));
+  LSTMTrainer deu_trainer;
+  deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata"));
+  // A string that uses characters common to French and German.
+  std::string kTestStr = "The quick brown 'fox' jumps over: the lazy dog!";
+  std::vector<int> deu_labels;
+  EXPECT_TRUE(deu_trainer.EncodeString(kTestStr.c_str(), &deu_labels));
+  // The french trainer cannot decode them correctly.
+  STRING badly_decoded = fra_trainer.DecodeLabels(deu_labels);
+  std::string bad_str(&badly_decoded[0], badly_decoded.length());
+  LOG(INFO) << "bad_str fra=" << bad_str << "\n";
+  EXPECT_NE(kTestStr, bad_str);
+  // Encode the string as fra.
+  std::vector<int> fra_labels;
+  EXPECT_TRUE(fra_trainer.EncodeString(kTestStr.c_str(), &fra_labels));
+  // Use the mapper to compute what the labels are as deu.
+  std::vector<int> mapping = fra_trainer.MapRecoder(deu_trainer.GetUnicharset(),
+                                                    deu_trainer.GetRecoder());
+  std::vector<int> mapped_fra_labels(fra_labels.size(), -1);
+  for (int i = 0; i < fra_labels.size(); ++i) {
+    mapped_fra_labels[i] = mapping[fra_labels[i]];
+    EXPECT_NE(-1, mapped_fra_labels[i]) << "i=" << i << ", ch=" << kTestStr[i];
+    EXPECT_EQ(mapped_fra_labels[i], deu_labels[i])
+        << "i=" << i << ", ch=" << kTestStr[i]
+        << " has deu label=" << deu_labels[i] << ", but mapped to "
+        << mapped_fra_labels[i];
+  }
+  // The german trainer can now decode them correctly.
+  STRING decoded = deu_trainer.DecodeLabels(mapped_fra_labels);
+  std::string ok_str(&decoded[0], decoded.length());
+  LOG(INFO) << "ok_str deu=" << ok_str << "\n";
+  EXPECT_EQ(kTestStr, ok_str);
+}
+
+// Tests that the actual fra model can be converted to the deu character set
+// and still read an eng image with 100% accuracy.
+TEST_F(LSTMTrainerTest, ConvertModel) {
+  // Setup a trainer with a deu charset.
+  LSTMTrainer deu_trainer;
+  deu_trainer.InitCharSet(TestDataNameToPath("deu/deu.traineddata"));
+  // Load the fra traineddata, strip out the model, and save to a tmp file.
+  TessdataManager mgr;
+  std::string fra_data =
+      file::JoinPath(TESSDATA_DIR "_best", "fra.traineddata");
+  CHECK(mgr.Init(fra_data.c_str()));
+  LOG(INFO) << "Load " << fra_data  << "\n";
+  file::MakeTmpdir();
+  std::string model_path = file::JoinPath(FLAGS_test_tmpdir, "fra.lstm");
+  CHECK(mgr.ExtractToFile(model_path.c_str()));
+  LOG(INFO) << "Extract " << model_path << "\n";
+  // Load the fra model into the deu_trainer, and save the converted model.
+  CHECK(deu_trainer.TryLoadingCheckpoint(model_path.c_str(), fra_data.c_str()));
+  LOG(INFO) << "Checkpoint load for " << model_path << " and " << fra_data << "\n";
+  std::string deu_data = file::JoinPath(FLAGS_test_tmpdir, "deu.traineddata");
+  CHECK(deu_trainer.SaveTraineddata(deu_data.c_str()));
+  LOG(INFO) << "Save " << deu_data << "\n";
+  // Now run the saved model on phototest. (See BasicTesseractTest in
+  // baseapi_test.cc).
+  TessBaseAPI api;
+  api.Init(FLAGS_test_tmpdir, "deu", tesseract::OEM_LSTM_ONLY);
+  Pix* src_pix = pixRead(TestingNameToPath("phototest.tif").c_str());
+  CHECK(src_pix);
+  api.SetImage(src_pix);
+  std::unique_ptr<char[]> result(api.GetUTF8Text());
+  std::string truth_text;
+  CHECK_OK(file::GetContents(TestingNameToPath("phototest.gold.txt"),
+                             &truth_text, file::Defaults()));
+
+  EXPECT_STREQ(truth_text.c_str(), result.get());
+  pixDestroy(&src_pix);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/mastertrainer_test.cc b/tesseract/unittest/mastertrainer_test.cc
new file mode 100644
index 00000000..0f93e221
--- /dev/null
+++ b/tesseract/unittest/mastertrainer_test.cc
@@ -0,0 +1,298 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Although this is a trivial-looking test, it exercises a lot of code:
+// SampleIterator has to correctly iterate over the correct characters, or
+// it will fail.
+// The canonical and cloud features computed by TrainingSampleSet need to
+// be correct, along with the distance caches, organizing samples by font
+// and class, indexing of features, distance calculations.
+// IntFeatureDist has to work, or the canonical samples won't work.
+// Mastertrainer has ability to read tr files and set itself up tested.
+// Finally the serialize/deserialize test ensures that MasterTrainer,
+// TrainingSampleSet, TrainingSample can all serialize/deserialize correctly
+// enough to reproduce the same results.
+
+#include "include_gunit.h"
+
+#include "log.h"                        // for LOG
+#include "unicharset.h"
+#include "errorcounter.h"
+#include "mastertrainer.h"
+#include "shapeclassifier.h"
+#include "shapetable.h"
+#include "trainingsample.h"
+#include "commontraining.h"
+
+#include "absl/strings/numbers.h"       // for safe_strto32
+#include "absl/strings/str_split.h"     // for absl::StrSplit
+
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace tesseract;
+
+// Specs of the MockClassifier.
+static const int kNumTopNErrs = 10;
+static const int kNumTop2Errs = kNumTopNErrs + 20;
+static const int kNumTop1Errs = kNumTop2Errs + 30;
+static const int kNumTopTopErrs = kNumTop1Errs + 25;
+static const int kNumNonReject = 1000;
+static const int kNumCorrect = kNumNonReject - kNumTop1Errs;
+// The total number of answers is given by the number of non-rejects plus
+// all the multiple answers.
+static const int kNumAnswers = kNumNonReject + 2 * (kNumTop2Errs - kNumTopNErrs) +
+                        (kNumTop1Errs - kNumTop2Errs) +
+                        (kNumTopTopErrs - kNumTop1Errs);
+
+#ifndef DISABLED_LEGACY_ENGINE
+static bool safe_strto32(const std::string& str, int* pResult)
+{
+  long n = strtol(str.c_str(), nullptr, 0);
+  *pResult = n;
+  return true;
+}
+#endif
+
+// Mock ShapeClassifier that cheats by looking at the correct answer, and
+// creates a specific pattern of errors that can be tested.
+class MockClassifier : public ShapeClassifier {
+ public:
+  explicit MockClassifier(ShapeTable* shape_table)
+      : shape_table_(shape_table), num_done_(0), done_bad_font_(false) {
+    // Add a false font answer to the shape table. We pick a random unichar_id,
+    // add a new shape for it with a false font. Font must actually exist in
+    // the font table, but not match anything in the first 1000 samples.
+    false_unichar_id_ = 67;
+    false_shape_ = shape_table_->AddShape(false_unichar_id_, 25);
+  }
+  virtual ~MockClassifier() {}
+
+  // Classifies the given [training] sample, writing to results.
+  // If debug is non-zero, then various degrees of classifier dependent debug
+  // information is provided.
+  // If keep_this (a shape index) is >= 0, then the results should always
+  // contain keep_this, and (if possible) anything of intermediate confidence.
+  // The return value is the number of classes saved in results.
+  int ClassifySample(const TrainingSample& sample, Pix* page_pix,
+                             int debug, UNICHAR_ID keep_this,
+                             std::vector<ShapeRating>* results) override {
+    results->clear();
+    // Everything except the first kNumNonReject is a reject.
+    if (++num_done_ > kNumNonReject) return 0;
+
+    int class_id = sample.class_id();
+    int font_id = sample.font_id();
+    int shape_id = shape_table_->FindShape(class_id, font_id);
+    // Get ids of some wrong answers.
+    int wrong_id1 = shape_id > 10 ? shape_id - 1 : shape_id + 1;
+    int wrong_id2 = shape_id > 10 ? shape_id - 2 : shape_id + 2;
+    if (num_done_ <= kNumTopNErrs) {
+      // The first kNumTopNErrs are top-n errors.
+      results->push_back(ShapeRating(wrong_id1, 1.0f));
+    } else if (num_done_ <= kNumTop2Errs) {
+      // The next kNumTop2Errs - kNumTopNErrs are top-2 errors.
+      results->push_back(ShapeRating(wrong_id1, 1.0f));
+      results->push_back(ShapeRating(wrong_id2, 0.875f));
+      results->push_back(ShapeRating(shape_id, 0.75f));
+    } else if (num_done_ <= kNumTop1Errs) {
+      // The next kNumTop1Errs - kNumTop2Errs are top-1 errors.
+      results->push_back(ShapeRating(wrong_id1, 1.0f));
+      results->push_back(ShapeRating(shape_id, 0.8f));
+    } else if (num_done_ <= kNumTopTopErrs) {
+      // The next kNumTopTopErrs - kNumTop1Errs are cases where the actual top
+      // is not correct, but do not count as a top-1 error because the rating
+      // is close enough to the top answer.
+      results->push_back(ShapeRating(wrong_id1, 1.0f));
+      results->push_back(ShapeRating(shape_id, 0.99f));
+    } else if (!done_bad_font_ && class_id == false_unichar_id_) {
+      // There is a single character with a bad font.
+      results->push_back(ShapeRating(false_shape_, 1.0f));
+      done_bad_font_ = true;
+    } else {
+      // Everything else is correct.
+      results->push_back(ShapeRating(shape_id, 1.0f));
+    }
+    return results->size();
+  }
+  // Provides access to the ShapeTable that this classifier works with.
+  const ShapeTable* GetShapeTable() const override { return shape_table_; }
+
+ private:
+  // Borrowed pointer to the ShapeTable.
+  ShapeTable* shape_table_;
+  // Unichar_id of a random character that occurs after the first 60 samples.
+  int false_unichar_id_;
+  // Shape index of prepared false answer for false_unichar_id.
+  int false_shape_;
+  // The number of classifications we have processed.
+  int num_done_;
+  // True after the false font has been emitted.
+  bool done_bad_font_;
+};
+
+const double kMin1lDistance = 0.25;
+
+// The fixture for testing Tesseract.
+class MasterTrainerTest : public testing::Test {
+#ifndef DISABLED_LEGACY_ENGINE
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+  std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTING_DIR, name);
+  }
+  std::string TmpNameToPath(const std::string& name) {
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+
+  MasterTrainerTest() {
+    shape_table_ = nullptr;
+    master_trainer_ = nullptr;
+  }
+  ~MasterTrainerTest() {
+    delete shape_table_;
+  }
+
+  // Initializes the master_trainer_ and shape_table_.
+  // if load_from_tmp, then reloads a master trainer that was saved by a
+  // previous call in which it was false.
+  void LoadMasterTrainer() {
+    FLAGS_output_trainer = TmpNameToPath("tmp_trainer").c_str();
+    FLAGS_F = file::JoinPath(LANGDATA_DIR, "font_properties").c_str();
+    FLAGS_X = TestDataNameToPath("eng.xheights").c_str();
+    FLAGS_U = TestDataNameToPath("eng.unicharset").c_str();
+    std::string tr_file_name(TestDataNameToPath("eng.Arial.exp0.tr"));
+    const char* argv[] = {tr_file_name.c_str()};
+    int argc = 1;
+    STRING file_prefix;
+    delete shape_table_;
+    shape_table_ = nullptr;
+    master_trainer_ =
+        LoadTrainingData(argc, argv, false, &shape_table_, &file_prefix);
+    EXPECT_TRUE(master_trainer_ != nullptr);
+    EXPECT_TRUE(shape_table_ != nullptr);
+  }
+
+  // EXPECTs that the distance between I and l in Arial is 0 and that the
+  // distance to 1 is significantly not 0.
+  void VerifyIl1() {
+    // Find the font id for Arial.
+    int font_id = master_trainer_->GetFontInfoId("Arial");
+    EXPECT_GE(font_id, 0);
+    // Track down the characters we are interested in.
+    int unichar_I = master_trainer_->unicharset().unichar_to_id("I");
+    EXPECT_GT(unichar_I, 0);
+    int unichar_l = master_trainer_->unicharset().unichar_to_id("l");
+    EXPECT_GT(unichar_l, 0);
+    int unichar_1 = master_trainer_->unicharset().unichar_to_id("1");
+    EXPECT_GT(unichar_1, 0);
+    // Now get the shape ids.
+    int shape_I = shape_table_->FindShape(unichar_I, font_id);
+    EXPECT_GE(shape_I, 0);
+    int shape_l = shape_table_->FindShape(unichar_l, font_id);
+    EXPECT_GE(shape_l, 0);
+    int shape_1 = shape_table_->FindShape(unichar_1, font_id);
+    EXPECT_GE(shape_1, 0);
+
+    float dist_I_l =
+        master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_l);
+    // No tolerance here. We expect that I and l should match exactly.
+    EXPECT_EQ(0.0f, dist_I_l);
+    float dist_l_I =
+        master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_I);
+    // BOTH ways.
+    EXPECT_EQ(0.0f, dist_l_I);
+
+    // l/1 on the other hand should be distinct.
+    float dist_l_1 =
+        master_trainer_->ShapeDistance(*shape_table_, shape_l, shape_1);
+    EXPECT_GT(dist_l_1, kMin1lDistance);
+    float dist_1_l =
+        master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_l);
+    EXPECT_GT(dist_1_l, kMin1lDistance);
+
+    // So should I/1.
+    float dist_I_1 =
+        master_trainer_->ShapeDistance(*shape_table_, shape_I, shape_1);
+    EXPECT_GT(dist_I_1, kMin1lDistance);
+    float dist_1_I =
+        master_trainer_->ShapeDistance(*shape_table_, shape_1, shape_I);
+    EXPECT_GT(dist_1_I, kMin1lDistance);
+  }
+
+  // Objects declared here can be used by all tests in the test case for Foo.
+  ShapeTable* shape_table_;
+  std::unique_ptr<MasterTrainer> master_trainer_;
+#endif
+};
+
+// Tests that the MasterTrainer correctly loads its data and reaches the correct
+// conclusion over the distance between Arial I l and 1.
+TEST_F(MasterTrainerTest, Il1Test) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because LoadTrainingData is missing.
+  GTEST_SKIP();
+#else
+  // Initialize the master_trainer_ and load the Arial tr file.
+  LoadMasterTrainer();
+  VerifyIl1();
+#endif
+}
+
+// Tests the ErrorCounter using a MockClassifier to check that it counts
+// error categories correctly.
+TEST_F(MasterTrainerTest, ErrorCounterTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because LoadTrainingData is missing.
+  GTEST_SKIP();
+#else
+  // Initialize the master_trainer_ from the saved tmp file.
+  LoadMasterTrainer();
+  // Add the space character to the shape_table_ if not already present to
+  // count junk.
+  if (shape_table_->FindShape(0, -1) < 0) shape_table_->AddShape(0, 0);
+  // Make a mock classifier.
+  auto shape_classifier = std::make_unique<MockClassifier>(shape_table_);
+  // Get the accuracy report.
+  STRING accuracy_report;
+  master_trainer_->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, 0,
+                                           false, shape_classifier.get(),
+                                           &accuracy_report);
+  LOG(INFO) << accuracy_report.c_str();
+  std::string result_string = accuracy_report.c_str();
+  std::vector<std::string> results =
+      absl::StrSplit(result_string, '\t', absl::SkipEmpty());
+  EXPECT_EQ(tesseract::CT_SIZE + 1, results.size());
+  int result_values[tesseract::CT_SIZE];
+  for (int i = 0; i < tesseract::CT_SIZE; ++i) {
+    EXPECT_TRUE(safe_strto32(results[i + 1], &result_values[i]));
+  }
+  // These tests are more-or-less immune to additions to the number of
+  // categories or changes in the training data.
+  int num_samples = master_trainer_->GetSamples()->num_raw_samples();
+  EXPECT_EQ(kNumCorrect, result_values[tesseract::CT_UNICHAR_TOP_OK]);
+  EXPECT_EQ(1, result_values[tesseract::CT_FONT_ATTR_ERR]);
+  EXPECT_EQ(kNumTopTopErrs, result_values[tesseract::CT_UNICHAR_TOPTOP_ERR]);
+  EXPECT_EQ(kNumTop1Errs, result_values[tesseract::CT_UNICHAR_TOP1_ERR]);
+  EXPECT_EQ(kNumTop2Errs, result_values[tesseract::CT_UNICHAR_TOP2_ERR]);
+  EXPECT_EQ(kNumTopNErrs, result_values[tesseract::CT_UNICHAR_TOPN_ERR]);
+  // Each of the TOPTOP errs also counts as a multi-unichar.
+  EXPECT_EQ(kNumTopTopErrs - kNumTop1Errs,
+            result_values[tesseract::CT_OK_MULTI_UNICHAR]);
+  EXPECT_EQ(num_samples - kNumNonReject, result_values[tesseract::CT_REJECT]);
+  EXPECT_EQ(kNumAnswers, result_values[tesseract::CT_NUM_RESULTS]);
+#endif
+}
diff --git a/tesseract/unittest/matrix_test.cc b/tesseract/unittest/matrix_test.cc
new file mode 100644
index 00000000..c900308d
--- /dev/null
+++ b/tesseract/unittest/matrix_test.cc
@@ -0,0 +1,137 @@
+///////////////////////////////////////////////////////////////////////
+// File:        matrix_test.cc
+// Author:      rays@google.com (Ray Smith)
+//
+// Copyright 2016 Google Inc. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "matrix.h"
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class MatrixTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    std::locale::global(std::locale(""));
+  }
+
+  // Fills src_ with data so it can pretend to be a tensor thus:
+  //  dims_=[5, 4, 3, 2]
+  //  array_=[0, 1, 2, ....119]
+  //  tensor=[[[[0, 1][2, 3][4, 5]]
+  //           [[6, 7][8, 9][10, 11]]
+  //           [[12, 13][14, 15][16, 17]]
+  //           [[18, 19][20, 21][22, 23]]]
+  //          [[[24, 25]...
+  MatrixTest() {
+    src_.Resize(1, kInputSize_, 0);
+    for (int i = 0; i < kInputSize_; ++i) {
+      src_.put(0, i, i);
+    }
+    for (int i = 0; i < kNumDims_; ++i) dims_[i] = 5 - i;
+  }
+  // Number of dimensions in src_.
+  static const int kNumDims_ = 4;
+  // Number of elements in src_.
+  static const int kInputSize_ = 120;
+  // Size of each dimension in src_;
+  int dims_[kNumDims_];
+  // Input array filled with [0,kInputSize).
+  GENERIC_2D_ARRAY<int> src_;
+};
+
+// Tests that the RotatingTranspose function does the right thing for various
+// transformations.
+// dims=[5, 4, 3, 2]->[5, 2, 4, 3]
+TEST_F(MatrixTest, RotatingTranspose_3_1) {
+  GENERIC_2D_ARRAY<int> m;
+  src_.RotatingTranspose(dims_, kNumDims_, 3, 1, &m);
+  m.ResizeNoInit(kInputSize_ / 3, 3);
+  // Verify that the result is:
+  // output tensor=[[[[0, 2, 4][6, 8, 10][12, 14, 16][18, 20, 22]]
+  //                 [[1, 3, 5][7, 9, 11][13, 15, 17][19, 21, 23]]]
+  //                [[[24, 26, 28]...
+  EXPECT_EQ(0, m(0, 0));
+  EXPECT_EQ(2, m(0, 1));
+  EXPECT_EQ(4, m(0, 2));
+  EXPECT_EQ(6, m(1, 0));
+  EXPECT_EQ(1, m(4, 0));
+  EXPECT_EQ(24, m(8, 0));
+  EXPECT_EQ(26, m(8, 1));
+  EXPECT_EQ(25, m(12, 0));
+}
+
+// dims=[5, 4, 3, 2]->[3, 5, 4, 2]
+TEST_F(MatrixTest, RotatingTranspose_2_0) {
+  GENERIC_2D_ARRAY<int> m;
+  src_.RotatingTranspose(dims_, kNumDims_, 2, 0, &m);
+  m.ResizeNoInit(kInputSize_ / 2, 2);
+  // Verify that the result is:
+  // output tensor=[[[[0, 1][6, 7][12, 13][18, 19]]
+  //                 [[24, 25][30, 31][36, 37][42, 43]]
+  //                 [[48, 49][54, 55][60, 61][66, 67]]
+  //                 [[72, 73][78, 79][84, 85][90, 91]]
+  //                 [[96, 97][102, 103][108, 109][114, 115]]]
+  //                [[[2,3]...
+  EXPECT_EQ(0, m(0, 0));
+  EXPECT_EQ(1, m(0, 1));
+  EXPECT_EQ(6, m(1, 0));
+  EXPECT_EQ(7, m(1, 1));
+  EXPECT_EQ(24, m(4, 0));
+  EXPECT_EQ(25, m(4, 1));
+  EXPECT_EQ(30, m(5, 0));
+  EXPECT_EQ(2, m(20, 0));
+}
+
+// dims=[5, 4, 3, 2]->[5, 3, 2, 4]
+TEST_F(MatrixTest, RotatingTranspose_1_3) {
+  GENERIC_2D_ARRAY<int> m;
+  src_.RotatingTranspose(dims_, kNumDims_, 1, 3, &m);
+  m.ResizeNoInit(kInputSize_ / 4, 4);
+  // Verify that the result is:
+  // output tensor=[[[[0, 6, 12, 18][1, 7, 13, 19]]
+  //                 [[2, 8, 14, 20][3, 9, 15, 21]]
+  //                 [[4, 10, 16, 22][5, 11, 17, 23]]]
+  //                [[[24, 30, 36, 42]...
+  EXPECT_EQ(0, m(0, 0));
+  EXPECT_EQ(6, m(0, 1));
+  EXPECT_EQ(1, m(1, 0));
+  EXPECT_EQ(2, m(2, 0));
+  EXPECT_EQ(3, m(3, 0));
+  EXPECT_EQ(4, m(4, 0));
+  EXPECT_EQ(5, m(5, 0));
+  EXPECT_EQ(24, m(6, 0));
+  EXPECT_EQ(30, m(6, 1));
+}
+
+// dims=[5, 4, 3, 2]->[4, 3, 5, 2]
+TEST_F(MatrixTest, RotatingTranspose_0_2) {
+  GENERIC_2D_ARRAY<int> m;
+  src_.RotatingTranspose(dims_, kNumDims_, 0, 2, &m);
+  m.ResizeNoInit(kInputSize_ / 2, 2);
+  // Verify that the result is:
+  // output tensor=[[[[0, 1][24, 25][48, 49][72, 73][96, 97]]
+  //                 [[2, 3][26, 27][50, 51][74, 75][98, 99]]
+  //                 [[4, 5][28, 29][52, 53][76, 77][100, 101]]]
+  //                [[[6, 7]...
+  EXPECT_EQ(0, m(0, 0));
+  EXPECT_EQ(1, m(0, 1));
+  EXPECT_EQ(24, m(1, 0));
+  EXPECT_EQ(25, m(1, 1));
+  EXPECT_EQ(96, m(4, 0));
+  EXPECT_EQ(97, m(4, 1));
+  EXPECT_EQ(2, m(5, 0));
+  EXPECT_EQ(6, m(15, 0));
+}
+
+}  // namespace
diff --git a/tesseract/unittest/networkio_test.cc b/tesseract/unittest/networkio_test.cc
new file mode 100644
index 00000000..3c25f14f
--- /dev/null
+++ b/tesseract/unittest/networkio_test.cc
@@ -0,0 +1,217 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "networkio.h"
+#include "stridemap.h"
+#ifdef INCLUDE_TENSORFLOW
+#include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D
+#endif
+
+namespace tesseract {
+
+class NetworkioTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    std::locale::global(std::locale(""));
+  }
+
+#ifdef INCLUDE_TENSORFLOW
+  // Sets up an Array2d object of the given size, initialized to increasing
+  // values starting with start.
+  std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) {
+    std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize));
+    int value = start;
+    for (int y = 0; y < ysize; ++y) {
+      for (int x = 0; x < xsize; ++x) {
+        (*a)(y, x) = value++;
+      }
+    }
+    return a;
+  }
+  // Sets up a NetworkIO with a batch of 2 "images" of known values.
+  void SetupNetworkIO(NetworkIO* nio) {
+    std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;
+    arrays.push_back(SetupArray(3, 4, 0));
+    arrays.push_back(SetupArray(4, 5, 12));
+    std::vector<std::pair<int, int>> h_w_sizes;
+    for (size_t i = 0; i < arrays.size(); ++i) {
+      h_w_sizes.emplace_back(arrays[i].get()->height(),
+                             arrays[i].get()->width());
+    }
+    StrideMap stride_map;
+    stride_map.SetStride(h_w_sizes);
+    nio->ResizeToMap(true, stride_map, 2);
+    // Iterate over the map, setting nio's contents from the arrays.
+    StrideMap::Index index(stride_map);
+    do {
+      int value = (*arrays[index.index(FD_BATCH)])(index.index(FD_HEIGHT),
+                                                   index.index(FD_WIDTH));
+      nio->SetPixel(index.t(), 0, 128 + value, 0.0f, 128.0f);
+      nio->SetPixel(index.t(), 1, 128 - value, 0.0f, 128.0f);
+    } while (index.Increment());
+  }
+#endif
+};
+
+// Tests that the initialization via SetPixel works and the resize correctly
+// fills with zero where image sizes don't match.
+TEST_F(NetworkioTest, InitWithZeroFill) {
+#ifdef INCLUDE_TENSORFLOW
+  NetworkIO nio;
+  nio.Resize2d(true, 32, 2);
+  int width = nio.Width();
+  for (int t = 0; t < width; ++t) {
+    nio.SetPixel(t, 0, 0, 0.0f, 128.0f);
+    nio.SetPixel(t, 1, 0, 0.0f, 128.0f);
+  }
+  // The initialization will wipe out all previously set values.
+  SetupNetworkIO(&nio);
+  nio.ZeroInvalidElements();
+  StrideMap::Index index(nio.stride_map());
+  int next_t = 0;
+  int pos = 0;
+  do {
+    int t = index.t();
+    // The indexed values just increase monotonically.
+    int value = nio.i(t)[0];
+    EXPECT_EQ(value, pos);
+    value = nio.i(t)[1];
+    EXPECT_EQ(value, -pos);
+    // When we skip t values, the data is always 0.
+    while (next_t < t) {
+      EXPECT_EQ(nio.i(next_t)[0], 0);
+      EXPECT_EQ(nio.i(next_t)[1], 0);
+      ++next_t;
+    }
+    ++pos;
+    ++next_t;
+  } while (index.Increment());
+  EXPECT_EQ(pos, 32);
+  EXPECT_EQ(next_t, 40);
+#else
+  LOG(INFO) << "Skip test because of missing xla::Array2D";
+  GTEST_SKIP();
+#endif
+}
+
+// Tests that CopyWithYReversal works.
+TEST_F(NetworkioTest, CopyWithYReversal) {
+#ifdef INCLUDE_TENSORFLOW
+  NetworkIO nio;
+  SetupNetworkIO(&nio);
+  NetworkIO copy;
+  copy.CopyWithYReversal(nio);
+  StrideMap::Index index(copy.stride_map());
+  int next_t = 0;
+  int pos = 0;
+  std::vector<int> expected_values = {
+      8,  9,  10, 11, 4,  5,  6,  7,  0,  1,  2,  3,  27, 28, 29, 30,
+      31, 22, 23, 24, 25, 26, 17, 18, 19, 20, 21, 12, 13, 14, 15, 16};
+  do {
+    int t = index.t();
+    // The indexed values match the expected values.
+    int value = copy.i(t)[0];
+    EXPECT_EQ(value, expected_values[pos]);
+    value = copy.i(t)[1];
+    EXPECT_EQ(value, -expected_values[pos]);
+    // When we skip t values, the data is always 0.
+    while (next_t < t) {
+      EXPECT_EQ(copy.i(next_t)[0], 0) << "Failure t = " << next_t;
+      EXPECT_EQ(copy.i(next_t)[1], 0) << "Failure t = " << next_t;
+      ++next_t;
+    }
+    ++pos;
+    ++next_t;
+  } while (index.Increment());
+  EXPECT_EQ(pos, 32);
+  EXPECT_EQ(next_t, 40);
+#else
+  LOG(INFO) << "Skip test because of missing xla::Array2D";
+  GTEST_SKIP();
+#endif
+}
+
+// Tests that CopyWithXReversal works.
+TEST_F(NetworkioTest, CopyWithXReversal) {
+#ifdef INCLUDE_TENSORFLOW
+  NetworkIO nio;
+  SetupNetworkIO(&nio);
+  NetworkIO copy;
+  copy.CopyWithXReversal(nio);
+  StrideMap::Index index(copy.stride_map());
+  int next_t = 0;
+  int pos = 0;
+  std::vector<int> expected_values = {
+      3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,  16, 15, 14, 13,
+      12, 21, 20, 19, 18, 17, 26, 25, 24, 23, 22, 31, 30, 29, 28, 27};
+  do {
+    int t = index.t();
+    // The indexed values match the expected values.
+    int value = copy.i(t)[0];
+    EXPECT_EQ(value, expected_values[pos]);
+    value = copy.i(t)[1];
+    EXPECT_EQ(value, -expected_values[pos]);
+    // When we skip t values, the data is always 0.
+    while (next_t < t) {
+      EXPECT_EQ(copy.i(next_t)[0], 0) << "Failure t = " << next_t;
+      EXPECT_EQ(copy.i(next_t)[1], 0) << "Failure t = " << next_t;
+      ++next_t;
+    }
+    ++pos;
+    ++next_t;
+  } while (index.Increment());
+  EXPECT_EQ(pos, 32);
+  EXPECT_EQ(next_t, 40);
+#else
+  LOG(INFO) << "Skip test because of missing xla::Array2D";
+  GTEST_SKIP();
+#endif
+}
+
+// Tests that CopyWithXYTranspose works.
+TEST_F(NetworkioTest, CopyWithXYTranspose) {
+#ifdef INCLUDE_TENSORFLOW
+  NetworkIO nio;
+  SetupNetworkIO(&nio);
+  NetworkIO copy;
+  copy.CopyWithXYTranspose(nio);
+  StrideMap::Index index(copy.stride_map());
+  int next_t = 0;
+  int pos = 0;
+  std::vector<int> expected_values = {
+      0,  4,  8,  1,  5,  9,  2,  6,  10, 3,  7,  11, 12, 17, 22, 27,
+      13, 18, 23, 28, 14, 19, 24, 29, 15, 20, 25, 30, 16, 21, 26, 31};
+  do {
+    int t = index.t();
+    // The indexed values match the expected values.
+    int value = copy.i(t)[0];
+    EXPECT_EQ(value, expected_values[pos]);
+    value = copy.i(t)[1];
+    EXPECT_EQ(value, -expected_values[pos]);
+    // When we skip t values, the data is always 0.
+    while (next_t < t) {
+      EXPECT_EQ(copy.i(next_t)[0], 0);
+      EXPECT_EQ(copy.i(next_t)[1], 0);
+      ++next_t;
+    }
+    ++pos;
+    ++next_t;
+  } while (index.Increment());
+  EXPECT_EQ(pos, 32);
+  EXPECT_EQ(next_t, 40);
+#else
+  LOG(INFO) << "Skip test because of missing xla::Array2D";
+  GTEST_SKIP();
+#endif
+}
+
+}  // namespace
diff --git a/tesseract/unittest/normstrngs_test.cc b/tesseract/unittest/normstrngs_test.cc
new file mode 100644
index 00000000..301bbd68
--- /dev/null
+++ b/tesseract/unittest/normstrngs_test.cc
@@ -0,0 +1,422 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "absl/strings/str_format.h"    // for absl::StrFormat
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+#include <tesseract/unichar.h>
+#ifdef INCLUDE_TENSORFLOW
+#include "util/utf8/unilib.h"           // for UniLib
+#endif
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+#if defined(MISSING_CODE)
+static std::string EncodeAsUTF8(const char32 ch32) {
+  UNICHAR uni_ch(ch32);
+  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
+}
+#endif
+
+TEST(NormstrngsTest, BasicText) {
+  const char* kBasicText = "AbCd Ef";
+  std::string result;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+                                  GraphemeNorm::kNormalize, kBasicText,
+                                  &result));
+  EXPECT_STREQ(kBasicText, result.c_str());
+}
+
+TEST(NormstrngsTest, LigatureText) {
+  const char* kTwoByteLigText = "ĳ";  // U+0133 (ĳ) -> ij
+  std::string result;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+                                  GraphemeNorm::kNormalize, kTwoByteLigText,
+                                  &result));
+  EXPECT_STREQ("ij", result.c_str());
+
+  const char* kThreeByteLigText = "ﬁnds";  // U+FB01 (ﬁ) -> fi
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+                                  GraphemeNorm::kNormalize, kThreeByteLigText,
+                                  &result));
+  EXPECT_STREQ("finds", result.c_str());
+}
+
+TEST(NormstrngsTest, OcrSpecificNormalization) {
+  const char* kSingleQuoteText = "‘Hi";  // U+2018 (‘) -> U+027 (')
+  std::string result;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+                                  GraphemeNorm::kNormalize, kSingleQuoteText,
+                                  &result));
+  EXPECT_STREQ("'Hi", result.c_str());
+
+  const char* kDoubleQuoteText = "“Hi";  // U+201C (“) -> U+022 (")
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+                                  GraphemeNorm::kNormalize, kDoubleQuoteText,
+                                  &result));
+  EXPECT_STREQ("\"Hi", result.c_str());
+
+  const char* kEmDash = "Hi—";  // U+2014 (—) -> U+02D (-)
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNormalize,
+                                  GraphemeNorm::kNormalize, kEmDash, &result));
+  EXPECT_STREQ("Hi-", result.c_str());
+  // Without the ocr normalization, these changes are not made.
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, kSingleQuoteText,
+                                  &result));
+  EXPECT_STREQ(kSingleQuoteText, result.c_str());
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, kDoubleQuoteText,
+                                  &result));
+  EXPECT_STREQ(kDoubleQuoteText, result.c_str());
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, kEmDash, &result));
+  EXPECT_STREQ(kEmDash, result.c_str());
+}
+
+// Sample text used in tests.
+const char kEngText[] = "the quick brown fox jumps over the lazy dog";
+const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
+const char kKorText[] = "이는 것으로";
+// Hindi words containing illegal vowel sequences.
+const char* kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें",     "प्रंात",
+                                      "कहीअे",     "पत्रिाका", "छह्णाीस"};
+// Thai illegal sequences.
+const char* kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};
+
+TEST(NormstrngsTest, DetectsCorrectText) {
+  std::string chars;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, kEngText, &chars));
+  EXPECT_STREQ(kEngText, chars.c_str());
+
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, kHinText, &chars))
+      << "Incorrect text: '" << kHinText << "'";
+  EXPECT_STREQ(kHinText, chars.c_str());
+
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, kKorText, &chars));
+  EXPECT_STREQ(kKorText, chars.c_str());
+}
+
+TEST(NormstrngsTest, DetectsIncorrectText) {
+  for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedHinWords); ++i) {
+    EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                     GraphemeNorm::kNormalize,
+                                     kBadlyFormedHinWords[i], nullptr))
+        << kBadlyFormedHinWords[i];
+  }
+  for (size_t i = 0; i < ARRAYSIZE(kBadlyFormedThaiWords); ++i) {
+    EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFKC, OCRNorm::kNone,
+                                     GraphemeNorm::kNormalize,
+                                     kBadlyFormedThaiWords[i], nullptr))
+        << kBadlyFormedThaiWords[i];
+  }
+}
+
+TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
+  std::string nonindic = "Here's some latin text.";
+  std::string dest;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, nonindic.c_str(),
+                                  &dest))
+      << PrintString32WithUnicodes(nonindic);
+  EXPECT_EQ(dest, nonindic);
+}
+
+TEST(NormstrngsTest, NoLonelyJoiners) {
+  std::string str = "x\u200d\u0d06\u0d34\u0d02";
+  std::vector<std::string> glyphs;
+  // Returns true, but the joiner is gone.
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[0], std::string("x"));
+  EXPECT_EQ(glyphs[1], std::string("\u0d06"));
+  EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02"));
+}
+
+TEST(NormstrngsTest, NoLonelyJoinersPlus) {
+  std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
+  std::vector<std::string> glyphs;
+  // Returns true, but the joiner is gone.
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
+  EXPECT_EQ(glyphs[1], std::string("+"));
+  EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b"));
+}
+
+TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
+  std::string str = "\u200d+\u200c\u200d";
+  // Returns true, but the joiners are gone.
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string("+"));
+  str = "\u200d\u200c\u200d";
+  // Without the plus, the string is invalid.
+  std::string result;
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result))
+      << PrintString32WithUnicodes(result);
+}
+
+TEST(NormstrngsTest, JoinersStayInArabic) {
+  std::string str = "\u0628\u200c\u0628\u200d\u0628";
+  // Returns true, string untouched.
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 5, 5, 2, str);
+}
+
+TEST(NormstrngsTest, DigitOK) {
+  std::string str = "\u0cea";  // Digit 4.
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
+}
+
+TEST(NormstrngsTest, DandaOK) {
+  std::string str = "\u0964";  // Single danda.
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
+  str = "\u0965";  // Double danda.
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, str);
+}
+
+TEST(NormstrngsTest, AllScriptsRegtest) {
+  // Tests some valid text in a large number of scripts, some of which were
+  // found to be rejected by an earlier version.
+  const std::vector<std::pair<std::string, std::string>> kScriptText(
+      {{"Arabic",
+        " فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
+        "توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
+        "مجموعه هیچ اثری در فنون هنر و ادب و ترجمه، تقدیم پیشگاه ارجمند "
+        "سازنده تاریخ نگاه میکرد و به اصطلاح انسان و فطرت انسانی را زیربنای"},
+       {"Armenian",
+        "անտիկ աշխարհի փիլիսոփաների կենսագրությունը, թե′ նրանց ուս-"
+        "պատրաստւում է դալ (բուլամա): Կովկասում կաթից նաև պատ-"
+        "Հոգաբարձութեան յղել այդ անձին յիմարութիւնը հաստա-"
+        "գծերը եւ միջագծերը կը համրուին վարէն վեր:"},
+       {"Bengali",
+        "এসে দাঁড়ায় দাও খানি উঁচিয়ে নিয়ে । ঝরনার স্বচ্ছ জলে প্রতিবিম্বিত "
+        "পাঠিয়ে, গোবিন্দ স্মরণ করে, নির্ভয়ে রওনা হয়েছিল। তাতে সে "
+        "সুলতার। মনে পড়ে বিয়ের সময় বাবা এদের বাড়ি থেকে ঘুরে "
+        "কিন্তু তারপর মাতৃহৃদয় কেমন করে আছে? কী"},
+       {"Cyrillic",
+        "достей, є ще нагороди й почесті, є хай і сумнівна, але слава, "
+        "вып., 96б). Параўн. найсвятший у 1 знач., насвятейший у 1 знач., "
+        "»Правді«, — гітлерівські окупанти винищували нижчі раси, після дру- "
+        "І знов майдан зачорнів од народу. Всередині чоло-"},
+       {"Devanagari",
+        "डा॰ नै हात्तीमाथि चढेर त्यो भएनेर आइपुगे। राजालाई देखी "
+        "बाबतीत लिहिणे ही  एक मोठीच जबाबदारी आहे. काकासाहेबांच्या कार्याचा "
+        "प्रबंध, आधोगिक प्रबंध तथा बैंकिंग  एवम वाणिज्य आदि विषयों में "
+        "चित्रकृती दिल्या. शंभराहून अधिक देश आज आपापले चित्रपट निर्माण करीत"},
+       {"Greek",
+        "Μέσα ένα τετράδιο είχα στριμώξει το πρώτο "
+        "νον αξίως τού ευαγγελίου τού χριστού πολιτεύεσθε, ίνα "
+        "οὐδεμία ὑπ' αὐτοῦ μνεία γίνεται τῶν οἰκείων χωρίων. "
+        "είτα την φάσιν αυτήν ην ούτος εποιήσατο κατά του Μίκω-"},
+       {"Gujarati",
+        "ઉપહારગૃહે ને નાટ્યસ્થળે આ એ જ તેલ કડકડતું "
+        "શકી. ભાવવધારો અટકાવી નથી શકી અને બેકારીને "
+        "ત્યાં વાંકુથી પાછે  આવ્યો, ચોરીનો માલ સોંપવા ! "
+        "કહી. એણે રેશમના કપડામાં વીંટી રાખેલ કુંવરીની છબી"},
+       {"Gurmukhi",
+        "ਯਾਦ ਰਹੇ ਕਿ ‘ਨਫਰਤ ’ ਦਾ ਵਿਸ਼ਾ ਕ੍ਰਾਤੀ ਨਹੀ ਹੈ ਅਤੇ ਕਵੀ ਦੀ ਇਹ "
+        "ਮਹਾਂ ਨੰਦਾ ਕੋਲ ਇਕ ਚੀਜ਼ ਸੀ ਉਹ ਸੀ ਸਚ, ਕੋਰਾ ਸਚ, ਬੇਧਤ੍ਰਕ ਕਹਿੳ "
+        "ਭੂਰਾ  ਸਾਨੂੰ  ਥੜਾ  ਚੰਗਾ  ਲਗਦਾ  ਸੀ ।  ਉਸ  ਦਾ  ਇਕ  ਪੈਰ  ਜਨਮ ਤੋ "
+        "ਨੂੰ ਇਹ ਅਧਿਕਾਰ ਦਿੱਤਾ ਕਿ ਉਹ ਸਿੱਖ ਵਿਰੋਧ ਦਾ ਸੰਗਠਨ ਕਰੇ ਅਤੇ 3 ਸਤੰਬਰ,"},
+       {"Hangul",
+        "로 들어갔다. 이대통령은 아이젠하워 대통령의 뒷모습을 보면서 "
+        "그것뿐인 줄 아요? 노름도 했다 캅니다. 빌어묵을 놈이 그러 "
+        "의 가장 과학적 태도이며, 우리 역사를 가장 정확하게 학습할 수 있는 "
+        "마르크스 레"
+        "각하는 그는 그들의 식사보장을 위해 때때로 집에"},
+       {"HanS",
+        "大凡世界上的先生可 分 三 种： 第一种只会教书， 只会拿一 "
+        "书像是探宝一样，在茶叶店里我买过西湖龙井﹑黄山毛峰﹑福建的铁观音﹑大红"
+        " "
+        "持 “左” 倾冒险主义的干部，便扣上 “富农 "
+        "笑说：“我听说了，王总工程师也跟我说过了，只是工作忙，谁"},
+       {"HanT",
+        "叁、 銀行資產管理的群組分析模式 "
+        "民國六十三年，申請就讀台灣大學歷史研究所，並從事著述，"
+        "質言之﹐在社會結構中﹐性質﹑特徵﹑地位相類似的一羣人﹐由於 "
+        "董橋，一九四二年生，福建晉江人，國立成功大學外"},
+       {"Hebrew",
+        " אֵ-לִי, אֵ-לִי, כֵּיַצד מְטַפְּסִים בְּקִירוֹת שֶׁל זְכוּכִי"
+        " הראשון חוצה אותי שוב. אני בסיבוב הרביעי, הוא בטח מתחיל את"
+        " ווערטער  געהאט,  אבער  דער  עיקר  איז  ניט  דאָס  וואָרט,  נאָר"
+        " על גחלת היהדות המקורית בעירך, נתת צביון ואופי מיוחד"},
+       {"Japanese",
+        "は異民族とみなされていた。楚の荘王（前613〜前 "
+        "を詳細に吟味する。実際の治療活動の領域は便宜上、(1)　障害者 "
+        "困難性は多角企業の場合原則として部門別に判断されている.). "
+        "☆ご希望の団体には見本をお送りします"},
+       {"Kannada",
+        "ಕೂಡ ಯುದ್ಧ ಮಾಡಿ ಜಯಪಡೆ. ನಂತರ ನಗರದೊಳಕ್ಕೆ ನಡೆ ಇದನ್ನು "
+        "ಅಸಹ್ಯದೃಶ್ಯ ಯಾರಿಗಾದರೂ ನಾಚಿಕೆತರುವಂತಹದಾಗಿದೆ. ಆರೋಗ್ಯ ದೃಷ್ಟಿ "
+        "ಯಾಗಲಿ, ಮೋಹನನಾಗಲಿ ಇಂಥ ಬಿಸಿಲಿನಲ್ಲಿ ಎಂದೂ ಬಹಳ ಹೊತ್ತು "
+        "\"ಇದೆ...ಖಂಡಿತಾ ಇದೆ\" ಅಂದ ಮನಸ್ಸಿನಲ್ಲಿಯೇ ವಂದಿಸುತ್ತಾ,"},
+       {"Khmer",
+        "សិតសក់និងផ្លាស់សម្លៀកបំពាក់ពេលយប់ចេញ។ "
+        "និយាយអំពីនគរនេះ ប្រាប់ដល់លោកទាំងមូលឲ្យដឹងច្បាស់លាស់អំពី "
+        "កន្លះកាថាសម្រាប់ទន្ទេញឲ្យងាយចាំ បោះពុម្ពនៅក្នុងទ្រង់ទ្រាយបច្ចុប្បន្ន "
+        "ឯកសារនេះបានផ្សព្វផ្សាយនៅក្នុងសន្និសីទ"},
+       {"Lao",
+        "ເອີຍ ! ຟັງສຽງຟ້າມັນຮ້ອງຮ່ວນ ມັນດັງໄກໆ ເອີຍ "
+        "ໄດລຽງດູລາວມາດວບຄວາມລາບາກຫລາຍ; "
+        "ບາງໄດ້ ເຈົ້າລອງສູ້ບໍ່ໄດ້ຈຶ່ງຫນີລົງມາວຽງຈັນ. "
+        "ລົບອອກຈາກ 3 ເຫລືອ 1, ຂ້ອຍຂຽນ 1 (1)"},
+       {"Latin",
+        "režisoru, palīdzēja to manu domīgo, kluso Dzejas metru ielikt "
+        "Ešte nedávno sa chcel mladý Novomeský „liečiť” "
+        "tiivisia kysymyksiä, mistä seuraa, että spekula-   |   don luonteesta "
+        "Grabiel Sanchez, yang bertani selama 120 tahun meninggal"},
+       {"Malayalam",
+        "അമൂർത്തചിത്രമായിരിക്കും.  ഛേ! ആ വീട്ടിലേക്ക്  അവളൊന്നിച്ച്  പോകേണ്ടതാ "
+        "മൃഗങ്ങൾക്ക് എന്തെക്കിലും പറ്റിയാൽ മാത്രം ഞാനതു "
+        "വെലക്ക് വേണമെങ്കിൽ തരാം. എന്തോ തരും?  പറ. "
+        "എല്ലാം കഴിഞ്ഞ് സീനിയറിന്റെ അടുത്തു ചെന്ന് കാൽതൊട്ട"},
+       {"Tamil",
+        "பொருத்தமாகப் பாடினாள் நம் ஔவைப் பாட்டி. காவிரி "
+        "உள்ளடக்கி  நிற்பது  விநோத  வார்த்தையின் அஃறிணை "
+        "சூரிய   கிரஹண   சமயத்தில்   குருக்ஷேத்திரம்   செல்வது "
+        "காலங்களில் வெளியே போகும்பொழுது, 'ஸார்', 'ஸார்',"},
+       {"Telugu",
+        "1892లో ఆమె 10వ సంవత్సరంలో గుంటూరు తాలూకా వేములాపాడు "
+        "ఫండ్స్ చట్టము'నందు చేయబడెను. తరువాత క్రీ. శ. "
+        "సంచారము చేయును.  మీరు ఇప్పుడే కాళకాలయమునకు "
+        "ఎంతటి  సరళమైన  భాషలో  వ్రాశాడో  విశదమవుతుంది.   పైగా  ఆనాటి   భాష"},
+       {"Thai",
+        "อ้อ! กับนัง....แม่ยอดพระกลิ่น นั่นเอง ! หรับก็ย่อมจะรู้โดยชัดเจนว่า "
+        "ถ้าตราบใดยังมีเรือปืนอยู่ใกล้ ๆ แล้ว  ตราบนั้น "
+        "พระดำรินี้ ที่มีคตีทำกรวยหมากและธูปเทียน "
+        "อันยานมีเรือเปนต้นฃ้ามยาก ฯ เพราะว่าแม่น้ำนั่นมีน้ำใสยิ่ง แม้เพียง"},
+       {"Vietnamese",
+        "vợ đến tai mụ hung thần Xăng-tô- mê-a. Mụ vô cùng "
+        "chiếc xe con gấu chạy qua nhà. Nhưng thỉnh thoảng "
+        "hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng "
+        "Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}});
+
+  for (const auto& p : kScriptText) {
+    std::string normalized;
+    EXPECT_TRUE(tesseract::NormalizeUTF8String(
+        tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
+        tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))
+        << "Script=" << p.first << " text=" << p.second;
+  }
+}
+
+TEST(NormstrngsTest, IsWhitespace) {
+  // U+0020 is whitespace
+  EXPECT_TRUE(IsWhitespace(' '));
+  EXPECT_TRUE(IsWhitespace('\t'));
+  EXPECT_TRUE(IsWhitespace('\r'));
+  EXPECT_TRUE(IsWhitespace('\n'));
+  // U+2000 through U+200A
+  for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {
+    SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+    EXPECT_TRUE(IsWhitespace(ch));
+  }
+  // U+3000 is whitespace
+  EXPECT_TRUE(IsWhitespace(0x3000));
+  // ZWNBSP is not considered a space.
+  EXPECT_FALSE(IsWhitespace(0xFEFF));
+}
+
+TEST(NormstrngsTest, SpanUTF8Whitespace) {
+  EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\n"));
+  EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\nabc"));
+  EXPECT_EQ(0, SpanUTF8Whitespace("abc \t\r\nabc"));
+  EXPECT_EQ(0, SpanUTF8Whitespace(""));
+}
+
+TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
+  const char kHinText[] = "पिताने विवाह";
+  const char kKorText[] = "이는 것으로 다시 넣을";
+  const char kMixedText[] = "والفكر 123 والصراع abc";
+
+  EXPECT_EQ(0, SpanUTF8NotWhitespace(""));
+  EXPECT_EQ(0, SpanUTF8NotWhitespace(" abc"));
+  EXPECT_EQ(0, SpanUTF8NotWhitespace("\rabc"));
+  EXPECT_EQ(0, SpanUTF8NotWhitespace("\tabc"));
+  EXPECT_EQ(0, SpanUTF8NotWhitespace("\nabc"));
+  EXPECT_EQ(3, SpanUTF8NotWhitespace("abc def"));
+  EXPECT_EQ(18, SpanUTF8NotWhitespace(kHinText));
+  EXPECT_EQ(6, SpanUTF8NotWhitespace(kKorText));
+  EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
+}
+
+// Test that the method clones the util/utf8/unilib definition of
+// interchange validity.
+TEST(NormstrngsTest, IsInterchangeValid) {
+#ifdef INCLUDE_TENSORFLOW
+  const int32_t kMinUnicodeValue = 33;
+  const int32_t kMaxUnicodeValue = 0x10FFFF;
+  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
+    SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+    EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
+  }
+#else
+  GTEST_SKIP();
+#endif
+}
+
+// Test that the method clones the util/utf8/unilib definition of
+// 7-bit ASCII interchange validity.
+TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
+#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
+  const int32_t kMinUnicodeValue = 33;
+  const int32_t kMaxUnicodeValue = 0x10FFFF;
+  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
+    SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+    std::string str = EncodeAsUTF8(ch);
+    EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
+              IsInterchangeValid7BitAscii(ch));
+  }
+#else
+  // Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
+  GTEST_SKIP();
+#endif
+}
+
+// Test that the method clones the util/utf8/unilib definition of
+// fullwidth-halfwidth .
+TEST(NormstrngsTest, FullwidthToHalfwidth) {
+  // U+FF21 -> U+0041 (Latin capital letter A)
+  EXPECT_EQ('A', FullwidthToHalfwidth(0xFF21));
+  // U+FF05 -> U+0025 (percent sign)
+  EXPECT_EQ('%', FullwidthToHalfwidth(0xFF05));
+  // U+FFE6 -> U+20A9 (won sign)
+  EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
+
+#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
+  // Skipped because of missing UniLib::FullwidthToHalfwidth.
+  const int32_t kMinUnicodeValue = 33;
+  const int32_t kMaxUnicodeValue = 0x10FFFF;
+  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
+    if (!IsValidCodepoint(ch)) continue;
+    SCOPED_TRACE(absl::StrFormat("Failed at U+%x", ch));
+    std::string str = EncodeAsUTF8(ch);
+    const std::string expected_half_str =
+        UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);
+    EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));
+  }
+#endif
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/normstrngs_test.h b/tesseract/unittest/normstrngs_test.h
new file mode 100644
index 00000000..3b459348
--- /dev/null
+++ b/tesseract/unittest/normstrngs_test.h
@@ -0,0 +1,84 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
+#define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
+
+#include <sstream>  // for std::stringstream
+#include <string>
+#include <vector>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+inline std::string CodepointList(const std::vector<char32>& str32) {
+  std::stringstream result;
+  int total_chars = str32.size();
+  result << std::hex;
+  for (int i = 0; i < total_chars; ++i) {
+    result << "[" << str32[i] << "]";
+  }
+  return result.str();
+}
+
+inline std::string PrintString32WithUnicodes(const std::string& str) {
+  std::vector<char32> str32 = UNICHAR::UTF8ToUTF32(str.c_str());
+  return absl::StrCat("\"", str, "\" ", CodepointList(str32));
+}
+
+inline std::string PrintStringVectorWithUnicodes(const std::vector<std::string>& glyphs) {
+  std::string result;
+  for (const auto& s : glyphs) {
+    result += "Glyph:";
+    result += PrintString32WithUnicodes(s) + "\n";
+  }
+  return result;
+}
+
+inline void ExpectGraphemeModeResults(const std::string& str, UnicodeNormMode u_mode,
+                                      int unicode_count, int glyph_count,
+                                      int grapheme_count,
+                                      const std::string& target_str) {
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true,
+      str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), unicode_count)
+      << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
+                                           GraphemeNormMode::kGlyphSplit, true,
+                                           str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), glyph_count)
+      << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
+                                           GraphemeNormMode::kCombined, true,
+                                           str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), grapheme_count)
+      << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(target_str, absl::StrJoin(glyphs.begin(), glyphs.end(), ""));
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(u_mode, OCRNorm::kNone,
+                                           GraphemeNormMode::kSingleString,
+                                           true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(target_str, glyphs[0]);
+  std::string result;
+  EXPECT_TRUE(NormalizeUTF8String(
+      u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result));
+  EXPECT_EQ(target_str, result);
+}
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_
diff --git a/tesseract/unittest/nthitem_test.cc b/tesseract/unittest/nthitem_test.cc
new file mode 100644
index 00000000..4d08ffae
--- /dev/null
+++ b/tesseract/unittest/nthitem_test.cc
@@ -0,0 +1,120 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kdpair.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+int test_data[] = {8, 1, 2, -4, 7, 9, 65536, 4, 9, 0, -32767, 6, 7};
+
+// The fixture for testing GenericHeap and DoublePtr.
+class NthItemTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    std::locale::global(std::locale(""));
+  }
+
+ public:
+  virtual ~NthItemTest();
+  // Pushes the test data onto the KDVector.
+  void PushTestData(KDVector* v) {
+    for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+      IntKDPair pair(test_data[i], i);
+      v->push_back(pair);
+    }
+  }
+};
+
+// Destructor.
+// It is defined here, so the compiler can create a single vtable
+// instead of a weak vtable (fixes compiler warning).
+NthItemTest::~NthItemTest() = default;
+
+// Tests basic results.
+TEST_F(NthItemTest, GeneralTest) {
+  KDVector v;
+  // Push the test data onto the KDVector.
+  PushTestData(&v);
+  // Get the min item.
+  int index = v.choose_nth_item(0);
+  // The result is -32767.
+  EXPECT_EQ(-32767, v[index].key());
+  // Get the max item.
+  index = v.choose_nth_item(v.size() - 1);
+  // The result is 65536.
+  EXPECT_EQ(65536, v[index].key());
+  // Invalid items are silently truncated to valid.
+  // Get the min item.
+  index = v.choose_nth_item(-1);
+  // The result is -32767.
+  EXPECT_EQ(-32767, v[index].key());
+  // Get the max item.
+  index = v.choose_nth_item(v.size());
+  // The result is 65536.
+  EXPECT_EQ(65536, v[index].key());
+}
+
+// Tests results on boring data with lots of duplication.
+TEST_F(NthItemTest, BoringTest) {
+  KDVector v;
+  // Push the test data onto the KDVector.
+  int test_data[] = {8, 8, 8, 8, 8, 7, 7, 7, 7};
+  for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
+    IntKDPair pair(test_data[i], i);
+    v.push_back(pair);
+  }
+  // The 3rd item is 7 but the 4th is 8..
+  int index = v.choose_nth_item(3);
+  // The result is 7.
+  EXPECT_EQ(7, v[index].key());
+  index = v.choose_nth_item(4);
+  // The result is 8.
+  EXPECT_EQ(8, v[index].key());
+  // Get the min item.
+  index = v.choose_nth_item(0);
+  // The result is 7.
+  EXPECT_EQ(7, v[index].key());
+  // Get the max item.
+  index = v.choose_nth_item(v.size() - 1);
+  // The result is 8.
+  EXPECT_EQ(8, v[index].key());
+}
+
+// Tests that a unique median in an odd-size array is found correctly.
+TEST_F(NthItemTest, UniqueTest) {
+  KDVector v;
+  // Push the test data onto the KDVector.
+  PushTestData(&v);
+  // Get the median item.
+  int index = v.choose_nth_item(v.size() / 2);
+  // The result is 6, it started out at index 11.
+  EXPECT_EQ(6, v[index].key());
+  EXPECT_EQ(11, v[index].data());
+}
+
+// Tests that an equal median is found correctly.
+TEST_F(NthItemTest, EqualTest) {
+  KDVector v;
+  // Push the test data onto the KDVector.
+  PushTestData(&v);
+  // Add an extra 8. This makes the median 7.
+  IntKDPair pair(8, 13);
+  v.push_back(pair);
+  // Get the median item.
+  int index = v.choose_nth_item(v.size() / 2);
+  // The result is 7, it started out at index 4 or 12.
+  EXPECT_EQ(7, v[index].key());
+  EXPECT_TRUE(v[index].data() == 4 || v[index].data() == 12);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/osd_test.cc b/tesseract/unittest/osd_test.cc
new file mode 100644
index 00000000..5100a6f9
--- /dev/null
+++ b/tesseract/unittest/osd_test.cc
@@ -0,0 +1,133 @@
+///////////////////////////////////////////////////////////////////////
+// File:        osd_test.cc
+// Description: OSD Tests for Tesseract.
+// Author:      ShreeDevi Kumar
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// based on https://gist.github.com/amitdo/7c7a522004dd79b398340c9595b377e1
+
+// expects clones of tessdata, tessdata_fast and tessdata_best repos
+
+//#include "log.h"
+#include <iostream>
+#include <memory>               // std::unique_ptr
+#include <string>
+#include <tesseract/baseapi.h>
+#include "include_gunit.h"
+#include "allheaders.h"
+
+namespace tesseract {
+
+class TestClass : public testing::Test {
+ protected:
+};
+
+#ifndef DISABLED_LEGACY_ENGINE
+static void OSDTester(int expected_deg, const char* imgname, const char* tessdatadir) {
+  // log.info() << tessdatadir << " for image: " << imgname << std::endl;
+  std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+  ASSERT_FALSE(api->Init(tessdatadir, "osd"))
+      << "Could not initialize tesseract.";
+  Pix* image = pixRead(imgname);
+  ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+  api->SetImage(image);
+  int orient_deg;
+  float orient_conf;
+  const char* script_name;
+  float script_conf;
+  bool detected = api->DetectOrientationScript(&orient_deg, &orient_conf,
+                                               &script_name, &script_conf);
+  ASSERT_FALSE(!detected) << "Failed to detect OSD.";
+  printf(
+      "************ Orientation in degrees: %d, Orientation confidence: %.2f\n"
+      "             Script: %s, Script confidence: %.2f\n",
+      orient_deg, orient_conf, script_name, script_conf);
+  EXPECT_EQ(expected_deg, orient_deg);
+  api->End();
+  pixDestroy(&image);
+}
+#endif
+
+class OSDTest : public TestClass,
+                public ::testing::WithParamInterface<
+                    std::tuple<int, const char*, const char*>> {};
+
+TEST_P(OSDTest, MatchOrientationDegrees) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because TessBaseAPI::DetectOrientationScript is missing.
+  GTEST_SKIP();
+#else
+  OSDTester(std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()));
+#endif
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataEngEuroHebrew, OSDTest,
+    ::testing::Combine(::testing::Values(0),
+                       ::testing::Values(TESTING_DIR "/phototest.tif",
+                                         TESTING_DIR "/eurotext.tif",
+                                         TESTING_DIR "/hebrew.png"),
+                       ::testing::Values(TESSDATA_DIR)));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataBestEngEuroHebrew, OSDTest,
+    ::testing::Combine(::testing::Values(0),
+                       ::testing::Values(TESTING_DIR "/phototest.tif",
+                                         TESTING_DIR "/eurotext.tif",
+                                         TESTING_DIR "/hebrew.png"),
+                       ::testing::Values(TESSDATA_DIR "_best")));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataFastEngEuroHebrew, OSDTest,
+    ::testing::Combine(::testing::Values(0),
+                       ::testing::Values(TESTING_DIR "/phototest.tif",
+                                         TESTING_DIR "/eurotext.tif",
+                                         TESTING_DIR "/hebrew.png"),
+                       ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataFastRotated90, OSDTest,
+    ::testing::Combine(::testing::Values(90),
+                       ::testing::Values(TESTING_DIR
+                                         "/phototest-rotated-R.png"),
+                       ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataFastRotated180, OSDTest,
+    ::testing::Combine(::testing::Values(180),
+                       ::testing::Values(TESTING_DIR
+                                         "/phototest-rotated-180.png"),
+                       ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataFastRotated270, OSDTest,
+    ::testing::Combine(::testing::Values(270),
+                       ::testing::Values(TESTING_DIR
+                                         "/phototest-rotated-L.png"),
+                       ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataFastDevaRotated270, OSDTest,
+    ::testing::Combine(::testing::Values(270),
+                       ::testing::Values(TESTING_DIR
+                                         "/devatest-rotated-270.png"),
+                       ::testing::Values(TESSDATA_DIR "_fast")));
+
+INSTANTIATE_TEST_SUITE_P(
+    TessdataFastDeva, OSDTest,
+    ::testing::Combine(::testing::Values(0),
+                       ::testing::Values(TESTING_DIR "/devatest.png"),
+                       ::testing::Values(TESSDATA_DIR "_fast")));
+
+}  // namespace
diff --git a/tesseract/unittest/pagesegmode_test.cc b/tesseract/unittest/pagesegmode_test.cc
new file mode 100644
index 00000000..60dcf8da
--- /dev/null
+++ b/tesseract/unittest/pagesegmode_test.cc
@@ -0,0 +1,114 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(_WIN32)
+#include <io.h>         // for _access
+#else
+#include <unistd.h>     // for access
+#endif
+#include <string>
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "helpers.h"
+#include "log.h"
+#include "include_gunit.h"
+
+namespace tesseract {
+
+// Replacement for std::filesystem::exists (C++-17)
+static bool file_exists(const char* filename) {
+#if defined(_WIN32)
+  return _access(filename, 0) == 0;
+#else
+  return access(filename, 0) == 0;
+#endif
+}
+
+// The fixture for testing Tesseract.
+class PageSegModeTest : public testing::Test {
+ protected:
+  PageSegModeTest() = default;
+  ~PageSegModeTest() {
+    pixDestroy(&src_pix_);
+  }
+
+  void SetUp() override {
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
+  }
+
+  void SetImage(const char* filename) {
+    pixDestroy(&src_pix_);
+    src_pix_ = pixRead(filename);
+    api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY);
+    api_.SetImage(src_pix_);
+  }
+
+  // Tests that the given rectangle produces exactly the given text in the
+  // given segmentation mode (after chopping off the last 2 newlines.)
+  void VerifyRectText(tesseract::PageSegMode mode, const char* str,
+                      int left, int top, int width, int height) {
+    api_.SetPageSegMode(mode);
+    api_.SetRectangle(left, top, width, height);
+    char* result = api_.GetUTF8Text();
+    chomp_string(result);
+    chomp_string(result);
+    EXPECT_STREQ(str, result);
+    delete[] result;
+  }
+
+  // Tests that the given rectangle does NOT produce the given text in the
+  // given segmentation mode.
+  void NotRectText(tesseract::PageSegMode mode, const char* str,
+                   int left, int top, int width, int height) {
+    api_.SetPageSegMode(mode);
+    api_.SetRectangle(left, top, width, height);
+    char* result = api_.GetUTF8Text();
+    EXPECT_STRNE(str, result);
+    delete[] result;
+  }
+
+  Pix* src_pix_ = nullptr;
+  std::string ocr_text_;
+  tesseract::TessBaseAPI api_;
+};
+
+// Tests the single-word segmentation mode, and that it performs correctly
+// and differently to line and block mode.
+TEST_F(PageSegModeTest, WordTest) {
+  std::string filename = file::JoinPath(TESTING_DIR, "segmodeimg.tif");
+  if (!file_exists(filename.c_str())) {
+    LOG(INFO) << "Skip test because of missing " << filename << '\n';
+    GTEST_SKIP();
+  } else {
+    SetImage(filename.c_str());
+    // Test various rectangles around the inverse page number.
+    VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1419, 264, 69, 34);
+    VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1411, 252, 78, 62);
+    VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1396, 218, 114, 102);
+    // Test a random pair of words as a line
+    VerifyRectText(tesseract::PSM_SINGLE_LINE,
+                   "What should", 237, 393, 256, 36);
+    // Test a random pair of words as a word
+    VerifyRectText(tesseract::PSM_SINGLE_WORD,
+                   "Whatshould", 237, 393, 256, 36);
+    // Test single block mode.
+    VerifyRectText(tesseract::PSM_SINGLE_BLOCK,
+                   "both the\nfrom the", 237, 450, 172, 94);
+    // But doesn't work in line or word mode.
+    NotRectText(tesseract::PSM_SINGLE_LINE,
+                "both the\nfrom the", 237, 450, 172, 94);
+    NotRectText(tesseract::PSM_SINGLE_WORD,
+                "both the\nfrom the", 237, 450, 172, 94);
+  }
+}
+
+}  // namespace
diff --git a/tesseract/unittest/pango_font_info_test.cc b/tesseract/unittest/pango_font_info_test.cc
new file mode 100644
index 00000000..5d1c7af7
--- /dev/null
+++ b/tesseract/unittest/pango_font_info_test.cc
@@ -0,0 +1,334 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdio>
+#include <string>
+#include <pango/pango.h>
+#include "include_gunit.h"
+#include "commandlineflags.h"
+#include "fileio.h"
+#include "pango_font_info.h"
+#include "absl/strings/str_cat.h"       // for absl::StrCat
+#include "gmock/gmock-matchers.h"       // for EXPECT_THAT
+#ifdef INCLUDE_TENSORFLOW
+#include "util/utf8/unicodetext.h"      // for UnicodeText
+#endif
+
+namespace tesseract {
+
+// Fonts in testdata directory
+const char* kExpectedFontNames[] = {
+  "Arab",
+  "Arial Bold Italic",
+  "DejaVu Sans Ultra-Light",
+  "Lohit Hindi",
+#if PANGO_VERSION <= 12005
+  "Times New Roman",
+#else
+  "Times New Roman,",  // Pango v1.36.2 requires a trailing ','
+#endif
+  "UnBatang",
+  "Verdana"
+};
+
+// Sample text used in tests.
+const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
+const char kEngText[] = "the quick brown fox jumps over the lazy dog";
+const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
+const char kKorText[] = "이는 것으로";
+// Hindi words containing illegal vowel sequences.
+const char* kBadlyFormedHinWords[] = {
+#if PANGO_VERSION <= 12005
+  "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
+#endif
+  // Pango v1.36.2 will render the above words even though they are invalid.
+  "प्रंात", nullptr
+};
+
+static PangoFontMap* font_map;
+
+class PangoFontInfoTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    if (!font_map) {
+      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+    }
+    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+  }
+
+  // Creates a fake fonts.conf file that points to the testdata fonts for
+  // fontconfig to initialize with.
+  static void SetUpTestCase() {
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
+
+    FLAGS_fonts_dir = TESTING_DIR;
+    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+    file::MakeTmpdir();
+    PangoFontInfo::SoftInitFontConfig(); // init early
+  }
+
+  PangoFontInfo font_info_;
+};
+
+TEST_F(PangoFontInfoTest, TestNonDefaultConstructor) {
+  PangoFontInfo font("Arial Bold Italic 12");
+  EXPECT_EQ(12, font.font_size());
+  EXPECT_EQ("Arial", font.family_name());
+}
+
+TEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) {
+  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Bold Italic 12"));
+  EXPECT_EQ(12, font_info_.font_size());
+  EXPECT_EQ("Arial", font_info_.family_name());
+
+  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Verdana 10"));
+  EXPECT_EQ(10, font_info_.font_size());
+  EXPECT_EQ("Verdana", font_info_.family_name());
+
+  EXPECT_TRUE(font_info_.ParseFontDescriptionName("DejaVu Sans Ultra-Light"));
+  EXPECT_EQ("DejaVu Sans", font_info_.family_name());
+}
+
+TEST_F(PangoFontInfoTest, DoesParseMissingFonts) {
+  // Font family one of whose faces exists but this one doesn't.
+  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
+  EXPECT_EQ(12, font_info_.font_size());
+  EXPECT_EQ("Arial", font_info_.family_name());
+
+  // Font family that doesn't exist in testdata. It will still parse the
+  // description name. But without the file, it will not be able to populate
+  // some font family details, like is_monospace().
+  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Georgia 10"));
+  EXPECT_EQ(10, font_info_.font_size());
+  EXPECT_EQ("Georgia", font_info_.family_name());
+}
+
+TEST_F(PangoFontInfoTest, DoesGetSpacingProperties) {
+  EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
+  int x_bearing, x_advance;
+  EXPECT_TRUE(font_info_.GetSpacingProperties("A", &x_bearing, &x_advance));
+  EXPECT_GT(x_advance, 0);
+  EXPECT_TRUE(font_info_.GetSpacingProperties("a", &x_bearing, &x_advance));
+  EXPECT_GT(x_advance, 0);
+}
+
+TEST_F(PangoFontInfoTest, CanRenderString) {
+  font_info_.ParseFontDescriptionName("Verdana 12");
+  EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText)));
+
+  font_info_.ParseFontDescriptionName("UnBatang 12");
+  EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
+
+  font_info_.ParseFontDescriptionName("Lohit Hindi 12");
+  EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText)));
+}
+
+TEST_F(PangoFontInfoTest, CanRenderLigature) {
+  font_info_.ParseFontDescriptionName("Arab 12");
+  const char kArabicLigature[] = "لا";
+  EXPECT_TRUE(
+    font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
+
+  printf("Next word\n");
+  EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
+}
+
+TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {
+  font_info_.ParseFontDescriptionName("Verdana 12");
+  EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
+}
+
+TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
+  font_info_.ParseFontDescriptionName("Lohit Hindi 12");
+  for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {
+    EXPECT_FALSE(font_info_.CanRenderString(kBadlyFormedHinWords[i],
+                                            strlen(kBadlyFormedHinWords[i])))
+        << "Can render " << kBadlyFormedHinWords[i];
+  }
+}
+
+TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
+  font_info_.ParseFontDescriptionName("Verdana 12");
+  // Verdana cannot render the "ff" ligature
+  std::string word = "oﬀice";
+  EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
+  EXPECT_EQ("oice", word);
+
+  // Don't drop non-letter characters like word joiners.
+  const char* kJoiners[] = {
+    "\u2060",  // U+2060 (WJ)
+    "\u200C",  // U+200C (ZWJ)
+    "\u200D"   // U+200D (ZWNJ)
+  };
+  for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
+    word = kJoiners[i];
+    EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
+    EXPECT_STREQ(kJoiners[i], word.c_str());
+  }
+}
+
+// ------------------------ FontUtils ------------------------------------
+
+class FontUtilsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    file::MakeTmpdir();
+  }
+  // Creates a fake fonts.conf file that points to the testdata fonts for
+  // fontconfig to initialize with.
+  static void SetUpTestCase() {
+    FLAGS_fonts_dir = TESTING_DIR;
+    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+    if (!font_map) {
+      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+    }
+    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+  }
+
+#ifdef INCLUDE_TENSORFLOW
+  void CountUnicodeChars(const char* utf8_text,
+                         std::unordered_map<char32, int64_t>* ch_map) {
+    ch_map->clear();
+    UnicodeText ut;
+    ut.PointToUTF8(utf8_text, strlen(utf8_text));
+    for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
+#if 0
+      if (UnicodeProps::IsWhitespace(*it)) continue;
+#else
+      if (std::isspace(*it)) continue;
+#endif
+      ++(*ch_map)[*it];
+    }
+  }
+#endif
+};
+
+TEST_F(FontUtilsTest, DoesFindAvailableFonts) {
+  EXPECT_TRUE(FontUtils::IsAvailableFont("Arial Bold Italic"));
+  EXPECT_TRUE(FontUtils::IsAvailableFont("Verdana"));
+  EXPECT_TRUE(FontUtils::IsAvailableFont("DejaVu Sans Ultra-Light"));
+
+  // Test that we can support font name convention for Pango v1.30.2 even when
+  // we are running an older version.
+  EXPECT_TRUE(FontUtils::IsAvailableFont("Times New Roman,"));
+}
+
+TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
+  // Only bold italic face is available.
+  EXPECT_FALSE(FontUtils::IsAvailableFont("Arial"));
+  // Don't have a ttf for the Courier family.
+  EXPECT_FALSE(FontUtils::IsAvailableFont("Courier"));
+  // Pango "synthesizes" the italic font from the available Verdana Regular and
+  // includes it in its list, but it is not really loadable.
+  EXPECT_FALSE(FontUtils::IsAvailableFont("Verdana Italic"));
+  // We have "Dejavu Sans Ultra-Light" but not its medium weight counterpart.
+  EXPECT_FALSE(FontUtils::IsAvailableFont("DejaVu Sans"));
+}
+
+TEST_F(FontUtilsTest, DoesListAvailableFonts) {
+  const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
+  EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
+  for (auto& font : fonts) {
+    PangoFontInfo font_info;
+    EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
+  }
+}
+
+#ifdef INCLUDE_TENSORFLOW
+TEST_F(FontUtilsTest, DoesFindBestFonts) {
+  std::string fonts_list;
+  std::unordered_map<char32, int64_t> ch_map;
+  CountUnicodeChars(kEngText, &ch_map);
+  EXPECT_EQ(26, ch_map.size());  // 26 letters
+  std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
+  std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
+  EXPECT_TRUE(best_list.size());
+  // All fonts except Lohit Hindi should render English text.
+  EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
+
+  CountUnicodeChars(kKorText, &ch_map);
+  best_list = FontUtils::BestFonts(ch_map, &font_flags);
+  EXPECT_TRUE(best_list.size());
+  // Only UnBatang font family is able to render korean.
+  EXPECT_EQ(1, font_flags.size());
+  EXPECT_STREQ("UnBatang", font_flags[0].first);
+}
+#endif
+
+TEST_F(FontUtilsTest, DoesSelectFont) {
+  const char* kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr};
+  const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
+  for (int i = 0; kLangText[i] != nullptr; ++i) {
+    SCOPED_TRACE(kLangNames[i]);
+    std::vector<std::string> graphemes;
+    std::string selected_font;
+    EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
+                                      &selected_font, &graphemes));
+    EXPECT_TRUE(selected_font.size());
+    EXPECT_TRUE(graphemes.size());
+  }
+}
+
+TEST_F(FontUtilsTest, DoesFailToSelectFont) {
+  const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
+  std::vector<std::string> graphemes;
+  std::string selected_font;
+  EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
+                                     &selected_font, &graphemes));
+}
+
+#if 0
+// Needs fix. FontUtils::GetAllRenderableCharacters was removed
+// because of deprecated pango_coverage_max.
+TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
+  const int32_t kHindiChar = 0x0905;
+  const int32_t kArabicChar = 0x0623;
+  const int32_t kMongolianChar = 0x180E;  // Mongolian vowel separator
+  const int32_t kOghamChar = 0x1680;      // Ogham space mark
+  std::vector<bool> unicode_mask;
+  FontUtils::GetAllRenderableCharacters(&unicode_mask);
+  EXPECT_TRUE(unicode_mask['A']);
+  EXPECT_TRUE(unicode_mask['1']);
+  EXPECT_TRUE(unicode_mask[kHindiChar]);
+  EXPECT_TRUE(unicode_mask[kArabicChar]);
+  EXPECT_FALSE(unicode_mask[kMongolianChar]);  // no font for mongolian.
+#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
+  EXPECT_FALSE(unicode_mask[kOghamChar]);      // no font for ogham.
+#endif
+  unicode_mask.clear();
+
+  std::vector<std::string> selected_fonts;
+  selected_fonts.push_back("Lohit Hindi");
+  FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
+  EXPECT_TRUE(unicode_mask['1']);
+  EXPECT_TRUE(unicode_mask[kHindiChar]);
+  EXPECT_FALSE(unicode_mask['A']);             // Lohit doesn't render English,
+  EXPECT_FALSE(unicode_mask[kArabicChar]);     // or Arabic,
+  EXPECT_FALSE(unicode_mask[kMongolianChar]);  // or Mongolian,
+  EXPECT_FALSE(unicode_mask[kOghamChar]);      // or Ogham.
+  unicode_mask.clear();
+
+  // Check that none of the included fonts cover the Mongolian or Ogham space
+  // characters.
+  for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
+    SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
+    FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
+#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
+    EXPECT_FALSE(unicode_mask[kOghamChar]);
+#endif
+    EXPECT_FALSE(unicode_mask[kMongolianChar]);
+    unicode_mask.clear();
+  }
+}
+#endif
+
+}  // namespace
diff --git a/tesseract/unittest/paragraphs_test.cc b/tesseract/unittest/paragraphs_test.cc
new file mode 100644
index 00000000..16134cac
--- /dev/null
+++ b/tesseract/unittest/paragraphs_test.cc
@@ -0,0 +1,705 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>                       // for std::string
+
+#include "absl/strings/str_cat.h"       // for absl::StrCat
+#include "absl/strings/str_join.h"      // for absl::StrJoin
+#include "absl/strings/str_split.h"     // for absl::StrSplit
+
+#include "include_gunit.h"              // for TEST
+#include "log.h"                        // for LOG
+
+#include "genericvector.h"
+// ccmain
+#include "paragraphs.h"
+#include "paragraphs_internal.h"
+// ccstruct
+#include "ocrpara.h"
+
+namespace tesseract {
+
+// Functions for making monospace ASCII trial text for the paragraph detector.
+const ParagraphJustification kLeft = JUSTIFICATION_LEFT;
+const ParagraphJustification kCenter = JUSTIFICATION_CENTER;
+const ParagraphJustification kRight = JUSTIFICATION_RIGHT;
+const ParagraphJustification kUnknown = JUSTIFICATION_UNKNOWN;
+
+enum TextModelInputType {
+  PCONT = 0,   // Continuation line of a paragraph (default).
+  PSTART = 1,  // First line of a paragraph.
+  PNONE = 2,   // Not a paragraph line.
+};
+
+struct TextAndModel {
+  const char* ascii;
+  TextModelInputType model_type;
+
+  // fields corresponding to PARA (see ccstruct/ocrpara.h)
+  ParagraphModel model;
+  bool is_very_first_or_continuation;
+  bool is_list_item;
+};
+
+// Imagine that the given text is typewriter ASCII with each character ten
+// pixels wide and twenty pixels high and return an appropriate row_info.
+void AsciiToRowInfo(const char* text, int row_number, RowInfo* info) {
+  const int kCharWidth = 10;
+  const int kLineSpace = 30;
+  info->text = text;
+  info->has_leaders =
+      strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
+  info->has_drop_cap = false;
+  info->pix_ldistance = info->pix_rdistance = 0;
+  info->average_interword_space = kCharWidth;
+  info->pix_xheight = kCharWidth;
+  info->lword_text = info->rword_text = "";
+  info->ltr = true;
+
+  std::vector<std::string> words = absl::StrSplit(text, ' ', absl::SkipEmpty());
+  info->num_words = words.size();
+  if (info->num_words < 1) return;
+
+  info->lword_text = words[0].c_str();
+  info->rword_text = words[words.size() - 1].c_str();
+  int lspace = 0;
+  while (lspace < info->text.size() && text[lspace] == ' ') {
+    lspace++;
+  }
+  int rspace = 0;
+  while (rspace < info->text.size() &&
+         text[info->text.size() - rspace - 1] == ' ') {
+    rspace++;
+  }
+
+  int top = -kLineSpace * row_number;
+  int bottom = top - kLineSpace;
+  int row_right = kCharWidth * info->text.size();
+  int lword_width = kCharWidth * info->lword_text.size();
+  int rword_width = kCharWidth * info->rword_text.size();
+  info->pix_ldistance = lspace * kCharWidth;
+  info->pix_rdistance = rspace * kCharWidth;
+  info->lword_box =
+      TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
+  info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,
+                         row_right - info->pix_rdistance, top);
+  LeftWordAttributes(
+      nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
+      &info->lword_likely_starts_idea, &info->lword_likely_ends_idea);
+  RightWordAttributes(
+      nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
+      &info->rword_likely_starts_idea, &info->rword_likely_ends_idea);
+}
+
+void MakeAsciiRowInfos(const TextAndModel* row_infos, int n,
+                       std::vector<RowInfo>* output) {
+  output->clear();
+  RowInfo info;
+  for (int i = 0; i < n; i++) {
+    AsciiToRowInfo(row_infos[i].ascii, i, &info);
+    output->push_back(info);
+  }
+}
+
+// Given n rows of reference ground truth, evaluate whether the n rows
+// of PARA * pointers yield the same paragraph breakpoints.
+void EvaluateParagraphDetection(const TextAndModel* correct, int n,
+                                const GenericVector<PARA*>& detector_output) {
+  int incorrect_breaks = 0;
+  int missed_breaks = 0;
+  int poorly_matched_models = 0;
+  int bad_crowns = 0;
+  int bad_list_items = 0;
+  ASSERT_EQ(detector_output.size(), n);
+  for (int i = 1; i < n; i++) {
+    bool has_break = correct[i].model_type != PCONT;
+    bool detected_break = (detector_output[i - 1] != detector_output[i]);
+    if (has_break && !detected_break) missed_breaks++;
+    if (detected_break && !has_break) incorrect_breaks++;
+    if (has_break) {
+      if (correct[i].model_type == PNONE) {
+        if (detector_output[i]->model != nullptr) {
+          poorly_matched_models++;
+        }
+      } else {
+        if (correct[i].model.justification() != kUnknown &&
+            (detector_output[i]->model == nullptr ||
+             !correct[i].model.Comparable(*detector_output[i]->model))) {
+          poorly_matched_models++;
+        }
+      }
+      if (correct[i].is_very_first_or_continuation ^
+          detector_output[i]->is_very_first_or_continuation) {
+        bad_crowns++;
+      }
+      if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {
+        bad_list_items++;
+      }
+    }
+  }
+  EXPECT_EQ(incorrect_breaks, 0);
+  EXPECT_EQ(missed_breaks, 0);
+  EXPECT_EQ(poorly_matched_models, 0);
+  EXPECT_EQ(bad_list_items, 0);
+  EXPECT_EQ(bad_crowns, 0);
+  if (incorrect_breaks || missed_breaks || poorly_matched_models ||
+      bad_list_items || bad_crowns) {
+    std::vector<std::string> dbg_lines;
+    dbg_lines.push_back("# ==========================");
+    dbg_lines.push_back("# Correct paragraph breaks:");
+    dbg_lines.push_back("# ==========================");
+    for (int i = 0; i < n; i++) {
+      if (correct[i].model_type != PCONT) {
+        dbg_lines.push_back(absl::StrCat(
+            correct[i].ascii, "  #  ", correct[i].model.ToString().c_str(),
+            correct[i].is_very_first_or_continuation ? " crown" : "",
+            correct[i].is_list_item ? " li" : ""));
+      } else {
+        dbg_lines.push_back(correct[i].ascii);
+      }
+    }
+    dbg_lines.push_back("");
+    dbg_lines.push_back("# ==========================");
+    dbg_lines.push_back("# Paragraph detector output:");
+    dbg_lines.push_back("# ==========================");
+    for (int i = 0; i < n; i++) {
+      std::string annotation;
+      if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
+        if (detector_output[i] && detector_output[i]->model) {
+          annotation += absl::StrCat(
+              "  #  ", detector_output[i]->model->ToString().c_str(),
+              detector_output[i]->is_very_first_or_continuation ? " crown" : "",
+              detector_output[i]->is_list_item ? " li" : "");
+        } else {
+          annotation = "  #  Unmodeled paragraph.";
+        }
+      }
+      dbg_lines.push_back(absl::StrCat(correct[i].ascii, annotation));
+    }
+    LOG(INFO) << "Discrepency!\n" << absl::StrJoin(dbg_lines, "\n");
+  }
+}
+
+void TestParagraphDetection(const TextAndModel* correct, int num_rows) {
+  std::vector<RowInfo> row_infos;
+  GenericVector<PARA*> row_owners;
+  PARA_LIST paragraphs;
+  std::vector<ParagraphModel*> models;
+
+  MakeAsciiRowInfos(correct, num_rows, &row_infos);
+  int debug_level(3);
+  tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, &paragraphs,
+                              &models);
+  EvaluateParagraphDetection(correct, num_rows, row_owners);
+  for (auto* model : models) {
+    delete model;
+  }
+}
+
+TEST(ParagraphsTest, ListItemsIdentified) {
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("iii"));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("B."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("C."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("1."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("2."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("3."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("1"));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("2"));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("3"));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("[[1]]"));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-1."));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-2"));
+  EXPECT_TRUE(tesseract::AsciiLikelyListItem("(A)(i)"));
+
+  EXPECT_FALSE(tesseract::AsciiLikelyListItem("The"));
+  EXPECT_FALSE(tesseract::AsciiLikelyListItem("first"));
+  EXPECT_FALSE(tesseract::AsciiLikelyListItem("house"));
+  EXPECT_FALSE(tesseract::AsciiLikelyListItem("Oregonian."));
+  EXPECT_FALSE(tesseract::AsciiLikelyListItem("on."));
+}
+
+typedef ParagraphModel PModel;
+
+const TextAndModel kTwoSimpleParagraphs[] = {
+  {"  Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"This paragraph starts at the top", PCONT, PModel(), false, false},
+  {"of the page and takes 3 lines.  ", PCONT, PModel(), false, false},
+  {"  Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"which indicates that the first  ", PCONT, PModel(), false, false},
+  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
+  {"from a previous page, as it is  ", PCONT, PModel(), false, false},
+  {"indented just like this second  ", PCONT, PModel(), false, false},
+  {"paragraph.                      ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestSimpleParagraphDetection) {
+  TestParagraphDetection(kTwoSimpleParagraphs,
+                         ABSL_ARRAYSIZE(kTwoSimpleParagraphs));
+}
+
+const TextAndModel kFewCluesWithCrown[] = {
+  {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
+   true, false},
+  {"of the page and takes two lines.", PCONT, PModel(), false, false},
+  {"  Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"which indicates that the first  ", PCONT, PModel(), false, false},
+  {"paragraph is a continuation from", PCONT, PModel(), false, false},
+  {"a previous page, as it is       ", PCONT, PModel(), false, false},
+  {"indented just like this second  ", PCONT, PModel(), false, false},
+  {"paragraph.                      ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestFewCluesWithCrown) {
+  TestParagraphDetection(kFewCluesWithCrown,
+                         ABSL_ARRAYSIZE(kFewCluesWithCrown));
+}
+
+const TextAndModel kCrownedParagraph[] = {
+  {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
+   true, false},
+  {"often not indented as the rest  ", PCONT, PModel(), false, false},
+  {"of the paragraphs are.  Nonethe-", PCONT, PModel(), false, false},
+  {"less it should be counted as the", PCONT, PModel(), false, false},
+  {"same type of paragraph.         ", PCONT, PModel(), false, false},
+  {"  The second and third para-    ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"graphs are both indented two    ", PCONT, PModel(), false, false},
+  {"spaces.                         ", PCONT, PModel(), false, false},
+  {"  The first paragraph has what  ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"fmt refers to as a 'crown.'     ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestCrownParagraphDetection) {
+  TestParagraphDetection(kCrownedParagraph, ABSL_ARRAYSIZE(kCrownedParagraph));
+}
+
+const TextAndModel kFlushLeftParagraphs[] = {
+  {"It  is sometimes  the case  that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
+  {"flush  left   paragraphs  (those", PCONT, PModel(), false, false},
+  {"with  no  body  indent)  are not", PCONT, PModel(), false, false},
+  {"actually crowns.                ", PCONT, PModel(), false, false},
+  {"Instead,  further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
+  {"also flush left aligned.  Usual-", PCONT, PModel(), false, false},
+  {"ly,  these  paragraphs  are  set", PCONT, PModel(), false, false},
+  {"apart vertically  by some white-", PCONT, PModel(), false, false},
+  {"space,  but you can also  detect", PCONT, PModel(), false, false},
+  {"them by observing  the big empty", PCONT, PModel(), false, false},
+  {"space at the  ends  of the para-", PCONT, PModel(), false, false},
+  {"graphs.                         ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
+  TestParagraphDetection(kFlushLeftParagraphs,
+                         ABSL_ARRAYSIZE(kFlushLeftParagraphs));
+}
+
+const TextAndModel kSingleFullPageContinuation[] = {
+  {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
+  {"continuation.  It flows  from", PCONT, PModel(), false, false},
+  {"line to  line, using the full", PCONT, PModel(), false, false},
+  {"column  width  with  no clear", PCONT, PModel(), false, false},
+  {"paragraph  break,  because it", PCONT, PModel(), false, false},
+  {"actually doesn't have one. It", PCONT, PModel(), false, false},
+  {"is the  middle of one monster", PCONT, PModel(), false, false},
+  {"paragraph continued  from the", PCONT, PModel(), false, false},
+  {"previous page and  continuing", PCONT, PModel(), false, false},
+  {"onto the  next  page.  There-", PCONT, PModel(), false, false},
+  {"fore,  it  ends  up   getting", PCONT, PModel(), false, false},
+  {"marked  as a  crown  and then", PCONT, PModel(), false, false},
+  {"getting re-marked as any  ex-", PCONT, PModel(), false, false},
+  {"isting model.  Not great, but", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestSingleFullPageContinuation) {
+  const TextAndModel* correct = kSingleFullPageContinuation;
+  int num_rows = ABSL_ARRAYSIZE(kSingleFullPageContinuation);
+  std::vector<RowInfo> row_infos;
+  GenericVector<PARA*> row_owners;
+  PARA_LIST paragraphs;
+  std::vector<ParagraphModel*> models;
+  models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
+  MakeAsciiRowInfos(correct, num_rows, &row_infos);
+  tesseract::DetectParagraphs(3, &row_infos, &row_owners, &paragraphs, &models);
+  EvaluateParagraphDetection(correct, num_rows, row_owners);
+  for (auto* model : models) {
+    delete model;
+  }
+}
+
+const TextAndModel kRightAligned[] = {
+  {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
+  {"   uncommon in Left-to-Right", PCONT, PModel(), false, false},
+  {"      languages, but they do", PCONT, PModel(), false, false},
+  {"                      exist.", PCONT, PModel(), false, false},
+  {"    Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
+  {" horribly tiny paragraphs in", PCONT, PModel(), false, false},
+  {"  tables on which we have no", PCONT, PModel(), false, false},
+  {"             chance anyways.", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestRightAlignedParagraph) {
+  TestParagraphDetection(kRightAligned, ABSL_ARRAYSIZE(kRightAligned));
+}
+
+const TextAndModel kTinyParagraphs[] = {
+  {"  Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"obvious paragraph text, you might", PCONT, PModel(), false, false},
+  {"find short exchanges of dialogue ", PCONT, PModel(), false, false},
+  {"between characters.              ", PCONT, PModel(), false, false},
+  {"  'Oh?'                          ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"  'Don't be confused!'           ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"  'Not me!'                      ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"  One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"mark a new paragraph whenever one", PCONT, PModel(), false, false},
+  {"of the statistics (left, right or", PCONT, PModel(), false, false},
+  {"center)  changes  from  one text-", PCONT, PModel(), false, false},
+  {"line  to  the  next.    Such   an", PCONT, PModel(), false, false},
+  {"approach  would  misclassify  the", PCONT, PModel(), false, false},
+  {"tiny paragraphs above as a single", PCONT, PModel(), false, false},
+  {"paragraph.                       ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestTinyParagraphs) {
+  TestParagraphDetection(kTinyParagraphs, ABSL_ARRAYSIZE(kTinyParagraphs));
+}
+
+const TextAndModel kComplexPage1[] = {
+  {"       Awesome                  ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
+  {"   Centered Title               ", PCONT, PModel(), false, false},
+  {" Paragraph Detection            ", PCONT, PModel(), false, false},
+  {"      OCR TEAM                  ", PCONT, PModel(), false, false},
+  {"  10 November 2010              ", PCONT, PModel(), false, false},
+  {"                                ", PNONE, PModel(), false, false},
+  {"  Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"This paragraph starts at the top", PCONT, PModel(), false, false},
+  {"of the page and takes 3 lines.  ", PCONT, PModel(), false, false},
+  {"  Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"which indicates that the first  ", PCONT, PModel(), false, false},
+  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
+  {"from a previous page, as it is  ", PCONT, PModel(), false, false},
+  {"indented just like this second  ", PCONT, PModel(), false, false},
+  {"paragraph.                      ", PCONT, PModel(), false, false},
+  {"   Here is a block quote. It    ", PSTART, PModel(kLeft, 30, 0, 0, 0),
+   true, false},
+  {"   looks like the prior text    ", PCONT, PModel(), false, false},
+  {"   but it  is indented  more    ", PCONT, PModel(), false, false},
+  {"   and is fully justified.      ", PCONT, PModel(), false, false},
+  {"  So how does one deal with     ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"centered text, block quotes,    ", PCONT, PModel(), false, false},
+  {"normal paragraphs, and lists    ", PCONT, PModel(), false, false},
+  {"like what follows?              ", PCONT, PModel(), false, false},
+  {"1. Make a plan.                 ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"   looking for lines where the  ", PCONT, PModel(), false, false},
+  {"   first word of the next line  ", PCONT, PModel(), false, false},
+  {"   would fit on the previous    ", PCONT, PModel(), false, false},
+  {"   line.                        ", PCONT, PModel(), false, false},
+  {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"   Python and try it out.       ", PCONT, PModel(), false, false},
+  {"4. Determine how to fix the     ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"   mistakes.                    ", PCONT, PModel(), false, false},
+  {"5. Repeat.                      ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"  For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"you can try to identify source  ", PCONT, PModel(), false, false},
+  {"code.  Ouch!                    ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestComplexPage1) {
+  TestParagraphDetection(kComplexPage1, ABSL_ARRAYSIZE(kComplexPage1));
+}
+
+// The same as above, but wider.
+const TextAndModel kComplexPage2[] = {
+  {"       Awesome                     ", PSTART,
+   PModel(kCenter, 0, 0, 0, 0), false, false},
+  {"   Centered Title                  ", PCONT, PModel(), false, false},
+  {" Paragraph Detection               ", PCONT, PModel(), false, false},
+  {"      OCR TEAM                     ", PCONT, PModel(), false, false},
+  {"  10 November 2010                 ", PCONT, PModel(), false, false},
+  {"                                   ", PNONE, PModel(), false, false},
+  {"  Look here, I have a paragraph.   ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"This paragraph starts at the top of", PCONT, PModel(), false, false},
+  {"the page and takes 3 lines.        ", PCONT, PModel(), false, false},
+  {"  Here I have a second paragraph   ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"which indicates that the first     ", PCONT, PModel(), false, false},
+  {"paragraph is not a continuation    ", PCONT, PModel(), false, false},
+  {"from a previous page, as it is in- ", PCONT, PModel(), false, false},
+  {"dented just like this second para- ", PCONT, PModel(), false, false},
+  {"graph.                             ", PCONT, PModel(), false, false},
+  {"   Here is a block quote. It       ", PSTART, PModel(kLeft, 30, 0, 0, 0),
+   true, false},
+  {"   looks like the prior text       ", PCONT, PModel(), false, false},
+  {"   but it  is indented  more       ", PCONT, PModel(), false, false},
+  {"   and is fully justified.         ", PCONT, PModel(), false, false},
+  {"  So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
+  {"graphs, and lists like what follow?", PCONT, PModel(), false, false},
+  {"1. Make a plan.                    ", PCONT, PModel(), false, false},  // BUG!!
+  {"2. Use a heuristic, for example,   ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"   looking for lines where the     ", PCONT, PModel(), false, false},
+  {"   first word of the next line     ", PCONT, PModel(), false, false},
+  {"   would fit on the previous line. ", PCONT, PModel(), false, false},
+  {"8. Try to implement the plan in    ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"   Python and try it out.          ", PCONT, PModel(), false, false},
+  {"4. Determine how to fix the        ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"   mistakes.                       ", PCONT, PModel(), false, false},
+  {"5. Repeat.                         ", PSTART, PModel(kLeft, 0, 0, 30, 0),
+   false, true},
+  {"  For extra painful penalty work   ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"you can try to identify source     ", PCONT, PModel(), false, false},
+  {"code.  Ouch!                       ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestComplexPage2) {
+  TestParagraphDetection(kComplexPage2, ABSL_ARRAYSIZE(kComplexPage2));
+}
+
+const TextAndModel kSubtleCrown[] = {
+  {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
+   true, false},
+  {"often not indented as the rest  ", PCONT, PModel(), false, false},
+  {"of the paragraphs are.  Nonethe-", PCONT, PModel(), false, false},
+  {"less it should be counted as the", PCONT, PModel(), false, false},
+  {"same type of paragraph.         ", PCONT, PModel(), false, false},
+  {"  Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"should suffice.                 ", PCONT, PModel(), false, false},
+  {"             1235               ", PNONE, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestSubtleCrown) {
+  TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown) - 1);
+}
+
+TEST(ParagraphsTest, TestStrayLineInBlock) {
+  TestParagraphDetection(kSubtleCrown, ABSL_ARRAYSIZE(kSubtleCrown));
+}
+
+const TextAndModel kUnlvRep3AO[] = {
+  {"    Defined contribution plans cover employees in Australia, New", PSTART,
+   PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries.  ", PCONT, PModel(), false, false},
+  {"In addition, employees in the U.S. are eligible to participate in    ", PCONT, PModel(), false, false},
+  {"deﬁned contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false},
+  {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false},
+  {"pensation, depending on Company proﬁt levels. Contributions    ", PCONT, PModel(), false, false},
+  {"charged to income for deﬁned contribution plans were $92 in    ", PCONT, PModel(), false, false},
+  {"1993, $98 in 1992 and $89 in 1991.                             ", PCONT, PModel(), false, false},
+  {"     In addition to providing pension beneﬁts, the Company pro- ", PSTART,
+   PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"vides certain health care and life insurance beneﬁts to retired     ", PCONT, PModel(), false, false},
+  {"employees. As discussed in Note A, the Company adopted FASB   ", PCONT, PModel(), false, false},
+  {"Statement No. 106 effective January 1, 1992. Previously, the     ", PCONT, PModel(), false, false},
+  {"Company recognized the cost of providing these beneﬁts as the     ", PCONT, PModel(), false, false},
+  {"beneﬁts were paid. These pretax costs amounted to $53 in 1991.   ", PCONT, PModel(), false, false},
+  {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false},
+  {"and life insurance beneﬁts in the year incurred.                ", PCONT, PModel(), false, false},
+  {"     The U.S. plan covering the parent company is the largest plan.",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"It provides medical and life insurance beneﬁts including hospital,  ", PCONT, PModel(), false, false},
+  {"physicians’ services and major medical expense beneﬁts and life   ", PCONT, PModel(), false, false},
+  {"insurance beneﬁts. The plan provides beneﬁts supplemental to    ", PCONT, PModel(), false, false},
+  {"Medicare after retirees are eligible for these beneﬁts. The cost of  ", PCONT, PModel(), false, false},
+  {"these beneﬁts are shared by the Company and the retiree, with the  ", PCONT, PModel(), false, false},
+  {"Company portion increasing as the retiree has increased years of   ", PCONT, PModel(), false, false},
+  {"credited service. The Company has the ability to change these    ", PCONT, PModel(), false, false},
+  {"beneﬁts at any time.                                            ", PCONT, PModel(), false, false},
+  {"     Effective October 1993, the Company amended its health   ", PSTART,
+   PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"beneﬁts plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false},
+  {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false},
+  {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false},
+  {"to reduce the December 31, 1993 accumulated postretirement   ", PCONT, PModel(), false, false},
+  {"beneﬁt obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false},
+  {"tirement cost by $21 for 1993 and is estimated to reduce this cost  ", PCONT, PModel(), false, false},
+  {"for 1994 by approximately $83.                                     ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, TestUnlvInsurance) {
+  TestParagraphDetection(kUnlvRep3AO, ABSL_ARRAYSIZE(kUnlvRep3AO));
+}
+
+// The basic outcome we want for something with a bunch of leader dots is that
+// we group each logical entry as a separate item.  Without knowledge of
+// leaders, we would most likely mark the text below as a simple right aligned
+// paragraph or two.
+// This example comes from Volume 9886293, Page 5
+const TextAndModel kTableOfContents[] = {
+  {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"   Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"    Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"     Proverbs . . . . . .  2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"        Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"     Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"        Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"     Appearance . . . . .  3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"   Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"    Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"    Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"    Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+  {"        Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
+};
+
+TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
+  TestParagraphDetection(kTableOfContents, ABSL_ARRAYSIZE(kTableOfContents));
+}
+
+const TextAndModel kTextWithSourceCode[] = {
+  {"  A typical page of a programming book may contain", PSTART,
+   PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
+  {"being described in prose.  Such examples should be", PCONT, PModel(), false, false},
+  {"rendered as lineated text, meaning text with      ", PCONT, PModel(), false, false},
+  {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false},
+  {"spacing.  Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
+  {"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
+  {"when the text is re-flowed.                       ", PCONT, PModel(), false, false},
+  {"  Let's show this by describing the function fact-", PSTART,
+   PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"orial.  Factorial is a simple recursive function  ", PCONT, PModel(), false, false},
+  {"which grows very quickly.  So quickly, in fact,   ", PCONT, PModel(), false, false},
+  {"that the typical C implementation will only work  ", PCONT, PModel(), false, false},
+  {"for values less than about 12:                    ", PCONT, PModel(), false, false},
+  {"                                                  ", PNONE, PModel(), false, false},
+  {"  # Naive implementation in C                     ", PCONT, PModel(), false, false},
+  {"  int factorial(int n) {                          ", PCONT, PModel(), false, false},
+  {"    if (n < 2)                                    ", PCONT, PModel(), false, false},
+  {"      return 1;                                   ", PCONT, PModel(), false, false},
+  {"    return  n * factorial(n - 1);                 ", PCONT, PModel(), false, false},
+  {"  }                                               ", PCONT, PModel(), false, false},
+  {"                                                  ", PCONT, PModel(), false, false},
+  {"  The C programming language does not have built- ", PSTART,
+   PModel(kLeft, 0, 20, 0, 0), false, false},
+  {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
+  {"naive implementation simply returns random values ", PCONT, PModel(), false, false},
+  {"if even a moderate sized n is provided.           ", PCONT, PModel(), false, false},
+};
+
+TEST(ParagraphsTest, NotDistractedBySourceCode) {
+  TestParagraphDetection(kTextWithSourceCode,
+                         ABSL_ARRAYSIZE(kTextWithSourceCode));
+}
+
+const TextAndModel kOldManAndSea[] = {
+  {"royal  palm  which  are called  guano  and  in it  there was a bed,  a",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false},
+  {"On  the  brown  walls  of  the ﬂattened,  overlapping  leaves  of  the", PCONT, PModel(), false, false},
+  {"sturdy  fibered guano  there  was  a  picture in  color of  the Sacred", PCONT, PModel(), false, false},
+  {"Heart  of  Jesus  and  another  of  the  Virgin  of Cobre.  These were", PCONT, PModel(), false, false},
+  {"relics of  his wife.   Once there had been  a tinted photograph of his", PCONT, PModel(), false, false},
+  {"wife on  the wall  but he  had taken  it  down because it made him too", PCONT, PModel(), false, false},
+  {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false},
+  {"shirt.                                                                ", PCONT, PModel(), false, false},
+  {"     \"What  do  you  have  to  eat?\"     the  boy   asked.          ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"A pot of yellow rice with fish. Do you want some?\"            ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"No. I will eat at home. Do you want me to make the fire?\"   ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"No. I will make it later on. Or I may eat the rice cold.\"     ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"May I take the cast net?\"                                     ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"Of course.\"                                                   ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     There was  no  cast net  and  the boy  remembered  when  they had",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"sold it.   But they went through  this fiction every day. There was no", PCONT, PModel(), false, false},
+  {"pot of yellow rice and fish and the boy knew this too.                 "
+   " ", PCONT, PModel(), false, false},
+  {"     \"Eighty-five  is a lucky number,\"  the  old  man  said.   \"How",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"would  you  like to see  me  bring one  in that dressed out over a "
+   "thou-", PCONT, PModel(), false, false},
+  {"sand pounds?                                                           "
+   " ", PCONT, PModel(), false, false},
+  {"     \"I'll get the cast net and go for sardines.  Will you sit in the "
+   "sun",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"in the doorway?\"                                                        "
+   " ", PCONT, PModel(), false, false},
+  {"     \"Yes.  I have yesterday's paper and I will read the baseball.\"   ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     The boy  did not  know  whether  yesterday's paper  was a fiction",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"too.  But the old man brought it out from under the bed.              ", PCONT, PModel(), false, false},
+  {"     \"Pedrico gave it to me at the bodega,\" he explained.             "
+   " ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"I'll be back when I have the sardines.  I'll keep yours and mine",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"together  on ice  and  we  can  share  them  in the  morning.   When I", PCONT, PModel(), false, false},
+  {"come back you can tell me about the baseball.\"                       ", PCONT, PModel(), false, false},
+  {"     \"The Yankees cannot lose.\"                                     ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"But I fear the Indians of Cleveland.\"                         ",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"     \"Have faith  in  the Yankees  my son.   Think of  the great  Di-",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"Maggio.\"                                                             ", PCONT, PModel(), false, false},
+  {"     \"I  fear both  the Tigers of Detroit  and the  Indians of Cleve-",
+   PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
+  {"land.\"                                                               ", PCONT, PModel(), false, false}
+};
+
+TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
+  TestParagraphDetection(kOldManAndSea, ABSL_ARRAYSIZE(kOldManAndSea));
+}
+
+const TextAndModel kNewZealandIndex[] = {
+  {"Oats, 51                      ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"O'Brien, Gregory, 175         ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"   138                        ", PCONT, PModel(), false, false},
+  {"OECD rankings, 155, 172       ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"   145                        ", PCONT, PModel(), false, false},
+  {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Old World evils, 77           ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Oliver, W. H., 39, 77, 89     ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Olssen, Erik, 45, 64, 84      ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Once on Chunuk Bair, 149      ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"On—shore whaling, xvi         ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Opotiki, xix                  ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Orakau battle of, xviii, 57   ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"O’Regan, Tipene, 170, 198-99  ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Organic agriculture, 177      ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Orwell, George, 151           ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otago, xvii, 45, 49-50, 70    ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otago block, xvii             ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otago Daily Times, 67         ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"   85                         ", PCONT, PModel(), false, false},
+  {"Otago gold rushes, 61-63      ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otago Peninsula, xx           ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otago Provincial Council, 68  ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Otaki, 33                     ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
+  {"Owls Do Cry, 139              ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}
+};
+
+TEST(ParagraphsTest, IndexPageTest) {
+  TestParagraphDetection(kNewZealandIndex, ABSL_ARRAYSIZE(kNewZealandIndex));
+}
+
+// TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.
+
+}  // namespace
diff --git a/tesseract/unittest/params_model_test.cc b/tesseract/unittest/params_model_test.cc
new file mode 100644
index 00000000..8627ab8e
--- /dev/null
+++ b/tesseract/unittest/params_model_test.cc
@@ -0,0 +1,75 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>         // std::string
+#include <vector>
+
+#include "include_gunit.h"
+#include "params_model.h"
+#include "serialis.h"     // TFile
+#include "tprintf.h"      // tprintf
+
+namespace tesseract {
+
+// Test some basic I/O of params model files (automated learning of language
+// model weights).
+#ifndef DISABLED_LEGACY_ENGINE
+static bool LoadFromFile(tesseract::ParamsModel& model, const char* lang, const char* full_path) {
+  tesseract::TFile fp;
+  if (!fp.Open(full_path, nullptr)) {
+    tprintf("Error opening file %s\n", full_path);
+    return false;
+  }
+  return model.LoadFromFp(lang, &fp);
+}
+#endif
+
+class ParamsModelTest : public testing::Test {
+#ifndef DISABLED_LEGACY_ENGINE
+ protected:
+  void SetUp() override {
+    std::locale::global(std::locale(""));
+  }
+
+  std::string TestDataNameToPath(const std::string& name) const {
+    return file::JoinPath(TESTDATA_DIR, name);
+  }
+  std::string OutputNameToPath(const std::string& name) const {
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+  // Test that we are able to load a params model, save it, reload it,
+  // and verify that the re-serialized version is the same as the original.
+  void TestParamsModelRoundTrip(const std::string& params_model_filename) const {
+    tesseract::ParamsModel orig_model;
+    tesseract::ParamsModel duplicate_model;
+    file::MakeTmpdir();
+    std::string orig_file = TestDataNameToPath(params_model_filename);
+    std::string out_file = OutputNameToPath(params_model_filename);
+
+    EXPECT_TRUE(LoadFromFile(orig_model, "eng", orig_file.c_str()));
+    EXPECT_TRUE(orig_model.SaveToFile(out_file.c_str()));
+
+    EXPECT_TRUE(LoadFromFile(duplicate_model, "eng", out_file.c_str()));
+    EXPECT_TRUE(orig_model.Equivalent(duplicate_model));
+  }
+#endif
+};
+
+TEST_F(ParamsModelTest, TestEngParamsModelIO) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because ParamsModel::LoadFromFp is missing.
+  GTEST_SKIP();
+#else
+  TestParamsModelRoundTrip("eng.params_model");
+#endif
+}
+
+}  // namespace
diff --git a/tesseract/unittest/progress_test.cc b/tesseract/unittest/progress_test.cc
new file mode 100644
index 00000000..dbe30269
--- /dev/null
+++ b/tesseract/unittest/progress_test.cc
@@ -0,0 +1,165 @@
+///////////////////////////////////////////////////////////////////////
+// File:        progress_test.cc
+// Description: Progress reporting API Test for Tesseract.
+// Author:      Jaroslaw Kubik
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+// expects clone of tessdata_fast repo in ../../tessdata_fast
+
+#include "include_gunit.h"
+
+#include <tesseract/baseapi.h>
+#include <tesseract/ocrclass.h>
+
+#include "allheaders.h"
+#include "gmock/gmock.h"
+
+#include <fstream>
+#include <iostream>
+#include <locale>
+#include <memory>               // std::unique_ptr
+#include <string>
+
+#include <time.h>
+
+namespace tesseract {
+
+class QuickTest : public testing::Test {
+ protected:
+  virtual void SetUp() { start_time_ = time(nullptr); }
+  virtual void TearDown() {
+    const time_t end_time = time(nullptr);
+    EXPECT_TRUE(end_time - start_time_ <= 25)
+        << "The test took too long - "
+        << ::testing::PrintToString(end_time - start_time_);
+  }
+  time_t start_time_;
+};
+
+class ClassicMockProgressSink {
+ public:
+  MOCK_METHOD1(classicProgress, bool(int));
+  MOCK_METHOD1(cancel, bool(int));
+
+  ETEXT_DESC monitor;
+
+  ClassicMockProgressSink() {
+    monitor.progress_callback = [](int progress, int, int, int, int) -> bool {
+      return instance->classicProgress(progress);
+    };
+    monitor.cancel = [](void* ths, int words) -> bool {
+      return ((ClassicMockProgressSink*)ths)->cancel(words);
+    };
+    monitor.cancel_this = this;
+    instance = this;
+  }
+
+  static ClassicMockProgressSink* instance;
+};
+
+ClassicMockProgressSink* ClassicMockProgressSink::instance = nullptr;
+
+class NewMockProgressSink : public ClassicMockProgressSink {
+ public:
+  MOCK_METHOD1(progress, bool(int));
+
+  NewMockProgressSink() {
+    monitor.progress_callback2 = [](ETEXT_DESC* ths, int, int, int,
+                                    int) -> bool {
+      return ((NewMockProgressSink*)ths->cancel_this)->progress(ths->progress);
+    };
+  }
+};
+
+void ClassicProgressTester(const char* imgname, const char* tessdatadir,
+                           const char* lang) {
+  using ::testing::_;
+  using ::testing::AllOf;
+  using ::testing::AtLeast;
+  using ::testing::DoAll;
+  using ::testing::Gt;
+  using ::testing::Le;
+  using ::testing::Return;
+  using ::testing::SaveArg;
+
+  std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+  ASSERT_FALSE(api->Init(tessdatadir, lang))
+      << "Could not initialize tesseract.";
+  Pix* image = pixRead(imgname);
+  ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+  api->SetImage(image);
+
+  ClassicMockProgressSink progressSink;
+
+  int currentProgress = -1;
+  EXPECT_CALL(progressSink,
+              classicProgress(AllOf(Gt<int&>(currentProgress), Le(100))))
+      .Times(AtLeast(5))
+      .WillRepeatedly(DoAll(SaveArg<0>(&currentProgress), Return(false)));
+  EXPECT_CALL(progressSink, cancel(_))
+      .Times(AtLeast(5))
+      .WillRepeatedly(Return(false));
+
+  EXPECT_EQ(api->Recognize(&progressSink.monitor), false);
+  EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%";
+
+  api->End();
+  pixDestroy(&image);
+}
+
+void NewProgressTester(const char* imgname, const char* tessdatadir,
+                       const char* lang) {
+  using ::testing::_;
+  using ::testing::AllOf;
+  using ::testing::AtLeast;
+  using ::testing::DoAll;
+  using ::testing::Gt;
+  using ::testing::Le;
+  using ::testing::Return;
+  using ::testing::SaveArg;
+
+  std::unique_ptr<tesseract::TessBaseAPI> api(new tesseract::TessBaseAPI());
+  ASSERT_FALSE(api->Init(tessdatadir, lang))
+      << "Could not initialize tesseract.";
+  Pix* image = pixRead(imgname);
+  ASSERT_TRUE(image != nullptr) << "Failed to read test image.";
+  api->SetImage(image);
+
+  NewMockProgressSink progressSink;
+
+  int currentProgress = -1;
+  EXPECT_CALL(progressSink, classicProgress(_)).Times(0);
+  EXPECT_CALL(progressSink, progress(AllOf(Gt<int&>(currentProgress), Le(100))))
+      .Times(AtLeast(5))
+      .WillRepeatedly(DoAll(SaveArg<0>(&currentProgress), Return(false)));
+  EXPECT_CALL(progressSink, cancel(_))
+      .Times(AtLeast(5))
+      .WillRepeatedly(Return(false));
+
+  EXPECT_EQ(api->Recognize(&progressSink.monitor), false);
+  EXPECT_GE(currentProgress, 50) << "The reported progress did not reach 50%";
+
+  api->End();
+  pixDestroy(&image);
+}
+
+TEST(QuickTest, ClassicProgressReporting) {
+  ClassicProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast",
+                        "eng");
+}
+
+TEST(QuickTest, NewProgressReporting) {
+  NewProgressTester(TESTING_DIR "/phototest.tif", TESSDATA_DIR "_fast", "eng");
+}
+
+}  // namespace
diff --git a/tesseract/unittest/qrsequence_test.cc b/tesseract/unittest/qrsequence_test.cc
new file mode 100644
index 00000000..783228d8
--- /dev/null
+++ b/tesseract/unittest/qrsequence_test.cc
@@ -0,0 +1,69 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <algorithm>
+#include <vector>
+
+#include "cycletimer.h"
+#include "include_gunit.h"
+#include "log.h"
+#include "qrsequence.h"
+
+namespace tesseract {
+
+class TestableQRSequenceGenerator : public QRSequenceGenerator {
+ public:
+  explicit TestableQRSequenceGenerator(const int& N) : QRSequenceGenerator(N) {}
+  // Overriding scope for testing
+  using QRSequenceGenerator::GetBinaryReversedInteger;
+};
+
+// Verifies binary inversion for a small range.
+TEST(QRSequenceGenerator, GetBinaryReversedInteger) {
+  const int kRangeSize = 8;
+  TestableQRSequenceGenerator generator(kRangeSize);
+  int reversed_vals[kRangeSize] = {0, 4, 2, 6, 1, 5, 3, 7};
+  for (int i = 0; i < kRangeSize; ++i)
+    EXPECT_EQ(reversed_vals[i], generator.GetBinaryReversedInteger(i));
+}
+
+// Trivial test fixture for a parameterized test.
+class QRSequenceGeneratorTest : public ::testing::TestWithParam<int> {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+};
+
+TEST_P(QRSequenceGeneratorTest, GeneratesValidSequence) {
+  const int kRangeSize = GetParam();
+  TestableQRSequenceGenerator generator(kRangeSize);
+  std::vector<int> vals(kRangeSize);
+  CycleTimer timer;
+  timer.Restart();
+  for (int i = 0; i < kRangeSize; ++i) vals[i] = generator.GetVal();
+  LOG(INFO) << kRangeSize << "-length sequence took " << timer.GetInMs() << "ms";
+  // Sort the numbers to verify that we've covered the range without repetition.
+  std::sort(vals.begin(), vals.end());
+  for (int i = 0; i < kRangeSize; ++i) {
+    EXPECT_EQ(i, vals[i]);
+    if (i != vals[i]) {
+      LOG(INFO) << "Aborting remaining comparisons";
+      break;
+    }
+  }
+}
+
+// Run a parameterized test using the following range sizes.
+INSTANTIATE_TEST_SUITE_P(RangeTest, QRSequenceGeneratorTest,
+                        ::testing::Values(2, 7, 8, 9, 16, 1e2, 1e4, 1e6));
+}  // namespace
diff --git a/tesseract/unittest/recodebeam_test.cc b/tesseract/unittest/recodebeam_test.cc
new file mode 100644
index 00000000..6e9bc4e3
--- /dev/null
+++ b/tesseract/unittest/recodebeam_test.cc
@@ -0,0 +1,483 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include_gunit.h"
+#include "log.h"                        // for LOG
+
+#include "genericvector.h"
+#include "recodebeam.h"
+#include "matrix.h"
+#include "pageres.h"
+#include "ratngs.h"
+#include "unicharcompress.h"
+#include "normstrngs.h"
+#include "unicharset_training_utils.h"
+
+#include "helpers.h"
+
+#include "absl/strings/str_format.h"        // for absl::StrFormat
+
+namespace tesseract {
+
+// Number of characters to test beam search with.
+const int kNumChars = 100;
+// Amount of extra random data to pad with after.
+const int kPadding = 64;
+// Dictionary test data.
+// The top choice is: "Gef s wordsright.".
+// The desired phrase is "Gets words right.".
+// There is a competing dictionary phrase: "Get swords right.".
+// ... due to the following errors from the network:
+// f stronger than t in "Get".
+// weak space between Gef and s and between s and words.
+// weak space between words and right.
+const char* kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r",    "d",
+                          "s", "",  "r", "i", "g", "h", "t", ".", nullptr};
+const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65,
+                               0.89, 0.99, 0.99, 0.99, 0.99, 0.95,
+                               0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
+const char* kGWR2nds[] = {"C", "c", "t", "",  "S", "",  "W", "O", "t",    "h",
+                          "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
+const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25,
+                               0.10, 0.01, 0.01, 0.01, 0.01, 0.05,
+                               0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
+
+const char* kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
+const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
+const char* kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
+const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
+
+const char* kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
+const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
+const char* kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
+const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
+
+class RecodeBeamTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+  RecodeBeamTest() : lstm_dict_(&ccutil_) {}
+  ~RecodeBeamTest() { lstm_dict_.End(); }
+
+  // Loads and compresses the given unicharset.
+  void LoadUnicharset(const std::string& unicharset_name) {
+    std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR,
+                                                "radical-stroke.txt");
+    std::string unicharset_file =
+        file::JoinPath(TESTDATA_DIR, unicharset_name);
+    std::string radical_data;
+    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
+                               file::Defaults()));
+    CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
+    unichar_null_char_ = ccutil_.unicharset.has_special_codes()
+                             ? UNICHAR_BROKEN
+                             : ccutil_.unicharset.size();
+    STRING radical_str(radical_data.c_str());
+    EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_,
+                                         &radical_str));
+    RecodedCharID code;
+    recoder_.EncodeUnichar(unichar_null_char_, &code);
+    encoded_null_char_ = code(0);
+    // Space should encode as itself.
+    recoder_.EncodeUnichar(UNICHAR_SPACE, &code);
+    EXPECT_EQ(UNICHAR_SPACE, code(0));
+    std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
+    STRING encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
+    std::string encoding_str(&encoding[0], encoding.size());
+    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
+    LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
+  }
+  // Loads the dictionary.
+  void LoadDict(const std::string& lang) {
+    std::string traineddata_name = lang + ".traineddata";
+    std::string traineddata_file =
+        file::JoinPath(TESTDATA_DIR, traineddata_name);
+    lstm_dict_.SetupForLoad(nullptr);
+    tesseract::TessdataManager mgr;
+    mgr.Init(traineddata_file.c_str());
+    lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
+    lstm_dict_.FinishLoad();
+  }
+
+  // Expects the appropriate results from the compressed_  ccutil_.unicharset.
+  void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
+                     const GenericVector<int>& transcription) {
+    // Get the utf8 string of the transcription.
+    std::string truth_utf8;
+    for (int i = 0; i < transcription.size(); ++i) {
+      truth_utf8 += ccutil_.unicharset.id_to_unichar(transcription[i]);
+    }
+    PointerVector<WERD_RES> words;
+    ExpectCorrect(output, truth_utf8, nullptr, &words);
+  }
+  void ExpectCorrect(const GENERIC_2D_ARRAY<float>& output,
+                     const std::string& truth_utf8, Dict* dict,
+                     PointerVector<WERD_RES>* words) {
+    RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
+    beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
+    // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
+    // beam_search.DebugBeams(ccutil_.unicharset);
+    std::vector<int> labels, xcoords;
+    beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
+    LOG(INFO) << "Labels size = " << labels.size() << " coords "
+              << xcoords.size() << "\n";
+    // Now decode using recoder_.
+    std::string decoded;
+    int end = 1;
+    for (int start = 0; start < labels.size(); start = end) {
+      RecodedCharID code;
+      int index = start;
+      int uni_id = INVALID_UNICHAR_ID;
+      do {
+        code.Set(code.length(), labels[index++]);
+        uni_id = recoder_.DecodeUnichar(code);
+      } while (index < labels.size() &&
+               code.length() < RecodedCharID::kMaxCodeLen &&
+               (uni_id == INVALID_UNICHAR_ID ||
+                !recoder_.IsValidFirstCode(labels[index])));
+      EXPECT_NE(INVALID_UNICHAR_ID, uni_id)
+          << "index=" << index << "/" << labels.size();
+      // To the extent of truth_utf8, we expect decoded to match, but if
+      // transcription is shorter, that is OK too, as we may just be testing
+      // that we get a valid sequence when padded with random data.
+      if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size())
+        decoded += ccutil_.unicharset.id_to_unichar(uni_id);
+      end = index;
+    }
+    EXPECT_EQ(truth_utf8, decoded);
+
+    // Check that ExtractBestPathAsUnicharIds does the same thing.
+    std::vector<int> unichar_ids;
+    std::vector<float> certainties, ratings;
+    beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset,
+                                            &unichar_ids, &certainties,
+                                            &ratings, &xcoords);
+    std::string u_decoded;
+    float total_rating = 0.0f;
+    for (int u = 0; u < unichar_ids.size(); ++u) {
+      // To the extent of truth_utf8, we expect decoded to match, but if
+      // transcription is shorter, that is OK too, as we may just be testing
+      // that we get a valid sequence when padded with random data.
+      if (u_decoded.size() < truth_utf8.size()) {
+        const char* str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
+        total_rating += ratings[u];
+        LOG(INFO) << absl::StrFormat("%d:u_id=%d=%s, c=%g, r=%g, r_sum=%g @%d", u,
+                                  unichar_ids[u], str, certainties[u],
+                                  ratings[u], total_rating, xcoords[u]) << "\n";
+        if (str[0] == ' ') total_rating = 0.0f;
+        u_decoded += str;
+      }
+    }
+    EXPECT_EQ(truth_utf8, u_decoded);
+
+    // Check that ExtractBestPathAsWords does the same thing.
+    TBOX line_box(0, 0, 100, 10);
+    for (int i = 0; i < 2; ++i) {
+      beam_search.ExtractBestPathAsWords(line_box, 1.0f, false,
+                                         &ccutil_.unicharset, words);
+      std::string w_decoded;
+      for (int w = 0; w < words->size(); ++w) {
+        const WERD_RES* word = (*words)[w];
+        if (w_decoded.size() < truth_utf8.size()) {
+          if (!w_decoded.empty() && word->word->space()) w_decoded += " ";
+          w_decoded += word->best_choice->unichar_string().c_str();
+        }
+        LOG(INFO) << absl::StrFormat("Word:%d = %s, c=%g, r=%g, perm=%d", w,
+                                  word->best_choice->unichar_string().c_str(),
+                                  word->best_choice->certainty(),
+                                  word->best_choice->rating(),
+                                  word->best_choice->permuter()) << "\n";
+      }
+      std::string w_trunc(w_decoded.data(), truth_utf8.size());
+      if (truth_utf8 != w_trunc) {
+        tesseract::NormalizeUTF8String(
+            tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize,
+            tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
+        w_trunc.assign(w_decoded.data(), truth_utf8.size());
+      }
+      EXPECT_EQ(truth_utf8, w_trunc);
+    }
+  }
+  // Generates easy encoding of the given unichar_ids, and pads with at least
+  // padding of random data.
+  GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(
+      const GenericVector<int>& unichar_ids, int padding) {
+    int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
+    int num_codes = recoder_.code_range();
+    GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
+    // Fill with random data.
+    TRand random;
+    for (int t = 0; t < width; ++t) {
+      for (int i = 0; i < num_codes; ++i)
+        outputs(t, i) = random.UnsignedRand(0.25);
+    }
+    int t = 0;
+    for (int i = 0; i < unichar_ids.size(); ++i) {
+      RecodedCharID code;
+      int len = recoder_.EncodeUnichar(unichar_ids[i], &code);
+      EXPECT_NE(0, len);
+      for (int j = 0; j < len; ++j) {
+        // Make the desired answer a clear winner.
+        if (j > 0 && code(j) == code(j - 1)) {
+          // We will collapse adjacent equal codes so put a null in between.
+          outputs(t++, encoded_null_char_) = 1.0f;
+        }
+        outputs(t++, code(j)) = 1.0f;
+      }
+      // Put a 0 as a null char in between.
+      outputs(t++, encoded_null_char_) = 1.0f;
+    }
+    // Normalize the probs.
+    for (int t = 0; t < width; ++t) {
+      double sum = 0.0;
+      for (int i = 0; i < num_codes; ++i) sum += outputs(t, i);
+      for (int i = 0; i < num_codes; ++i) outputs(t, i) /= sum;
+    }
+
+    return outputs;
+  }
+  // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
+  // the score for the appropriate sequence of codes, returning the ending t.
+  int EncodeUTF8(const char* utf8_str, float score, int start_t, TRand* random,
+                 GENERIC_2D_ARRAY<float>* outputs) {
+    int t = start_t;
+    std::vector<int> unichar_ids;
+    EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids,
+                                                 nullptr, nullptr));
+    if (unichar_ids.empty() || utf8_str[0] == '\0') {
+      unichar_ids.clear();
+      unichar_ids.push_back(unichar_null_char_);
+    }
+    int num_ids = unichar_ids.size();
+    for (int u = 0; u < num_ids; ++u) {
+      RecodedCharID code;
+      int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
+      EXPECT_NE(0, len);
+      for (int i = 0; i < len; ++i) {
+        // Apply the desired score.
+        (*outputs)(t++, code(i)) = score;
+        if (random != nullptr &&
+            t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
+          int dups = static_cast<int>(random->UnsignedRand(3.0));
+          for (int d = 0; d < dups; ++d) {
+            // Duplicate the desired score.
+            (*outputs)(t++, code(i)) = score;
+          }
+        }
+      }
+      if (random != nullptr &&
+          t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
+        int dups = static_cast<int>(random->UnsignedRand(3.0));
+        for (int d = 0; d < dups; ++d) {
+          // Add a random number of nulls as well.
+          (*outputs)(t++, encoded_null_char_) = score;
+        }
+      }
+    }
+    return t;
+  }
+  // Generates an encoding of the given 4 arrays as synthetic network scores.
+  // uses scores1 for chars1 and scores2 for chars2, and everything else gets
+  // the leftovers shared out equally. Note that empty string encodes as the
+  // null_char_.
+  GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char* chars1[],
+                                                   const float scores1[],
+                                                   const char* chars2[],
+                                                   const float scores2[],
+                                                   TRand* random) {
+    int width = 0;
+    while (chars1[width] != nullptr) ++width;
+    int padding = width * RecodedCharID::kMaxCodeLen;
+    int num_codes = recoder_.code_range();
+    GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
+    int t = 0;
+    for (int i = 0; i < width; ++i) {
+      // In case there is overlap in the codes between 1st and 2nd choice, it
+      // is better to encode the 2nd choice first.
+      int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
+      int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
+      // Advance t to the max end, setting everything else to the leftovers.
+      int max_t = std::max(end_t1, end_t2);
+      while (t < max_t) {
+        double total_score = 0.0;
+        for (int j = 0; j < num_codes; ++j) total_score += outputs(t, j);
+        double null_remainder = (1.0 - total_score) / 2.0;
+        double remainder = null_remainder / (num_codes - 2);
+        if (outputs(t, encoded_null_char_) < null_remainder) {
+          outputs(t, encoded_null_char_) += null_remainder;
+        } else {
+          remainder += remainder;
+        }
+        for (int j = 0; j < num_codes; ++j) {
+          if (outputs(t, j) == 0.0f) outputs(t, j) = remainder;
+        }
+        ++t;
+      }
+    }
+    // Fill the rest with null chars.
+    while (t < width + padding) {
+      outputs(t++, encoded_null_char_) = 1.0f;
+    }
+    return outputs;
+  }
+  UnicharCompress recoder_;
+  int unichar_null_char_ = 0;
+  int encoded_null_char_ = 0;
+  CCUtil ccutil_;
+  Dict lstm_dict_;
+};
+
+TEST_F(RecodeBeamTest, DoesChinese) {
+  LOG(INFO) << "Testing chi_tra" << "\n";
+  LoadUnicharset("chi_tra.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  GenericVector<int> transcription;
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  GENERIC_2D_ARRAY<float> outputs =
+      GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+  LOG(INFO) << "Testing chi_sim" << "\n";
+  LoadUnicharset("chi_sim.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  transcription.clear();
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesJapanese) {
+  LOG(INFO) << "Testing jpn" << "\n";
+  LoadUnicharset("jpn.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  GenericVector<int> transcription;
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  GENERIC_2D_ARRAY<float> outputs =
+      GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesKorean) {
+  LOG(INFO) << "Testing kor" << "\n";
+  LoadUnicharset("kor.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  GenericVector<int> transcription;
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  GENERIC_2D_ARRAY<float> outputs =
+      GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesKannada) {
+  LOG(INFO) << "Testing kan" << "\n";
+  LoadUnicharset("kan.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  GenericVector<int> transcription;
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  GENERIC_2D_ARRAY<float> outputs =
+      GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesMarathi) {
+  LOG(INFO) << "Testing mar" << "\n";
+  LoadUnicharset("mar.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  GenericVector<int> transcription;
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  GENERIC_2D_ARRAY<float> outputs =
+      GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DoesEnglish) {
+  LOG(INFO) << "Testing eng" << "\n";
+  LoadUnicharset("eng.unicharset");
+  // Correctly reproduce the first kNumchars characters from easy output.
+  GenericVector<int> transcription;
+  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i)
+    transcription.push_back(i);
+  GENERIC_2D_ARRAY<float> outputs =
+      GenerateRandomPaddedOutputs(transcription, kPadding);
+  ExpectCorrect(outputs, transcription);
+}
+
+TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
+  LOG(INFO) << "Testing eng dictionary" << "\n";
+  LoadUnicharset("eng_beam.unicharset");
+  GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
+      kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
+  std::string default_str;
+  for (int i = 0; kGWRTops[i] != nullptr; ++i) default_str += kGWRTops[i];
+  PointerVector<WERD_RES> words;
+  ExpectCorrect(outputs, default_str, nullptr, &words);
+  // Now try again with the dictionary.
+  LoadDict("eng_beam");
+  ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
+}
+
+TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
+  LOG(INFO) << "Testing zh_hans dictionary" << "\n";
+  LoadUnicharset("zh_hans.unicharset");
+  GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
+      kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
+  PointerVector<WERD_RES> words;
+  ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
+  // Each is an individual word, with permuter = top choice.
+  EXPECT_EQ(7, words.size());
+  for (int w = 0; w < words.size(); ++w) {
+    EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
+  }
+  // Now try again with the dictionary.
+  LoadDict("zh_hans");
+  ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
+  // Number of words expected.
+  const int kNumWords = 5;
+  // Content of the words.
+  const char* kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
+  // Permuters of the words.
+  const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM,
+                                     TOP_CHOICE_PERM, TOP_CHOICE_PERM,
+                                     SYSTEM_DAWG_PERM};
+  EXPECT_EQ(kNumWords, words.size());
+  for (int w = 0; w < kNumWords && w < words.size(); ++w) {
+    EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
+    EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
+  }
+}
+
+// Tests that a recoder built with decomposed unicode allows true ctc
+// arbitrary duplicates and inserted nulls inside the multicode sequence.
+TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
+  LOG(INFO) << "Testing duplicates in multi-code sequences"  << "\n";
+  LoadUnicharset("vie.d.unicharset");
+  tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
+  TRand random;
+  GENERIC_2D_ARRAY<float> outputs = GenerateSyntheticOutputs(
+      kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
+  PointerVector<WERD_RES> words;
+  std::string truth_str;
+  tesseract::NormalizeUTF8String(
+      tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
+      tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
+  ExpectCorrect(outputs, truth_str, nullptr, &words);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/rect_test.cc b/tesseract/unittest/rect_test.cc
new file mode 100644
index 00000000..5d9d439f
--- /dev/null
+++ b/tesseract/unittest/rect_test.cc
@@ -0,0 +1,176 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "rect.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TBOXTest : public testing::Test {
+ public:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+  void TearDown() {}
+};
+
+TEST_F(TBOXTest, OverlapInside) {
+  TBOX a(10, 10, 20, 20);
+  TBOX b(11, 11, 12, 12);
+
+  EXPECT_TRUE(a.overlap(b));
+  EXPECT_TRUE(b.overlap(a));
+  EXPECT_DOUBLE_EQ(0.01, a.overlap_fraction(b));
+  EXPECT_DOUBLE_EQ(1.0, b.overlap_fraction(a));
+}
+
+TEST_F(TBOXTest, OverlapBoolCorners) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX bottom_left(5, 5, 15, 15);
+  TBOX top_left(5, 25, 15, 35);
+  // other corners covered by symmetry
+
+  EXPECT_TRUE(mid.overlap(bottom_left));
+  EXPECT_TRUE(bottom_left.overlap(mid));
+  EXPECT_TRUE(mid.overlap(top_left));
+  EXPECT_TRUE(top_left.overlap(mid));
+}
+
+TEST_F(TBOXTest, OverlapFractionCorners) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX bottom_left(5, 5, 15, 15);
+  TBOX top_left(5, 25, 15, 35);
+  // other corners covered by symmetry
+
+  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0),
+                   mid.overlap_fraction(bottom_left));
+  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0),
+                   bottom_left.overlap_fraction(mid));
+  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (20.0 * 20.0), mid.overlap_fraction(top_left));
+  EXPECT_DOUBLE_EQ((5.0 * 5.0) / (10.0 * 10.0), top_left.overlap_fraction(mid));
+}
+
+TEST_F(TBOXTest, OverlapBoolSides) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX left(5, 15, 15, 25);
+  TBOX bottom(15, 5, 25, 15);
+  // other sides covered by symmetry
+
+  EXPECT_TRUE(mid.overlap(left));
+  EXPECT_TRUE(left.overlap(mid));
+  EXPECT_TRUE(mid.overlap(bottom));
+  EXPECT_TRUE(bottom.overlap(mid));
+}
+
+TEST_F(TBOXTest, OverlapFractionSides) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX left(5, 15, 15, 25);
+  TBOX bottom(15, 5, 25, 15);
+  // other sides covered by symmetry
+
+  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(left));
+  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), left.overlap_fraction(mid));
+  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (20.0 * 20.0), mid.overlap_fraction(bottom));
+  EXPECT_DOUBLE_EQ((5.0 * 10.0) / (10.0 * 10.0), bottom.overlap_fraction(mid));
+}
+
+TEST_F(TBOXTest, OverlapBoolSpan) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX vertical(15, 5, 25, 35);
+  TBOX horizontal(5, 15, 35, 25);
+  // other sides covered by symmetry in other test cases
+
+  EXPECT_TRUE(mid.overlap(vertical));
+  EXPECT_TRUE(vertical.overlap(mid));
+  EXPECT_TRUE(mid.overlap(horizontal));
+  EXPECT_TRUE(horizontal.overlap(mid));
+}
+
+TEST_F(TBOXTest, OverlapFractionSpan) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX vertical(15, 5, 25, 35);
+  TBOX horizontal(5, 15, 35, 25);
+  // other sides covered by symmetry in other test cases
+
+  EXPECT_DOUBLE_EQ((10.0 * 20.0) / (20.0 * 20.0),
+                   mid.overlap_fraction(vertical));
+  EXPECT_DOUBLE_EQ((10.0 * 20.0) / (10.0 * 30.0),
+                   vertical.overlap_fraction(mid));
+  EXPECT_DOUBLE_EQ((20.0 * 10.0) / (20.0 * 20.0),
+                   mid.overlap_fraction(horizontal));
+  EXPECT_DOUBLE_EQ((20.0 * 10.0) / (30.0 * 10.0),
+                   horizontal.overlap_fraction(mid));
+}
+
+// TODO(nbeato): pretty much all cases
+TEST_F(TBOXTest, OverlapOutsideTests) {
+  TBOX mid(10, 10, 30, 30);
+  TBOX left(0, 15, 5, 25);
+
+  EXPECT_FALSE(mid.overlap(left));
+  EXPECT_FALSE(left.overlap(mid));
+  EXPECT_DOUBLE_EQ(0.0, mid.overlap_fraction(left));
+  EXPECT_DOUBLE_EQ(0.0, left.overlap_fraction(mid));
+}
+
+TEST_F(TBOXTest, OverlapXFraction) {
+  TBOX a(10, 10, 20, 20);
+  TBOX b(12, 100, 26, 200);
+  TBOX c(0, 0, 100, 100);
+  TBOX d(0, 0, 1, 1);
+
+  EXPECT_DOUBLE_EQ(8.0 / 10.0, a.x_overlap_fraction(b));
+  EXPECT_DOUBLE_EQ(8.0 / 14.0, b.x_overlap_fraction(a));
+  EXPECT_DOUBLE_EQ(1.0, a.x_overlap_fraction(c));
+  EXPECT_DOUBLE_EQ(10.0 / 100.0, c.x_overlap_fraction(a));
+  EXPECT_DOUBLE_EQ(0.0, a.x_overlap_fraction(d));
+  EXPECT_DOUBLE_EQ(0.0, d.x_overlap_fraction(a));
+}
+
+TEST_F(TBOXTest, OverlapYFraction) {
+  TBOX a(10, 10, 20, 20);
+  TBOX b(100, 12, 200, 26);
+  TBOX c(0, 0, 100, 100);
+  TBOX d(0, 0, 1, 1);
+
+  EXPECT_DOUBLE_EQ(8.0 / 10.0, a.y_overlap_fraction(b));
+  EXPECT_DOUBLE_EQ(8.0 / 14.0, b.y_overlap_fraction(a));
+  EXPECT_DOUBLE_EQ(1.0, a.y_overlap_fraction(c));
+  EXPECT_DOUBLE_EQ(10.0 / 100.0, c.y_overlap_fraction(a));
+  EXPECT_DOUBLE_EQ(0.0, a.y_overlap_fraction(d));
+  EXPECT_DOUBLE_EQ(0.0, d.y_overlap_fraction(a));
+}
+
+TEST_F(TBOXTest, OverlapXFractionZeroSize) {
+  TBOX zero(10, 10, 10, 10);
+  TBOX big(0, 0, 100, 100);
+  TBOX small(0, 0, 1, 1);
+
+  EXPECT_DOUBLE_EQ(1.0, zero.x_overlap_fraction(big));
+  EXPECT_DOUBLE_EQ(0.0, big.x_overlap_fraction(zero));
+  EXPECT_DOUBLE_EQ(0.0, zero.x_overlap_fraction(small));
+  EXPECT_DOUBLE_EQ(0.0, small.x_overlap_fraction(zero));
+}
+
+TEST_F(TBOXTest, OverlapYFractionZeroSize) {
+  TBOX zero(10, 10, 10, 10);
+  TBOX big(0, 0, 100, 100);
+  TBOX small(0, 0, 1, 1);
+
+  EXPECT_DOUBLE_EQ(1.0, zero.y_overlap_fraction(big));
+  EXPECT_DOUBLE_EQ(0.0, big.y_overlap_fraction(zero));
+  EXPECT_DOUBLE_EQ(0.0, zero.y_overlap_fraction(small));
+  EXPECT_DOUBLE_EQ(0.0, small.y_overlap_fraction(zero));
+}
+
+}  // namespace
diff --git a/tesseract/unittest/resultiterator_test.cc b/tesseract/unittest/resultiterator_test.cc
new file mode 100644
index 00000000..50e18949
--- /dev/null
+++ b/tesseract/unittest/resultiterator_test.cc
@@ -0,0 +1,612 @@
+
+#include <tesseract/resultiterator.h>
+#include <string>
+#include "allheaders.h"
+#include <tesseract/baseapi.h>
+#include "genericvector.h"
+#include "scrollview.h"
+
+#include "include_gunit.h"
+#include "log.h"                        // for LOG
+#include "absl/strings/str_format.h"        // for absl::StrFormat
+
+namespace tesseract {
+
+// DEFINE_string(tess_config, "", "config file for tesseract");
+// DEFINE_bool(visual_test, false, "Runs a visual test using scrollview");
+
+// Helper functions for converting to STL vectors
+template <typename T>
+void ToVector(const GenericVector<T>& from, std::vector<T>* to) {
+  to->clear();
+  for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
+}
+
+template <typename T>
+void ToVector(const std::vector<T>& from, std::vector<T>* to) {
+  to->clear();
+  for (int i = 0; i < from.size(); i++) to->push_back(from[i]);
+}
+
+// The fixture for testing Tesseract.
+class ResultIteratorTest : public testing::Test {
+ protected:
+  std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTING_DIR , name);
+  }
+  std::string TessdataPath() {
+    return file::JoinPath(TESSDATA_DIR, "");
+  }
+  std::string OutputNameToPath(const std::string& name) {
+    file::MakeTmpdir();
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+
+  ResultIteratorTest() { src_pix_ = nullptr; }
+  ~ResultIteratorTest() {}
+
+  void SetImage(const char* filename) {
+    src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
+    api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
+//    if (!FLAGS_tess_config.empty())
+//      api_.ReadConfigFile(FLAGS_tess_config.c_str());
+    api_.SetPageSegMode(tesseract::PSM_AUTO);
+    api_.SetImage(src_pix_);
+    pixDestroy(&src_pix_);
+    src_pix_ = api_.GetInputImage();
+  }
+
+  // Rebuilds the image using the binary images at the given level, and
+  // EXPECTs that the number of pixels in the xor of the rebuilt image with
+  // the original is at most max_diff.
+  void VerifyRebuild(int max_diff, PageIteratorLevel level, PageIterator* it) {
+    it->Begin();
+    int width = pixGetWidth(src_pix_);
+    int height = pixGetHeight(src_pix_);
+    int depth = pixGetDepth(src_pix_);
+    Pix* pix = pixCreate(width, height, depth);
+    EXPECT_TRUE(depth == 1 || depth == 8);
+    if (depth == 8) pixSetAll(pix);
+    do {
+      int left, top, right, bottom;
+      PageIteratorLevel im_level = level;
+      // If the return is false, it is a non-text block so get the block image.
+      if (!it->BoundingBox(level, &left, &top, &right, &bottom)) {
+        im_level = tesseract::RIL_BLOCK;
+        EXPECT_TRUE(it->BoundingBox(im_level, &left, &top, &right, &bottom));
+      }
+      LOG(INFO) << "BBox: [L:" << left << ", T:" << top << ", R:" << right
+              << ", B:" << bottom << "]" << "\n";
+      Pix* block_pix;
+      if (depth == 1) {
+        block_pix = it->GetBinaryImage(im_level);
+        pixRasterop(pix, left, top, right - left, bottom - top,
+                    PIX_SRC ^ PIX_DST, block_pix, 0, 0);
+      } else {
+        block_pix = it->GetImage(im_level, 2, src_pix_, &left, &top);
+        pixRasterop(pix, left, top, pixGetWidth(block_pix),
+                    pixGetHeight(block_pix), PIX_SRC & PIX_DST, block_pix, 0,
+                    0);
+      }
+      CHECK(block_pix != nullptr);
+      pixDestroy(&block_pix);
+    } while (it->Next(level));
+//    if (base::GetFlag(FLAGS_v) >= 1)
+//      pixWrite(OutputNameToPath("rebuilt.png").c_str(), pix, IFF_PNG);
+    pixRasterop(pix, 0, 0, width, height, PIX_SRC ^ PIX_DST, src_pix_, 0, 0);
+    if (depth == 8) {
+      Pix* binary_pix = pixThresholdToBinary(pix, 128);
+      pixDestroy(&pix);
+      pixInvert(binary_pix, binary_pix);
+      pix = binary_pix;
+    }
+//    if (base::GetFlag(FLAGS_v) >= 1)
+//      pixWrite(OutputNameToPath("rebuiltxor.png").c_str(), pix, IFF_PNG);
+    l_int32 pixcount;
+    pixCountPixels(pix, &pixcount, nullptr);
+    if (pixcount > max_diff) {
+      std::string outfile = OutputNameToPath("failedxor.png");
+      LOG(INFO) << "outfile = " << outfile << "\n";
+      pixWrite(outfile.c_str(), pix, IFF_PNG);
+    }
+    pixDestroy(&pix);
+    LOG(INFO) << absl::StrFormat("At level %d: pix diff = %d\n", level, pixcount);
+    EXPECT_LE(pixcount, max_diff);
+//    if (base::GetFlag(FLAGS_v) > 1) CHECK_LE(pixcount, max_diff);
+  }
+
+  // Rebuilds the text from the iterator strings at the given level, and
+  // EXPECTs that the rebuild string exactly matches the truth string.
+  void VerifyIteratorText(const std::string& truth, PageIteratorLevel level,
+                          ResultIterator* it) {
+    LOG(INFO) << "Text Test Level " << level << "\n";
+    it->Begin();
+    std::string result;
+    do {
+      char* text = it->GetUTF8Text(level);
+      result += text;
+      delete[] text;
+      if ((level == tesseract::RIL_WORD || level == tesseract::RIL_SYMBOL) &&
+          it->IsAtFinalElement(tesseract::RIL_WORD, level)) {
+        if (it->IsAtFinalElement(tesseract::RIL_TEXTLINE, level)) {
+          result += '\n';
+        } else {
+          result += ' ';
+        }
+        if (it->IsAtFinalElement(tesseract::RIL_PARA, level) &&
+           !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level)))
+           result += '\n';
+      }
+    } while (it->Next(level));
+    EXPECT_STREQ(truth.c_str(), result.c_str())
+        << "Rebuild failed at Text Level " << level;
+  }
+
+  void VerifyRebuilds(int block_limit, int para_limit, int line_limit,
+                      int word_limit, int symbol_limit, PageIterator* it) {
+    VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);
+    VerifyRebuild(para_limit, tesseract::RIL_PARA, it);
+    VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);
+    VerifyRebuild(word_limit, tesseract::RIL_WORD, it);
+    VerifyRebuild(symbol_limit, tesseract::RIL_SYMBOL, it);
+  }
+
+  void VerifyAllText(const std::string& truth, ResultIterator* it) {
+    VerifyIteratorText(truth, tesseract::RIL_BLOCK, it);
+    VerifyIteratorText(truth, tesseract::RIL_PARA, it);
+    VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it);
+    VerifyIteratorText(truth, tesseract::RIL_WORD, it);
+    VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it);
+  }
+
+  // Verifies that ResultIterator::CalculateTextlineOrder() produces the right
+  // results given an array of word directions (word_dirs[num_words]), an
+  // expected output reading order
+  // (expected_reading_order[num_reading_order_entries]) and a given reading
+  // context (ltr or rtl).
+  void ExpectTextlineReadingOrder(bool in_ltr_context,
+                                  const StrongScriptDirection* word_dirs,
+                                  int num_words, int* expected_reading_order,
+                                  int num_reading_order_entries) const {
+    std::vector<StrongScriptDirection> gv_word_dirs;
+    for (int i = 0; i < num_words; i++) {
+      gv_word_dirs.push_back(word_dirs[i]);
+    }
+
+    std::vector<int> output;
+    ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
+                                           &output);
+    // STL vector can be used with EXPECT_EQ, so convert...
+    std::vector<int> correct_order(
+        expected_reading_order,
+        expected_reading_order + num_reading_order_entries);
+    std::vector<int> calculated_order;
+    ToVector(output, &calculated_order);
+    EXPECT_EQ(correct_order, calculated_order);
+  }
+
+  // Verify that ResultIterator::CalculateTextlineOrder() produces sane output
+  // for a given array of word_dirs[num_words] in ltr or rtl context.
+  // Sane means that the output contains some permutation of the indices
+  // 0..[num_words - 1] interspersed optionally with negative (marker) values.
+  void VerifySaneTextlineOrder(bool in_ltr_context,
+                               const StrongScriptDirection* word_dirs,
+                               int num_words) const {
+    std::vector<StrongScriptDirection> gv_word_dirs;
+    for (int i = 0; i < num_words; i++) {
+      gv_word_dirs.push_back(word_dirs[i]);
+    }
+
+    std::vector<int> output;
+    ResultIterator::CalculateTextlineOrder(in_ltr_context, gv_word_dirs,
+                                           &output);
+    ASSERT_GE(output.size(), num_words);
+    std::vector<int> output_copy(output);
+    std::sort(output_copy.begin(), output_copy.end());
+    bool sane = true;
+    int j = 0;
+    while (j < output_copy.size() && output_copy[j] < 0) j++;
+    for (int i = 0; i < num_words; i++, j++) {
+      if (output_copy[j] != i) {
+        sane = false;
+        break;
+      }
+    }
+    if (j != output_copy.size()) {
+      sane = false;
+    }
+    if (!sane) {
+      std::vector<int> output_copy2, empty;
+      ToVector(output, &output_copy2);
+      EXPECT_EQ(output_copy2, empty)
+          << " permutation of 0.." << num_words - 1 << " not found in "
+          << (in_ltr_context ? "ltr" : "rtl") << " context.";
+    }
+  }
+
+  // Objects declared here can be used by all tests in the test case for Foo.
+  Pix* src_pix_;  // Borrowed from api_. Do not destroy.
+  std::string ocr_text_;
+  tesseract::TessBaseAPI api_;
+};
+
+// Tests layout analysis output (and scrollview) on the UNLV page numbered
+// 8087_054.3G.tif. (Dubrovnik), but only if --visual_test is true.
+//
+//TEST_F(ResultIteratorTest, VisualTest) {
+//  if (!FLAGS_visual_test) return;
+//  const char* kIms[] = {"8087_054.3G.tif", "8071_093.3B.tif", nullptr};
+//  for (int i = 0; kIms[i] != nullptr; ++i) {
+//    SetImage(kIms[i]);
+//    // Just run layout analysis.
+//    PageIterator* it = api_.AnalyseLayout();
+//    EXPECT_FALSE(it == nullptr);
+//    // Make a scrollview window for the display.
+//    int width = pixGetWidth(src_pix_);
+//    int height = pixGetHeight(src_pix_);
+//    ScrollView* win =
+//        new ScrollView(kIms[i], 100, 100, width / 2, height / 2, width, height);
+//    win->Image(src_pix_, 0, 0);
+//    it->Begin();
+//    ScrollView::Color color = ScrollView::RED;
+//    win->Brush(ScrollView::NONE);
+//    do {
+//      Pta* pts = it->BlockPolygon();
+//      if (pts != nullptr) {
+//        win->Pen(color);
+//        int num_pts = ptaGetCount(pts);
+//        l_float32 x, y;
+//        ptaGetPt(pts, num_pts - 1, &x, &y);
+//        win->SetCursor(static_cast<int>(x), static_cast<int>(y));
+//        for (int p = 0; p < num_pts; ++p) {
+//          ptaGetPt(pts, p, &x, &y);
+//          win->DrawTo(static_cast<int>(x), static_cast<int>(y));
+//        }
+//      }
+//      ptaDestroy(&pts);
+//    } while (it->Next(tesseract::RIL_BLOCK));
+//    win->Update();
+//    delete win->AwaitEvent(SVET_DESTROY);
+//    delete win;
+//    delete it;
+//  }
+//}
+
+// Tests that Tesseract gets exactly the right answer on phototest.
+TEST_F(ResultIteratorTest, EasyTest) {
+  SetImage("phototest.tif");
+  // Just run layout analysis.
+  PageIterator* p_it = api_.AnalyseLayout();
+  EXPECT_FALSE(p_it == nullptr);
+  // Check iterator position.
+  EXPECT_TRUE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
+  // This should be a single block.
+  EXPECT_FALSE(p_it->Next(tesseract::RIL_BLOCK));
+  EXPECT_FALSE(p_it->IsAtBeginningOf(tesseract::RIL_BLOCK));
+
+  // The images should rebuild almost perfectly.
+  LOG(INFO) << "Verifying image rebuilds 1 (pageiterator)" << "\n";
+  VerifyRebuilds(10, 10, 0, 0, 0, p_it);
+  delete p_it;
+
+  char* result = api_.GetUTF8Text();
+  ocr_text_ = result;
+  delete[] result;
+  ResultIterator* r_it = api_.GetIterator();
+  // The images should rebuild almost perfectly.
+  LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" << "\n";
+  VerifyRebuilds(8, 8, 0, 0, 40, r_it);
+  // Test the text.
+  LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" << "\n";
+  VerifyAllText(ocr_text_, r_it);
+
+  // The images should rebuild almost perfectly.
+  LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" << "\n";
+  VerifyRebuilds(8, 8, 0, 0, 40, r_it);
+
+  r_it->Begin();
+  // Test baseline of the first line.
+  int x1, y1, x2, y2;
+  r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
+  LOG(INFO) << absl::StrFormat("Baseline (%d,%d)->(%d,%d)", x1, y1, x2, y2) << "\n";
+  // Make sure we have a decent vector.
+  EXPECT_GE(x2, x1 + 400);
+  // The point 200,116 should be very close to the baseline.
+  // (x3,y3) is the vector from (x1,y1) to (200,116)
+  int x3 = 200 - x1;
+  int y3 = 116 - y1;
+  x2 -= x1;
+  y2 -= y1;
+  // The cross product (x2,y1)x(x3,y3) should be small.
+  int product = x2 * y3 - x3 * y2;
+  EXPECT_LE(abs(product), x2);
+
+  // Test font attributes for each word.
+  do {
+    bool bold, italic, underlined, monospace, serif, smallcaps;
+    int pointsize, font_id;
+    const char* font =
+        r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
+                                 &serif, &smallcaps, &pointsize, &font_id);
+    float confidence = r_it->Confidence(tesseract::RIL_WORD);
+    EXPECT_GE(confidence, 80.0f);
+    char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+    LOG(INFO) << absl::StrFormat("Word %s in font %s, id %d, size %d, conf %g",
+                            word_str, font, font_id, pointsize, confidence) << "\n";
+    delete[] word_str;
+    EXPECT_FALSE(bold);
+    EXPECT_FALSE(italic);
+    EXPECT_FALSE(underlined);
+    EXPECT_FALSE(monospace);
+    EXPECT_FALSE(serif);
+    // The text is about 31 pixels high.  Above we say the source is 200 ppi,
+    // which translates to:
+    // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts
+    EXPECT_GE(pointsize, 11.16 - 1.50);
+    EXPECT_LE(pointsize, 11.16 + 1.50);
+  } while (r_it->Next(tesseract::RIL_WORD));
+  delete r_it;
+}
+
+// Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
+TEST_F(ResultIteratorTest, ComplexTest) {
+  SetImage("8087_054.3B.tif");
+  // Just run layout analysis.
+  PageIterator* it = api_.AnalyseLayout();
+  EXPECT_FALSE(it == nullptr);
+  // The images should rebuild almost perfectly.
+  VerifyRebuilds(2073, 2073, 2080, 2081, 2090, it);
+  delete it;
+}
+
+// Tests image rebuild on the UNLV page numbered 8087_054.3G.tif. (Dubrovnik)
+TEST_F(ResultIteratorTest, GreyTest) {
+  SetImage("8087_054.3G.tif");
+  // Just run layout analysis.
+  PageIterator* it = api_.AnalyseLayout();
+  EXPECT_FALSE(it == nullptr);
+  // The images should rebuild almost perfectly.
+  VerifyRebuilds(600, 600, 600, 600, 600, it);
+  delete it;
+}
+
+// Tests that Tesseract gets smallcaps and dropcaps.
+TEST_F(ResultIteratorTest, SmallCapDropCapTest) {
+  SetImage("8071_093.3B.tif");
+  char* result = api_.GetUTF8Text();
+  delete[] result;
+  ResultIterator* r_it = api_.GetIterator();
+  // Iterate over the words.
+  int found_dropcaps = 0;
+  int found_smallcaps = 0;
+  int false_positives = 0;
+  do {
+    bool bold, italic, underlined, monospace, serif, smallcaps;
+    int pointsize, font_id;
+    r_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif,
+                             &smallcaps, &pointsize, &font_id);
+    char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+    if (word_str != nullptr) {
+      LOG(INFO) << absl::StrFormat("Word %s is %s", word_str,
+                              smallcaps ? "SMALLCAPS" : "Normal") << "\n";
+      if (r_it->SymbolIsDropcap()) {
+        ++found_dropcaps;
+      }
+      if (strcmp(word_str, "SHE") == 0 || strcmp(word_str, "MOPED") == 0 ||
+          strcmp(word_str, "RALPH") == 0 ||
+          strcmp(word_str, "KINNEY") == 0 ||  // Not working yet.
+          strcmp(word_str, "BENNETT") == 0) {
+        EXPECT_TRUE(smallcaps) << word_str;
+        ++found_smallcaps;
+      } else {
+        if (smallcaps) ++false_positives;
+      }
+      // No symbol other than the first of any word should be dropcap.
+      ResultIterator s_it(*r_it);
+      while (s_it.Next(tesseract::RIL_SYMBOL) &&
+             !s_it.IsAtBeginningOf(tesseract::RIL_WORD)) {
+        if (s_it.SymbolIsDropcap()) {
+          char* sym_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
+          LOG(ERROR) << absl::StrFormat("Symbol %s of word %s is dropcap", sym_str,
+                                     word_str);
+          delete[] sym_str;
+        }
+        EXPECT_FALSE(s_it.SymbolIsDropcap());
+      }
+      delete[] word_str;
+    }
+  } while (r_it->Next(tesseract::RIL_WORD));
+  delete r_it;
+  EXPECT_EQ(1, found_dropcaps);
+  EXPECT_GE(4, found_smallcaps);
+  EXPECT_LE(false_positives, 3);
+}
+
+#if 0
+// TODO(rays) uncomment on the next change to layout analysis.
+// CL 22736106 breaks it, but it is fixed in the change when
+// the textline finders start to collapse.
+
+// Tests that Tesseract gets subscript and superscript.
+// TODO(rays) This test is a bit feeble, due to bad textline finding on this
+// image, so beef up the test a bit when we get less false positive subs.
+TEST_F(ResultIteratorTest, SubSuperTest) {
+  SetImage("0146_281.3B.tif");
+  char* result = api_.GetUTF8Text();
+  delete [] result;
+  ResultIterator* r_it = api_.GetIterator();
+  // Iterate over the symbols.
+  // Accuracy isn't great, so just count up and expect a decent count of
+  // positives and negatives.
+  const char kAllowedSupers[] = "O0123456789-";
+  int found_subs = 0;
+  int found_supers = 0;
+  int found_normal = 0;
+  do {
+    if (r_it->SymbolIsSubscript()) {
+      ++found_subs;
+    } else if (r_it->SymbolIsSuperscript()) {
+      result = r_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+      if (strchr(kAllowedSupers, result[0]) == nullptr) {
+        char* word = r_it->GetUTF8Text(tesseract::RIL_WORD);
+        LOG(ERROR) << absl::StrFormat("Char %s in word %s is unexpected super!",
+                                    result, word);
+        delete [] word;
+        EXPECT_TRUE(strchr(kAllowedSupers, result[0]) != nullptr);
+      }
+      delete [] result;
+      ++found_supers;
+    } else {
+      ++found_normal;
+    }
+  } while (r_it->Next(tesseract::RIL_SYMBOL));
+  delete r_it;
+  LOG(INFO) << absl::StrFormat("Subs = %d, supers= %d, normal = %d",
+                          found_subs, found_supers, found_normal) << "\n";
+  EXPECT_GE(found_subs, 25);
+  EXPECT_GE(found_supers, 25);
+  EXPECT_GE(found_normal, 1350);
+}
+#endif
+
+static const StrongScriptDirection dL = DIR_LEFT_TO_RIGHT;
+static const StrongScriptDirection dR = DIR_RIGHT_TO_LEFT;
+static const StrongScriptDirection dN = DIR_NEUTRAL;
+
+// Test that a sequence of words that could be interpreted to start from
+// the left side left-to-right or from the right side right-to-left is
+// interpreted appropriately in different contexts.
+TEST_F(ResultIteratorTest, DualStartTextlineOrderTest) {
+  const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dR, dR, dR};
+  int reading_order_rtl_context[] = {7, 6, 5, 4, ResultIterator::kMinorRunStart,
+                                     0, 1, 2, 3, ResultIterator::kMinorRunEnd};
+  int reading_order_ltr_context[] = {0, 1,
+                                     2, 3,
+                                     4, ResultIterator::kMinorRunStart,
+                                     7, 6,
+                                     5, ResultIterator::kMinorRunEnd};
+
+  ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+                             reading_order_ltr_context,
+                             ABSL_ARRAYSIZE(reading_order_ltr_context));
+  ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+                             reading_order_rtl_context,
+                             ABSL_ARRAYSIZE(reading_order_rtl_context));
+}
+
+// Tests that clearly left-direction text (with no right-to-left indications)
+// comes out strictly left to right no matter the context.
+TEST_F(ResultIteratorTest, LeftwardTextlineOrderTest) {
+  const StrongScriptDirection word_dirs[] = {dL, dL, dN, dL, dN, dN, dL, dL};
+  // The order here is just left to right, nothing fancy.
+  int reading_order_ltr_context[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  // In the strange event that this shows up in an RTL paragraph, nonetheless
+  // just presume the whole thing is an LTR line.
+  int reading_order_rtl_context[] = {
+      ResultIterator::kMinorRunStart, 0, 1, 2, 3, 4, 5, 6, 7,
+      ResultIterator::kMinorRunEnd};
+
+  ExpectTextlineReadingOrder(true, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+                             reading_order_ltr_context,
+                             ABSL_ARRAYSIZE(reading_order_ltr_context));
+  ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+                             reading_order_rtl_context,
+                             ABSL_ARRAYSIZE(reading_order_rtl_context));
+}
+
+// Test that right-direction text comes out strictly right-to-left in
+// a right-to-left context.
+TEST_F(ResultIteratorTest, RightwardTextlineOrderTest) {
+  const StrongScriptDirection word_dirs[] = {dR, dR, dN, dR, dN, dN, dR, dR};
+  // The order here is just right-to-left, nothing fancy.
+  int reading_order_rtl_context[] = {7, 6, 5, 4, 3, 2, 1, 0};
+  ExpectTextlineReadingOrder(false, word_dirs, ABSL_ARRAYSIZE(word_dirs),
+                             reading_order_rtl_context,
+                             ABSL_ARRAYSIZE(reading_order_rtl_context));
+}
+
+TEST_F(ResultIteratorTest, TextlineOrderSanityCheck) {
+  // Iterate through all 7-word sequences and make sure that the output
+  // contains each of the indices 0..6 exactly once.
+  const int kNumWords(7);
+  const int kNumCombos = 1 << (2 * kNumWords);  // 4 ^ 7 combinations
+  StrongScriptDirection word_dirs[kNumWords];
+  for (int i = 0; i < kNumCombos; i++) {
+    // generate the next combination.
+    int tmp = i;
+    for (int j = 0; j < kNumWords; j++) {
+      word_dirs[j] = static_cast<StrongScriptDirection>(tmp % 4);
+      tmp = tmp / 4;
+    }
+    VerifySaneTextlineOrder(true, word_dirs, kNumWords);
+    VerifySaneTextlineOrder(false, word_dirs, kNumWords);
+  }
+}
+
+// TODO: Missing image
+TEST_F(ResultIteratorTest, DISABLED_NonNullChoicesTest) {
+  SetImage("5318c4b679264.jpg");
+  char* result = api_.GetUTF8Text();
+  delete[] result;
+  ResultIterator* r_it = api_.GetIterator();
+  // Iterate over the words.
+  do {
+    char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+    if (word_str != nullptr) {
+      LOG(INFO) << absl::StrFormat("Word %s:", word_str) << "\n";
+      ResultIterator s_it = *r_it;
+      do {
+        tesseract::ChoiceIterator c_it(s_it);
+        do {
+          const char* char_str = c_it.GetUTF8Text();
+          if (char_str == nullptr)
+            LOG(INFO) << "Null char choice" << "\n";
+          else
+            LOG(INFO) << "Char choice " << char_str << "\n";
+          CHECK(char_str != nullptr);
+        } while (c_it.Next());
+      } while (
+          !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
+          s_it.Next(tesseract::RIL_SYMBOL));
+      delete[] word_str;
+    }
+  } while (r_it->Next(tesseract::RIL_WORD));
+  delete r_it;
+}
+
+// TODO: Missing image
+TEST_F(ResultIteratorTest, NonNullConfidencesTest) {
+//  SetImage("line6.tiff");
+  SetImage("trainingitalline.tif");
+  api_.SetPageSegMode(tesseract::PSM_SINGLE_BLOCK);
+  // Force recognition so we can used the result iterator.
+  // We don't care about the return from GetUTF8Text.
+  char* result = api_.GetUTF8Text();
+  delete[] result;
+  ResultIterator* r_it = api_.GetIterator();
+  // Iterate over the words.
+  do {
+    char* word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
+    if (word_str != nullptr) {
+      EXPECT_FALSE(r_it->Empty(tesseract::RIL_WORD));
+      EXPECT_FALSE(r_it->Empty(tesseract::RIL_SYMBOL));
+      ResultIterator s_it = *r_it;
+      do {
+        const char* char_str = s_it.GetUTF8Text(tesseract::RIL_SYMBOL);
+        CHECK(char_str != nullptr);
+        float confidence = s_it.Confidence(tesseract::RIL_SYMBOL);
+        LOG(INFO) << absl::StrFormat("Char %s has confidence %g\n", char_str,
+                                confidence);
+        delete[] char_str;
+      } while (
+          !s_it.IsAtFinalElement(tesseract::RIL_WORD, tesseract::RIL_SYMBOL) &&
+          s_it.Next(tesseract::RIL_SYMBOL));
+      delete[] word_str;
+    } else {
+      LOG(INFO) << "Empty word found" << "\n";
+    }
+  } while (r_it->Next(tesseract::RIL_WORD));
+  delete r_it;
+}
+
+}  // namespace
diff --git a/tesseract/unittest/scanutils_test.cc b/tesseract/unittest/scanutils_test.cc
new file mode 100644
index 00000000..e6917fce
--- /dev/null
+++ b/tesseract/unittest/scanutils_test.cc
@@ -0,0 +1,114 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>             // for cout
+
+#include "include_gunit.h"
+#include "scanutils.h"
+
+namespace tesseract {
+
+class ScanutilsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+  }
+};
+
+TEST_F(ScanutilsTest, DoesScanf) {
+  // This test verifies that tfscanf does Scanf the same as stdio fscanf.
+  // There are probably a gazillion more test cases that could be added, but
+  // these brought the tesseract and unittest test results in line.
+  std::string filename = file::JoinPath(TESTDATA_DIR, "scanftest.txt");
+  FILE* fp1 = fopen(filename.c_str(), "r");
+  if (fp1 == nullptr) {
+    std::cout << "Failed to open file " << filename << '\n';
+    GTEST_SKIP();
+  }
+  FILE* fp2 = fopen(filename.c_str(), "r");
+  if (fp2 == nullptr) {
+    std::cout << "Failed to open file " << filename << '\n';
+    fclose(fp1);
+    GTEST_SKIP();
+  }
+  // The file contains this:
+  // 42.5 17 0.001000 -0.001000
+  // 0 1 123 -123 0x100
+  // abcdefghijklmnopqrstuvwxyz
+  // abcdefghijklmnopqrstuvwxyz
+  // MF 25 6.25e-2 0.5e5 -1e+4
+  // 42 MF 25 6.25e-2 0.5
+  // 24
+  const int kNumFloats = 4;
+  float f1[kNumFloats], f2[kNumFloats];
+  int r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]);
+  int r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]);
+  EXPECT_EQ(r1, kNumFloats);
+  EXPECT_EQ(r2, kNumFloats);
+  if (r1 == r2) {
+    for (int i = 0; i < r1; ++i) {
+      EXPECT_FLOAT_EQ(f1[i], f2[i]);
+    }
+  }
+  const int kNumInts = 5;
+  int i1[kNumInts], i2[kNumInts];
+  r1 = fscanf(fp1, "%d %d %d %d %i", &i1[0], &i1[1], &i1[2], &i1[3], &i1[4]);
+  r2 = tfscanf(fp2, "%d %d %d %d %i", &i2[0], &i2[1], &i2[2], &i2[3], &i2[4]);
+  EXPECT_EQ(r1, kNumInts);
+  EXPECT_EQ(r2, kNumInts);
+  if (r1 == r2) {
+    for (int i = 0; i < kNumInts; ++i) {
+      EXPECT_EQ(i1[i], i2[i]);
+    }
+  }
+  const int kStrLen = 1024;
+  char s1[kStrLen];
+  char s2[kStrLen];
+  r1 = fscanf(fp1, "%1023s", s1);
+  r2 = tfscanf(fp2, "%1023s", s2);
+  EXPECT_EQ(r1, r2);
+  EXPECT_STREQ(s1, s2);
+  EXPECT_EQ(26, strlen(s2));
+  r1 = fscanf(fp1, "%20s", s1);
+  r2 = tfscanf(fp2, "%20s", s2);
+  EXPECT_EQ(r1, r2);
+  EXPECT_STREQ(s1, s2);
+  EXPECT_EQ(20, strlen(s2));
+  // Now read the rest of the alphabet.
+  r1 = fscanf(fp1, "%1023s", s1);
+  r2 = tfscanf(fp2, "%1023s", s2);
+  EXPECT_EQ(r1, r2);
+  EXPECT_STREQ(s1, s2);
+  EXPECT_EQ(6, strlen(s2));
+  r1 = fscanf(fp1, "%1023s", s1);
+  r2 = tfscanf(fp2, "%1023s", s2);
+  EXPECT_EQ(r1, r2);
+  EXPECT_STREQ(s1, s2);
+  EXPECT_EQ(2, strlen(s2));
+  r1 = fscanf(fp1, "%f %f %f %f", &f1[0], &f1[1], &f1[2], &f1[3]);
+  r2 = tfscanf(fp2, "%f %f %f %f", &f2[0], &f2[1], &f2[2], &f2[3]);
+  EXPECT_EQ(r1, r2);
+  for (int i = 0; i < kNumFloats; ++i) EXPECT_FLOAT_EQ(f1[i], f2[i]);
+  // Test the * for field suppression.
+  r1 = fscanf(fp1, "%d %*s %*d %*f %*f", &i1[0]);
+  r2 = tfscanf(fp2, "%d %*s %*d %*f %*f", &i2[0]);
+  EXPECT_EQ(r1, r2);
+  EXPECT_EQ(i1[0], i2[0]);
+  // We should still see the next value and no phantoms.
+  r1 = fscanf(fp1, "%d %1023s", &i1[0], s1);
+  r2 = tfscanf(fp2, "%d %1023s", &i2[0], s2);
+  EXPECT_EQ(r1, r2);
+  EXPECT_EQ(1, r2);
+  EXPECT_EQ(i1[0], i2[0]);
+  fclose(fp2);
+  fclose(fp1);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/shapetable_test.cc b/tesseract/unittest/shapetable_test.cc
new file mode 100644
index 00000000..285ed833
--- /dev/null
+++ b/tesseract/unittest/shapetable_test.cc
@@ -0,0 +1,182 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_format.h"	// for absl::StrFormat
+
+#include "include_gunit.h"
+
+#include "serialis.h"
+#include "shapetable.h"
+#include "unicharset.h"
+
+namespace tesseract {
+
+#ifndef DISABLED_LEGACY_ENGINE
+
+static std::string TmpNameToPath(const std::string& name) {
+  return file::JoinPath(FLAGS_test_tmpdir, name);
+}
+
+// Sets up a simple shape with some unichars.
+static void Setup352(int font_id, Shape* shape) {
+  shape->AddToShape(3, font_id);
+  shape->AddToShape(5, font_id);
+  shape->AddToShape(2, font_id);
+}
+
+// Verifies some properties of the 352 shape.
+static void Expect352(int font_id, const Shape& shape) {
+  EXPECT_EQ(3, shape.size());
+  EXPECT_TRUE(shape.ContainsUnichar(2));
+  EXPECT_TRUE(shape.ContainsUnichar(3));
+  EXPECT_TRUE(shape.ContainsUnichar(5));
+  EXPECT_FALSE(shape.ContainsUnichar(1));
+  EXPECT_TRUE(shape.ContainsUnicharAndFont(2, font_id));
+  EXPECT_FALSE(shape.ContainsUnicharAndFont(2, font_id - 1));
+  EXPECT_FALSE(shape.ContainsUnicharAndFont(font_id, 2));
+  // It should be a subset of itself.
+  EXPECT_TRUE(shape.IsSubsetOf(shape));
+}
+
+#endif
+
+// The fixture for testing Shape.
+class ShapeTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+};
+
+// Tests that a Shape works as expected for all the basic functions.
+TEST_F(ShapeTest, BasicTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because Shape is missing.
+  GTEST_SKIP();
+#else
+  Shape shape1;
+  EXPECT_EQ(0, shape1.size());
+  Setup352(101, &shape1);
+  Expect352(101, shape1);
+  // It should still work after file I/O.
+  std::string filename = TmpNameToPath("shapefile");
+  FILE* fp = fopen(filename.c_str(), "wb");
+  ASSERT_TRUE(fp != nullptr);
+  EXPECT_TRUE(shape1.Serialize(fp));
+  fclose(fp);
+  TFile tfp;
+  EXPECT_TRUE(tfp.Open(filename.c_str(), nullptr));
+  Shape shape2;
+  EXPECT_TRUE(shape2.DeSerialize(&tfp));
+  Expect352(101, shape2);
+  // They should be subsets of each other.
+  EXPECT_TRUE(shape1.IsSubsetOf(shape2));
+  EXPECT_TRUE(shape2.IsSubsetOf(shape1));
+  // They should be equal unichars.
+  EXPECT_TRUE(shape1.IsEqualUnichars(&shape2));
+  // and still pass afterwards.
+  Expect352(101, shape1);
+  Expect352(101, shape2);
+#endif
+}
+
+// Tests AddShape separately, as it takes quite a bit of work.
+TEST_F(ShapeTest, AddShapeTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because Shape is missing.
+  GTEST_SKIP();
+#else
+  Shape shape1;
+  Setup352(101, &shape1);
+  Expect352(101, shape1);
+  // Now setup a different shape with different content.
+  Shape shape2;
+  shape2.AddToShape(3, 101);  // Duplicates shape1.
+  shape2.AddToShape(5, 110);  // Different font to shape1.
+  shape2.AddToShape(7, 101);  // Different unichar to shape1.
+  // They should NOT be subsets of each other.
+  EXPECT_FALSE(shape1.IsSubsetOf(shape2));
+  EXPECT_FALSE(shape2.IsSubsetOf(shape1));
+  // Now add shape2 to shape1.
+  shape1.AddShape(shape2);
+  // Test subsets again.
+  EXPECT_FALSE(shape1.IsSubsetOf(shape2));
+  EXPECT_TRUE(shape2.IsSubsetOf(shape1));
+  EXPECT_EQ(4, shape1.size());
+  EXPECT_FALSE(shape1.ContainsUnichar(1));
+  EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 101));
+  EXPECT_TRUE(shape1.ContainsUnicharAndFont(5, 110));
+  EXPECT_FALSE(shape1.ContainsUnicharAndFont(3, 110));
+  EXPECT_FALSE(shape1.ContainsUnicharAndFont(7, 110));
+  EXPECT_FALSE(shape1.IsEqualUnichars(&shape2));
+#endif
+}
+
+// The fixture for testing Shape.
+class ShapeTableTest : public testing::Test {};
+
+// Tests that a Shape works as expected for all the basic functions.
+TEST_F(ShapeTableTest, FullTest) {
+#ifdef DISABLED_LEGACY_ENGINE
+  // Skip test because Shape is missing.
+  GTEST_SKIP();
+#else
+  Shape shape1;
+  Setup352(101, &shape1);
+  // Build a shape table with the same data, but in separate shapes.
+  UNICHARSET unicharset;
+  unicharset.unichar_insert(" ");
+  for (int i = 1; i <= 10; ++i) {
+    std::string class_str = absl::StrFormat("class%d", i);
+    unicharset.unichar_insert(class_str.c_str());
+  }
+  ShapeTable st(unicharset);
+  EXPECT_EQ(0, st.AddShape(3, 101));
+  EXPECT_EQ(1, st.AddShape(5, 101));
+  EXPECT_EQ(2, st.AddShape(2, 101));
+  EXPECT_EQ(3, st.NumShapes());
+  Expect352(101, shape1);
+  EXPECT_EQ(3, st.AddShape(shape1));
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_FALSE(st.MutableShape(i)->IsEqualUnichars(&shape1));
+  }
+  EXPECT_TRUE(st.MutableShape(3)->IsEqualUnichars(&shape1));
+  EXPECT_TRUE(st.AnyMultipleUnichars());
+  st.DeleteShape(3);
+  EXPECT_FALSE(st.AnyMultipleUnichars());
+
+  // Now merge to make a single shape like shape1.
+  EXPECT_EQ(1, st.MasterUnicharCount(0));
+  st.MergeShapes(0, 1);
+  EXPECT_EQ(3, st.MergedUnicharCount(1, 2));
+  st.MergeShapes(1, 2);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(3, st.MasterUnicharCount(i));
+    // Master font count is the sum of all the font counts in the shape, not
+    // the actual number of different fonts in the shape.
+    EXPECT_EQ(3, st.MasterFontCount(i));
+  }
+  EXPECT_EQ(0, st.MasterDestinationIndex(1));
+  EXPECT_EQ(0, st.MasterDestinationIndex(2));
+  ShapeTable st2;
+  st2.AppendMasterShapes(st, nullptr);
+  EXPECT_EQ(1, st.NumMasterShapes());
+  EXPECT_EQ(1, st2.NumShapes());
+  EXPECT_TRUE(st2.MutableShape(0)->IsEqualUnichars(&shape1));
+  EXPECT_TRUE(st2.AnyMultipleUnichars());
+#endif
+}
+
+}  // namespace
diff --git a/tesseract/unittest/stats_test.cc b/tesseract/unittest/stats_test.cc
new file mode 100644
index 00000000..58c3483d
--- /dev/null
+++ b/tesseract/unittest/stats_test.cc
@@ -0,0 +1,59 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "genericvector.h"
+#include "kdpair.h"
+#include "statistc.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+const int kTestData[] = {2, 0, 12, 1, 1, 2, 10, 1, 0, 0, 0, 2, 0, 4, 1, 1};
+
+class STATSTest : public testing::Test {
+ public:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    stats_.set_range(0, 16);
+    for (size_t i = 0; i < ARRAYSIZE(kTestData); ++i)
+      stats_.add(i, kTestData[i]);
+  }
+
+  void TearDown() {}
+
+  STATS stats_;
+};
+
+// Tests some basic numbers from the stats_.
+TEST_F(STATSTest, BasicStats) {
+  EXPECT_EQ(37, stats_.get_total());
+  EXPECT_EQ(2, stats_.mode());
+  EXPECT_EQ(12, stats_.pile_count(2));
+}
+
+// Tests the top_n_modes function.
+TEST_F(STATSTest, TopNModes) {
+  GenericVector<tesseract::KDPairInc<float, int> > modes;
+  int num_modes = stats_.top_n_modes(3, &modes);
+  EXPECT_EQ(3, num_modes);
+  // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14.
+  EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key());
+  EXPECT_EQ(14, modes[0].data());
+  // Mode 1 is 2 10 1 = 13 total count with a mean of 5 12/13.
+  EXPECT_FLOAT_EQ(5.0f + 12.0f / 13, modes[1].key());
+  EXPECT_EQ(13, modes[1].data());
+  // Mode 2 is 4 1 1 = 6 total count with a mean of 13.5.
+  EXPECT_FLOAT_EQ(13.5f, modes[2].key());
+  EXPECT_EQ(6, modes[2].data());
+}
+
+}  // namespace.
diff --git a/tesseract/unittest/stridemap_test.cc b/tesseract/unittest/stridemap_test.cc
new file mode 100644
index 00000000..fa1ef234
--- /dev/null
+++ b/tesseract/unittest/stridemap_test.cc
@@ -0,0 +1,219 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef INCLUDE_TENSORFLOW
+#include <tensorflow/compiler/xla/array2d.h> // for xla::Array2D
+#else
+#include <array> // std::array
+#endif
+#include "include_gunit.h"
+#include "stridemap.h"
+
+namespace tesseract {
+
+#if !defined(INCLUDE_TENSORFLOW) && 0
+namespace xla {
+
+template <typename T>
+class Array2D : public std::vector<T> {
+ public:
+  Array2D() : std::vector<T>(std::vector<int64_t>{0, 0}) {}
+
+  Array2D(const int64_t n1, const int64_t n2)
+      : std::vector<T>(std::vector<int64_t>{n1, n2}) {}
+
+  Array2D(const int64_t n1, const int64_t n2, const T value)
+      : std::vector<T>({n1, n2}, value) {}
+};
+}
+#endif
+
+class StridemapTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+#ifdef INCLUDE_TENSORFLOW
+  // Sets up an Array2d object of the given size, initialized to increasing
+  // values starting with start.
+  std::unique_ptr<xla::Array2D<int>> SetupArray(int ysize, int xsize, int start) {
+    std::unique_ptr<xla::Array2D<int>> a(new xla::Array2D<int>(ysize, xsize));
+    int value = start;
+    for (int y = 0; y < ysize; ++y) {
+      for (int x = 0; x < xsize; ++x) {
+#ifdef INCLUDE_TENSORFLOW
+        (*a)(y, x) = value++;
+#else
+        a[y][x] = value++;
+#endif
+      }
+    }
+    return a;
+  }
+#endif
+};
+
+TEST_F(StridemapTest, Indexing) {
+  // This test verifies that with a batch of arrays of different sizes, the
+  // iteration index each of them in turn, without going out of bounds.
+#ifdef INCLUDE_TENSORFLOW
+  std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;
+  arrays.push_back(SetupArray(3, 4, 0));
+  arrays.push_back(SetupArray(4, 5, 12));
+  arrays.push_back(SetupArray(4, 4, 32));
+  arrays.push_back(SetupArray(3, 5, 48));
+  std::vector<std::pair<int, int>> h_w_sizes;
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());
+  }
+  StrideMap stride_map;
+  stride_map.SetStride(h_w_sizes);
+  StrideMap::Index index(stride_map);
+  int pos = 0;
+  do {
+    EXPECT_GE(index.t(), pos);
+    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+                                                  index.index(FD_WIDTH)),
+              pos);
+    EXPECT_EQ(index.IsLast(FD_BATCH),
+              index.index(FD_BATCH) == arrays.size() - 1);
+    EXPECT_EQ(
+        index.IsLast(FD_HEIGHT),
+        index.index(FD_HEIGHT) == arrays[index.index(FD_BATCH)]->height() - 1);
+    EXPECT_EQ(
+        index.IsLast(FD_WIDTH),
+        index.index(FD_WIDTH) == arrays[index.index(FD_BATCH)]->width() - 1);
+    EXPECT_TRUE(index.IsValid());
+    ++pos;
+  } while (index.Increment());
+  LOG(INFO) << "pos=" << pos;
+  index.InitToLast();
+  do {
+    --pos;
+    EXPECT_GE(index.t(), pos);
+    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+                                                  index.index(FD_WIDTH)),
+              pos);
+    StrideMap::Index copy(index);
+    // Since a change in batch index changes the height and width, it isn't
+    // necessarily true that the position is still valid, even when changing
+    // to another valid batch index.
+    if (index.IsLast(FD_BATCH)) {
+      EXPECT_FALSE(copy.AddOffset(1, FD_BATCH));
+    }
+    copy = index;
+    EXPECT_EQ(index.IsLast(FD_HEIGHT), !copy.AddOffset(1, FD_HEIGHT));
+    copy = index;
+    EXPECT_EQ(index.IsLast(FD_WIDTH), !copy.AddOffset(1, FD_WIDTH));
+    copy = index;
+    if (index.index(FD_BATCH) == 0) {
+      EXPECT_FALSE(copy.AddOffset(-1, FD_BATCH));
+    }
+    copy = index;
+    EXPECT_EQ(index.index(FD_HEIGHT) == 0, !copy.AddOffset(-1, FD_HEIGHT));
+    copy = index;
+    EXPECT_EQ(index.index(FD_WIDTH) == 0, !copy.AddOffset(-1, FD_WIDTH));
+    copy = index;
+    EXPECT_FALSE(copy.AddOffset(10, FD_WIDTH));
+    copy = index;
+    EXPECT_FALSE(copy.AddOffset(-10, FD_HEIGHT));
+    EXPECT_TRUE(index.IsValid());
+  } while (index.Decrement());
+#else
+  LOG(INFO) << "Skip test because of missing xla::Array2D";
+  GTEST_SKIP();
+#endif
+}
+
+TEST_F(StridemapTest, Scaling) {
+  // This test verifies that with a batch of arrays of different sizes, the
+  // scaling/reduction functions work as expected.
+#ifdef INCLUDE_TENSORFLOW
+  std::vector<std::unique_ptr<xla::Array2D<int>>> arrays;
+  arrays.push_back(SetupArray(3, 4, 0));   // 0-11
+  arrays.push_back(SetupArray(4, 5, 12));  // 12-31
+  arrays.push_back(SetupArray(4, 4, 32));  // 32-47
+  arrays.push_back(SetupArray(3, 5, 48));  // 48-62
+  std::vector<std::pair<int, int>> h_w_sizes;
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    h_w_sizes.emplace_back(arrays[i].get()->height(), arrays[i].get()->width());
+  }
+  StrideMap stride_map;
+  stride_map.SetStride(h_w_sizes);
+
+  // Scale x by 2, keeping y the same.
+  std::vector<int> values_x2 = {0,  1,  4,  5,  8,  9,  12, 13, 17, 18,
+                                22, 23, 27, 28, 32, 33, 36, 37, 40, 41,
+                                44, 45, 48, 49, 53, 54, 58, 59};
+  StrideMap test_map(stride_map);
+  test_map.ScaleXY(2, 1);
+  StrideMap::Index index(test_map);
+  int pos = 0;
+  do {
+    int expected_value = values_x2[pos++];
+    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+                                                  index.index(FD_WIDTH)),
+              expected_value);
+  } while (index.Increment());
+  EXPECT_EQ(pos, values_x2.size());
+
+  test_map = stride_map;
+  // Scale y by 2, keeping x the same.
+  std::vector<int> values_y2 = {0,  1,  2,  3,  12, 13, 14, 15, 16,
+                                17, 18, 19, 20, 21, 32, 33, 34, 35,
+                                36, 37, 38, 39, 48, 49, 50, 51, 52};
+  test_map.ScaleXY(1, 2);
+  index.InitToFirst();
+  pos = 0;
+  do {
+    int expected_value = values_y2[pos++];
+    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+                                                  index.index(FD_WIDTH)),
+              expected_value);
+  } while (index.Increment());
+  EXPECT_EQ(pos, values_y2.size());
+
+  test_map = stride_map;
+  // Scale x and y by 2.
+  std::vector<int> values_xy2 = {0, 1, 12, 13, 17, 18, 32, 33, 36, 37, 48, 49};
+  test_map.ScaleXY(2, 2);
+  index.InitToFirst();
+  pos = 0;
+  do {
+    int expected_value = values_xy2[pos++];
+    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+                                                  index.index(FD_WIDTH)),
+              expected_value);
+  } while (index.Increment());
+  EXPECT_EQ(pos, values_xy2.size());
+
+  test_map = stride_map;
+  // Reduce Width to 1.
+  std::vector<int> values_x_to_1 = {0,  4,  8,  12, 17, 22, 27,
+                                    32, 36, 40, 44, 48, 53, 58};
+  test_map.ReduceWidthTo1();
+  index.InitToFirst();
+  pos = 0;
+  do {
+    int expected_value = values_x_to_1[pos++];
+    EXPECT_EQ((*arrays.at(index.index(FD_BATCH)))(index.index(FD_HEIGHT),
+                                                  index.index(FD_WIDTH)),
+              expected_value);
+  } while (index.Increment());
+  EXPECT_EQ(pos, values_x_to_1.size());
+#else
+  LOG(INFO) << "Skip test because of missing xla::Array2D";
+  GTEST_SKIP();
+#endif
+}
+
+}  // namespace
diff --git a/tesseract/unittest/stringrenderer_test.cc b/tesseract/unittest/stringrenderer_test.cc
new file mode 100644
index 00000000..8cba6e4f
--- /dev/null
+++ b/tesseract/unittest/stringrenderer_test.cc
@@ -0,0 +1,564 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+
+#include "boxchar.h"
+#include "boxread.h"
+#include "commandlineflags.h"
+#include "stringrenderer.h"
+#include "strngs.h"
+
+#include "absl/strings/str_split.h"  // for absl::StrSplit
+#include "allheaders.h"
+
+#include <memory>
+#include <string>
+
+BOOL_PARAM_FLAG(display, false, "Display image for inspection");
+
+namespace tesseract {
+
+const char kEngText[] = "the quick brown fox jumps over the lazy dog";
+const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
+
+const char kKorText[] = "이는 것으로 다시 넣을 1234 수는 있지만 선택의 의미는";
+const char kArabicText[] =
+    "والفكر والصراع ، بالتأمل والفهم والتحليل ، "
+    "بالعلم والفن ، وأخيرا بالضحك أوبالبكاء ، ";
+const char kMixedText[] = "والفكر 123 والصراع abc";
+
+const char kEngNonLigatureText[] = "fidelity";
+// Same as kEngNonLigatureText, but with "fi" replaced with its ligature.
+const char kEngLigatureText[] = "ﬁdelity";
+
+static PangoFontMap* font_map;
+
+class StringRendererTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    if (!font_map) {
+      font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
+    }
+    pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
+  }
+
+  static void SetUpTestCase() {
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
+
+    l_chooseDisplayProg(L_DISPLAY_WITH_XZGV);
+    FLAGS_fonts_dir = TESTING_DIR;
+    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+    file::MakeTmpdir();
+    PangoFontInfo::SoftInitFontConfig(); // init early
+  }
+
+  void DisplayClusterBoxes(Pix* pix) {
+    if (!FLAGS_display) return;
+    const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+    Boxa* boxes = boxaCreate(0);
+    for (const auto& boxchar : boxchars) {
+      if (boxchar->box())
+        boxaAddBox(boxes, const_cast<Box*>(boxchar->box()), L_CLONE);
+    }
+    Pix* box_pix = pixDrawBoxaRandom(pix, boxes, 1);
+    boxaDestroy(&boxes);
+    pixDisplay(box_pix, 0, 0);
+    pixDestroy(&box_pix);
+  }
+  std::unique_ptr<StringRenderer> renderer_;
+};
+
+TEST_F(StringRendererTest, DoesRenderToImage) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+
+  renderer_.reset(new StringRenderer("UnBatang 10", 600, 600));
+  EXPECT_EQ(strlen(kKorText),
+            renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+
+  renderer_.reset(new StringRenderer("Lohit Hindi 10", 600, 600));
+  EXPECT_EQ(strlen(kHinText),
+            renderer_->RenderToImage(kHinText, strlen(kHinText), &pix));
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+
+  // RTL text
+  renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+  EXPECT_EQ(strlen(kArabicText),
+            renderer_->RenderToImage(kArabicText, strlen(kArabicText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+
+  // Mixed direction Arabic + english text
+  renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+  EXPECT_EQ(strlen(kMixedText),
+            renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesRenderToImageWithUnderline) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  // Underline all words but NOT intervening spaces.
+  renderer_->set_underline_start_prob(1.0);
+  renderer_->set_underline_continuation_prob(0);
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+  renderer_->ClearBoxes();
+
+  // Underline all words AND intervening spaces.
+  renderer_->set_underline_start_prob(1.0);
+  renderer_->set_underline_continuation_prob(1.0);
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+  renderer_->ClearBoxes();
+
+  // Underline words and intervening spaces with 0.5 prob.
+  renderer_->set_underline_start_prob(0.5);
+  renderer_->set_underline_continuation_prob(0.5);
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesHandleNewlineCharacters) {
+  const char kRawText[] = "\n\n\n A \nB \nC \n\n\n";
+  const char kStrippedText[] = " A B C ";  // text with newline chars removed
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kRawText),
+            renderer_->RenderToImage(kRawText, strlen(kRawText), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+  // 3 characters + 4 spaces => 7 boxes
+  EXPECT_EQ(7, boxchars.size());
+  if (boxchars.size() == 7) {
+    // Verify the text content of the boxchars
+    for (size_t i = 0; i < boxchars.size(); ++i) {
+      EXPECT_EQ(std::string(1, kStrippedText[i]), boxchars[i]->ch());
+    }
+  }
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesRenderLigatures) {
+  renderer_.reset(new StringRenderer("Arab 12", 600, 250));
+  const char kArabicLigature[] = "لا";
+
+  Pix* pix = nullptr;
+  EXPECT_EQ(
+      strlen(kArabicLigature),
+      renderer_->RenderToImage(kArabicLigature, strlen(kArabicLigature), &pix));
+  EXPECT_TRUE(pix != nullptr);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  const std::vector<BoxChar*>& boxes = renderer_->GetBoxes();
+  EXPECT_EQ(1, boxes.size());
+  EXPECT_TRUE(boxes[0]->box() != nullptr);
+  EXPECT_STREQ(kArabicLigature, boxes[0]->ch().c_str());
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+
+  renderer_.reset(new StringRenderer("Arab 12", 600, 250));
+  const char kArabicMixedText[] = "والفكر والصراع 1234,\nوالفكر لا والصراع";
+  renderer_->RenderToImage(kArabicMixedText, strlen(kArabicMixedText), &pix);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+}
+
+static int FindBoxCharXCoord(const std::vector<BoxChar*>& boxchars,
+                             const std::string& ch) {
+  for (const auto& boxchar : boxchars) {
+    if (boxchar->ch() == ch) return boxchar->box()->x;
+  }
+  return INT_MAX;
+}
+
+TEST_F(StringRendererTest, ArabicBoxcharsInLTROrder) {
+  renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+  Pix* pix = nullptr;
+  // Arabic letters should be in decreasing x-coordinates
+  const char kArabicWord[] = "\u0644\u0627\u0641\u0643\u0631";
+  const std::string kRevWord = "\u0631\u0643\u0641\u0627\u0644";
+  renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix);
+  std::string boxes_str = renderer_->GetBoxesStr();
+  // Decode to get the box text strings.
+  EXPECT_FALSE(boxes_str.empty());
+  std::vector<STRING> texts;
+  EXPECT_TRUE(ReadMemBoxes(0, false, boxes_str.c_str(), false, nullptr, &texts,
+                           nullptr, nullptr));
+  std::string ltr_str;
+  for (size_t i = 0; i < texts.size(); ++i) {
+    ltr_str += texts[i].c_str();
+  }
+  // The string should come out perfectly reversed, despite there being a
+  // ligature.
+  EXPECT_EQ(ltr_str, kRevWord);
+  // Just to prove there was a ligature, the number of texts is less than the
+  // number of unicodes.
+  EXPECT_LT(texts.size(), 5);
+  pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesOutputBoxcharsInReadingOrder) {
+  renderer_.reset(new StringRenderer("Arab 10", 600, 600));
+  Pix* pix = nullptr;
+  // Arabic letters should be in decreasing x-coordinates
+  const char kArabicWord[] = "والفكر";
+  renderer_->RenderToImage(kArabicWord, strlen(kArabicWord), &pix);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+  for (size_t i = 1; i < boxchars.size(); ++i) {
+    EXPECT_GT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x)
+        << boxchars[i - 1]->ch();
+  }
+  pixDestroy(&pix);
+
+  // English letters should be in increasing x-coordinates
+  const char kEnglishWord[] = "Google";
+  renderer_->ClearBoxes();
+  renderer_->RenderToImage(kEnglishWord, strlen(kEnglishWord), &pix);
+  EXPECT_EQ(boxchars.size(), strlen(kEnglishWord));
+  for (size_t i = 1; i < boxchars.size(); ++i) {
+    EXPECT_LT(boxchars[i - 1]->box()->x, boxchars[i]->box()->x)
+        << boxchars[i - 1]->ch();
+  }
+  pixDestroy(&pix);
+
+  // Mixed text should satisfy both.
+  renderer_->ClearBoxes();
+  renderer_->RenderToImage(kMixedText, strlen(kMixedText), &pix);
+  EXPECT_LT(FindBoxCharXCoord(boxchars, "a"), FindBoxCharXCoord(boxchars, "b"));
+  EXPECT_LT(FindBoxCharXCoord(boxchars, "1"), FindBoxCharXCoord(boxchars, "2"));
+  EXPECT_GT(FindBoxCharXCoord(boxchars, "و"), FindBoxCharXCoord(boxchars, "ر"));
+  pixDestroy(&pix);
+}
+
+TEST_F(StringRendererTest, DoesRenderVerticalText) {
+  Pix* pix = nullptr;
+  renderer_.reset(new StringRenderer("UnBatang 10", 600, 600));
+  renderer_->set_vertical_text(true);
+  EXPECT_EQ(strlen(kKorText),
+            renderer_->RenderToImage(kKorText, strlen(kKorText), &pix));
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  DisplayClusterBoxes(pix);
+  pixDestroy(&pix);
+}
+
+// Checks that we preserve charboxes across RenderToImage calls, with
+// appropriate page numbers.
+TEST_F(StringRendererTest, DoesKeepAllImageBoxes) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  Pix* pix = nullptr;
+  int num_boxes_per_page = 0;
+  const int kNumTrials = 2;
+  for (int i = 0; i < kNumTrials; ++i) {
+    EXPECT_EQ(strlen(kEngText),
+              renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+    EXPECT_TRUE(pix != nullptr);
+    pixDestroy(&pix);
+    EXPECT_GT(renderer_->GetBoxes().size(), 0);
+    if (!num_boxes_per_page) {
+      num_boxes_per_page = renderer_->GetBoxes().size();
+    } else {
+      EXPECT_EQ((i + 1) * num_boxes_per_page, renderer_->GetBoxes().size());
+    }
+    for (int j = i * num_boxes_per_page; j < (i + 1) * num_boxes_per_page;
+         ++j) {
+      EXPECT_EQ(i, renderer_->GetBoxes()[j]->page());
+    }
+  }
+}
+
+TEST_F(StringRendererTest, DoesClearBoxes) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  pixDestroy(&pix);
+  EXPECT_GT(renderer_->GetBoxes().size(), 0);
+  const int num_boxes_per_page = renderer_->GetBoxes().size();
+
+  renderer_->ClearBoxes();
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  pixDestroy(&pix);
+  EXPECT_EQ(num_boxes_per_page, renderer_->GetBoxes().size());
+}
+
+TEST_F(StringRendererTest, DoesLigatureTextForRendering) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  renderer_->set_add_ligatures(true);
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kEngNonLigatureText),
+            renderer_->RenderToImage(kEngNonLigatureText,
+                                     strlen(kEngNonLigatureText), &pix));
+  pixDestroy(&pix);
+  // There should be one less box than letters due to the 'fi' ligature.
+  EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());
+  // The output box text should be ligatured.
+  EXPECT_STREQ("ﬁ", renderer_->GetBoxes()[0]->ch().c_str());
+}
+
+TEST_F(StringRendererTest, DoesRetainInputLigatureForRendering) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kEngLigatureText),
+            renderer_->RenderToImage(kEngLigatureText, strlen(kEngLigatureText),
+                                     &pix));
+  pixDestroy(&pix);
+  // There should be one less box than letters due to the 'fi' ligature.
+  EXPECT_EQ(strlen(kEngNonLigatureText) - 1, renderer_->GetBoxes().size());
+  // The output box text should be ligatured.
+  EXPECT_STREQ("\uFB01", renderer_->GetBoxes()[0]->ch().c_str());
+}
+
+TEST_F(StringRendererTest, DoesStripUnrenderableWords) {
+  // Verdana should only be able to render the english letters and numbers in
+  // the mixed text.
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  std::string text(kMixedText);
+  EXPECT_GT(renderer_->StripUnrenderableWords(&text), 0);
+  EXPECT_EQ(" 123  abc", text);
+}
+
+TEST_F(StringRendererTest, DoesRenderWordBoxes) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  renderer_->set_output_word_boxes(true);
+  Pix* pix = nullptr;
+  EXPECT_EQ(strlen(kEngText),
+            renderer_->RenderToImage(kEngText, strlen(kEngText), &pix));
+  pixDestroy(&pix);
+  // Verify #boxchars = #words + #spaces
+  std::vector<std::string> words =
+      absl::StrSplit(kEngText, ' ', absl::SkipEmpty());
+  const int kNumSpaces = words.size() - 1;
+  const int kExpectedNumBoxes = words.size() + kNumSpaces;
+  const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+  EXPECT_EQ(kExpectedNumBoxes, boxchars.size());
+  // Verify content of words and spaces
+  for (size_t i = 0; i < boxchars.size(); i += 2) {
+    EXPECT_EQ(words[i / 2], boxchars[i]->ch());
+    if (i < boxchars.size() - 1) {
+      EXPECT_EQ(" ", boxchars[i + 1]->ch());
+      EXPECT_TRUE(boxchars[i + 1]->box() == nullptr);
+    }
+  }
+}
+
+TEST_F(StringRendererTest, DoesRenderWordBoxesFromMultiLineText) {
+  renderer_.reset(new StringRenderer("Verdana 10", 600, 600));
+  renderer_->set_output_word_boxes(true);
+  Pix* pix = nullptr;
+  const char kMultlineText[] = "the quick brown fox\njumps over the lazy dog";
+  EXPECT_EQ(strlen(kMultlineText),
+            renderer_->RenderToImage(kMultlineText, strlen(kEngText), &pix));
+  pixDestroy(&pix);
+  // Verify #boxchars = #words + #spaces + #newlines
+  std::vector<std::string> words =
+      absl::StrSplit(kMultlineText, absl::ByAnyChar(" \n"), absl::SkipEmpty());
+  const int kNumSeparators = words.size() - 1;
+  const int kExpectedNumBoxes = words.size() + kNumSeparators;
+  const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+  EXPECT_EQ(kExpectedNumBoxes, boxchars.size());
+  // Verify content of words and spaces
+  for (size_t i = 0; i < boxchars.size(); i += 2) {
+    EXPECT_EQ(words[i / 2], boxchars[i]->ch());
+    if (i + 1 < boxchars.size()) {
+      EXPECT_EQ(" ", boxchars[i + 1]->ch());
+      EXPECT_TRUE(boxchars[i + 1]->box() == nullptr);
+    }
+  }
+}
+
+TEST_F(StringRendererTest, DoesRenderAllFontsToImage) {
+  renderer_.reset(new StringRenderer("Verdana 10", 1200, 1200));
+  size_t offset = 0;
+  std::string font_used;
+  do {
+    Pix* pix = nullptr;
+    font_used.clear();
+    offset += renderer_->RenderAllFontsToImage(
+        1.0, kEngText + offset, strlen(kEngText + offset), &font_used, &pix);
+    if (offset < strlen(kEngText)) {
+      EXPECT_TRUE(pix != nullptr);
+      EXPECT_STRNE("", font_used.c_str());
+    }
+    if (FLAGS_display) pixDisplay(pix, 0, 0);
+    pixDestroy(&pix);
+  } while (offset < strlen(kEngText));
+}
+
+TEST_F(StringRendererTest, DoesNotRenderWordJoiner) {
+  renderer_.reset(new StringRenderer("Verdana 10", 500, 200));
+  const std::string word = "A- -B C-D A BC";
+  const std::string joined_word = StringRenderer::InsertWordJoiners(word);
+  Pix* pix = nullptr;
+  renderer_->RenderToImage(joined_word.c_str(), joined_word.length(), &pix);
+  pixDestroy(&pix);
+  const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+  const std::string kWordJoinerUTF8 = "\u2060";
+  ASSERT_EQ(word.length(), boxchars.size());
+  for (size_t i = 0; i < boxchars.size(); ++i) {
+    EXPECT_NE(kWordJoinerUTF8, boxchars[i]->ch());
+    EXPECT_EQ(word.substr(i, 1), boxchars[i]->ch());
+  }
+}
+
+TEST_F(StringRendererTest, DISABLED_DoesDropUncoveredChars) {
+  renderer_.reset(new StringRenderer("Verdana 10", 500, 200));
+  renderer_->set_drop_uncovered_chars(true);
+  const std::string kWord = "oﬀice";
+  const std::string kCleanWord = "oice";
+  Pix* pix = nullptr;
+  EXPECT_FALSE(
+      renderer_->font().CanRenderString(kWord.c_str(), kWord.length()));
+  EXPECT_FALSE(renderer_->font().CoversUTF8Text(kWord.c_str(), kWord.length()));
+  int offset = renderer_->RenderToImage(kWord.c_str(), kWord.length(), &pix);
+  pixDestroy(&pix);
+  const std::vector<BoxChar*>& boxchars = renderer_->GetBoxes();
+  EXPECT_EQ(kWord.length(), offset);
+  ASSERT_EQ(kCleanWord.length(), boxchars.size());
+  for (size_t i = 0; i < boxchars.size(); ++i) {
+    EXPECT_EQ(kCleanWord.substr(i, 1), boxchars[i]->ch());
+  }
+}
+
+// ------------ StringRenderer::ConvertBasicLatinToFullwidthLatin() ------------
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesConvertBasicLatin) {
+  const std::string kHalfAlpha = "ABCD";
+  const std::string kFullAlpha = "ＡＢＣＤ";
+  EXPECT_EQ(kFullAlpha,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfAlpha));
+
+  const std::string kHalfDigit = "0123";
+  const std::string kFullDigit = "０１２３";
+  EXPECT_EQ(kFullDigit,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfDigit));
+
+  const std::string kHalfSym = "()[]:;!?";
+  const std::string kFullSym = "（）［］：；！？";
+  EXPECT_EQ(kFullSym,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSym));
+}
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertFullwidthLatin) {
+  const std::string kFullAlpha = "ＡＢＣＤ";
+  EXPECT_EQ(kFullAlpha,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullAlpha));
+
+  const std::string kFullDigit = "０１２３";
+  EXPECT_EQ(kFullDigit,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullDigit));
+
+  const std::string kFullSym = "（）［］：；！？";
+  EXPECT_EQ(kFullSym,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSym));
+}
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertNonLatin) {
+  const std::string kHalfKana = "ｱｲｳｴｵ";
+  const std::string kFullKana = "アイウエオ";
+  EXPECT_EQ(kHalfKana,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfKana));
+  EXPECT_EQ(kFullKana,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullKana));
+}
+
+TEST(ConvertBasicLatinToFullwidthLatinTest, DoesNotConvertSpace) {
+  const std::string kHalfSpace = " ";
+  const std::string kFullSpace = "　";
+  EXPECT_EQ(kHalfSpace,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kHalfSpace));
+  EXPECT_EQ(kFullSpace,
+            StringRenderer::ConvertBasicLatinToFullwidthLatin(kFullSpace));
+}
+
+// ------------ StringRenderer::ConvertFullwidthLatinToBasicLatin() ------------
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesConvertFullwidthLatin) {
+  const std::string kHalfAlpha = "ABCD";
+  const std::string kFullAlpha = "ＡＢＣＤ";
+  EXPECT_EQ(kHalfAlpha,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullAlpha));
+
+  const std::string kHalfDigit = "0123";
+  const std::string kFullDigit = "０１２３";
+  EXPECT_EQ(kHalfDigit,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullDigit));
+
+  const std::string kHalfSym = "()[]:;!?";
+  const std::string kFullSym = "（）［］：；！？";
+  EXPECT_EQ(kHalfSym,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSym));
+}
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertBasicLatin) {
+  const std::string kHalfAlpha = "ABCD";
+  EXPECT_EQ(kHalfAlpha,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfAlpha));
+
+  const std::string kHalfDigit = "0123";
+  EXPECT_EQ(kHalfDigit,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfDigit));
+
+  const std::string kHalfSym = "()[]:;!?";
+  EXPECT_EQ(kHalfSym,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSym));
+}
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertNonLatin) {
+  const std::string kHalfKana = "ｱｲｳｴｵ";
+  const std::string kFullKana = "アイウエオ";
+  EXPECT_EQ(kHalfKana,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfKana));
+  EXPECT_EQ(kFullKana,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullKana));
+}
+
+TEST(ConvertFullwidthLatinToBasicLatinTest, DoesNotConvertSpace) {
+  const std::string kHalfSpace = " ";
+  const std::string kFullSpace = "　";
+  EXPECT_EQ(kHalfSpace,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kHalfSpace));
+  EXPECT_EQ(kFullSpace,
+            StringRenderer::ConvertFullwidthLatinToBasicLatin(kFullSpace));
+}
+}  // namespace
diff --git a/tesseract/unittest/syntaxnet/base.h b/tesseract/unittest/syntaxnet/base.h
new file mode 100644
index 00000000..5dabbbda
--- /dev/null
+++ b/tesseract/unittest/syntaxnet/base.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_BASE_H_
+#define SYNTAXNET_BASE_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "google/protobuf/util/message_differencer.h"
+
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/default/integral_types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+
+
+using tensorflow::int8;
+using tensorflow::int16;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::uint8;
+using tensorflow::uint16;
+using tensorflow::uint64;
+using tensorflow::uint32;
+using tensorflow::protobuf::TextFormat;
+using tensorflow::mutex_lock;
+using tensorflow::mutex;
+using std::map;
+using std::pair;
+using std::vector;
+using std::unordered_map;
+using std::unordered_set;
+typedef signed int char32;
+
+using tensorflow::StringPiece;
+using std::string;
+
+
+  // namespace syntaxnet
+
+#endif  // SYNTAXNET_BASE_H_
diff --git a/tesseract/unittest/tablefind_test.cc b/tesseract/unittest/tablefind_test.cc
new file mode 100644
index 00000000..df6d511c
--- /dev/null
+++ b/tesseract/unittest/tablefind_test.cc
@@ -0,0 +1,261 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "colpartition.h"
+#include "colpartitiongrid.h"
+#include "tablefind.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableTableFinder : public tesseract::TableFinder {
+ public:
+  using TableFinder::GapInXProjection;
+  using TableFinder::HasLeaderAdjacent;
+  using TableFinder::InsertLeaderPartition;
+  using TableFinder::InsertTextPartition;
+  using TableFinder::set_global_median_blob_width;
+  using TableFinder::set_global_median_ledding;
+  using TableFinder::set_global_median_xheight;
+  using TableFinder::SplitAndInsertFragmentedTextPartition;
+
+  void ExpectPartition(const TBOX& box) {
+    tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_);
+    gsearch.SetUniqueMode(true);
+    gsearch.StartFullSearch();
+    ColPartition* part = nullptr;
+    bool found = false;
+    while ((part = gsearch.NextFullSearch()) != nullptr) {
+      if (part->bounding_box().left() == box.left() &&
+          part->bounding_box().bottom() == box.bottom() &&
+          part->bounding_box().right() == box.right() &&
+          part->bounding_box().top() == box.top()) {
+        found = true;
+      }
+    }
+    EXPECT_TRUE(found);
+  }
+  void ExpectPartitionCount(int expected_count) {
+    tesseract::ColPartitionGridSearch gsearch(&fragmented_text_grid_);
+    gsearch.SetUniqueMode(true);
+    gsearch.StartFullSearch();
+    ColPartition* part = nullptr;
+    int count = 0;
+    while ((part = gsearch.NextFullSearch()) != nullptr) {
+      ++count;
+    }
+    EXPECT_EQ(expected_count, count);
+  }
+};
+
+class TableFinderTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    free_boxes_it_.set_to_list(&free_boxes_);
+    finder_.reset(new TestableTableFinder());
+    finder_->Init(1, ICOORD(0, 0), ICOORD(500, 500));
+    // gap finding
+    finder_->set_global_median_xheight(5);
+    finder_->set_global_median_blob_width(5);
+  }
+
+  void TearDown() {
+    if (partition_.get() != nullptr) partition_->DeleteBoxes();
+    DeletePartitionListBoxes();
+    finder_.reset(nullptr);
+  }
+
+  void MakePartition(int x_min, int y_min, int x_max, int y_max) {
+    MakePartition(x_min, y_min, x_max, y_max, 0, 0);
+  }
+
+  void MakePartition(int x_min, int y_min, int x_max, int y_max,
+                     int first_column, int last_column) {
+    if (partition_.get() != nullptr) partition_->DeleteBoxes();
+    TBOX box;
+    box.set_to_given_coords(x_min, y_min, x_max, y_max);
+    partition_.reset(
+        ColPartition::FakePartition(box, PT_UNKNOWN, BRT_UNKNOWN, BTFT_NONE));
+    partition_->set_first_column(first_column);
+    partition_->set_last_column(last_column);
+  }
+
+  void InsertTextPartition(ColPartition* part) {
+    finder_->InsertTextPartition(part);
+    free_boxes_it_.add_after_then_move(part);
+  }
+
+  void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max) {
+    InsertLeaderPartition(x_min, y_min, x_max, y_max, 0, 0);
+  }
+
+  void InsertLeaderPartition(int x_min, int y_min, int x_max, int y_max,
+                             int first_column, int last_column) {
+    TBOX box;
+    box.set_to_given_coords(x_min, y_min, x_max, y_max);
+    ColPartition* part = ColPartition::FakePartition(box, PT_FLOWING_TEXT,
+                                                     BRT_UNKNOWN, BTFT_LEADER);
+    part->set_first_column(first_column);
+    part->set_last_column(last_column);
+    finder_->InsertLeaderPartition(part);
+    free_boxes_it_.add_after_then_move(part);
+  }
+
+  void DeletePartitionListBoxes() {
+    for (free_boxes_it_.mark_cycle_pt(); !free_boxes_it_.cycled_list();
+         free_boxes_it_.forward()) {
+      ColPartition* part = free_boxes_it_.data();
+      part->DeleteBoxes();
+    }
+  }
+
+  std::unique_ptr<TestableTableFinder> finder_;
+  std::unique_ptr<ColPartition> partition_;
+
+ private:
+  tesseract::ColPartition_CLIST free_boxes_;
+  tesseract::ColPartition_C_IT free_boxes_it_;
+};
+
+TEST_F(TableFinderTest, GapInXProjectionNoGap) {
+  int data[100];
+  for (int i = 0; i < 100; ++i) data[i] = 10;
+  EXPECT_FALSE(finder_->GapInXProjection(data, 100));
+}
+
+TEST_F(TableFinderTest, GapInXProjectionEdgeGap) {
+  int data[100];
+  for (int i = 0; i < 10; ++i) data[i] = 2;
+  for (int i = 10; i < 90; ++i) data[i] = 10;
+  for (int i = 90; i < 100; ++i) data[i] = 2;
+  EXPECT_FALSE(finder_->GapInXProjection(data, 100));
+}
+
+TEST_F(TableFinderTest, GapInXProjectionExists) {
+  int data[100];
+  for (int i = 0; i < 10; ++i) data[i] = 10;
+  for (int i = 10; i < 90; ++i) data[i] = 2;
+  for (int i = 90; i < 100; ++i) data[i] = 10;
+  EXPECT_TRUE(finder_->GapInXProjection(data, 100));
+}
+
+TEST_F(TableFinderTest, HasLeaderAdjacentOverlapping) {
+  InsertLeaderPartition(90, 0, 150, 5);
+  MakePartition(0, 0, 100, 10);
+  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(0, 25, 100, 40);
+  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(145, 0, 200, 20);
+  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(40, 0, 50, 4);
+  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+}
+
+TEST_F(TableFinderTest, HasLeaderAdjacentNoOverlap) {
+  InsertLeaderPartition(90, 10, 150, 15);
+  MakePartition(0, 10, 85, 20);
+  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(0, 25, 100, 40);
+  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(0, 0, 100, 10);
+  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+  // TODO(nbeato): is this a useful metric? case fails
+  // MakePartition(160, 0, 200, 15);  // leader is primarily above it
+  // EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+}
+
+TEST_F(TableFinderTest, HasLeaderAdjacentPreservesColumns) {
+  InsertLeaderPartition(90, 0, 150, 5, 1, 2);
+  MakePartition(0, 0, 85, 10, 0, 0);
+  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(0, 0, 100, 10, 0, 1);
+  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(0, 0, 200, 10, 0, 5);
+  EXPECT_TRUE(finder_->HasLeaderAdjacent(*partition_));
+  MakePartition(155, 0, 200, 10, 5, 5);
+  EXPECT_FALSE(finder_->HasLeaderAdjacent(*partition_));
+}
+
+// TODO(nbeato): Only testing a splitting case. Add more...
+// Also test non-split cases.
+TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicPass) {
+  finder_->set_global_median_blob_width(3);
+  finder_->set_global_median_xheight(10);
+
+  TBOX part_box(10, 5, 100, 15);
+  ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
+  all->set_type(PT_FLOWING_TEXT);
+  all->set_blob_type(BRT_TEXT);
+  all->set_flow(BTFT_CHAIN);
+  all->set_left_margin(10);
+  all->set_right_margin(100);
+  TBOX blob_box = part_box;
+  for (int i = 10; i <= 20; i += 5) {
+    blob_box.set_left(i + 1);
+    blob_box.set_right(i + 4);
+    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+  }
+  for (int i = 35; i <= 55; i += 5) {
+    blob_box.set_left(i + 1);
+    blob_box.set_right(i + 4);
+    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+  }
+  for (int i = 80; i <= 95; i += 5) {
+    blob_box.set_left(i + 1);
+    blob_box.set_right(i + 4);
+    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+  }
+  // TODO(nbeato): Ray's newer code...
+  // all->ClaimBoxes();
+  all->ComputeLimits();      // This is to make sure median iinfo is set.
+  InsertTextPartition(all);  // This is to delete blobs
+  ColPartition* fragment_me = all->CopyButDontOwnBlobs();
+
+  finder_->SplitAndInsertFragmentedTextPartition(fragment_me);
+  finder_->ExpectPartition(TBOX(11, 5, 24, 15));
+  finder_->ExpectPartition(TBOX(36, 5, 59, 15));
+  finder_->ExpectPartition(TBOX(81, 5, 99, 15));
+  finder_->ExpectPartitionCount(3);
+}
+
+TEST_F(TableFinderTest, SplitAndInsertFragmentedPartitionsBasicFail) {
+  finder_->set_global_median_blob_width(3);
+  finder_->set_global_median_xheight(10);
+
+  TBOX part_box(10, 5, 100, 15);
+  ColPartition* all = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
+  all->set_type(PT_FLOWING_TEXT);
+  all->set_blob_type(BRT_TEXT);
+  all->set_flow(BTFT_CHAIN);
+  all->set_left_margin(10);
+  all->set_right_margin(100);
+  TBOX blob_box = part_box;
+  for (int i = 10; i <= 95; i += 5) {
+    blob_box.set_left(i + 1);
+    blob_box.set_right(i + 4);
+    all->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(blob_box)));
+  }
+  // TODO(nbeato): Ray's newer code...
+  // all->ClaimBoxes();
+  all->ComputeLimits();      // This is to make sure median iinfo is set.
+  InsertTextPartition(all);  // This is to delete blobs
+  ColPartition* fragment_me = all->CopyButDontOwnBlobs();
+
+  finder_->SplitAndInsertFragmentedTextPartition(fragment_me);
+  finder_->ExpectPartition(TBOX(11, 5, 99, 15));
+  finder_->ExpectPartitionCount(1);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/tablerecog_test.cc b/tesseract/unittest/tablerecog_test.cc
new file mode 100644
index 00000000..3dfb32c5
--- /dev/null
+++ b/tesseract/unittest/tablerecog_test.cc
@@ -0,0 +1,316 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "colpartition.h"
+#include "colpartitiongrid.h"
+#include "tablerecog.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableTableRecognizer : public tesseract::TableRecognizer {
+ public:
+  using TableRecognizer::FindLinesBoundingBox;
+  using TableRecognizer::HasSignificantLines;
+  using TableRecognizer::RecognizeLinedTable;
+  using TableRecognizer::RecognizeTable;
+  using TableRecognizer::RecognizeWhitespacedTable;
+};
+
+class TestableStructuredTable : public tesseract::StructuredTable {
+ public:
+  using StructuredTable::CountHorizontalIntersections;
+  using StructuredTable::CountVerticalIntersections;
+  using StructuredTable::FindLinedStructure;
+  using StructuredTable::FindWhitespacedColumns;
+  using StructuredTable::FindWhitespacedStructure;
+  using StructuredTable::VerifyLinedTableCells;
+
+  void InjectCellY(int y) {
+    cell_y_.push_back(y);
+    cell_y_.sort();
+  }
+  void InjectCellX(int x) {
+    cell_x_.push_back(x);
+    cell_x_.sort();
+  }
+
+  void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) {
+    ASSERT_EQ(0, (almost_done - second) % add);
+    EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size());
+    EXPECT_EQ(x_min, cell_x_.get(0));
+    EXPECT_EQ(x_max, cell_x_.get(cell_x_.size() - 1));
+    for (int i = 1; i < cell_x_.size() - 1; ++i) {
+      EXPECT_EQ(second + add * (i - 1), cell_x_.get(i));
+    }
+  }
+
+  void ExpectSortedX() {
+    EXPECT_GT(cell_x_.size(), 0);
+    for (int i = 1; i < cell_x_.size(); ++i) {
+      EXPECT_LT(cell_x_.get(i - 1), cell_x_.get(i));
+    }
+  }
+};
+
+class SharedTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    ICOORD bleft(0, 0);
+    ICOORD tright(1000, 1000);
+    text_grid_.reset(new ColPartitionGrid(5, bleft, tright));
+    line_grid_.reset(new ColPartitionGrid(5, bleft, tright));
+  }
+
+  void TearDown() {
+    tesseract::ColPartition_IT memory(&allocated_parts_);
+    for (memory.mark_cycle_pt(); !memory.cycled_list(); memory.forward()) {
+      memory.data()->DeleteBoxes();
+    }
+  }
+
+  void InsertPartitions() {
+    for (int row = 0; row < 800; row += 20)
+      for (int col = 0; col < 500; col += 25)
+        InsertPartition(col + 1, row + 1, col + 24, row + 19);
+  }
+
+  void InsertPartition(int left, int bottom, int right, int top) {
+    TBOX box(left, bottom, right, top);
+    ColPartition* part =
+        ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
+    part->set_median_width(3);
+    part->set_median_height(3);
+    text_grid_->InsertBBox(true, true, part);
+
+    tesseract::ColPartition_IT add_it(&allocated_parts_);
+    add_it.add_after_stay_put(part);
+  }
+
+  void InsertLines() {
+    line_box_.set_to_given_coords(
+        100 - line_grid_->gridsize(), 10 - line_grid_->gridsize(),
+        450 + line_grid_->gridsize(), 50 + line_grid_->gridsize());
+    for (int i = 10; i <= 50; i += 10) InsertHorizontalLine(100, 450, i);
+    for (int i = 100; i <= 450; i += 50) InsertVerticalLine(i, 10, 50);
+
+    for (int i = 100; i <= 200; i += 20) InsertHorizontalLine(0, 100, i);
+  }
+
+  void InsertHorizontalLine(int left, int right, int y) {
+    TBOX box(left, y - line_grid_->gridsize(), right,
+             y + line_grid_->gridsize());
+    ColPartition* part =
+        ColPartition::FakePartition(box, PT_HORZ_LINE, BRT_HLINE, BTFT_NONE);
+    line_grid_->InsertBBox(true, true, part);
+
+    tesseract::ColPartition_IT add_it(&allocated_parts_);
+    add_it.add_after_stay_put(part);
+  }
+  void InsertVerticalLine(int x, int bottom, int top) {
+    TBOX box(x - line_grid_->gridsize(), bottom, x + line_grid_->gridsize(),
+             top);
+    ColPartition* part =
+        ColPartition::FakePartition(box, PT_VERT_LINE, BRT_VLINE, BTFT_NONE);
+    line_grid_->InsertBBox(true, true, part);
+
+    tesseract::ColPartition_IT add_it(&allocated_parts_);
+    add_it.add_after_stay_put(part);
+  }
+
+  void InsertCellsInLines() {
+    for (int y = 10; y <= 50; y += 10)
+      for (int x = 100; x <= 450; x += 50)
+        InsertPartition(x + 1, y + 1, x + 49, y + 9);
+  }
+
+  TBOX line_box_;
+  std::unique_ptr<ColPartitionGrid> text_grid_;
+  std::unique_ptr<ColPartitionGrid> line_grid_;
+  ColPartition_LIST allocated_parts_;
+};
+
+class TableRecognizerTest : public SharedTest {
+ protected:
+  void SetUp() {
+    SharedTest::SetUp();
+    recognizer_.reset(new TestableTableRecognizer());
+    recognizer_->Init();
+    recognizer_->set_text_grid(text_grid_.get());
+    recognizer_->set_line_grid(line_grid_.get());
+  }
+
+  std::unique_ptr<TestableTableRecognizer> recognizer_;
+};
+
+class StructuredTableTest : public SharedTest {
+ protected:
+  void SetUp() {
+    SharedTest::SetUp();
+    table_.reset(new TestableStructuredTable());
+    table_->Init();
+    table_->set_text_grid(text_grid_.get());
+    table_->set_line_grid(line_grid_.get());
+  }
+
+  std::unique_ptr<TestableStructuredTable> table_;
+};
+
+TEST_F(TableRecognizerTest, HasSignificantLinesBasicPass) {
+  InsertLines();
+  TBOX smaller_guess(120, 15, 370, 45);
+  TBOX larger_guess(90, 5, 490, 70);
+  EXPECT_TRUE(recognizer_->HasSignificantLines(line_box_));
+  EXPECT_TRUE(recognizer_->HasSignificantLines(larger_guess));
+  EXPECT_TRUE(recognizer_->HasSignificantLines(smaller_guess));
+}
+
+TEST_F(TableRecognizerTest, HasSignificantLinesBasicFail) {
+  InsertLines();
+  TBOX box(370, 35, 500, 45);
+  EXPECT_FALSE(recognizer_->HasSignificantLines(box));
+}
+
+TEST_F(TableRecognizerTest, HasSignificantLinesHorizontalOnlyFails) {
+  InsertLines();
+  TBOX box(0, 100, 200, 200);
+  EXPECT_FALSE(recognizer_->HasSignificantLines(box));
+}
+
+TEST_F(TableRecognizerTest, FindLinesBoundingBoxBasic) {
+  InsertLines();
+  TBOX box(0, 0, 200, 50);
+  bool result = recognizer_->FindLinesBoundingBox(&box);
+  EXPECT_TRUE(result);
+  EXPECT_EQ(line_box_.left(), box.left());
+  EXPECT_EQ(line_box_.right(), box.right());
+  EXPECT_EQ(line_box_.bottom(), box.bottom());
+  EXPECT_EQ(line_box_.top(), box.top());
+}
+
+TEST_F(TableRecognizerTest, RecognizeLinedTableBasic) {
+  InsertLines();
+  TBOX guess(120, 15, 370, 45);
+  tesseract::StructuredTable table;
+  table.set_text_grid(text_grid_.get());
+  table.set_line_grid(line_grid_.get());
+
+  EXPECT_TRUE(recognizer_->RecognizeLinedTable(guess, &table));
+  EXPECT_EQ(line_box_.bottom(), table.bounding_box().bottom());
+  EXPECT_EQ(line_box_.top(), table.bounding_box().top());
+  EXPECT_EQ(line_box_.left(), table.bounding_box().left());
+  EXPECT_EQ(line_box_.right(), table.bounding_box().right());
+  EXPECT_EQ(line_box_.area(), table.bounding_box().area());
+  EXPECT_EQ(7, table.column_count());
+  EXPECT_EQ(4, table.row_count());
+  EXPECT_EQ(28, table.cell_count());
+  EXPECT_TRUE(table.is_lined());
+}
+
+TEST_F(TableRecognizerTest, RecognizeWhitespacedTableBasic) {
+  InsertPartitions();
+  TBOX guess(0, 0, 500, 800);
+
+  tesseract::StructuredTable table;
+  table.set_text_grid(text_grid_.get());
+  table.set_line_grid(line_grid_.get());
+  EXPECT_TRUE(recognizer_->RecognizeWhitespacedTable(guess, &table));
+  EXPECT_EQ(1, table.bounding_box().bottom());
+  EXPECT_EQ(799, table.bounding_box().top());
+  EXPECT_EQ(1, table.bounding_box().left());
+  EXPECT_EQ(499, table.bounding_box().right());
+  EXPECT_EQ(798 * 498, table.bounding_box().area());
+  EXPECT_EQ(500 / 25, table.column_count());
+  EXPECT_EQ(800 / 20, table.row_count());
+  EXPECT_EQ(500 * 800 / 20 / 25, table.cell_count());
+  EXPECT_FALSE(table.is_lined());
+}
+
+TEST_F(StructuredTableTest, CountVerticalIntersectionsAll) {
+  table_->set_bounding_box(TBOX(0, 0, 1000, 1000));
+  InsertPartition(0, 0, 100, 10);
+  InsertPartition(1, 12, 43, 21);
+  EXPECT_EQ(2, table_->CountVerticalIntersections(4));
+  EXPECT_EQ(2, table_->CountVerticalIntersections(20));
+  EXPECT_EQ(2, table_->CountVerticalIntersections(40));
+  EXPECT_EQ(1, table_->CountVerticalIntersections(50));
+  EXPECT_EQ(1, table_->CountVerticalIntersections(60));
+  EXPECT_EQ(1, table_->CountVerticalIntersections(80));
+  EXPECT_EQ(1, table_->CountVerticalIntersections(95));
+  EXPECT_EQ(0, table_->CountVerticalIntersections(104));
+  EXPECT_EQ(0, table_->CountVerticalIntersections(150));
+}
+
+TEST_F(StructuredTableTest, CountHorizontalIntersectionsAll) {
+  table_->set_bounding_box(TBOX(0, 0, 1000, 1000));
+  InsertPartition(0, 3, 100, 10);
+  InsertPartition(110, 5, 200, 16);
+
+  EXPECT_EQ(0, table_->CountHorizontalIntersections(0));
+  EXPECT_EQ(1, table_->CountHorizontalIntersections(4));
+  EXPECT_EQ(2, table_->CountHorizontalIntersections(8));
+  EXPECT_EQ(1, table_->CountHorizontalIntersections(12));
+  EXPECT_EQ(0, table_->CountHorizontalIntersections(20));
+}
+
+TEST_F(StructuredTableTest, VerifyLinedTableBasicPass) {
+  for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
+  for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
+  InsertLines();
+  InsertCellsInLines();
+  table_->set_bounding_box(line_box_);
+  EXPECT_TRUE(table_->VerifyLinedTableCells());
+}
+
+TEST_F(StructuredTableTest, VerifyLinedTableHorizontalFail) {
+  for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
+  for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
+  InsertLines();
+  InsertCellsInLines();
+  InsertPartition(101, 11, 299, 19);
+  table_->set_bounding_box(line_box_);
+  EXPECT_FALSE(table_->VerifyLinedTableCells());
+}
+
+TEST_F(StructuredTableTest, VerifyLinedTableVerticalFail) {
+  for (int y = 10; y <= 50; y += 10) table_->InjectCellY(y);
+  for (int x = 100; x <= 450; x += 50) table_->InjectCellX(x);
+  InsertLines();
+  InsertCellsInLines();
+  InsertPartition(151, 21, 199, 39);
+  table_->set_bounding_box(line_box_);
+  EXPECT_FALSE(table_->VerifyLinedTableCells());
+}
+
+TEST_F(StructuredTableTest, FindWhitespacedColumnsBasic) {
+  InsertPartitions();
+  TBOX guess(0, 0, 500, 800);
+  table_->set_bounding_box(guess);
+  table_->FindWhitespacedColumns();
+  table_->ExpectCellX(1, 25, 25, 475, 499);
+}
+
+TEST_F(StructuredTableTest, FindWhitespacedColumnsSorted) {
+  InsertPartitions();
+  TBOX guess(0, 0, 500, 800);
+  table_->set_bounding_box(guess);
+  table_->FindWhitespacedColumns();
+  table_->ExpectSortedX();
+}
+
+// TODO(nbeato): check failure cases
+// TODO(nbeato): check Recognize processes correctly on trivial real examples.
+
+}  // namespace
diff --git a/tesseract/unittest/tabvector_test.cc b/tesseract/unittest/tabvector_test.cc
new file mode 100644
index 00000000..dab0ace8
--- /dev/null
+++ b/tesseract/unittest/tabvector_test.cc
@@ -0,0 +1,130 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "tabvector.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TabVectorTest : public testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    vector_.reset();
+  }
+
+  void TearDown() {}
+
+  void MakeSimpleTabVector(int x1, int y1, int x2, int y2) {
+    vector_.reset(new TabVector());
+    vector_->set_startpt(ICOORD(x1, y1));
+    vector_->set_endpt(ICOORD(x2, y2));
+  }
+
+  std::unique_ptr<TabVector> vector_;
+};
+
+TEST_F(TabVectorTest, SetStartEndPointsMatch) {
+  vector_.reset(new TabVector());
+  ICOORD start(51, 65);
+  ICOORD end(7568, 234);
+  // Test coordinates individually to avoid adding an ostream operator
+  // explicitly to the ICOORD class (Droid doesn't support it).
+  vector_->set_startpt(start);
+  EXPECT_EQ(start.x(), vector_->startpt().x());
+  EXPECT_EQ(start.y(), vector_->startpt().y());
+  vector_->set_endpt(end);
+  EXPECT_EQ(end.x(), vector_->endpt().x());
+  EXPECT_EQ(end.y(), vector_->endpt().y());
+}
+
+TEST_F(TabVectorTest, XAtY45DegreeSlopeInRangeExact) {
+  MakeSimpleTabVector(0, 0, 100, 100);
+  for (int y = 0; y <= 100; ++y) {
+    int x = vector_->XAtY(y);
+    EXPECT_EQ(y, x);
+  }
+}
+
+TEST_F(TabVectorTest, XAtYVerticalInRangeExact) {
+  const int x = 120;  // Arbitrary choice
+  MakeSimpleTabVector(x, 0, x, 100);
+  for (int y = 0; y <= 100; ++y) {
+    int result_x = vector_->XAtY(y);
+    EXPECT_EQ(x, result_x);
+  }
+}
+
+TEST_F(TabVectorTest, XAtYHorizontal) {
+  const int y = 76;  // arbitrary
+  MakeSimpleTabVector(0, y, 100, y);
+  EXPECT_EQ(0, vector_->XAtY(y));
+  // TODO(nbeato): What's the failure condition?
+  // Undefined! Should not pass! Allow until resolved answer.
+  EXPECT_EQ(0, vector_->XAtY(10));
+}
+
+TEST_F(TabVectorTest, XAtYRoundingSimple) {
+  MakeSimpleTabVector(0, 0, 2, 10000);
+  int x = vector_->XAtY(1);
+  EXPECT_EQ(0, x);
+  x = vector_->XAtY(4999);
+  EXPECT_EQ(0, x);
+  x = vector_->XAtY(5001);
+  EXPECT_EQ(1, x);
+  x = vector_->XAtY(9999);
+  EXPECT_EQ(1, x);
+}
+
+TEST_F(TabVectorTest, XAtYLargeNumbers) {
+  // Assume a document is 800 DPI,
+  // the width of a page is 10 inches across (8000 pixels), and
+  // the height of the page is 15 inches (12000 pixels).
+  MakeSimpleTabVector(7804, 504, 7968, 11768);  // Arbitrary for vertical line
+  int x = vector_->XAtY(6136);                  // test mid point
+  EXPECT_EQ(7886, x);
+}
+
+TEST_F(TabVectorTest, XAtYHorizontalInRangeExact) {
+  const int y = 120;  // Arbitrary choice
+  MakeSimpleTabVector(50, y, 150, y);
+
+  int x = vector_->XAtY(y);
+  EXPECT_EQ(50, x);
+}
+
+TEST_F(TabVectorTest, VOverlapInRangeSimple) {
+  MakeSimpleTabVector(0, 0, 100, 100);
+  int overlap = vector_->VOverlap(90, 10);
+  EXPECT_EQ(80, overlap);
+  overlap = vector_->VOverlap(100, 0);
+  EXPECT_EQ(100, overlap);
+}
+
+TEST_F(TabVectorTest, VOverlapOutOfRange) {
+  MakeSimpleTabVector(0, 10, 100, 90);
+  int overlap = vector_->VOverlap(100, 0);
+  EXPECT_EQ(80, overlap);
+}
+
+TEST_F(TabVectorTest, XYFlip) {
+  MakeSimpleTabVector(1, 2, 3, 4);
+  vector_->XYFlip();
+  EXPECT_EQ(2, vector_->startpt().x());
+  EXPECT_EQ(1, vector_->startpt().y());
+  EXPECT_EQ(4, vector_->endpt().x());
+  EXPECT_EQ(3, vector_->endpt().y());
+}
+
+}  // namespace
diff --git a/tesseract/unittest/tatweel_test.cc b/tesseract/unittest/tatweel_test.cc
new file mode 100644
index 00000000..4bd8b337
--- /dev/null
+++ b/tesseract/unittest/tatweel_test.cc
@@ -0,0 +1,114 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(_WIN32)
+#include <io.h>         // for _access
+#else
+#include <unistd.h>     // for access
+#endif
+
+#include "include_gunit.h"
+#include "dawg.h"
+#include "trie.h"
+#include "unicharset.h"
+#ifdef INCLUDE_TENSORFLOW
+#include "util/utf8/unicodetext.h"  // for UnicodeText
+#endif
+
+namespace tesseract {
+
+// Replacement for std::filesystem::exists (C++-17)
+static bool file_exists(const char* filename) {
+#if defined(_WIN32)
+  return _access(filename, 0) == 0;
+#else
+  return access(filename, 0) == 0;
+#endif
+}
+
+class TatweelTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
+  }
+
+  TatweelTest() {
+#ifdef INCLUDE_TENSORFLOW
+    std::string filename = TestDataNameToPath("ara.wordlist");
+    if (file_exists(filename.c_str())) {
+      std::string wordlist(u8"\u0640");
+      CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
+      // Put all the unicodes in the unicharset_.
+      UnicodeText text;
+      text.PointToUTF8(wordlist.data(), wordlist.size());
+      int num_tatweel = 0;
+      for (auto it = text.begin(); it != text.end(); ++it) {
+        std::string utf8 = it.get_utf8_string();
+        if (utf8.find(u8"\u0640") != std::string::npos) ++num_tatweel;
+        unicharset_.unichar_insert(utf8.c_str());
+      }
+      LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
+      EXPECT_GT(num_tatweel, 0);
+    }
+#endif
+  }
+
+  std::string TestDataNameToPath(const std::string& name) {
+    return file::JoinPath(TESTDATA_DIR, name);
+  }
+  UNICHARSET unicharset_;
+};
+
+TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
+  // This test verifies that the unicharset ignores the Tatweel character.
+  for (int i = 0; i < unicharset_.size(); ++i) {
+    const char* utf8 = unicharset_.id_to_unichar(i);
+    EXPECT_EQ(strstr(utf8, u8"\u0640"), nullptr);
+  }
+}
+
+TEST_F(TatweelTest, DictIgnoresTatweel) {
+  // This test verifies that the dictionary ignores the Tatweel character.
+  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM,
+                       unicharset_.size(), 0);
+  std::string filename = TestDataNameToPath("ara.wordlist");
+  if (!file_exists(filename.c_str())) {
+    LOG(INFO) << "Skip test because of missing " << filename;
+    GTEST_SKIP();
+  } else {
+    EXPECT_TRUE(trie.read_and_add_word_list(
+      filename.c_str(), unicharset_,
+      tesseract::Trie::RRP_REVERSE_IF_HAS_RTL));
+    EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
+  }
+}
+
+TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
+  // This test verifies that a load of an existing unicharset keeps any
+  // existing tatweel for backwards compatibility.
+  std::string filename = TestDataNameToPath("ara.unicharset");
+  if (!file_exists(filename.c_str())) {
+    LOG(INFO) << "Skip test because of missing " << filename;
+    GTEST_SKIP();
+  } else {
+    EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
+    int num_tatweel = 0;
+    for (int i = 0; i < unicharset_.size(); ++i) {
+      const char* utf8 = unicharset_.id_to_unichar(i);
+      if (strstr(utf8, u8"\u0640") != nullptr) ++num_tatweel;
+    }
+    LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
+    EXPECT_EQ(num_tatweel, 4);
+  }
+}
+
+}  // namespace
diff --git a/tesseract/unittest/tesseract_leaksanitizer.supp b/tesseract/unittest/tesseract_leaksanitizer.supp
new file mode 100644
index 00000000..6cc39999
--- /dev/null
+++ b/tesseract/unittest/tesseract_leaksanitizer.supp
@@ -0,0 +1,12 @@
+# Suppress memory leaks.
+# Use with LSAN_OPTIONS=suppressions=tesseract_lsan.supp
+leak:FcLangSetCreate
+leak:FcPatternObjectAddWithBinding
+leak:FcPatternObjectInsertElt
+leak:FcValueListAppend
+leak:FcValueListDuplicate
+leak:FcValueListPrepend
+leak:IA__FcLangSetCreate
+leak:IA__FcValueSave
+leak:libfontconfig.so
+leak:libfreetype.so
diff --git a/tesseract/unittest/textlineprojection_test.cc b/tesseract/unittest/textlineprojection_test.cc
new file mode 100644
index 00000000..f8423615
--- /dev/null
+++ b/tesseract/unittest/textlineprojection_test.cc
@@ -0,0 +1,262 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <allheaders.h>
+#include <string>                       // for std::string
+
+#include "absl/strings/str_format.h"    // for absl::StrFormat
+#include "include_gunit.h"
+
+#include <tesseract/baseapi.h>
+#include "colfind.h"
+#include "log.h"                        // for LOG
+#include "mutableiterator.h"
+#include <tesseract/osdetect.h>
+#include "pageres.h"
+#include "tesseractclass.h"
+#include "textlineprojection.h"
+
+namespace tesseract {
+
+// Minimum score for a STRONG_CHAIN textline.
+// NOTE: Keep in sync with textlineprojection.cc.
+const int kMinStrongTextValue = 6;
+
+// The fixture for testing Tesseract.
+class TextlineProjectionTest : public testing::Test {
+ protected:
+  std::string OutputNameToPath(const std::string& name) {
+    file::MakeTmpdir();
+    return file::JoinPath(FLAGS_test_tmpdir, name);
+  }
+
+  TextlineProjectionTest() {
+    src_pix_ = nullptr;
+    bin_pix_ = nullptr;
+    tesseract_ = nullptr;
+    finder_ = nullptr;
+    denorm_ = nullptr;
+    projection_ = nullptr;
+  }
+  virtual ~TextlineProjectionTest() {
+    pixDestroy(&src_pix_);
+    pixDestroy(&bin_pix_);
+    delete finder_;
+    delete tesseract_;
+  }
+
+  void SetImage(const char* filename) {
+    pixDestroy(&src_pix_);
+    src_pix_ = pixRead(file::JoinPath(TESTING_DIR, filename).c_str());
+    api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY);
+    api_.SetPageSegMode(tesseract::PSM_AUTO_OSD);
+    api_.SetImage(src_pix_);
+  }
+
+  // Ugly hacked-together function sets up projection_ and denorm_ by setting
+  // up for auto pagelayout, setting up a ColumnFinder, running it, and
+  // using accessors to get at the internal denorm and projection.
+  // If the coordinates have been rotated, the denorm should match
+  // correctly and transform coordinates back to the projection.
+  // We throw away all the blocks, blobs etc, and test the projection with
+  // the resultiterator from a separate BaseAPI run.
+  void SetupProjection() {
+    tesseract::TessdataManager mgr;
+    Tesseract* osd_tess = new Tesseract;
+    OSResults osr;
+    EXPECT_EQ(osd_tess->init_tesseract(TESSDATA_DIR, nullptr, "osd",
+                                       tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
+                                       nullptr, nullptr, false, &mgr),
+              0);
+    tesseract_ = new Tesseract;
+    EXPECT_EQ(tesseract_->init_tesseract(TESSDATA_DIR, nullptr, "eng",
+                                         tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
+                                         nullptr, nullptr, false, &mgr),
+              0);
+    bin_pix_ = api_.GetThresholdedImage();
+    *tesseract_->mutable_pix_binary() = pixClone(bin_pix_);
+    osd_tess->set_source_resolution(api_.tesseract()->source_resolution());
+    tesseract_->set_source_resolution(api_.tesseract()->source_resolution());
+    int width = pixGetWidth(bin_pix_);
+    int height = pixGetHeight(bin_pix_);
+    // First make a single block covering the whole image.
+    BLOCK* block = new BLOCK("", true, 0, 0, 0, 0, width, height);
+    block->set_right_to_left(false);
+    BLOCK_LIST src_blocks;
+    BLOCK_IT block_it(&src_blocks);
+    block_it.add_to_end(block);
+    Pix* photomask_pix = nullptr;
+    // The blocks made by the ColumnFinder. Moved to blocks before return.
+    BLOCK_LIST found_blocks;
+    TO_BLOCK_LIST temp_blocks;
+    finder_ = tesseract_->SetupPageSegAndDetectOrientation(
+        tesseract::PSM_AUTO_OSD, &src_blocks, osd_tess, &osr, &temp_blocks,
+        &photomask_pix, nullptr);
+    TO_BLOCK_IT to_block_it(&temp_blocks);
+    TO_BLOCK* to_block = to_block_it.data();
+    denorm_ = finder_->denorm();
+    TO_BLOCK_LIST to_blocks;
+    BLOBNBOX_LIST diacritic_blobs;
+    EXPECT_GE(finder_->FindBlocks(tesseract::PSM_AUTO, nullptr, 1, to_block,
+                                  photomask_pix, nullptr, nullptr, nullptr,
+                                  &found_blocks, &diacritic_blobs, &to_blocks),
+              0);
+    projection_ = finder_->projection();
+    pixDestroy(&photomask_pix);
+    delete osd_tess;
+  }
+
+  // Helper evaluates the given box, expects the result to be greater_than
+  // or !greater_than the target_value and provides diagnostics if not.
+  void EvaluateBox(const TBOX& box, bool greater_or_equal, int target_value,
+                   const char* text, const char* message) {
+    int value = projection_->EvaluateBox(box, denorm_, false);
+    if (greater_or_equal != (value > target_value)) {
+      LOG(INFO) << absl::StrFormat(
+          "EvaluateBox too %s:%d vs %d for %s word '%s' at:",
+          greater_or_equal ? "low" : "high", value, target_value, message,
+          text);
+      box.print();
+      value = projection_->EvaluateBox(box, denorm_, true);
+    } else {
+      LOG(INFO) << absl::StrFormat("EvaluateBox OK(%d) for %s word '%s'",
+                                   value, message, text);
+    }
+    if (greater_or_equal) {
+      EXPECT_GE(value, target_value);
+    } else {
+      EXPECT_LT(value, target_value);
+    }
+  }
+
+  // Helper evaluates the DistanceOfBoxFromBox function by expecting that
+  // box should be nearer to true_box than false_box.
+  void EvaluateDistance(const TBOX& box, const TBOX& true_box,
+                        const TBOX& false_box, const char* text,
+                        const char* message) {
+    int true_dist =
+        projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, false);
+    int false_dist =
+        projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, false);
+    if (false_dist <= true_dist) {
+      LOG(INFO) << absl::StrFormat(
+        "Distance wrong:%d vs %d for %s word '%s' at:",
+        false_dist, true_dist, message, text);
+      true_box.print();
+      projection_->DistanceOfBoxFromBox(box, true_box, true, denorm_, true);
+      projection_->DistanceOfBoxFromBox(box, false_box, true, denorm_, true);
+    } else {
+      LOG(INFO) << absl::StrFormat("Distance OK(%d vs %d) for %s word '%s'",
+                                   false_dist, true_dist, message, text);
+    }
+  }
+
+  // Tests the projection on the word boxes of the given image.
+  // line_height is the cap + descender size of the text.
+  void VerifyBoxes(const char* imagefile, int line_height) {
+    SetImage(imagefile);
+    api_.Recognize(nullptr);
+    SetupProjection();
+    MutableIterator* it = api_.GetMutableIterator();
+    do {
+      char* text = it->GetUTF8Text(tesseract::RIL_WORD);
+      const PAGE_RES_IT* pr_it = it->PageResIt();
+      WERD_RES* word = pr_it->word();
+      // The word_box refers to the internal, possibly rotated, coords.
+      TBOX word_box = word->word->bounding_box();
+      bool small_word = word_box.height() * 1.5 < line_height;
+      bool tall_word = word_box.height() * 1.125 > line_height;
+      // We pad small and tall words differently because ascenders and
+      // descenders affect the position and size of the upper/lower boxes.
+      int padding;
+      if (small_word) {
+        padding = word_box.height();
+      } else if (tall_word) {
+        padding = word_box.height() / 3;
+      } else {
+        padding = word_box.height() / 2;
+      }
+      // Test that the word box gets a good score.
+      EvaluateBox(word_box, true, kMinStrongTextValue, text, "Real Word");
+
+      // Now test a displaced box, both above and below the word.
+      TBOX upper_box(word_box);
+      upper_box.set_bottom(word_box.top());
+      upper_box.set_top(word_box.top() + padding);
+      EvaluateBox(upper_box, false, kMinStrongTextValue, text, "Upper Word");
+      EvaluateBox(upper_box, true, -1, text, "Upper Word not vertical");
+      TBOX lower_box = word_box;
+      lower_box.set_top(word_box.bottom());
+      lower_box.set_bottom(word_box.bottom() - padding);
+      if (tall_word) lower_box.move(ICOORD(0, padding / 2));
+      EvaluateBox(lower_box, false, kMinStrongTextValue, text, "Lower Word");
+      EvaluateBox(lower_box, true, -1, text, "Lower Word not vertical");
+
+      // Since some words have no text below and some words have no text above
+      // check that at least one of the boxes satisfies BoxOutOfTextline.
+      bool upper_or_lower_out_of_textline =
+          projection_->BoxOutOfHTextline(upper_box, denorm_, false) ||
+          projection_->BoxOutOfHTextline(lower_box, denorm_, false);
+      if (!upper_or_lower_out_of_textline) {
+        projection_->BoxOutOfHTextline(upper_box, denorm_, true);
+        projection_->BoxOutOfHTextline(lower_box, denorm_, true);
+      }
+      EXPECT_TRUE(upper_or_lower_out_of_textline);
+
+      // Now test DistanceOfBoxFromBox by faking a challenger word, and asking
+      // that each pad box be nearer to its true textline than the
+      // challenger. Due to the tight spacing of latin text, getting
+      // the right position and size of these test boxes is quite fiddly.
+      padding = line_height / 4;
+      upper_box.set_top(upper_box.bottom() + padding);
+      TBOX target_box(word_box);
+      if (!small_word) {
+        upper_box.move(ICOORD(0, -padding * 3 / 2));
+      }
+      target_box.set_top(upper_box.bottom());
+      TBOX upper_challenger(upper_box);
+      upper_challenger.set_bottom(upper_box.top());
+      upper_challenger.set_top(upper_box.top() + word_box.height());
+      EvaluateDistance(upper_box, target_box, upper_challenger, text,
+                       "Upper Word");
+      if (tall_word) lower_box.move(ICOORD(0, padding / 2));
+      lower_box.set_bottom(lower_box.top() - padding);
+      target_box = word_box;
+      target_box.set_bottom(lower_box.top());
+      TBOX lower_challenger(lower_box);
+      lower_challenger.set_top(lower_box.bottom());
+      lower_challenger.set_bottom(lower_box.bottom() - word_box.height());
+      EvaluateDistance(lower_box, target_box, lower_challenger, text,
+                       "Lower Word");
+
+      delete[] text;
+    } while (it->Next(tesseract::RIL_WORD));
+    delete it;
+  }
+
+  Pix* src_pix_;
+  Pix* bin_pix_;
+  BLOCK_LIST blocks_;
+  std::string ocr_text_;
+  tesseract::TessBaseAPI api_;
+  Tesseract* tesseract_;
+  ColumnFinder* finder_;
+  const DENORM* denorm_;
+  const TextlineProjection* projection_;
+};
+
+// Tests all word boxes on an unrotated image.
+TEST_F(TextlineProjectionTest, Unrotated) { VerifyBoxes("phototest.tif", 31); }
+
+// Tests character-level applyboxes on italic Times New Roman.
+TEST_F(TextlineProjectionTest, Rotated) { VerifyBoxes("phototestrot.tif", 31); }
+
+}  // namespace
diff --git a/tesseract/unittest/tfile_test.cc b/tesseract/unittest/tfile_test.cc
new file mode 100644
index 00000000..166405ff
--- /dev/null
+++ b/tesseract/unittest/tfile_test.cc
@@ -0,0 +1,179 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "genericvector.h"
+#include "serialis.h"
+
+#include "include_gunit.h"
+
+namespace tesseract {
+
+// Tests TFile and std::vector serialization by serializing and
+// writing/reading.
+
+class TfileTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+  }
+
+  TfileTest() {}
+
+  // Some data to serialize.
+  class MathData {
+   public:
+    MathData() : num_squares_(0), num_triangles_(0) {}
+    void Setup() {
+      // Setup some data.
+      for (int s = 0; s < 42; ++s) squares_.push_back(s * s);
+      num_squares_ = squares_.size();
+      for (int t = 0; t < 52; ++t) triangles_.push_back(t * (t + 1) / 2);
+      num_triangles_ = triangles_.size();
+    }
+    void ExpectEq(const MathData& other) {
+      // Check the data.
+      EXPECT_EQ(num_squares_, other.num_squares_);
+      for (int s = 0; s < squares_.size(); ++s)
+        EXPECT_EQ(squares_[s], other.squares_[s]);
+      EXPECT_EQ(num_triangles_, other.num_triangles_);
+      for (int s = 0; s < triangles_.size(); ++s)
+        EXPECT_EQ(triangles_[s], other.triangles_[s]);
+    }
+    bool Serialize(TFile* fp) {
+      if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
+      if (!squares_.Serialize(fp)) return false;
+      if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+        return false;
+      if (!triangles_.Serialize(fp)) return false;
+      return true;
+    }
+    bool DeSerialize(TFile* fp) {
+      if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1)
+        return false;
+      if (!squares_.DeSerialize(fp)) return false;
+      if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+        return false;
+      if (!triangles_.DeSerialize(fp)) return false;
+      return true;
+    }
+    bool SerializeBigEndian(TFile* fp) {
+      ReverseN(&num_squares_, sizeof(num_squares_));
+      if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
+      // Write an additional reversed size before the vector, which will get
+      // used as its size on reading.
+      if (fp->FWrite(&num_squares_, sizeof(num_squares_), 1) != 1) return false;
+      for (int i = 0; i < squares_.size(); ++i)
+        ReverseN(&squares_[i], sizeof(squares_[i]));
+      if (!squares_.Serialize(fp)) return false;
+      ReverseN(&num_triangles_, sizeof(num_triangles_));
+      if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+        return false;
+      if (fp->FWrite(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+        return false;
+      for (int i = 0; i < triangles_.size(); ++i)
+        ReverseN(&triangles_[i], sizeof(triangles_[i]));
+      return triangles_.Serialize(fp);
+    }
+    bool DeSerializeBigEndian(TFile* fp) {
+      if (fp->FReadEndian(&num_squares_, sizeof(num_squares_), 1) != 1)
+        return false;
+      if (!squares_.DeSerialize(fp)) return false;
+      // The first element is the size that was written, so we will delete it
+      // and read the last element separately.
+      int last_element;
+      if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1)
+        return false;
+      squares_.remove(0);
+      squares_.push_back(last_element);
+      if (fp->FReadEndian(&num_triangles_, sizeof(num_triangles_), 1) != 1)
+        return false;
+      if (!triangles_.DeSerialize(fp)) return false;
+      if (fp->FReadEndian(&last_element, sizeof(last_element), 1) != 1)
+        return false;
+      triangles_.remove(0);
+      triangles_.push_back(last_element);
+      return true;
+    }
+
+   private:
+    GenericVector<int> squares_;
+    int num_squares_;
+    GenericVector<int> triangles_;
+    int num_triangles_;
+  };
+};
+
+TEST_F(TfileTest, Serialize) {
+  // This test verifies that Tfile can serialize a class.
+  MathData m1;
+  m1.Setup();
+  std::vector<char> data;
+  TFile fpw;
+  fpw.OpenWrite(&data);
+  EXPECT_TRUE(m1.Serialize(&fpw));
+  TFile fpr;
+  EXPECT_TRUE(fpr.Open(&data[0], data.size()));
+  MathData m2;
+  EXPECT_TRUE(m2.DeSerialize(&fpr));
+  m1.ExpectEq(m2);
+  MathData m3;
+  EXPECT_FALSE(m3.DeSerialize(&fpr));
+  fpr.Rewind();
+  EXPECT_TRUE(m3.DeSerialize(&fpr));
+  m1.ExpectEq(m3);
+}
+
+TEST_F(TfileTest, FGets) {
+  // This test verifies that Tfile can interleave FGets with binary data.
+  MathData m1;
+  std::string line_str = "This is a textline with a newline\n";
+  m1.Setup();
+  std::vector<char> data;
+  TFile fpw;
+  fpw.OpenWrite(&data);
+  EXPECT_TRUE(m1.Serialize(&fpw));
+  EXPECT_EQ(1, fpw.FWrite(line_str.data(), line_str.size(), 1));
+  EXPECT_TRUE(m1.Serialize(&fpw));
+  // Now get back the 2 copies of m1 with the line in between.
+  TFile fpr;
+  EXPECT_TRUE(fpr.Open(&data[0], data.size()));
+  MathData m2;
+  EXPECT_TRUE(m2.DeSerialize(&fpr));
+  m1.ExpectEq(m2);
+  const int kBufsize = 1024;
+  char buffer[kBufsize + 1];
+  EXPECT_EQ(buffer, fpr.FGets(buffer, kBufsize));
+  EXPECT_STREQ(line_str.c_str(), buffer);
+  MathData m3;
+  EXPECT_TRUE(m3.DeSerialize(&fpr));
+  m1.ExpectEq(m3);
+}
+
+TEST_F(TfileTest, BigEndian) {
+  // This test verifies that Tfile can auto-reverse big-endian data.
+  MathData m1;
+  m1.Setup();
+  std::vector<char> data;
+  TFile fpw;
+  fpw.OpenWrite(&data);
+  EXPECT_TRUE(m1.SerializeBigEndian(&fpw));
+  TFile fpr;
+  EXPECT_TRUE(fpr.Open(&data[0], data.size()));
+  fpr.set_swap(true);
+  MathData m2;
+  EXPECT_TRUE(m2.DeSerializeBigEndian(&fpr));
+  // That serialize was destructive, so test against a fresh MathData.
+  MathData m3;
+  m3.Setup();
+  m3.ExpectEq(m2);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/third_party/utf/rune.c b/tesseract/unittest/third_party/utf/rune.c
new file mode 100644
index 00000000..3d860570
--- /dev/null
+++ b/tesseract/unittest/third_party/utf/rune.c
@@ -0,0 +1,357 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "third_party/utf/utf.h"
+#include "third_party/utf/utfdef.h"
+
+enum
+{
+	Bit1	= 7,
+	Bitx	= 6,
+	Bit2	= 5,
+	Bit3	= 4,
+	Bit4	= 3,
+	Bit5	= 2, 
+
+	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
+	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
+	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
+	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
+	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */
+
+	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
+	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
+
+	Bad	= Runeerror,
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune 
+ * that works on strings that are not necessarily null-terminated.
+ * 
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int
+charntorune(Rune *rune, const char *str, int length)
+{
+	int c, c1, c2, c3;
+	long l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		if (l > Runemax)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+
+/*
+ * This is the older "unsafe" version, which works fine on 
+ * null-terminated strings.
+ */
+int
+chartorune(Rune *rune, const char *str)
+{
+	int c, c1, c2, c3;
+	long l;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		if (l > Runemax)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	/*
+	 * Support for 5-byte or longer UTF-8 would go here, but
+	 * since we don't have that, we'll just fall through to bad.
+	 */
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+}
+
+int
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+	*consumed = charntorune(rune, str, length);
+	return *rune != Runeerror || *consumed == 3;
+}
+    
+int
+runetochar(char *str, const Rune *rune)
+{
+	/* Runes are signed, so convert to unsigned for range check. */
+	unsigned long c;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	c = *rune;
+	if(c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	if(c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
+}
+
+int
+runelen(Rune rune)
+{
+	char str[10];
+
+	return runetochar(str, &rune);
+}
+
+int
+runenlen(const Rune *r, int nrune)
+{
+	int nb;
+	ulong c;	/* Rune is signed, so use unsigned for range check. */
+
+	nb = 0;
+	while(nrune--) {
+		c = *r++;
+		if (c <= Rune1)
+			nb++;
+		else if (c <= Rune2)
+			nb += 2;
+		else if (c <= Rune3)
+			nb += 3;
+		else if (c <= Runemax)
+			nb += 4;
+		else
+			nb += 3;	/* Runeerror = 0xFFFD, see runetochar */
+	}
+	return nb;
+}
+
+int
+fullrune(const char *str, int n)
+{
+	if (n > 0) {
+		int c = *(uchar*)str;
+		if (c < Tx)
+			return 1;
+		if (n > 1) {
+			if (c < T3)
+				return 1;
+			if (n > 2) {
+				if (c < T4 || n > 3)
+					return 1;
+			}
+		}
+	}
+	return 0;
+}
diff --git a/tesseract/unittest/third_party/utf/utf.h b/tesseract/unittest/third_party/utf/utf.h
new file mode 100644
index 00000000..06982e58
--- /dev/null
+++ b/tesseract/unittest/third_party/utf/utf.h
@@ -0,0 +1,246 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#ifndef _UTFH_
+#define _UTFH_ 1
+
+#include <stdint.h>
+
+typedef signed int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+  UTFmax	= 4,		/* maximum bytes per rune */
+  Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
+  Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
+  Runeerror	= 0xFFFD,	/* decoding error in UTF */
+  Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * rune routines
+ */
+
+/*
+ * These routines were written by Rob Pike and Ken Thompson
+ * and first appeared in Plan 9.
+ * SEE ALSO
+ * utf (7)
+ * tcs (1)
+*/
+
+// runetochar copies (encodes) one rune, pointed to by r, to at most
+// UTFmax bytes starting at s and returns the number of bytes generated.
+
+int runetochar(char* s, const Rune* r);
+
+
+// chartorune copies (decodes) at most UTFmax bytes starting at s to
+// one rune, pointed to by r, and returns the number of bytes consumed.
+// If the input is not exactly in UTF format, chartorune will set *r
+// to Runeerror and return 1.
+//
+// Note: There is no special case for a "null-terminated" string. A
+// string whose first byte has the value 0 is the UTF8 encoding of the
+// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
+// anywhere else in a UTF sequence.
+
+int chartorune(Rune* r, const char* s);
+
+
+// charntorune is like chartorune, except that it will access at most
+// n bytes of s.  If the UTF sequence is incomplete within n bytes,
+// charntorune will set *r to Runeerror and return 0. If it is complete
+// but not in UTF format, it will set *r to Runeerror and return 1.
+// 
+// Added 2004-09-24 by Wei-Hwa Huang
+
+int charntorune(Rune* r, const char* s, int n);
+
+// isvalidcharntorune(str, n, r, consumed)
+// is a convenience function that calls "*consumed = charntorune(r, str, n)"
+// and returns an int (logically boolean) indicating whether the first
+// n bytes of str was a valid and complete UTF sequence.
+
+int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
+
+// runelen returns the number of bytes required to convert r into UTF.
+
+int runelen(Rune r);
+
+
+// runenlen returns the number of bytes required to convert the n
+// runes pointed to by r into UTF.
+
+int runenlen(const Rune* r, int n);
+
+
+// fullrune returns 1 if the string s of length n is long enough to be
+// decoded by chartorune, and 0 otherwise. This does not guarantee
+// that the string contains a legal UTF encoding. This routine is used
+// by programs that obtain input one byte at a time and need to know
+// when a full rune has arrived.
+
+int fullrune(const char* s, int n);
+
+// The following routines are analogous to the corresponding string
+// routines with "utf" substituted for "str", and "rune" substituted
+// for "chr".
+
+// utflen returns the number of runes that are represented by the UTF
+// string s. (cf. strlen)
+
+int utflen(const char* s);
+
+
+// utfnlen returns the number of complete runes that are represented
+// by the first n bytes of the UTF string s. If the last few bytes of
+// the string contain an incompletely coded rune, utfnlen will not
+// count them; in this way, it differs from utflen, which includes
+// every byte of the string. (cf. strnlen)
+
+int utfnlen(const char* s, long n);
+
+
+// utfrune returns a pointer to the first occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strchr)
+
+const char* utfrune(const char* s, Rune r);
+
+
+// utfrrune returns a pointer to the last occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strrchr)
+
+const char* utfrrune(const char* s, Rune r);
+
+
+// utfutf returns a pointer to the first occurrence of the UTF string
+// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
+// null string, utfutf returns s1. (cf. strstr)
+
+const char* utfutf(const char* s1, const char* s2);
+
+
+// utfecpy copies UTF sequences until a null sequence has been copied,
+// but writes no sequences beyond es1.  If any sequences are copied,
+// s1 is terminated by a null sequence, and a pointer to that sequence
+// is returned.  Otherwise, the original s1 is returned. (cf. strecpy)
+
+char* utfecpy(char *s1, char *es1, const char *s2);
+
+
+
+// These functions are rune-string analogues of the corresponding
+// functions in strcat (3).
+// 
+// These routines first appeared in Plan 9.
+// SEE ALSO
+// memmove (3)
+// rune (3)
+// strcat (2)
+//
+// BUGS: The outcome of overlapping moves varies among implementations.
+
+Rune* runestrcat(Rune* s1, const Rune* s2);
+Rune* runestrncat(Rune* s1, const Rune* s2, long n);
+
+const Rune* runestrchr(const Rune* s, Rune c);
+
+int runestrcmp(const Rune* s1, const Rune* s2);
+int runestrncmp(const Rune* s1, const Rune* s2, long n);
+
+Rune* runestrcpy(Rune* s1, const Rune* s2);
+Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
+Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
+
+Rune* runestrdup(const Rune* s);
+
+const Rune* runestrrchr(const Rune* s, Rune c);
+long runestrlen(const Rune* s);
+const Rune* runestrstr(const Rune* s1, const Rune* s2);
+
+
+
+// The following routines test types and modify cases for Unicode
+// characters.  Unicode defines some characters as letters and
+// specifies three cases: upper, lower, and title.  Mappings among the
+// cases are also defined, although they are not exhaustive: some
+// upper case letters have no lower case mapping, and so on.  Unicode
+// also defines several character properties, a subset of which are
+// checked by these routines.  These routines are based on Unicode
+// version 3.0.0.
+//
+// NOTE: The routines are implemented in C, so the boolean functions
+// (e.g., isupperrune) return 0 for false and 1 for true.
+//
+//
+// toupperrune, tolowerrune, and totitlerune are the Unicode case
+// mappings. These routines return the character unchanged if it has
+// no defined mapping.
+
+Rune toupperrune(Rune r);
+Rune tolowerrune(Rune r);
+Rune totitlerune(Rune r);
+
+
+// isupperrune tests for upper case characters, including Unicode
+// upper case letters and targets of the toupper mapping. islowerrune
+// and istitlerune are defined analogously. 
+ 
+int isupperrune(Rune r);
+int islowerrune(Rune r);
+int istitlerune(Rune r);
+
+
+// isalpharune tests for Unicode letters; this includes ideographs in
+// addition to alphabetic characters.
+
+int isalpharune(Rune r);
+
+
+// isdigitrune tests for digits. Non-digit numbers, such as Roman
+// numerals, are not included.
+
+int isdigitrune(Rune r);
+
+
+// isideographicrune tests for ideographic characters and numbers, as
+// defined by the Unicode standard.
+
+int isideographicrune(Rune r);
+
+
+// isspacerune tests for whitespace characters, including "C" locale
+// whitespace, Unicode defined whitespace, and the "zero-width
+// non-break space" character.
+
+int isspacerune(Rune r);
+
+
+// (The comments in this file were copied from the manpage files rune.3,
+// isalpharune.3, and runestrcat.3. Some formatting changes were also made
+// to conform to Google style. /JRM 11/11/05)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/tesseract/unittest/third_party/utf/utfdef.h b/tesseract/unittest/third_party/utf/utfdef.h
new file mode 100644
index 00000000..4b58ae87
--- /dev/null
+++ b/tesseract/unittest/third_party/utf/utfdef.h
@@ -0,0 +1,14 @@
+#define uchar _utfuchar
+#define ushort _utfushort
+#define uint _utfuint
+#define ulong _utfulong
+#define vlong _utfvlong
+#define uvlong _utfuvlong
+
+typedef unsigned char		uchar;
+typedef unsigned short		ushort;
+typedef unsigned int		uint;
+typedef unsigned long		ulong;
+
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+#define nil ((void*)0)
diff --git a/tesseract/unittest/unichar_test.cc b/tesseract/unittest/unichar_test.cc
new file mode 100644
index 00000000..54394436
--- /dev/null
+++ b/tesseract/unittest/unichar_test.cc
@@ -0,0 +1,43 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "gmock/gmock.h"                // for testing::ElementsAreArray
+#include <tesseract/unichar.h>
+
+namespace tesseract {
+
+TEST(UnicharTest, Conversion) {
+  // This test verifies that Unichar::UTF8ToUTF32 and Unichar::UTF32ToUTF8
+  // show the required conversion properties.
+  // Test for round-trip utf8-32-8 for 1, 2, 3 and 4 byte codes.
+  const char* kUTF8Src = "a\u05d0\u0ca4\U0002a714";
+  const std::vector<char32> kUTF32Src = {'a', 0x5d0, 0xca4, 0x2a714};
+  // Check for round-trip conversion.
+  std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src);
+  EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src));
+  std::string utf8 = UNICHAR::UTF32ToUTF8(utf32);
+  EXPECT_STREQ(kUTF8Src, utf8.c_str());
+}
+
+TEST(UnicharTest, InvalidText) {
+  // This test verifies that Unichar correctly deals with invalid text.
+  const char* kInvalidUTF8 = "a b\200d string";
+  const std::vector<char32> kInvalidUTF32 = {'a', ' ', 0x200000, 'x'};
+  // Invalid utf8 produces an empty vector.
+  std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8);
+  EXPECT_TRUE(utf32.empty());
+  // Invalid utf32 produces an empty string.
+  std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32);
+  EXPECT_TRUE(utf8.empty());
+}
+
+}  // namespace
diff --git a/tesseract/unittest/unicharcompress_test.cc b/tesseract/unittest/unicharcompress_test.cc
new file mode 100644
index 00000000..1777930e
--- /dev/null
+++ b/tesseract/unittest/unicharcompress_test.cc
@@ -0,0 +1,257 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "allheaders.h"
+
+#include "include_gunit.h"
+#include "log.h"                        // for LOG
+#include "serialis.h"
+#include "tprintf.h"
+#include "unicharcompress.h"
+
+namespace tesseract {
+
+class UnicharcompressTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    std::locale::global(std::locale(""));
+    file::MakeTmpdir();
+  }
+
+  // Loads and compresses the given unicharset.
+  void LoadUnicharset(const std::string& unicharset_name) {
+    std::string radical_stroke_file =
+        file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
+    std::string unicharset_file =
+        file::JoinPath(TESTDATA_DIR, unicharset_name);
+    std::string radical_data;
+    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
+                               file::Defaults()));
+    CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
+    STRING radical_str(radical_data.c_str());
+    null_char_ =
+        unicharset_.has_special_codes() ? UNICHAR_BROKEN : unicharset_.size();
+    compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
+    // Get the encoding of the null char.
+    RecodedCharID code;
+    compressed_.EncodeUnichar(null_char_, &code);
+    encoded_null_char_ = code(0);
+    std::string output_name = file::JoinPath(
+        FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
+    STRING encoding = compressed_.GetEncodingAsString(unicharset_);
+    std::string encoding_str(&encoding[0], encoding.size());
+    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
+    LOG(INFO) << "Wrote encoding to:" << output_name;
+  }
+  // Serializes and de-serializes compressed_ over itself.
+  void SerializeAndUndo() {
+    std::vector<char> data;
+    TFile wfp;
+    wfp.OpenWrite(&data);
+    EXPECT_TRUE(compressed_.Serialize(&wfp));
+    TFile rfp;
+    rfp.Open(&data[0], data.size());
+    EXPECT_TRUE(compressed_.DeSerialize(&rfp));
+  }
+  // Returns true if the lang is in CJK.
+  bool IsCJKLang(const std::string& lang) {
+    return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
+           lang == "jpn";
+  }
+  // Returns true if the lang is Indic.
+  bool IsIndicLang(const std::string& lang) {
+    return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
+           lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
+           lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
+           lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" ||
+           lang == "tel";
+  }
+
+  // Expects the appropriate results from the compressed_  unicharset_.
+  void ExpectCorrect(const std::string& lang) {
+    // Count the number of times each code is used in each element of
+    // RecodedCharID.
+    RecodedCharID zeros;
+    for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
+    int code_range = compressed_.code_range();
+   std::vector<RecodedCharID> times_seen(code_range, zeros);
+    for (int u = 0; u <= unicharset_.size(); ++u) {
+      if (u != UNICHAR_SPACE && u != null_char_ &&
+          (u == unicharset_.size() || (unicharset_.has_special_codes() &&
+                                       u < SPECIAL_UNICHAR_CODES_COUNT))) {
+        continue;  // Not used so not encoded.
+      }
+      RecodedCharID code;
+      int len = compressed_.EncodeUnichar(u, &code);
+      // Check round-trip encoding.
+      int unichar_id;
+      GenericVector<UNICHAR_ID> normed_ids;
+      if (u == null_char_ || u == unicharset_.size()) {
+        unichar_id = null_char_;
+      } else {
+        unichar_id = u;
+      }
+      EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
+      // Check that the codes are valid.
+      for (int i = 0; i < len; ++i) {
+        int code_val = code(i);
+        EXPECT_GE(code_val, 0);
+        EXPECT_LT(code_val, code_range);
+        times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
+      }
+    }
+    // Check that each code is used in at least one position.
+    for (int c = 0; c < code_range; ++c) {
+      int num_used = 0;
+      for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
+        if (times_seen[c](i) != 0) ++num_used;
+      }
+      EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
+    }
+    // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
+    // and create valid codes.
+    RecodedCharID code;
+    CheckCodeExtensions(code, times_seen);
+    // Finally, we achieved all that using a codebook < 10% of the size of
+    // the original unicharset, for CK or Indic, and 20% with J, but just
+    // no bigger for all others.
+    if (IsCJKLang(lang) || IsIndicLang(lang)) {
+      EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
+    } else {
+      EXPECT_LE(code_range, unicharset_.size() + 1);
+    }
+    LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to "
+              << code_range;
+  }
+  // Checks for extensions of the current code that either finish a code, or
+  // extend it and checks those extensions recursively.
+  void CheckCodeExtensions(const RecodedCharID& code,
+                           const std::vector<RecodedCharID>& times_seen) {
+    RecodedCharID extended = code;
+    int length = code.length();
+    const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code);
+    if (final_codes != nullptr) {
+      for (int i = 0; i < final_codes->size(); ++i) {
+        int ending = (*final_codes)[i];
+        EXPECT_GT(times_seen[ending](length), 0);
+        extended.Set(length, ending);
+        int unichar_id = compressed_.DecodeUnichar(extended);
+        EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
+      }
+    }
+    const GenericVector<int>* next_codes = compressed_.GetNextCodes(code);
+    if (next_codes != nullptr) {
+      for (int i = 0; i < next_codes->size(); ++i) {
+        int extension = (*next_codes)[i];
+        EXPECT_GT(times_seen[extension](length), 0);
+        extended.Set(length, extension);
+        CheckCodeExtensions(extended, times_seen);
+      }
+    }
+  }
+
+  UnicharCompress compressed_;
+  UNICHARSET unicharset_;
+  int null_char_;
+  // The encoding of the null_char_.
+  int encoded_null_char_;
+};
+
+TEST_F(UnicharcompressTest, DoesChinese) {
+  LOG(INFO) << "Testing chi_tra";
+  LoadUnicharset("chi_tra.unicharset");
+  ExpectCorrect("chi_tra");
+  LOG(INFO) << "Testing chi_sim";
+  LoadUnicharset("chi_sim.unicharset");
+  ExpectCorrect("chi_sim");
+}
+
+TEST_F(UnicharcompressTest, DoesJapanese) {
+  LOG(INFO) << "Testing jpn";
+  LoadUnicharset("jpn.unicharset");
+  ExpectCorrect("jpn");
+}
+
+TEST_F(UnicharcompressTest, DoesKorean) {
+  LOG(INFO) << "Testing kor";
+  LoadUnicharset("kor.unicharset");
+  ExpectCorrect("kor");
+}
+
+TEST_F(UnicharcompressTest, DoesKannada) {
+  LOG(INFO) << "Testing kan";
+  LoadUnicharset("kan.unicharset");
+  ExpectCorrect("kan");
+  SerializeAndUndo();
+  ExpectCorrect("kan");
+}
+
+TEST_F(UnicharcompressTest, DoesMarathi) {
+  LOG(INFO) << "Testing mar";
+  LoadUnicharset("mar.unicharset");
+  ExpectCorrect("mar");
+}
+
+TEST_F(UnicharcompressTest, DoesEnglish) {
+  LOG(INFO) << "Testing eng";
+  LoadUnicharset("eng.unicharset");
+  ExpectCorrect("eng");
+}
+
+// Tests that a unicharset that contains double-letter ligatures (eg ff) has
+// no null char in the encoding at all.
+TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
+  LOG(INFO) << "Testing por with ligatures";
+  LoadUnicharset("por.unicharset");
+  ExpectCorrect("por");
+  // Check that any unichar-id that is encoded with multiple codes has the
+  // correct encoded_nulll_char_ in between.
+  for (int u = 0; u <= unicharset_.size(); ++u) {
+    RecodedCharID code;
+    int len = compressed_.EncodeUnichar(u, &code);
+    if (len > 1) {
+      // The should not be any null char in the code.
+      for (int i = 0; i < len; ++i) {
+        EXPECT_NE(encoded_null_char_, code(i));
+      }
+    }
+  }
+}
+
+// Tests that GetEncodingAsString returns the right result for a trivial
+// unicharset.
+TEST_F(UnicharcompressTest, GetEncodingAsString) {
+  LoadUnicharset("trivial.unicharset");
+  ExpectCorrect("trivial");
+  STRING encoding = compressed_.GetEncodingAsString(unicharset_);
+  std::string encoding_str(&encoding[0], encoding.length());
+  std::vector<std::string> lines =
+      absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
+  EXPECT_EQ(5, lines.size());
+  // The first line is always space.
+  EXPECT_EQ("0\t ", lines[0]);
+  // Next we have i.
+  EXPECT_EQ("1\ti", lines[1]);
+  // Next we have f.
+  EXPECT_EQ("2\tf", lines[2]);
+  // Next we have the fi ligature: ﬁ. There are no nulls in it, as there are no
+  // repeated letter ligatures in this unicharset, unlike por.unicharset above.
+  EXPECT_EQ("2,1\tﬁ", lines[3]);
+  // Finally the null character.
+  EXPECT_EQ("3\t<nul>", lines[4]);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/unicharset_test.cc b/tesseract/unittest/unicharset_test.cc
new file mode 100644
index 00000000..401a34c1
--- /dev/null
+++ b/tesseract/unittest/unicharset_test.cc
@@ -0,0 +1,161 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "log.h"                        // for LOG
+#include "unicharset.h"
+#include "gmock/gmock.h"  // for testing::ElementsAreArray
+#include "include_gunit.h"
+
+using testing::ElementsAreArray;
+
+namespace tesseract {
+
+class UnicharsetTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    std::locale::global(std::locale(""));
+  }
+};
+
+TEST(UnicharsetTest, Basics) {
+  // This test verifies basic insertion, unichar_to_id, and encode.
+  UNICHARSET u;
+  u.unichar_insert("a");
+  EXPECT_EQ(u.size(), 4);
+  u.unichar_insert("f");
+  EXPECT_EQ(u.size(), 5);
+  u.unichar_insert("i");
+  EXPECT_EQ(u.size(), 6);
+  // The fi ligature is NOT added because it can be encoded with a cleanup as f
+  // then i.
+  u.unichar_insert("\ufb01");
+  EXPECT_EQ(u.size(), 6);
+  u.unichar_insert("e");
+  EXPECT_EQ(u.size(), 7);
+  u.unichar_insert("n");
+  EXPECT_EQ(u.size(), 8);
+  EXPECT_EQ(u.unichar_to_id("f"), 4);
+  EXPECT_EQ(u.unichar_to_id("i"), 5);
+  // The fi ligature has no valid id.
+  EXPECT_EQ(u.unichar_to_id("\ufb01"), INVALID_UNICHAR_ID);
+  // The fi pair has no valid id.
+  EXPECT_EQ(u.unichar_to_id("fi"), INVALID_UNICHAR_ID);
+  std::vector<int> labels;
+  EXPECT_TRUE(u.encode_string("affine", true, &labels, nullptr, nullptr));
+  std::vector<int> v(&labels[0], &labels[0] + labels.size());
+  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
+  // With the fi ligature encoding fails without a pre-cleanup.
+  std::string lig_str = "af\ufb01ne";
+  EXPECT_FALSE(
+      u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
+  lig_str = u.CleanupString(lig_str.c_str());
+  EXPECT_TRUE(
+      u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
+  v = std::vector<int>(&labels[0], &labels[0] + labels.size());
+  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
+}
+
+TEST(UnicharsetTest, Multibyte) {
+  // This test verifies basic insertion, unichar_to_id, and encode.
+  // The difference from Basic above is that now we are testing multi-byte
+  // unicodes instead of single byte.
+  UNICHARSET u;
+  // Insert some Arabic letters.
+  u.unichar_insert("\u0627");
+  EXPECT_EQ(u.size(), 4);
+  u.unichar_insert("\u062c");
+  EXPECT_EQ(u.size(), 5);
+  u.unichar_insert("\u062f");
+  EXPECT_EQ(u.size(), 6);
+  u.unichar_insert("\ufb01");  // fi ligature is added as fi pair.
+  EXPECT_EQ(u.size(), 7);
+  u.unichar_insert("\u062b");
+  EXPECT_EQ(u.size(), 8);
+  u.unichar_insert("\u0635");
+  EXPECT_EQ(u.size(), 9);
+  EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
+  EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
+  // The first two bytes of this string is \u0627, which matches id 3;
+  EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
+  EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
+  // Individual f and i are not present, but they are there as a pair.
+  EXPECT_EQ(u.unichar_to_id("f"), INVALID_UNICHAR_ID);
+  EXPECT_EQ(u.unichar_to_id("i"), INVALID_UNICHAR_ID);
+  EXPECT_EQ(u.unichar_to_id("fi"), 6);
+  // The fi ligature is findable.
+  EXPECT_EQ(u.unichar_to_id("\ufb01"), 6);
+  std::vector<int> labels;
+  EXPECT_TRUE(u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true,
+                              &labels, nullptr, nullptr));
+  std::vector<int> v(&labels[0], &labels[0] + labels.size());
+  EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));
+  // With the fi ligature the fi is picked out.
+  std::vector<char> lengths;
+  int encoded_length;
+  std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
+  // src_str has to be pre-cleaned for lengths to be correct.
+  std::string cleaned = u.CleanupString(src_str.c_str());
+  EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths,
+                              &encoded_length));
+  EXPECT_EQ(encoded_length, cleaned.size());
+  std::string len_str(&lengths[0], lengths.size());
+  EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
+  v = std::vector<int>(&labels[0], &labels[0] + labels.size());
+  EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));
+}
+
+TEST(UnicharsetTest, MultibyteBigrams) {
+  // This test verifies basic insertion, unichar_to_id, and encode.
+  // The difference from Basic above is that now we are testing multi-byte
+  // unicodes instead of single byte.
+  UNICHARSET u;
+  // Insert some Arabic letters.
+  u.unichar_insert("\u0c9c");
+  EXPECT_EQ(u.size(), 4);
+  u.unichar_insert("\u0cad");
+  EXPECT_EQ(u.size(), 5);
+  u.unichar_insert("\u0ccd\u0c9c");
+  EXPECT_EQ(u.size(), 6);
+  u.unichar_insert("\u0ccd");
+  EXPECT_EQ(u.size(), 7);
+  // By default the encodable bigram is NOT added.
+  u.unichar_insert("\u0ccd\u0cad");
+  EXPECT_EQ(u.size(), 7);
+  // It is added if we force it to be.
+  u.unichar_insert("\u0ccd\u0cad", OldUncleanUnichars::kTrue);
+  EXPECT_EQ(u.size(), 8);
+  std::vector<char> data;
+  tesseract::TFile fp;
+  fp.OpenWrite(&data);
+  u.save_to_file(&fp);
+  fp.Open(&data[0], data.size());
+  UNICHARSET v;
+  v.load_from_file(&fp, false);
+  EXPECT_EQ(v.unichar_to_id("\u0c9c"), 3);
+  EXPECT_EQ(v.unichar_to_id("\u0cad"), 4);
+  EXPECT_EQ(v.unichar_to_id("\u0ccd\u0c9c"), 5);
+  EXPECT_EQ(v.unichar_to_id("\u0ccd"), 6);
+  EXPECT_EQ(v.unichar_to_id("\u0ccd\u0cad"), 7);
+}
+
+TEST(UnicharsetTest, OldStyle) {
+  // This test verifies an old unicharset that contains fi/fl ligatures loads
+  // and keeps all the entries.
+  std::string filename =
+      file::JoinPath(TESTDATA_DIR, "eng.unicharset");
+  UNICHARSET u;
+  LOG(INFO) << "Filename=" << filename;
+  EXPECT_TRUE(u.load_from_file(filename.c_str()));
+  EXPECT_EQ(u.size(), 111);
+}
+
+}  // namespace
diff --git a/tesseract/unittest/util/utf8/unicodetext.cc b/tesseract/unittest/util/utf8/unicodetext.cc
new file mode 100644
index 00000000..1a884dd1
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unicodetext.cc
@@ -0,0 +1,507 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/utf8/unicodetext.h"
+
+#include <string.h>                     // for memcpy, NULL, memcmp, etc
+#include <algorithm>                    // for max
+
+//#include "base/logging.h"               // for operator<<, CHECK, etc
+//#include "base/stringprintf.h"          // for StringPrintf, StringAppendF
+//#include "strings/stringpiece.h"        // for StringPiece, etc
+
+#include "third_party/utf/utf.h"        // for isvalidcharntorune, etc
+#include "util/utf8/unilib.h"    // for IsInterchangeValid, etc
+#include "util/utf8/unilib_utf8_utils.h"    // for OneCharLen
+
+static int CodepointDistance(const char* start, const char* end) {
+  int n = 0;
+  // Increment n on every non-trail-byte.
+  for (const char* p = start; p < end; ++p) {
+    n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
+  }
+  return n;
+}
+
+static int CodepointCount(const char* utf8, int len) {
+  return CodepointDistance(utf8, utf8 + len);
+}
+
+UnicodeText::const_iterator::difference_type
+distance(const UnicodeText::const_iterator& first,
+         const UnicodeText::const_iterator& last) {
+  return CodepointDistance(first.it_, last.it_);
+}
+
+// ---------- Utility ----------
+
+static int ConvertToInterchangeValid(char* start, int len) {
+  // This routine is called only when we've discovered that a UTF-8 buffer
+  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
+  // was not interchange valid. This indicates a bug in the caller, and
+  // a LOG(WARNING) is done in that case.
+  // This is similar to CoerceToInterchangeValid, but it replaces each
+  // structurally valid byte with a space, and each non-interchange
+  // character with a space, even when that character requires more
+  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
+  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
+  // code point. The result should contain one space, not three.
+  //
+  // Since the conversion never needs to write more data than it
+  // reads, it is safe to change the buffer in place. It returns the
+  // number of bytes written.
+  char* const in = start;
+  char* out = start;
+  char* const end = start + len;
+  while (start < end) {
+    int good = UniLib::SpanInterchangeValid(start, end - start);
+    if (good > 0) {
+      if (out != start) {
+        memmove(out, start, good);
+      }
+      out += good;
+      start += good;
+      if (start == end) {
+        break;
+      }
+    }
+    // Is the current string invalid UTF8 or just non-interchange UTF8?
+    char32 rune;
+    int n;
+    if (isvalidcharntorune(start, end - start, &rune, &n)) {
+      // structurally valid UTF8, but not interchange valid
+      start += n;  // Skip over the whole character.
+    } else {  // bad UTF8
+      start += 1;  // Skip over just one byte
+    }
+    *out++ = ' ';
+  }
+  return out - in;
+}
+
+
+// *************** Data representation **********
+
+// Note: the copy constructor is undefined.
+
+// After reserve(), resize(), or clear(), we're an owner, not an alias.
+
+void UnicodeText::Repr::reserve(int new_capacity) {
+  // If there's already enough capacity, and we're an owner, do nothing.
+  if (capacity_ >= new_capacity && ours_) return;
+
+  // Otherwise, allocate a new buffer.
+  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
+  char* new_data = new char[capacity_];
+
+  // If there is an old buffer, copy it into the new buffer.
+  if (data_) {
+    memcpy(new_data, data_, size_);
+    if (ours_) delete[] data_;  // If we owned the old buffer, free it.
+  }
+  data_ = new_data;
+  ours_ = true;  // We own the new buffer.
+  // size_ is unchanged.
+}
+
+void UnicodeText::Repr::resize(int new_size) {
+  if (new_size == 0) {
+    clear();
+  } else {
+    if (!ours_ || new_size > capacity_) reserve(new_size);
+    // Clear the memory in the expanded part.
+    if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
+    size_ = new_size;
+    ours_ = true;
+  }
+}
+
+// This implementation of clear() deallocates the buffer if we're an owner.
+// That's not strictly necessary; we could just set size_ to 0.
+void UnicodeText::Repr::clear() {
+  if (ours_) delete[] data_;
+  data_ = nullptr;
+  size_ = capacity_ = 0;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::Copy(const char* data, int size) {
+  resize(size);
+  memcpy(data_, data, size);
+}
+
+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
+  if (data == data_) return;  // We already own this memory. (Weird case.)
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = data;
+  size_ = size;
+  capacity_ = capacity;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::PointTo(const char* data, int size) {
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = const_cast<char*>(data);
+  size_ = size;
+  capacity_ = size;
+  ours_ = false;
+}
+
+void UnicodeText::Repr::append(const char* bytes, int byte_length) {
+  reserve(size_ + byte_length);
+  memcpy(data_ + size_, bytes, byte_length);
+  size_ += byte_length;
+}
+
+string UnicodeText::Repr::DebugString() const {
+  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
+                      this,
+                      data_, size_, capacity_,
+                      ours_ ? "Owned" : "Alias");
+}
+
+
+
+// *************** UnicodeText ******************
+
+// ----- Constructors -----
+
+// Default constructor
+UnicodeText::UnicodeText() {
+}
+
+// Copy constructor
+UnicodeText::UnicodeText(const UnicodeText& src) {
+  Copy(src);
+}
+
+// Substring constructor
+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
+                         const UnicodeText::const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.append(first.it_, last.it_ - first.it_);
+}
+
+string UnicodeText::UTF8Substring(const const_iterator& first,
+                                  const const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  return string(first.it_, last.it_ - first.it_);
+}
+
+
+// ----- Copy -----
+
+UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
+  if (this != &src) {
+    Copy(src);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
+  repr_.Copy(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
+                                           int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  return *this;
+}
+
+// ----- TakeOwnershipOf  -----
+
+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
+                                              int byte_length,
+                                              int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
+                                                    int byte_length,
+                                                    int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  return *this;
+}
+
+// ----- PointTo -----
+
+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
+  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    repr_.PointTo(buffer, byte_length);
+  } else {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.Copy(buffer, byte_length);
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
+                                          int byte_length) {
+  repr_.PointTo(buffer, byte_length);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
+  repr_.PointTo(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const const_iterator &first,
+                                  const const_iterator &last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
+  return *this;
+}
+
+// ----- Append -----
+
+UnicodeText& UnicodeText::append(const UnicodeText& u) {
+  repr_.append(u.repr_.data_, u.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::append(const const_iterator& first,
+                                 const const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.append(first.it_, last.it_ - first.it_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
+  repr_.append(utf8, len);
+  return *this;
+}
+
+// ----- substring searching -----
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
+                                              const_iterator start_pos) const {
+  CHECK_GE(start_pos.utf8_data(), utf8_data());
+  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
+  return UnsafeFind(look, start_pos);
+}
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
+  return UnsafeFind(look, begin());
+}
+
+UnicodeText::const_iterator UnicodeText::UnsafeFind(
+    const UnicodeText& look, const_iterator start_pos) const {
+  // Due to the magic of the UTF8 encoding, searching for a sequence of
+  // letters is equivalent to substring search.
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece look_piece(look.utf8_data(), look.utf8_length());
+  LOG(FATAL) << "Not implemented";
+  //StringPiece::size_type found =
+  //    searching.find(look_piece, start_pos.utf8_data() - utf8_data());
+  StringPiece::size_type found = StringPiece::npos;
+  if (found == StringPiece::npos) return end();
+  return const_iterator(utf8_data() + found);
+}
+
+bool UnicodeText::HasReplacementChar() const {
+  // Equivalent to:
+  //   UnicodeText replacement_char;
+  //   replacement_char.push_back(0xFFFD);
+  //   return find(replacement_char) != end();
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece looking_for("\xEF\xBF\xBD", 3);
+  LOG(FATAL) << "Not implemented";
+  //return searching.find(looking_for) != StringPiece::npos;
+  return false;
+}
+
+// ----- other methods -----
+
+// Clear operator
+void UnicodeText::clear() {
+  repr_.clear();
+}
+
+// Destructor
+UnicodeText::~UnicodeText() {}
+
+
+void UnicodeText::push_back(char32 c) {
+  if (UniLib::IsValidCodepoint(c)) {
+    char buf[UTFmax];
+    int len = runetochar(buf, &c);
+    if (UniLib::IsInterchangeValid(buf, len)) {
+      repr_.append(buf, len);
+    } else {
+      LOG(WARNING) << "Unicode value 0x" << std::hex << c
+                  << " is not valid for interchange";
+      repr_.append(" ", 1);
+    }
+  } else {
+    LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
+    repr_.append(" ", 1);
+  }
+}
+
+int UnicodeText::size() const {
+  return CodepointCount(repr_.data_, repr_.size_);
+}
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
+  if (&lhs == &rhs) return true;
+  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
+  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
+}
+
+string UnicodeText::DebugString() const {
+  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
+                      this,
+                      size(),
+                      repr_.DebugString().c_str());
+}
+
+
+// ******************* UnicodeText::const_iterator *********************
+
+// The implementation of const_iterator would be nicer if it
+// inherited from boost::iterator_facade
+// (http://boost.org/libs/iterator/doc/iterator_facade.html).
+
+UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
+
+UnicodeText::const_iterator::const_iterator(const const_iterator& other)
+    : it_(other.it_) {
+}
+
+UnicodeText::const_iterator&
+UnicodeText::const_iterator::operator=(const const_iterator& other) {
+  if (&other != this)
+    it_ = other.it_;
+  return *this;
+}
+
+UnicodeText::const_iterator UnicodeText::begin() const {
+  return const_iterator(repr_.data_);
+}
+
+UnicodeText::const_iterator UnicodeText::end() const {
+  return const_iterator(repr_.data_ + repr_.size_);
+}
+
+bool operator<(const UnicodeText::const_iterator& lhs,
+               const UnicodeText::const_iterator& rhs) {
+  return lhs.it_ < rhs.it_;
+}
+
+char32 UnicodeText::const_iterator::operator*() const {
+  // (We could call chartorune here, but that does some
+  // error-checking, and we're guaranteed that our data is valid
+  // UTF-8. Also, we expect this routine to be called very often. So
+  // for speed, we do the calculation ourselves.)
+
+  // Convert from UTF-8
+  unsigned char byte1 = it_[0];
+  if (byte1 < 0x80)
+    return byte1;
+
+  unsigned char byte2 = it_[1];
+  if (byte1 < 0xE0)
+    return ((byte1 & 0x1F) << 6)
+          | (byte2 & 0x3F);
+
+  unsigned char byte3 = it_[2];
+  if (byte1 < 0xF0)
+    return ((byte1 & 0x0F) << 12)
+         | ((byte2 & 0x3F) << 6)
+         |  (byte3 & 0x3F);
+
+  unsigned char byte4 = it_[3];
+  return ((byte1 & 0x07) << 18)
+       | ((byte2 & 0x3F) << 12)
+       | ((byte3 & 0x3F) << 6)
+       |  (byte4 & 0x3F);
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
+  it_ += UniLib::OneCharLen(it_);
+  return *this;
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
+  while (UniLib::IsTrailByte(*--it_));
+  return *this;
+}
+
+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
+  utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
+  utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
+  utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
+  utf8_output[3] = it_[3];
+  return 4;
+}
+
+string UnicodeText::const_iterator::get_utf8_string() const {
+  return string(utf8_data(), utf8_length());
+}
+
+int UnicodeText::const_iterator::utf8_length() const {
+  if ((it_[0] & 0xff) < 0x80) {
+    return 1;
+  } else if ((it_[0] & 0xff) < 0xE0) {
+    return 2;
+  } else if ((it_[0] & 0xff) < 0xF0) {
+    return 3;
+  } else {
+    return 4;
+  }
+}
+
+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
+  CHECK(p != nullptr);
+  const char* start = utf8_data();
+  int len = utf8_length();
+  const char* end = start + len;
+  CHECK(p >= start);
+  CHECK(p <= end);
+  CHECK(p == end || !UniLib::IsTrailByte(*p));
+  return const_iterator(p);
+}
+
+string UnicodeText::const_iterator::DebugString() const {
+  return tensorflow::strings::Printf("{iter %p}", it_);
+}
+
+
+// *************************** Utilities *************************
+
+string CodepointString(const UnicodeText& t) {
+  string s;
+  UnicodeText::const_iterator it = t.begin(), end = t.end();
+  while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
+  return s;
+}
diff --git a/tesseract/unittest/util/utf8/unicodetext.h b/tesseract/unittest/util/utf8/unicodetext.h
new file mode 100644
index 00000000..4e25d3ee
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unicodetext.h
@@ -0,0 +1,477 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+
+#include <stddef.h>                     // for NULL, ptrdiff_t
+#include <iterator>                     // for bidirectional_iterator_tag, etc
+#include <string>                       // for string
+#include <utility>                      // for pair
+
+#include "syntaxnet/base.h"
+
+// ***************************** UnicodeText **************************
+//
+// A UnicodeText object is a container for a sequence of Unicode
+// codepoint values. It has default, copy, and assignment constructors.
+// Data can be appended to it from another UnicodeText, from
+// iterators, or from a single codepoint.
+//
+// The internal representation of the text is UTF-8. Since UTF-8 is a
+// variable-width format, UnicodeText does not provide random access
+// to the text, and changes to the text are permitted only at the end.
+//
+// The UnicodeText class defines a const_iterator. The dereferencing
+// operator (*) returns a codepoint (char32). The iterator is a
+// bidirectional, read-only iterator. It becomes invalid if the text
+// is changed.
+//
+// There are methods for appending and retrieving UTF-8 data directly.
+// The 'utf8_data' method returns a const char* that contains the
+// UTF-8-encoded version of the text; 'utf8_length' returns the number
+// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
+// 4 bytes of UTF-8 data in a char array and returns the number of
+// bytes that it stored.
+//
+// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
+// 0x10FFFF], but UnicodeText has the additional restriction that it
+// can contain only those characters that are valid for interchange on
+// the Web. This excludes all of the control codes except for carriage
+// return, line feed, and horizontal tab.  It also excludes
+// non-characters, but codepoints that are in the Private Use regions
+// are allowed, as are codepoints that are unassigned. (See the
+// Unicode reference for details.) The function UniLib::IsInterchangeValid
+// can be used as a test for this property.
+//
+// UnicodeTexts are safe. Every method that constructs or modifies a
+// UnicodeText tests for interchange-validity, and will substitute a
+// space for the invalid data. Such cases are reported via
+// LOG(WARNING).
+//
+// MEMORY MANAGEMENT: copy, take ownership, or point to
+//
+// A UnicodeText is either an "owner", meaning that it owns the memory
+// for the data buffer and will free it when the UnicodeText is
+// destroyed, or it is an "alias", meaning that it does not.
+//
+// There are three methods for storing UTF-8 data in a UnicodeText:
+//
+// CopyUTF8(buffer, len) copies buffer.
+//
+// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
+//
+// PointToUTF8(buffer, size) creates an alias pointing to buffer.
+//
+// All three methods perform a validity check on the buffer. There are
+// private, "unsafe" versions of these functions that bypass the
+// validity check. They are used internally and by friend-functions
+// that are handling UTF-8 data that has already been validated.
+//
+// The purpose of an alias is to avoid making an unnecessary copy of a
+// UTF-8 buffer while still providing access to the Unicode values
+// within that text through iterators or the fast scanners that are
+// based on UTF-8 state tables. The lifetime of an alias must not
+// exceed the lifetime of the buffer from which it was constructed.
+//
+// The semantics of an alias might be described as "copy on write or
+// repair." The source data is never modified. If push_back() or
+// append() is called on an alias, a copy of the data will be created,
+// and the UnicodeText will become an owner. If clear() is called on
+// an alias, it becomes an (empty) owner.
+//
+// The copy constructor and the assignment operator produce an owner.
+// That is, after direct initialization ("UnicodeText x(y);") or copy
+// initialization ("UnicodeText x = y;") x will be an owner, even if y
+// was an alias. The assignment operator ("x = y;") also produces an
+// owner unless x and y are the same object and y is an alias.
+//
+// Aliases should be used with care. If the source from which an alias
+// was created is freed, or if the contents are changed, while the
+// alias is still in use, fatal errors could result. But it can be
+// quite useful to have a UnicodeText "window" through which to see a
+// UTF-8 buffer without having to pay the price of making a copy.
+//
+// UTILITIES
+//
+// The interfaces in util/utf8/public/textutils.h provide higher-level
+// utilities for dealing with UnicodeTexts, including routines for
+// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
+// strings, creating strings from UnicodeTexts, normalizing text for
+// efficient matching or display, and others.
+
+class UnicodeText {
+ public:
+  class const_iterator;
+
+  typedef char32 value_type;
+
+  // Constructors. These always produce owners.
+  UnicodeText();  // Create an empty text.
+  UnicodeText(const UnicodeText& src);  // copy constructor
+  // Construct a substring (copies the data).
+  UnicodeText(const const_iterator& first, const const_iterator& last);
+
+  // Assignment operator. This copies the data and produces an owner
+  // unless this == &src, e.g., "x = x;", which is a no-op.
+  UnicodeText& operator=(const UnicodeText& src);
+
+  // x.Copy(y) copies the data from y into x.
+  UnicodeText& Copy(const UnicodeText& src);
+  inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
+
+  // x.PointTo(y) changes x so that it points to y's data.
+  // It does not copy y or take ownership of y's data.
+  UnicodeText& PointTo(const UnicodeText& src);
+  UnicodeText& PointTo(const const_iterator& first,
+                       const const_iterator& last);
+
+  ~UnicodeText();
+
+  void clear();  // Clear text.
+  bool empty() const { return repr_.size_ == 0; }  // Test if text is empty.
+
+  // Add a codepoint to the end of the text.
+  // If the codepoint is not interchange-valid, add a space instead
+  // and log a warning.
+  void push_back(char32 codepoint);
+
+  // Generic appending operation.
+  // iterator_traits<ForwardIterator>::value_type must be implicitly
+  // convertible to char32. Typical uses of this method might include:
+  //     char32 chars[] = {0x1, 0x2, ...};
+  //     vector<char32> more_chars = ...;
+  //     utext.append(chars, chars+arraysize(chars));
+  //     utext.append(more_chars.begin(), more_chars.end());
+  template<typename ForwardIterator>
+  UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
+    while (first != last) { push_back(*first++); }
+    return *this;
+  }
+
+  // A specialization of the generic append() method.
+  UnicodeText& append(const const_iterator& first, const const_iterator& last);
+
+  // An optimization of append(source.begin(), source.end()).
+  UnicodeText& append(const UnicodeText& source);
+
+  int size() const;  // the number of Unicode characters (codepoints)
+
+  friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+  friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
+
+  class const_iterator {
+    typedef const_iterator CI;
+   public:
+    typedef std::bidirectional_iterator_tag iterator_category;
+    typedef char32 value_type;
+    typedef ptrdiff_t difference_type;
+    typedef void pointer;  // (Not needed.)
+    typedef const char32 reference;  // (Needed for const_reverse_iterator)
+
+    // Iterators are default-constructible.
+    const_iterator();
+
+    // It's safe to make multiple passes over a UnicodeText.
+    const_iterator(const const_iterator& other);
+    const_iterator& operator=(const const_iterator& other);
+
+    char32 operator*() const;  // Dereference
+
+    const_iterator& operator++();  // Advance (++iter)
+    const_iterator operator++(int) {  // (iter++)
+      const_iterator result(*this);
+      ++*this;
+      return result;
+    }
+
+    const_iterator& operator--();  // Retreat (--iter)
+    const_iterator operator--(int) {  // (iter--)
+      const_iterator result(*this);
+      --*this;
+      return result;
+    }
+
+    // We love relational operators.
+    friend bool operator==(const CI& lhs, const CI& rhs) {
+      return lhs.it_ == rhs.it_; }
+    friend bool operator!=(const CI& lhs, const CI& rhs) {
+      return !(lhs == rhs); }
+    friend bool operator<(const CI& lhs, const CI& rhs);
+    friend bool operator>(const CI& lhs, const CI& rhs) {
+      return rhs < lhs; }
+    friend bool operator<=(const CI& lhs, const CI& rhs) {
+      return !(rhs < lhs); }
+    friend bool operator>=(const CI& lhs, const CI& rhs) {
+      return !(lhs < rhs); }
+
+    friend difference_type distance(const CI& first, const CI& last);
+
+    // UTF-8-specific methods
+    // Store the UTF-8 encoding of the current codepoint into buf,
+    // which must be at least 4 bytes long. Return the number of
+    // bytes written.
+    int get_utf8(char* buf) const;
+    // Return the UTF-8 character that the iterator points to.
+    string get_utf8_string() const;
+    // Return the byte length of the UTF-8 character the iterator points to.
+    int utf8_length() const;
+    // Return the iterator's pointer into the UTF-8 data.
+    const char* utf8_data() const { return it_; }
+
+    string DebugString() const;
+
+   private:
+    friend class UnicodeText;
+    friend class UnicodeTextUtils;
+    friend class UTF8StateTableProperty;
+    explicit const_iterator(const char* it) : it_(it) {}
+
+    const char* it_;
+  };
+
+  const_iterator begin() const;
+  const_iterator end() const;
+
+  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
+   public:
+    explicit const_reverse_iterator(const_iterator it) :
+        std::reverse_iterator<const_iterator>(it) {}
+    const char* utf8_data() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).utf8_data();
+    }
+    int get_utf8(char* buf) const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).get_utf8(buf);
+    }
+    string get_utf8_string() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).get_utf8_string();
+    }
+    int utf8_length() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).utf8_length();
+    }
+  };
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Substring searching.  Returns the beginning of the first
+  // occurrence of "look", or end() if not found.
+  const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
+  // Equivalent to find(look, begin())
+  const_iterator find(const UnicodeText& look) const;
+
+  // Returns whether this contains the character U+FFFD.  This can
+  // occur, for example, if the input to Encodings::Decode() had byte
+  // sequences that were invalid in the source encoding.
+  bool HasReplacementChar() const;
+
+  // UTF-8-specific methods
+  //
+  // Return the data, length, and capacity of UTF-8-encoded version of
+  // the text. Length and capacity are measured in bytes.
+  const char* utf8_data() const { return repr_.data_; }
+  int utf8_length() const { return repr_.size_; }
+  int utf8_capacity() const { return repr_.capacity_; }
+
+  // Return the UTF-8 data as a string.
+  static string UTF8Substring(const const_iterator& first,
+                              const const_iterator& last);
+
+  // There are three methods for initializing a UnicodeText from UTF-8
+  // data. They vary in details of memory management. In all cases,
+  // the data is tested for interchange-validity. If it is not
+  // interchange-valid, a LOG(WARNING) is issued, and each
+  // structurally invalid byte and each interchange-invalid codepoint
+  // is replaced with a space.
+
+  // x.CopyUTF8(buf, len) copies buf into x.
+  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
+
+  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
+  // buf. buf is not copied.
+  UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
+                                   int byte_length,
+                                   int byte_capacity);
+
+  // x.PointToUTF8(buf,len) changes x so that it points to buf
+  // ("becomes an alias"). It does not take ownership or copy buf.
+  // If the buffer is not valid, this has the same effect as
+  // CopyUTF8(utf8_buffer, byte_length).
+  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
+
+  // Occasionally it is necessary to use functions that operate on the
+  // pointer returned by utf8_data(). MakeIterator(p) provides a way
+  // to get back to the UnicodeText level. It uses CHECK to ensure
+  // that p is a pointer within this object's UTF-8 data, and that it
+  // points to the beginning of a character.
+  const_iterator MakeIterator(const char* p) const;
+
+  string DebugString() const;
+
+ private:
+  friend class const_iterator;
+  friend class UnicodeTextUtils;
+
+  class Repr {  // A byte-string.
+   public:
+    char* data_;
+    int size_;
+    int capacity_;
+    bool ours_;  // Do we own data_?
+
+    Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
+    ~Repr() { if (ours_) delete[] data_; }
+
+    void clear();
+    void reserve(int capacity);
+    void resize(int size);
+
+    void append(const char* bytes, int byte_length);
+    void Copy(const char* data, int size);
+    void TakeOwnershipOf(char* data, int size, int capacity);
+    void PointTo(const char* data, int size);
+
+    string DebugString() const;
+
+   private:
+    Repr& operator=(const Repr&);
+    Repr(const Repr& other);
+  };
+
+  Repr repr_;
+
+  // UTF-8-specific private methods.
+  // These routines do not perform a validity check when compiled
+  // in opt mode.
+  // It is an error to call these methods with UTF-8 data that
+  // is not interchange-valid.
+  //
+  UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
+  UnicodeText& UnsafeTakeOwnershipOfUTF8(
+      char* utf8_buffer, int byte_length, int byte_capacity);
+  UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
+  UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
+  const_iterator UnsafeFind(const UnicodeText& look,
+                            const_iterator start_pos) const;
+};
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+
+inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
+  return !(lhs == rhs);
+}
+
+// UnicodeTextRange is a pair of iterators, useful for specifying text
+// segments. If the iterators are ==, the segment is empty.
+typedef pair<UnicodeText::const_iterator,
+             UnicodeText::const_iterator> UnicodeTextRange;
+
+inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
+  return r.first == r.second;
+}
+
+
+// *************************** Utilities *************************
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
+// is an "owner.")
+//
+// Each byte that is structurally invalid will be replaced with a
+// space. Each codepoint that is interchange-invalid will also be
+// replaced with a space, even if the codepoint was represented with a
+// multibyte sequence in the UTF-8 data.
+//
+inline UnicodeText MakeUnicodeTextAcceptingOwnership(
+    char* utf8_buffer, int byte_length, int byte_capacity) {
+  return UnicodeText().TakeOwnershipOfUTF8(
+      utf8_buffer, byte_length, byte_capacity);
+}
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText does not take ownership of the
+// buffer. (It is an "alias.")
+//
+inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
+    const char* utf8_buffer, int byte_length) {
+  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
+}
+
+// Create a UnicodeText from a UTF-8 string or buffer.
+//
+// If do_copy is true, then a copy of the string is made. The copy is
+// owned by the resulting UnicodeText object and will be freed when
+// the object is destroyed. This UnicodeText object is referred to
+// as an "owner."
+//
+// If do_copy is false, then no copy is made. The resulting
+// UnicodeText object does NOT take ownership of the string; in this
+// case, the lifetime of the UnicodeText object must not exceed the
+// lifetime of the string. This Unicodetext object is referred to as
+// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
+//
+// If the input string does not contain valid UTF-8, then a copy is
+// made (as if do_copy were true) and coerced to valid UTF-8 by
+// replacing each invalid byte with a space.
+//
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
+                                     bool do_copy) {
+  UnicodeText t;
+  if (do_copy) {
+    t.CopyUTF8(utf8_buf, len);
+  } else {
+    t.PointToUTF8(utf8_buf, len);
+  }
+  return t;
+}
+
+inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
+  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
+}
+
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
+  return UTF8ToUnicodeText(utf8_buf, len, true);
+}
+inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
+  return UTF8ToUnicodeText(utf8_string, true);
+}
+
+// Return a string containing the UTF-8 encoded version of all the
+// Unicode characters in t.
+inline string UnicodeTextToUTF8(const UnicodeText& t) {
+  return string(t.utf8_data(), t.utf8_length());
+}
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// For debugging.  Return a string of integers, written in uppercase
+// hex (%X), corresponding to the codepoints within the text. Each
+// integer is followed by a space. E.g., "61 62 6A 3005 ".
+string CodepointString(const UnicodeText& t);
+
+#endif  // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
diff --git a/tesseract/unittest/util/utf8/unilib.cc b/tesseract/unittest/util/utf8/unilib.cc
new file mode 100644
index 00000000..c00759ae
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib.cc
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: sligocki@google.com (Shawn Ligocki)
+
+#include "util/utf8/unilib.h"
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Codepoints not allowed for interchange are:
+//   C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
+//       Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
+//       Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
+//   C1 controls: U+007F to U+009F
+//   Surrogates: U+D800 to U+DFFF
+//   Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
+bool IsInterchangeValid(char32 c) {
+  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
+           (c >= 0x7F && c <= 0x9F) ||
+           (c >= 0xD800 && c <= 0xDFFF) ||
+           (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
+}
+
+int SpanInterchangeValid(const char* begin, int byte_length) {
+  char32 rune;
+  const char* p = begin;
+  const char* end = begin + byte_length;
+  while (p < end) {
+    int bytes_consumed = charntorune(&rune, p, end - p);
+    // We want to accept Runeerror == U+FFFD as a valid char, but it is used
+    // by chartorune to indicate error. Luckily, the real codepoint is size 3
+    // while errors return bytes_consumed <= 1.
+    if ((rune == Runeerror && bytes_consumed <= 1) ||
+        !IsInterchangeValid(rune)) {
+      break;  // Found
+    }
+    p += bytes_consumed;
+  }
+  return p - begin;
+}
+
+}  // namespace UniLib
diff --git a/tesseract/unittest/util/utf8/unilib.h b/tesseract/unittest/util/utf8/unilib.h
new file mode 100644
index 00000000..e99895a2
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Routines to do manipulation of Unicode characters or text
+//
+// The StructurallyValid routines accept buffers of arbitrary bytes.
+// For CoerceToStructurallyValid(), the input buffer and output buffers may
+// point to exactly the same memory.
+//
+// In all other cases, the UTF-8 string must be structurally valid and
+// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
+// Debug builds take a fatal error for invalid UTF-8 input.
+// The input and output buffers may not overlap at all.
+//
+// The char32 routines are here only for convenience; they convert to UTF-8
+// internally and use the UTF-8 routines.
+
+#ifndef UTIL_UTF8_UNILIB_H__
+#define UTIL_UTF8_UNILIB_H__
+
+#include <string>
+#include "syntaxnet/base.h"
+
+// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
+// but they are defined in unilib_utf8_utils.h.
+//#include "util/utf8/public/unilib_utf8_utils.h"  // IWYU pragma: export
+
+namespace UniLib {
+
+// Returns the length in bytes of the prefix of src that is all
+//  interchange valid UTF-8
+int SpanInterchangeValid(const char* src, int byte_length);
+inline int SpanInterchangeValid(const std::string& src) {
+  return SpanInterchangeValid(src.data(), src.size());
+}
+
+// Returns true if the source is all interchange valid UTF-8
+// "Interchange valid" is a stronger than structurally valid --
+// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
+bool IsInterchangeValid(char32 codepoint);
+inline bool IsInterchangeValid(const char* src, int byte_length) {
+  return (byte_length == SpanInterchangeValid(src, byte_length));
+}
+inline bool IsInterchangeValid(const std::string& src) {
+  return IsInterchangeValid(src.data(), src.size());
+}
+
+}  // namespace UniLib
+
+#endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
diff --git a/tesseract/unittest/util/utf8/unilib_utf8_utils.h b/tesseract/unittest/util/utf8/unilib_utf8_utils.h
new file mode 100644
index 00000000..a9c10166
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib_utf8_utils.h
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+
+// These definitions are self-contained and have no dependencies.
+// They are also exported from unilib.h for legacy reasons.
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
+// (i.e., is not a surrogate codepoint). See also
+// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
+inline bool IsValidCodepoint(char32 c) {
+  return (static_cast<uint32>(c) < 0xD800)
+    || (c >= 0xE000 && c <= 0x10FFFF);
+}
+
+// Returns true if 'str' is the start of a structurally valid UTF-8
+// sequence and is not a surrogate codepoint. Returns false if str.empty()
+// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
+// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
+inline bool IsUTF8ValidCodepoint(StringPiece str) {
+  char32 c;
+  int consumed;
+  // It's OK if str.length() > consumed.
+  return !str.empty()
+      && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
+      && IsValidCodepoint(c);
+}
+
+// Returns the length (number of bytes) of the Unicode code point
+// starting at src, based on inspecting just that one byte. This
+// requires that src point to a well-formed UTF-8 string; the result
+// is undefined otherwise.
+inline int OneCharLen(const char* src) {
+  return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+}
+
+// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
+inline bool IsTrailByte(char x) {
+  // return (x & 0xC0) == 0x80;
+  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
+  return static_cast<signed char>(x) < -0x40;
+}
+
+}  // namespace UniLib
+
+#endif  // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
diff --git a/tesseract/unittest/validate_grapheme_test.cc b/tesseract/unittest/validate_grapheme_test.cc
new file mode 100644
index 00000000..54e2f490
--- /dev/null
+++ b/tesseract/unittest/validate_grapheme_test.cc
@@ -0,0 +1,179 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
+  std::string str = "\u0c15\u0c3f\u0c15\u0c0e";  // KA - dep I - KA - ind E.
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  // It made 3 graphemes.
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
+  EXPECT_EQ(glyphs[1], std::string("\u0c15"));
+  EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
+}
+
+TEST(ValidateGraphemeTest, SingleConsonantOK) {
+  std::string str = "\u0cb9";  // HA
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+}
+
+TEST(ValidateGraphemeTest, SimpleCV) {
+  std::string str = "\u0cb9\u0cbf";  // HA I
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+}
+
+TEST(ValidateGraphemeTest, SubscriptConjunct) {
+  std::string str = "\u0cb9\u0ccd\u0c95\u0cbf";  // HA Virama KA I
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
+}
+
+TEST(ValidateGraphemeTest, HalfFormJoiner) {
+  std::string str = "\u0d15\u0d4d\u200d\u0d24";  // KA Virama ZWJ Ta
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
+}
+
+TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
+  std::string str = "\u0d15\u200d\u0d4d\u0d24";  // KA ZWI Virama Ta
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
+}
+
+TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
+  std::string str = "\u0d15\u200c\u0d4d\u0d24";  // KA ZWNJ Virama Ta
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
+  // Malaylam only, so not allowed in Telugu.
+  str = "\u0c15\u200c\u0c4d\u0c24";  // KA ZWNJ Virama Ta
+  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+}
+
+TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
+  std::string str = "\u0d15\u0d4d\u200c\u0d24";  // KA Virama ZWNJ Ta
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 2);
+  EXPECT_EQ(glyphs[1], std::string("\u0d24"));
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
+}
+
+TEST(ValidateGraphemeTest, ThaiGraphemes) {
+  // This is a single grapheme unless in glyph split mode
+  std::string str = "\u0e14\u0e38\u0e4a";
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 1);
+  EXPECT_EQ(glyphs[0], str);
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[0], std::string("\u0e14"));
+}
+
+TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
+  std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
+  std::vector<std::string> glyphs;
+  // Returns true, but the joiner is gone.
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(glyphs.size(), 5);
+  EXPECT_EQ(glyphs[0], std::string("'"));
+  EXPECT_EQ(glyphs[1], std::string("\u0d24"));
+  EXPECT_EQ(glyphs[2], std::string("\u0d23"));
+  EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
+  EXPECT_EQ(glyphs[4], std::string("'"));
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/validate_indic_test.cc b/tesseract/unittest/validate_indic_test.cc
new file mode 100644
index 00000000..d317198b
--- /dev/null
+++ b/tesseract/unittest/validate_indic_test.cc
@@ -0,0 +1,231 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+// Though the unicode example for Telugu in section 12.7:
+// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
+// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to
+// suppress a conjugate that would otherwise occur.  If a consonant is followed
+// by a virama and then by a non-Indic character, OpenType will presume that
+// the user simply meant to suppress the inherent vowel of the consonant
+// and render it as the consonant with an explicit virama, the same as if
+// a ZWNJ had followed. Since this is confusing to an OCR engine, the
+// normalizer always puts a termninating ZWNJ on the end if not present,
+// and accepts the string as valid.
+TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
+  std::string str = "\u0c15\u0c4d";               // KA - virama
+  std::string target_str = "\u0c15\u0c4d\u200c";  // KA - virama - ZWNJ
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
+  // Same result if we started with the normalized string.
+  ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1,
+                            target_str);
+}
+
+// Only one dependent vowel is allowed.
+TEST(ValidateIndicTest, OnlyOneDependentVowel) {
+  std::string str = "\u0d15\u0d3e\u0d42";  // KA AA UU
+  std::string dest;
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &dest))
+      << PrintString32WithUnicodes(str);
+}
+
+//  [c26][c4d][c01]
+//     A consonant (DA) followed by the virama followed by a bindu
+//     Syllable modifiers [c01][c02][c03] all modify the pronunciation of
+//     the vowel in a syllable, as does the virama [c04].  You can only
+//     have one of these on a syllable.
+//
+//  References:
+//    http://www.omniglot.com/writing/telugu.htm
+TEST(ValidateIndicTest, OnlyOneVowelModifier) {
+  std::string str = "\u0c26\u0c4d\u0c01";  // DA virama candrabindu
+  std::string result;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(),
+                                  &result));
+  // It made 1 grapheme of 4 chars, by terminating the explicit virama.
+  EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);
+
+  str = "\u0995\u0983\u0981";  // KA visarga candrabindu
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result));
+
+  // Exception: Malayalam allows multiple anusvara.
+  str = "\u0d15\u0d02\u0d02";  // KA Anusvara Anusvara
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(),
+                                  &result));
+  EXPECT_EQ(str, result);
+}
+
+//  [c28][c02][c3f]
+//    A consonant (NA) followed by the Anusvara/sunna and another matra (I).
+// The anusvara [c02] is a pronunciation directive
+//    for a whole syllable and only appears at the end of the syllable
+//  References:
+//    + Unicode v9, 12.1 "Modifier Mark Rules R10,"
+//       and the Microsoft page
+//       http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
+TEST(ValidateIndicTest, VowelModifierMustBeLast) {
+  std::string str = "\u0c28\u0c02\u0c3f";  // NA Sunna I
+  std::string dest;
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &dest))
+      << PrintString32WithUnicodes(str);
+  // Swap c02/c3f and all is ok.
+  str = "\u0c28\u0c3f\u0c02";  // NA I Sunna
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+}
+
+//  [c05][c47]
+//    A Vowel (A) followed by a combining vowel/matra (EE).
+//    In Telugu, matras are only put on consonants, not independent
+//    vowels.
+//  References:
+//  + Unicode v9, 12.1:
+//     Principles of the Devanagari Script: Dependent Vowel Signs (Matras).
+//  + http://varamozhi.sourceforge.net/iscii91.pdf
+TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
+  std::string str = "\u0c05\u0c47";  // A EE
+  std::string dest;
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &dest))
+      << PrintString32WithUnicodes(str);
+  str = "\u0c1e\u0c3e";  // NYA AA
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+}
+
+// Sub-graphemes are allowed if GraphemeNorm is turned off.
+TEST(ValidateIndicTest, SubGraphemes) {
+  std::string str = "\u0d3e";  // AA
+  std::string dest;
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNone, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+}
+
+TEST(ValidateIndicTest, Nukta) {
+  std::string str = "\u0c95\u0cbc\u0ccd\u0cb9";  // KA Nukta Virama HA
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));
+  // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
+  std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9";  // KA Virama Nukta HA
+  ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str);
+}
+
+// Sinhala has some of its own specific rules. See www.macciato.com/sinhala
+TEST(ValidateIndicTest, SinhalaRakaransaya) {
+  std::string str = "\u0d9a\u0dca\u200d\u0dbb";  // KA Virama ZWJ Rayanna
+  std::string dest;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 2);
+  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));
+  // Can be followed by a dependent vowel.
+  str += "\u0dd9";  // E
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+}
+
+TEST(ValidateIndicTest, SinhalaYansaya) {
+  std::string str = "\u0d9a\u0dca\u200d\u0dba";  // KA Virama ZWJ Yayanna
+  std::string dest;
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+  // Can be followed by a dependent vowel.
+  str += "\u0ddd";  // OO
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
+      << PrintString32WithUnicodes(str);
+  EXPECT_EQ(dest, str);
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));
+}
+
+TEST(ValidateIndicTest, SinhalaRepaya) {
+  std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8";  // KA Rayanna Virama ZWJ MA
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 2);
+  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 3);
+  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
+}
+
+TEST(ValidateIndicTest, SinhalaSpecials) {
+  // Sinhala has some exceptions from the usual rules.
+  std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
+  std::vector<std::string> glyphs;
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(glyphs[0], std::string("\u0dc0"));
+  EXPECT_EQ(glyphs[1], std::string("\u0d9c"));
+  EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb"));
+  EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));
+  EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));
+  str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
+  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
+  EXPECT_EQ(glyphs[0], std::string("\u0dc3"));
+  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
+  EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d"));
+  EXPECT_EQ(glyphs[3], std::string("\u0dcf"));
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/validate_khmer_test.cc b/tesseract/unittest/validate_khmer_test.cc
new file mode 100644
index 00000000..74b87e61
--- /dev/null
+++ b/tesseract/unittest/validate_khmer_test.cc
@@ -0,0 +1,50 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+// Test some random Khmer words.
+TEST(ValidateKhmerTest, GoodKhmerWords) {
+  std::string str = "ព័ត៏មានប្លែកៗ";
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 13, 12, 7, str);
+  str = "ទំនុកច្រៀង";
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 10, 9, 5, str);
+  str = "កាលីហ្វូញ៉ា";
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 4, str);
+  str = "ចាប់ពីផ្លូវ";
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 10, 5, str);
+}
+
+// Test some random Khmer words with dotted circles.
+TEST(ValidateKhmerTest, BadKhmerWords) {
+  std::string result;
+  // Multiple dependent vowels not allowed
+  std::string str = "\u1796\u17b6\u17b7";
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result));
+  // Multiple shifters not allowed
+  str = "\u1798\u17c9\u17ca";
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result));
+  // Multiple signs not allowed
+  str = "\u1780\u17b6\u17cb\u17cd";
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result));
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/validate_myanmar_test.cc b/tesseract/unittest/validate_myanmar_test.cc
new file mode 100644
index 00000000..262e04b6
--- /dev/null
+++ b/tesseract/unittest/validate_myanmar_test.cc
@@ -0,0 +1,54 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include_gunit.h"
+#include "normstrngs.h"
+#include "normstrngs_test.h"
+
+namespace tesseract {
+
+// Test some random Myanmar words.
+TEST(ValidateMyanmarTest, GoodMyanmarWords) {
+  std::string str = "လျှာကသိသည် ";  // No viramas in this one.
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 11, 5, str);
+  str = "တုန္လႈပ္မႈ ";
+  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 9, 4, str);
+}
+
+// Test some random Myanmar words with dotted circles.
+TEST(ValidateMyanmarTest, BadMyanmarWords) {
+  std::string str = "က်န္းမာေရး";
+  std::vector<std::string> glyphs;
+  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
+      str.c_str(), &glyphs));
+  std::string result;
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result));
+  // It works if the grapheme normalization is turned off.
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNone, str.c_str(), &result));
+  EXPECT_EQ(str, result);
+  str = "ခုႏွစ္";
+  EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
+      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
+      true, str.c_str(), &glyphs));
+  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                   GraphemeNorm::kNormalize, str.c_str(),
+                                   &result));
+  // It works if the grapheme normalization is turned off.
+  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
+                                  GraphemeNorm::kNone, str.c_str(), &result));
+  EXPECT_EQ(str, result);
+}
+
+}  // namespace tesseract
diff --git a/tesseract/unittest/validator_test.cc b/tesseract/unittest/validator_test.cc
new file mode 100644
index 00000000..84cb42af
--- /dev/null
+++ b/tesseract/unittest/validator_test.cc
@@ -0,0 +1,76 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "validator.h"
+
+#include "gmock/gmock.h"  // for testing::ElementsAreArray
+#include "include_gunit.h"
+
+namespace tesseract {
+
+class TestableValidator : public Validator {
+ public:
+  static ViramaScript TestableMostFrequentViramaScript(
+      const std::vector<char32>& utf32) {
+    return MostFrequentViramaScript(utf32);
+  }
+};
+
+// The majority of Validator is tested by the script-specific tests of its
+// subclasses, but the MostFrequentViramaScript function is worth a unittest.
+TEST(ValidatorTest, MostFrequentViramaScript) {
+  // The most frequent virama script should come out correct, despite
+  // distractions from other scripts.
+  EXPECT_EQ(ViramaScript::kTelugu,
+            TestableValidator::TestableMostFrequentViramaScript({0xc05}));
+  // It is still Telugu surrounded by Latin.
+  EXPECT_EQ(ViramaScript::kTelugu,
+            TestableValidator::TestableMostFrequentViramaScript(
+                {'a', 0xc05, 'b', 'c'}));
+  // But not still Telugu surrounded by Devanagari.
+  EXPECT_EQ(ViramaScript::kDevanagari,
+            TestableValidator::TestableMostFrequentViramaScript(
+                {0x905, 0xc05, 0x906, 0x907}));
+  EXPECT_EQ(ViramaScript::kKannada,
+            TestableValidator::TestableMostFrequentViramaScript(
+                {0xc85, 0xc05, 0xc86, 0xc87}));
+  EXPECT_EQ(ViramaScript::kBengali,
+            TestableValidator::TestableMostFrequentViramaScript(
+                {0x985, 0xc05, 0x986, 0x987}));
+  // Danda and double Danda don't count as Devanagari, as they are common.
+  EXPECT_EQ(ViramaScript::kTelugu,
+            TestableValidator::TestableMostFrequentViramaScript(
+                {0x964, 0xc05, 0x965, 0x965}));
+}
+
+// ValidateCleanAndSegment doesn't modify the input by much, but its
+// transformation should be idempotent. (Doesn't change again if re-applied.)
+TEST(ValidatorTest, Idempotency) {
+  std::vector<char32> str1(
+      {0xd24, 0xd23, 0xd32, 0xd4d, '\'', 0x200d, 0x200c, 0x200d, 0x200c});
+  std::vector<char32> str2(
+      {0xd24, 0xd23, 0xd32, 0xd4d, 0x200c, 0x200d, 0x200c, 0x200d, '\''});
+  std::vector<std::vector<char32>> result1, result2, result3, result4;
+  EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+      GraphemeNormMode::kSingleString, true, str1, &result1));
+  EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+      GraphemeNormMode::kSingleString, true, result1[0], &result2));
+  EXPECT_EQ(result1.size(), result2.size());
+  EXPECT_THAT(result2[0], testing::ElementsAreArray(result1[0]));
+  EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+      GraphemeNormMode::kSingleString, true, str2, &result3));
+  EXPECT_TRUE(Validator::ValidateCleanAndSegment(
+      GraphemeNormMode::kSingleString, true, result3[0], &result4));
+  EXPECT_EQ(result3.size(), result4.size());
+  EXPECT_THAT(result4[0], testing::ElementsAreArray(result3[0]));
+}
+
+}  // namespace tesseract