diff options
Diffstat (limited to 'tesseract/unittest/lang_model_test.cc')
-rw-r--r-- | tesseract/unittest/lang_model_test.cc | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/tesseract/unittest/lang_model_test.cc b/tesseract/unittest/lang_model_test.cc new file mode 100644 index 00000000..b059c18c --- /dev/null +++ b/tesseract/unittest/lang_model_test.cc @@ -0,0 +1,217 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <string> // for std::string + +#include "absl/strings/str_cat.h" + +#include "gmock/gmock.h" // for testing::ElementsAreArray + +#include "include_gunit.h" +#include "lang_model_helpers.h" +#include "log.h" // for LOG +#include "lstmtrainer.h" +#include "unicharset_training_utils.h" + +namespace tesseract { + +std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, name); +} + +// This is an integration test that verifies that CombineLangModel works to +// the extent that an LSTMTrainer can be initialized with the result, and it +// can encode strings. More importantly, the test verifies that adding an extra +// character to the unicharset does not change the encoding of strings. +TEST(LangModelTest, AddACharacter) { + constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&"; + constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹"; + // Setup the arguments. + std::string script_dir = LANGDATA_DIR; + std::string eng_dir = file::JoinPath(script_dir, "eng"); + std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset"); + UNICHARSET unicharset; + EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str())); + std::string version_str = "TestVersion"; + file::MakeTmpdir(); + std::string output_dir = FLAGS_test_tmpdir; + LOG(INFO) << "Output dir=" << output_dir << "\n"; + std::string lang1 = "eng"; + bool pass_through_recoder = false; + std::vector<STRING> words, puncs, numbers; + // If these reads fail, we get a warning message and an empty list of words. + ReadFile(file::JoinPath(eng_dir, "eng.wordlist"), nullptr) + .split('\n', &words); + EXPECT_GT(words.size(), 0); + ReadFile(file::JoinPath(eng_dir, "eng.punc"), nullptr).split('\n', &puncs); + EXPECT_GT(puncs.size(), 0); + ReadFile(file::JoinPath(eng_dir, "eng.numbers"), nullptr) + .split('\n', &numbers); + EXPECT_GT(numbers.size(), 0); + bool lang_is_rtl = false; + // Generate the traineddata file. + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang1, pass_through_recoder, words, puncs, + numbers, lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata1 = + file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); + LSTMTrainer trainer1; + trainer1.InitCharSet(traineddata1); + std::vector<int> labels1; + EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1)); + STRING test1_decoded = trainer1.DecodeLabels(labels1); + std::string test1_str(&test1_decoded[0], test1_decoded.length()); + LOG(INFO) << "Labels1=" << test1_str << "\n"; + + // Add a new character to the unicharset and try again. + int size_before = unicharset.size(); + unicharset.unichar_insert("₹"); + SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, + &unicharset); + EXPECT_EQ(size_before + 1, unicharset.size()); + // Generate the traineddata file. + std::string lang2 = "extended"; + EXPECT_EQ(EXIT_SUCCESS, + CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang2, pass_through_recoder, words, puncs, numbers, + lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata2 = + file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); + LSTMTrainer trainer2; + trainer2.InitCharSet(traineddata2); + std::vector<int> labels2; + EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2)); + STRING test2_decoded = trainer2.DecodeLabels(labels2); + std::string test2_str(&test2_decoded[0], test2_decoded.length()); + LOG(INFO) << "Labels2=" << test2_str << "\n"; + // encode kTestStringRupees. + std::vector<int> labels3; + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3)); + STRING test3_decoded = trainer2.DecodeLabels(labels3); + std::string test3_str(&test3_decoded[0], test3_decoded.length()); + LOG(INFO) << "labels3=" << test3_str << "\n"; + // Copy labels1 to a std::vector, renumbering the null char to match trainer2. + // Since Tensor Flow's CTC implementation insists on having the null be the + // last label, and we want to be compatible, null has to be renumbered when + // we add a class. + int null1 = trainer1.null_char(); + int null2 = trainer2.null_char(); + EXPECT_EQ(null1 + 1, null2); + std::vector<int> labels1_v(labels1.size()); + for (int i = 0; i < labels1.size(); ++i) { + if (labels1[i] == null1) + labels1_v[i] = null2; + else + labels1_v[i] = labels1[i]; + } + EXPECT_THAT(labels1_v, + testing::ElementsAreArray(&labels2[0], labels2.size())); + // To make sure we we are not cheating somehow, we can now encode the Rupee + // symbol, which we could not do before. + EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2)); +} + +// Same as above test, for hin instead of eng +TEST(LangModelTest, AddACharacterHindi) { + constexpr char kTestString[] = "हिन्दी में एक लाइन लिखें"; + constexpr char kTestStringRupees[] = "हिंदी में रूपये का चिन्ह प्रयोग करें ₹१००.००"; + // Setup the arguments. + std::string script_dir = LANGDATA_DIR; + std::string hin_dir = file::JoinPath(script_dir, "hin"); + std::string unicharset_path = TestDataNameToPath("hin_beam.unicharset"); + UNICHARSET unicharset; + EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str())); + std::string version_str = "TestVersion"; + file::MakeTmpdir(); + std::string output_dir = FLAGS_test_tmpdir; + LOG(INFO) << "Output dir=" << output_dir << "\n"; + std::string lang1 = "hin"; + bool pass_through_recoder = false; + std::vector<STRING> words, puncs, numbers; + // If these reads fail, we get a warning message and an empty list of words. + ReadFile(file::JoinPath(hin_dir, "hin.wordlist"), nullptr) + .split('\n', &words); + EXPECT_GT(words.size(), 0); + ReadFile(file::JoinPath(hin_dir, "hin.punc"), nullptr).split('\n', &puncs); + EXPECT_GT(puncs.size(), 0); + ReadFile(file::JoinPath(hin_dir, "hin.numbers"), nullptr) + .split('\n', &numbers); + EXPECT_GT(numbers.size(), 0); + bool lang_is_rtl = false; + // Generate the traineddata file. + EXPECT_EQ(0, CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang1, pass_through_recoder, words, puncs, + numbers, lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata1 = + file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); + LSTMTrainer trainer1; + trainer1.InitCharSet(traineddata1); + std::vector<int> labels1; + EXPECT_TRUE(trainer1.EncodeString(kTestString, &labels1)); + STRING test1_decoded = trainer1.DecodeLabels(labels1); + std::string test1_str(&test1_decoded[0], test1_decoded.length()); + LOG(INFO) << "Labels1=" << test1_str << "\n"; + + // Add a new character to the unicharset and try again. + int size_before = unicharset.size(); + unicharset.unichar_insert("₹"); + SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, + &unicharset); + EXPECT_EQ(size_before + 1, unicharset.size()); + // Generate the traineddata file. + std::string lang2 = "extendedhin"; + EXPECT_EQ(EXIT_SUCCESS, + CombineLangModel(unicharset, script_dir, version_str, output_dir, + lang2, pass_through_recoder, words, puncs, numbers, + lang_is_rtl, nullptr, nullptr)); + // Init a trainer with it, and encode kTestString. + std::string traineddata2 = + file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); + LSTMTrainer trainer2; + trainer2.InitCharSet(traineddata2); + std::vector<int> labels2; + EXPECT_TRUE(trainer2.EncodeString(kTestString, &labels2)); + STRING test2_decoded = trainer2.DecodeLabels(labels2); + std::string test2_str(&test2_decoded[0], test2_decoded.length()); + LOG(INFO) << "Labels2=" << test2_str << "\n"; + // encode kTestStringRupees. + std::vector<int> labels3; + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels3)); + STRING test3_decoded = trainer2.DecodeLabels(labels3); + std::string test3_str(&test3_decoded[0], test3_decoded.length()); + LOG(INFO) << "labels3=" << test3_str << "\n"; + // Copy labels1 to a std::vector, renumbering the null char to match trainer2. + // Since Tensor Flow's CTC implementation insists on having the null be the + // last label, and we want to be compatible, null has to be renumbered when + // we add a class. + int null1 = trainer1.null_char(); + int null2 = trainer2.null_char(); + EXPECT_EQ(null1 + 1, null2); + std::vector<int> labels1_v(labels1.size()); + for (int i = 0; i < labels1.size(); ++i) { + if (labels1[i] == null1) + labels1_v[i] = null2; + else + labels1_v[i] = labels1[i]; + } + EXPECT_THAT(labels1_v, + testing::ElementsAreArray(&labels2[0], labels2.size())); + // To make sure we we are not cheating somehow, we can now encode the Rupee + // symbol, which we could not do before. + EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); + EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2)); +} + +} // namespace tesseract |