summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'tesseract/unittest/util/utf8/unilib.h')
-rw-r--r--tesseract/unittest/util/utf8/unilib.h63
1 files changed, 63 insertions, 0 deletions
diff --git a/tesseract/unittest/util/utf8/unilib.h b/tesseract/unittest/util/utf8/unilib.h
new file mode 100644
index 00000000..e99895a2
--- /dev/null
+++ b/tesseract/unittest/util/utf8/unilib.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Routines to do manipulation of Unicode characters or text
+//
+// The StructurallyValid routines accept buffers of arbitrary bytes.
+// For CoerceToStructurallyValid(), the input buffer and output buffers may
+// point to exactly the same memory.
+//
+// In all other cases, the UTF-8 string must be structurally valid and
+// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
+// Debug builds take a fatal error for invalid UTF-8 input.
+// The input and output buffers may not overlap at all.
+//
+// The char32 routines are here only for convenience; they convert to UTF-8
+// internally and use the UTF-8 routines.
+
+#ifndef UTIL_UTF8_UNILIB_H__
+#define UTIL_UTF8_UNILIB_H__
+
+#include <string>
+#include "syntaxnet/base.h"
+
+// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
+// but they are defined in unilib_utf8_utils.h.
+//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
+
+namespace UniLib {
+
+// Returns the length in bytes of the prefix of src that is all
+// interchange valid UTF-8
+int SpanInterchangeValid(const char* src, int byte_length);
+inline int SpanInterchangeValid(const std::string& src) {
+ return SpanInterchangeValid(src.data(), src.size());
+}
+
+// Returns true if the source is all interchange valid UTF-8
+// "Interchange valid" is a stronger than structurally valid --
+// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
+bool IsInterchangeValid(char32 codepoint);
+inline bool IsInterchangeValid(const char* src, int byte_length) {
+ return (byte_length == SpanInterchangeValid(src, byte_length));
+}
+inline bool IsInterchangeValid(const std::string& src) {
+ return IsInterchangeValid(src.data(), src.size());
+}
+
+} // namespace UniLib
+
+#endif // UTIL_UTF8_PUBLIC_UNILIB_H_