diff options
Diffstat (limited to 'tesseract/unittest/util/utf8/unilib.h')
-rw-r--r-- | tesseract/unittest/util/utf8/unilib.h | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/tesseract/unittest/util/utf8/unilib.h b/tesseract/unittest/util/utf8/unilib.h new file mode 100644 index 00000000..e99895a2 --- /dev/null +++ b/tesseract/unittest/util/utf8/unilib.h @@ -0,0 +1,63 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Routines to do manipulation of Unicode characters or text +// +// The StructurallyValid routines accept buffers of arbitrary bytes. +// For CoerceToStructurallyValid(), the input buffer and output buffers may +// point to exactly the same memory. +// +// In all other cases, the UTF-8 string must be structurally valid and +// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. +// Debug builds take a fatal error for invalid UTF-8 input. +// The input and output buffers may not overlap at all. +// +// The char32 routines are here only for convenience; they convert to UTF-8 +// internally and use the UTF-8 routines. + +#ifndef UTIL_UTF8_UNILIB_H__ +#define UTIL_UTF8_UNILIB_H__ + +#include <string> +#include "syntaxnet/base.h" + +// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here, +// but they are defined in unilib_utf8_utils.h. +//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export + +namespace UniLib { + +// Returns the length in bytes of the prefix of src that is all +// interchange valid UTF-8 +int SpanInterchangeValid(const char* src, int byte_length); +inline int SpanInterchangeValid(const std::string& src) { + return SpanInterchangeValid(src.data(), src.size()); +} + +// Returns true if the source is all interchange valid UTF-8 +// "Interchange valid" is a stronger than structurally valid -- +// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. +bool IsInterchangeValid(char32 codepoint); +inline bool IsInterchangeValid(const char* src, int byte_length) { + return (byte_length == SpanInterchangeValid(src, byte_length)); +} +inline bool IsInterchangeValid(const std::string& src) { + return IsInterchangeValid(src.data(), src.size()); +} + +} // namespace UniLib + +#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ |