microsoft/onnxruntime-extensions
Publicmirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable
tokenizer/unicode.h
101lines · modecode
| 1 | // This file is part of UniLib <http://github.com/ufal/unilib/>. |
| 2 | // |
| 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
| 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. |
| 5 | // |
| 6 | // This Source Code Form is subject to the terms of the Mozilla Public |
| 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this |
| 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 9 | // |
| 10 | // UniLib version: 3.2.1-devel |
| 11 | // Unicode version: 13.0.0 |
| 12 | |
| 13 | #pragma once |
| 14 | |
| 15 | #include <cstddef> |
| 16 | #include <cstdint> |
| 17 | #include <string> |
| 18 | |
| 19 | namespace ufal { |
| 20 | namespace unilib { |
| 21 | |
| 22 | class unicode { |
| 23 | enum : uint8_t { |
| 24 | _Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5, |
| 25 | _Mn = 6, _Mc = 7, _Me = 8, |
| 26 | _Nd = 9, _Nl = 10, _No = 11, |
| 27 | _Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18, |
| 28 | _Sm = 19, _Sc = 20, _Sk = 21, _So = 22, |
| 29 | _Zs = 23, _Zl = 24, _Zp = 25, |
| 30 | _Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30 |
| 31 | }; |
| 32 | |
| 33 | public: |
| 34 | typedef uint32_t category_t; |
| 35 | enum : category_t { |
| 36 | Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt, |
| 37 | Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo, |
| 38 | Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me, |
| 39 | Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No, |
| 40 | Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi, |
| 41 | Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po, |
| 42 | Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So, |
| 43 | Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp, |
| 44 | Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn |
| 45 | }; |
| 46 | |
| 47 | static inline category_t category(char32_t chr); |
| 48 | |
| 49 | static inline char32_t lowercase(char32_t chr); |
| 50 | static inline char32_t uppercase(char32_t chr); |
| 51 | static inline char32_t titlecase(char32_t chr); |
| 52 | |
| 53 | private: |
| 54 | static const char32_t CHARS = 0x110000; |
| 55 | static const int32_t DEFAULT_CAT = Cn; |
| 56 | |
| 57 | static const uint8_t category_index[CHARS >> 8]; |
| 58 | static const uint8_t category_block[][256]; |
| 59 | static const uint8_t othercase_index[CHARS >> 8]; |
| 60 | static const char32_t othercase_block[][256]; |
| 61 | |
| 62 | enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, UPPER_ONLY = 3, LOWER_THEN_UPPER = 4, UPPER_THEN_TITLE = 5, TITLE_THEN_LOWER = 6 }; |
| 63 | }; |
| 64 | |
| 65 | unicode::category_t unicode::category(char32_t chr) { |
| 66 | return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
| 67 | } |
| 68 | |
| 69 | char32_t unicode::lowercase(char32_t chr) { |
| 70 | if (chr < CHARS) { |
| 71 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
| 72 | if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; |
| 73 | if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; |
| 74 | if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
| 75 | } |
| 76 | return chr; |
| 77 | } |
| 78 | |
| 79 | char32_t unicode::uppercase(char32_t chr) { |
| 80 | if (chr < CHARS) { |
| 81 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
| 82 | if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
| 83 | if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8; |
| 84 | if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; |
| 85 | if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
| 86 | } |
| 87 | return chr; |
| 88 | } |
| 89 | |
| 90 | char32_t unicode::titlecase(char32_t chr) { |
| 91 | if (chr < CHARS) { |
| 92 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
| 93 | if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
| 94 | if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8; |
| 95 | if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
| 96 | } |
| 97 | return chr; |
| 98 | } |
| 99 | |
| 100 | } // namespace unilib |
| 101 | } // namespace ufal |