microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
92f6b51106c9e9143c452e537cb5e41d2dcaa266

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

tokenizer/unicode.h

101lines · modecode

1// This file is part of UniLib <http://github.com/ufal/unilib/>.
2//
3// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
4// Mathematics and Physics, Charles University in Prague, Czech Republic.
5//
6// This Source Code Form is subject to the terms of the Mozilla Public
7// License, v. 2.0. If a copy of the MPL was not distributed with this
8// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9//
10// UniLib version: 3.2.1-devel
11// Unicode version: 13.0.0
12
13#pragma once
14
15#include <cstddef>
16#include <cstdint>
17#include <string>
18
19namespace ufal {
20namespace unilib {
21
22class unicode {
23 enum : uint8_t {
24 _Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5,
25 _Mn = 6, _Mc = 7, _Me = 8,
26 _Nd = 9, _Nl = 10, _No = 11,
27 _Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18,
28 _Sm = 19, _Sc = 20, _Sk = 21, _So = 22,
29 _Zs = 23, _Zl = 24, _Zp = 25,
30 _Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30
31 };
32
33 public:
34 typedef uint32_t category_t;
35 enum : category_t {
36 Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt,
37 Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo,
38 Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me,
39 Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No,
40 Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi,
41 Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po,
42 Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So,
43 Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp,
44 Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn
45 };
46
47 static inline category_t category(char32_t chr);
48
49 static inline char32_t lowercase(char32_t chr);
50 static inline char32_t uppercase(char32_t chr);
51 static inline char32_t titlecase(char32_t chr);
52
53 private:
54 static const char32_t CHARS = 0x110000;
55 static const int32_t DEFAULT_CAT = Cn;
56
57 static const uint8_t category_index[CHARS >> 8];
58 static const uint8_t category_block[][256];
59 static const uint8_t othercase_index[CHARS >> 8];
60 static const char32_t othercase_block[][256];
61
62 enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, UPPER_ONLY = 3, LOWER_THEN_UPPER = 4, UPPER_THEN_TITLE = 5, TITLE_THEN_LOWER = 6 };
63};
64
65unicode::category_t unicode::category(char32_t chr) {
66 return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
67}
68
69char32_t unicode::lowercase(char32_t chr) {
70 if (chr < CHARS) {
71 char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
72 if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8;
73 if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8;
74 if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
75 }
76 return chr;
77}
78
79char32_t unicode::uppercase(char32_t chr) {
80 if (chr < CHARS) {
81 char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
82 if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
83 if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8;
84 if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8;
85 if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
86 }
87 return chr;
88}
89
90char32_t unicode::titlecase(char32_t chr) {
91 if (chr < CHARS) {
92 char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
93 if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
94 if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8;
95 if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
96 }
97 return chr;
98}
99
100} // namespace unilib
101} // namespace ufal