microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
2c3d6f7976130aeacd8c2de4f27569d3ef08520a

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

base/string_utils.cc

232lines · modecode

1#ifdef ENABLE_TF_STRING
2#include "farmhash.h"
3#endif
4
5#include "string_utils.h"
6
7std::vector<std::string_view> SplitString(const std::string_view& str, const std::string_view& seps, bool remove_empty_entries) {
8 std::vector<std::string_view> result;
9 std::string ::size_type pre_pos = 0;
10
11 //TODO: bug fix
12 while (true) {
13 auto next_pos = str.find_first_of(seps, pre_pos);
14
15 if (next_pos == std::string::npos) {
16 auto sub_str = str.substr(pre_pos, next_pos);
17 // sub_str is empty means the last sep reach the end of string
18 if (!sub_str.empty()) {
19 result.push_back(sub_str);
20 }
21
22 break;
23 }
24
25 if (pre_pos != next_pos || !remove_empty_entries) {
26 auto sub_str = str.substr(pre_pos, next_pos - pre_pos);
27 result.push_back(sub_str);
28 }
29
30 pre_pos = next_pos + 1;
31 }
32
33 return result;
34}
35
36bool IsCJK(char32_t c) {
37 return (c >= 0x4E00 && c <= 0x9FFF)
38 || (c >= 0x3400 && c <= 0x4DBF)
39 || (c >= 0x20000 && c <= 0x2A6DF)
40 || (c >= 0x2A700 && c <= 0x2B73F)
41 || (c >= 0x2B740 && c <= 0x2B81F)
42 || (c >= 0x2B820 && c <= 0x2CEAF)
43 || (c >= 0xF900 && c <= 0xFAFF)
44 || (c >= 0x2F800 && c <= 0x2FA1F);
45}
46
47// Generated by tools/generate_unicode_category_table.py
48bool IsSpace(char32_t c) {
49 if (c == 13||c == 32||c == 160||c == 8239||c == 8287||c == 12288) {
50 return true;
51 }
52
53 if ((c >= 9 && c <= 10)||(c >= 8192 && c <= 8202)) {
54 return true;
55 }
56
57 return false;
58}
59
60// Generated by tools/generate_unicode_category_table.py
61bool IsPunct(char32_t c) {
62 if (c == 161||c == 167||c == 171||c == 187||c == 191||c == 894||c == 903||c == 12336||c == 12349) {
63 return true;
64 }
65
66 if ((c >= 33 && c <= 47)||(c >= 58 && c <= 64)||(c >= 91 && c <= 96)||(c >= 123 && c <= 126)
67 ||(c >= 182 && c <= 183)||(c >= 8208 && c <= 8231)||(c >= 8240 && c <= 8259)
68 || (c >= 8261 && c <= 8273)||(c >= 8275 && c <= 8286)||(c >= 12289 && c <= 12291)
69 ||(c >= 12296 && c <= 12305)||(c >= 12308 && c <= 12319)) {
70 return true;
71 }
72
73 return false;
74}
75
76// Generated by tools/generate_unicode_category_table.py
77bool IsControl(char32_t c) {
78 if (c == 173||c == 907||c == 909||c == 930||c == 11930||c == 173790||c == 195102
79 ) {
80 return true;
81 }
82
83 if ((c >= 0 && c <= 8)||(c >= 11 && c <= 12)||(c >= 14 && c <= 31)||(c >= 128 && c <= 159)
84 ||(c >= 888 && c <= 889)||(c >= 896 && c <= 899)||(c >= 8203 && c <= 8207)
85 ||(c >= 8234 && c <= 8238)||(c >= 8288 && c <= 8302)||(c >= 12020 && c <= 12030)
86 ||(c >= 40957 && c <= 40958)||(c >= 64110 && c <= 64111)||(c >= 64218 && c <= 64254)
87 ||(c >= 177973 && c <= 177982)||(c >= 178206 && c <= 178207)||(c >= 183970 && c <= 183982)) {
88 return true;
89 }
90
91 return false;
92}
93
94bool IsAccent(char32_t c)
95{
96 // only support part of accent
97 // [TODO] support more accent
98 return c >= 0x300 && c <= 0x36F;
99}
100
101char32_t ToLower(char32_t c) {
102 // Basic ASCII uppercase to lowercase
103 if (c >= 'A' && c <= 'Z') {
104 return c + 32;
105 }
106
107 // Latin-1 Supplement (À-ß)
108 if (c >= U'À' && c <= U'Þ') {
109 // Special case for ×
110 if(c == U'×') {
111 return c;
112 }
113 return c + 32;
114 }
115
116 // // Latin Extended-A (Ā-ſ)
117 // if (c >= U'Ā' && c <= U'Ʒ') {
118 // return c + 1;
119 // }
120
121 // Greek and Coptic
122 if (c >= U'Ἀ' && c <= U'Ὠ') {
123 return c + 32;
124 }
125
126 // Cyrillic
127 if (c >= U'А' && c <= U'Я') {
128 return c + 32;
129 }
130
131 // Special cases
132 switch (c) {
133 case U'Ι':
134 return U'ι';
135 case U'Κ':
136 return U'κ';
137 case U'Λ':
138 return U'λ';
139 case U'Μ':
140 return U'μ';
141 case U'Ν':
142 return U'ν';
143 case U'Ο':
144 return U'ο';
145 case U'Π':
146 return U'π';
147 }
148
149 // If no conversion rule applies, return the original character
150 return c;
151}
152
153char32_t StripAccent(char32_t c)
154{
155 // "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
156 const char32_t * tr = U"AAAAAAÆCEEEEIIIIÐNOOOOOרUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
157 if (c < 192 || c > 255) {
158 return c;
159}
160
161 return tr[c - 192];
162}
163
164#ifdef ENABLE_TF_STRING
165// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L28
166static inline uint64_t ByteAs64(char c) { return static_cast<uint64_t>(c) & 0xff; }
167
168// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L41
169uint64_t DecodeFixed32(const char* ptr) {
170 return ((static_cast<uint64_t>(static_cast<unsigned char>(ptr[0]))) |
171 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 8) |
172 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 16) |
173 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 24));
174}
175
176// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L55
177static uint64_t DecodeFixed64(const char* ptr) {
178 uint64_t lo = DecodeFixed32(ptr);
179 uint64_t hi = DecodeFixed32(ptr + 4);
180 return (hi << 32) | lo;
181}
182
183// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L79
184uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
185 const uint64_t m = 0xc6a4a7935bd1e995;
186 const int r = 47;
187
188 uint64_t h = seed ^ (n * m);
189
190 while (n >= 8) {
191 uint64_t k = DecodeFixed64(data);
192 data += 8;
193 n -= 8;
194
195 k *= m;
196 k ^= k >> r;
197 k *= m;
198
199 h ^= k;
200 h *= m;
201 }
202
203 switch (n) {
204 case 7:
205 h ^= ByteAs64(data[6]) << 48;
206 case 6:
207 h ^= ByteAs64(data[5]) << 40;
208 case 5:
209 h ^= ByteAs64(data[4]) << 32;
210 case 4:
211 h ^= ByteAs64(data[3]) << 24;
212 case 3:
213 h ^= ByteAs64(data[2]) << 16;
214 case 2:
215 h ^= ByteAs64(data[1]) << 8;
216 default: // case 1: make some code analyzer be happier.
217 h ^= ByteAs64(data[0]);
218 h *= m;
219 }
220
221 h ^= h >> r;
222 h *= m;
223 h ^= h >> r;
224
225 return h;
226}
227
228uint64_t Hash64Fast(const char* data, size_t n) {
229 return static_cast<int64_t>(util::Fingerprint64(data, n));
230}
231
232#endif // ENABLE_TF_STRING
233