microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
main

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

base/string_utils.cc

229lines · modecode

1#include "string_utils.h"
2#include <functional>
3
4std::vector<std::string_view> SplitString(const std::string_view& str, const std::string_view& seps, bool remove_empty_entries) {
5 std::vector<std::string_view> result;
6 std::string ::size_type pre_pos = 0;
7
8 //TODO: bug fix
9 while (true) {
10 auto next_pos = str.find_first_of(seps, pre_pos);
11
12 if (next_pos == std::string::npos) {
13 auto sub_str = str.substr(pre_pos, next_pos);
14 // sub_str is empty means the last sep reach the end of string
15 if (!sub_str.empty()) {
16 result.push_back(sub_str);
17 }
18
19 break;
20 }
21
22 if (pre_pos != next_pos || !remove_empty_entries) {
23 auto sub_str = str.substr(pre_pos, next_pos - pre_pos);
24 result.push_back(sub_str);
25 }
26
27 pre_pos = next_pos + 1;
28 }
29
30 return result;
31}
32
33bool IsCJK(char32_t c) {
34 return (c >= 0x4E00 && c <= 0x9FFF)
35 || (c >= 0x3400 && c <= 0x4DBF)
36 || (c >= 0x20000 && c <= 0x2A6DF)
37 || (c >= 0x2A700 && c <= 0x2B73F)
38 || (c >= 0x2B740 && c <= 0x2B81F)
39 || (c >= 0x2B820 && c <= 0x2CEAF)
40 || (c >= 0xF900 && c <= 0xFAFF)
41 || (c >= 0x2F800 && c <= 0x2FA1F);
42}
43
44// Generated by tools/generate_unicode_category_table.py
45bool IsSpace(char32_t c) {
46 if (c == 13||c == 32||c == 160||c == 8239||c == 8287||c == 12288) {
47 return true;
48 }
49
50 if ((c >= 9 && c <= 10)||(c >= 8192 && c <= 8202)) {
51 return true;
52 }
53
54 return false;
55}
56
57// Generated by tools/generate_unicode_category_table.py
58bool IsPunct(char32_t c) {
59 if (c == 161||c == 167||c == 171||c == 187||c == 191||c == 894||c == 903||c == 12336||c == 12349) {
60 return true;
61 }
62
63 if ((c >= 33 && c <= 47)||(c >= 58 && c <= 64)||(c >= 91 && c <= 96)||(c >= 123 && c <= 126)
64 ||(c >= 182 && c <= 183)||(c >= 8208 && c <= 8231)||(c >= 8240 && c <= 8259)
65 || (c >= 8261 && c <= 8273)||(c >= 8275 && c <= 8286)||(c >= 12289 && c <= 12291)
66 ||(c >= 12296 && c <= 12305)||(c >= 12308 && c <= 12319)) {
67 return true;
68 }
69
70 return false;
71}
72
73// Generated by tools/generate_unicode_category_table.py
74bool IsControl(char32_t c) {
75 if (c == 173||c == 907||c == 909||c == 930||c == 11930||c == 173790||c == 195102
76 ) {
77 return true;
78 }
79
80 if ((c >= 0 && c <= 8)||(c >= 11 && c <= 12)||(c >= 14 && c <= 31)||(c >= 128 && c <= 159)
81 ||(c >= 888 && c <= 889)||(c >= 896 && c <= 899)||(c >= 8203 && c <= 8207)
82 ||(c >= 8234 && c <= 8238)||(c >= 8288 && c <= 8302)||(c >= 12020 && c <= 12030)
83 ||(c >= 40957 && c <= 40958)||(c >= 64110 && c <= 64111)||(c >= 64218 && c <= 64254)
84 ||(c >= 177973 && c <= 177982)||(c >= 178206 && c <= 178207)||(c >= 183970 && c <= 183982)) {
85 return true;
86 }
87
88 return false;
89}
90
91bool IsAccent(char32_t c)
92{
93 // only support part of accent
94 // [TODO] support more accent
95 return c >= 0x300 && c <= 0x36F;
96}
97
98char32_t ToLower(char32_t c) {
99 // Basic ASCII uppercase to lowercase
100 if (c >= 'A' && c <= 'Z') {
101 return c + 32;
102 }
103
104 // Latin-1 Supplement (À-ß)
105 if (c >= U'À' && c <= U'Þ') {
106 // Special case for ×
107 if(c == U'×') {
108 return c;
109 }
110 return c + 32;
111 }
112
113 // // Latin Extended-A (Ā-ſ)
114 // if (c >= U'Ā' && c <= U'Ʒ') {
115 // return c + 1;
116 // }
117
118 // Greek and Coptic
119 if (c >= U'Ἀ' && c <= U'Ὠ') {
120 return c + 32;
121 }
122
123 // Cyrillic
124 if (c >= U'А' && c <= U'Я') {
125 return c + 32;
126 }
127
128 // Special cases
129 switch (c) {
130 case U'Ι':
131 return U'ι';
132 case U'Κ':
133 return U'κ';
134 case U'Λ':
135 return U'λ';
136 case U'Μ':
137 return U'μ';
138 case U'Ν':
139 return U'ν';
140 case U'Ο':
141 return U'ο';
142 case U'Π':
143 return U'π';
144 }
145
146 // If no conversion rule applies, return the original character
147 return c;
148}
149
150char32_t StripAccent(char32_t c)
151{
152 // "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
153 const char32_t * tr = U"AAAAAAÆCEEEEIIIIÐNOOOOOרUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
154 if (c < 192 || c > 255) {
155 return c;
156}
157
158 return tr[c - 192];
159}
160
161#ifdef ENABLE_TF_STRING
162// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L28
163static inline uint64_t ByteAs64(char c) { return static_cast<uint64_t>(c) & 0xff; }
164
165// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L41
166uint64_t DecodeFixed32(const char* ptr) {
167 return ((static_cast<uint64_t>(static_cast<unsigned char>(ptr[0]))) |
168 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 8) |
169 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 16) |
170 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 24));
171}
172
173// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L55
174static uint64_t DecodeFixed64(const char* ptr) {
175 uint64_t lo = DecodeFixed32(ptr);
176 uint64_t hi = DecodeFixed32(ptr + 4);
177 return (hi << 32) | lo;
178}
179
180// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L79
181uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
182 const uint64_t m = 0xc6a4a7935bd1e995;
183 const int r = 47;
184
185 uint64_t h = seed ^ (n * m);
186
187 while (n >= 8) {
188 uint64_t k = DecodeFixed64(data);
189 data += 8;
190 n -= 8;
191
192 k *= m;
193 k ^= k >> r;
194 k *= m;
195
196 h ^= k;
197 h *= m;
198 }
199
200 switch (n) {
201 case 7:
202 h ^= ByteAs64(data[6]) << 48;
203 case 6:
204 h ^= ByteAs64(data[5]) << 40;
205 case 5:
206 h ^= ByteAs64(data[4]) << 32;
207 case 4:
208 h ^= ByteAs64(data[3]) << 24;
209 case 3:
210 h ^= ByteAs64(data[2]) << 16;
211 case 2:
212 h ^= ByteAs64(data[1]) << 8;
213 default: // case 1: make some code analyzer be happier.
214 h ^= ByteAs64(data[0]);
215 h *= m;
216 }
217
218 h ^= h >> r;
219 h *= m;
220 h ^= h >> r;
221
222 return h;
223}
224
225uint64_t Hash64Fast(const char* data, size_t n) {
226 return std::hash<std::string_view>{}(std::string_view(data, n));
227}
228
229#endif // ENABLE_TF_STRING
230