microsoft/onnxruntime-extensions

Public

mirrored from https://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
rel-0.5

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/string_utils.cc

194lines · modeblame

4eaa5ac4Wenbing Li4 years ago1#ifdef ENABLE_TF_STRING
c891e5d7Wenbing Li5 years ago2#include "farmhash.h"
4eaa5ac4Wenbing Li4 years ago3#endif
4
c891e5d7Wenbing Li5 years ago5#include "string_utils.h"
6
7std::vector<std::string_view> SplitString(const std::string_view& str, const std::string_view& seps, bool remove_empty_entries) {
8std::vector<std::string_view> result;
9std::string ::size_type pre_pos = 0;
10
aef5ef1eMojimi4 years ago11//TODO: bug fix
c891e5d7Wenbing Li5 years ago12while (true) {
13auto next_pos = str.find_first_of(seps, pre_pos);
14
15if (next_pos == std::string::npos) {
16auto sub_str = str.substr(pre_pos, next_pos);
17// sub_str is empty means the last sep reach the end of string
18if (!sub_str.empty()) {
19result.push_back(sub_str);
20}
21
22break;
23}
24
25if (pre_pos != next_pos || !remove_empty_entries) {
26auto sub_str = str.substr(pre_pos, next_pos - pre_pos);
27result.push_back(sub_str);
28}
29
30pre_pos = next_pos + 1;
31}
32
33return result;
34}
35
aef5ef1eMojimi4 years ago36bool IsCJK(char32_t c) {
37return (c >= 0x4E00 && c <= 0x9FFF)
38|| (c >= 0x3400 && c <= 0x4DBF)
39|| (c >= 0x20000 && c <= 0x2A6DF)
40|| (c >= 0x2A700 && c <= 0x2B73F)
41|| (c >= 0x2B740 && c <= 0x2B81F)
42|| (c >= 0x2B820 && c <= 0x2CEAF)
43|| (c >= 0xF900 && c <= 0xFAFF)
44|| (c >= 0x2F800 && c <= 0x2FA1F);
45}
46
44851853Mojimi4 years ago47// Generated by tools/generate_unicode_category_table.py
48bool IsSpace(char32_t c) {
49if (c == 13||c == 32||c == 160||c == 8239||c == 8287||c == 12288) {
50return true;
51}
52
53if ((c >= 9 && c <= 10)||(c >= 8192 && c <= 8202)) {
54return true;
55}
56
57return false;
58}
59
60// Generated by tools/generate_unicode_category_table.py
61bool IsPunct(char32_t c) {
62if (c == 161||c == 167||c == 171||c == 187||c == 191||c == 894||c == 903||c == 12336||c == 12349) {
63return true;
64}
65
66if ((c >= 33 && c <= 47)||(c >= 58 && c <= 64)||(c >= 91 && c <= 96)||(c >= 123 && c <= 126)
67||(c >= 182 && c <= 183)||(c >= 8208 && c <= 8231)||(c >= 8240 && c <= 8259)
68|| (c >= 8261 && c <= 8273)||(c >= 8275 && c <= 8286)||(c >= 12289 && c <= 12291)
69||(c >= 12296 && c <= 12305)||(c >= 12308 && c <= 12319)) {
70return true;
71}
72
73return false;
74}
75
76// Generated by tools/generate_unicode_category_table.py
77bool IsControl(char32_t c) {
78if (c == 173||c == 907||c == 909||c == 930||c == 11930||c == 173790||c == 195102
79) {
80return true;
81}
82
83if ((c >= 0 && c <= 8)||(c >= 11 && c <= 12)||(c >= 14 && c <= 31)||(c >= 128 && c <= 159)
84||(c >= 888 && c <= 889)||(c >= 896 && c <= 899)||(c >= 8203 && c <= 8207)
85||(c >= 8234 && c <= 8238)||(c >= 8288 && c <= 8302)||(c >= 12020 && c <= 12030)
86||(c >= 40957 && c <= 40958)||(c >= 64110 && c <= 64111)||(c >= 64218 && c <= 64254)
87||(c >= 177973 && c <= 177982)||(c >= 178206 && c <= 178207)||(c >= 183970 && c <= 183982)) {
88return true;
89}
90
91return false;
92}
93
aef5ef1eMojimi4 years ago94bool IsAccent(char32_t c)
95{
96// only support part of accent
97// [TODO] support more accent
98return c >= 0x300 && c <= 0x36F;
99}
100
46d096f1Mojimi4 years ago101// only support latin now
102char32_t ToLower(char32_t c) {
103if ((c >= 'A') && (c <= 'Z')) {
104return c + 'a' - 'A';
105}
106
107if ((c >= U'À' && (c <= U'Þ'))) {
108return c + U'à' - U'À';
109}
110
111return c;
112}
113
aef5ef1eMojimi4 years ago114char32_t StripAccent(char32_t c)
115{
116// "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
46d096f1Mojimi4 years ago117const char32_t * tr = U"AAAAAAÆCEEEEIIIIÐNOOOOOרUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
aef5ef1eMojimi4 years ago118if (c < 192 || c > 255) {
119return c;
120}
121
122return tr[c - 192];
123}
124
4eaa5ac4Wenbing Li4 years ago125#ifdef ENABLE_TF_STRING
c891e5d7Wenbing Li5 years ago126// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L28
127static inline uint64_t ByteAs64(char c) { return static_cast<uint64_t>(c) & 0xff; }
128
129// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L41
130uint64_t DecodeFixed32(const char* ptr) {
131return ((static_cast<uint64_t>(static_cast<unsigned char>(ptr[0]))) |
132(static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 8) |
133(static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 16) |
134(static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 24));
135}
136
137// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L55
138static uint64_t DecodeFixed64(const char* ptr) {
139uint64_t lo = DecodeFixed32(ptr);
140uint64_t hi = DecodeFixed32(ptr + 4);
141return (hi << 32) | lo;
142}
143
144// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L79
145uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
146const uint64_t m = 0xc6a4a7935bd1e995;
147const int r = 47;
148
149uint64_t h = seed ^ (n * m);
150
151while (n >= 8) {
152uint64_t k = DecodeFixed64(data);
153data += 8;
154n -= 8;
155
156k *= m;
157k ^= k >> r;
158k *= m;
159
160h ^= k;
161h *= m;
162}
163
164switch (n) {
165case 7:
166h ^= ByteAs64(data[6]) << 48;
167case 6:
168h ^= ByteAs64(data[5]) << 40;
169case 5:
170h ^= ByteAs64(data[4]) << 32;
171case 4:
172h ^= ByteAs64(data[3]) << 24;
173case 3:
174h ^= ByteAs64(data[2]) << 16;
175case 2:
176h ^= ByteAs64(data[1]) << 8;
5320af1eWenbing Li3 years ago177default: // case 1: make some code analyzer be happier.
c891e5d7Wenbing Li5 years ago178h ^= ByteAs64(data[0]);
179h *= m;
180}
181
182h ^= h >> r;
183h *= m;
184h ^= h >> r;
185
186return h;
187}
188
189uint64_t Hash64Fast(const char* data, size_t n) {
190return static_cast<int64_t>(util::Fingerprint64(data, n));
191}
4eaa5ac4Wenbing Li4 years ago192
46d096f1Mojimi4 years ago193
aef5ef1eMojimi4 years ago194#endif // ENABLE_TF_STRING