microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
9eef22cb81d762f6c093a4740c992582267a783f

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/string_utils.cc

133lines · modecode

1#ifdef ENABLE_TF_STRING
2#include "farmhash.h"
3#endif
4
5#include "string_utils.h"
6
7std::vector<std::string_view> SplitString(const std::string_view& str, const std::string_view& seps, bool remove_empty_entries) {
8 std::vector<std::string_view> result;
9 std::string ::size_type pre_pos = 0;
10
11 //TODO: bug fix
12 while (true) {
13 auto next_pos = str.find_first_of(seps, pre_pos);
14
15 if (next_pos == std::string::npos) {
16 auto sub_str = str.substr(pre_pos, next_pos);
17 // sub_str is empty means the last sep reach the end of string
18 if (!sub_str.empty()) {
19 result.push_back(sub_str);
20 }
21
22 break;
23 }
24
25 if (pre_pos != next_pos || !remove_empty_entries) {
26 auto sub_str = str.substr(pre_pos, next_pos - pre_pos);
27 result.push_back(sub_str);
28 }
29
30 pre_pos = next_pos + 1;
31 }
32
33 return result;
34}
35
36bool IsCJK(char32_t c) {
37 return (c >= 0x4E00 && c <= 0x9FFF)
38 || (c >= 0x3400 && c <= 0x4DBF)
39 || (c >= 0x20000 && c <= 0x2A6DF)
40 || (c >= 0x2A700 && c <= 0x2B73F)
41 || (c >= 0x2B740 && c <= 0x2B81F)
42 || (c >= 0x2B820 && c <= 0x2CEAF)
43 || (c >= 0xF900 && c <= 0xFAFF)
44 || (c >= 0x2F800 && c <= 0x2FA1F);
45}
46
47bool IsAccent(char32_t c)
48{
49 // only support part of accent
50 // [TODO] support more accent
51 return c >= 0x300 && c <= 0x36F;
52}
53
54char32_t StripAccent(char32_t c)
55{
56 // "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
57 const char* tr = "AAAAAAÆCEEEEIIIIÐNOOOOOרUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷øuuuuyþy";
58 if (c < 192 || c > 255) {
59 return c;
60}
61
62 return tr[c - 192];
63}
64
65#ifdef ENABLE_TF_STRING
66// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L28
67static inline uint64_t ByteAs64(char c) { return static_cast<uint64_t>(c) & 0xff; }
68
69// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L41
70uint64_t DecodeFixed32(const char* ptr) {
71 return ((static_cast<uint64_t>(static_cast<unsigned char>(ptr[0]))) |
72 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 8) |
73 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 16) |
74 (static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 24));
75}
76
77// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/raw_coding.h#L55
78static uint64_t DecodeFixed64(const char* ptr) {
79 uint64_t lo = DecodeFixed32(ptr);
80 uint64_t hi = DecodeFixed32(ptr + 4);
81 return (hi << 32) | lo;
82}
83
84// Source: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hash.cc#L79
85uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
86 const uint64_t m = 0xc6a4a7935bd1e995;
87 const int r = 47;
88
89 uint64_t h = seed ^ (n * m);
90
91 while (n >= 8) {
92 uint64_t k = DecodeFixed64(data);
93 data += 8;
94 n -= 8;
95
96 k *= m;
97 k ^= k >> r;
98 k *= m;
99
100 h ^= k;
101 h *= m;
102 }
103
104 switch (n) {
105 case 7:
106 h ^= ByteAs64(data[6]) << 48;
107 case 6:
108 h ^= ByteAs64(data[5]) << 40;
109 case 5:
110 h ^= ByteAs64(data[4]) << 32;
111 case 4:
112 h ^= ByteAs64(data[3]) << 24;
113 case 3:
114 h ^= ByteAs64(data[2]) << 16;
115 case 2:
116 h ^= ByteAs64(data[1]) << 8;
117 case 1:
118 h ^= ByteAs64(data[0]);
119 h *= m;
120 }
121
122 h ^= h >> r;
123 h *= m;
124 h ^= h >> r;
125
126 return h;
127}
128
129uint64_t Hash64Fast(const char* data, size_t n) {
130 return static_cast<int64_t>(util::Fingerprint64(data, n));
131}
132
133#endif // ENABLE_TF_STRING
134