microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
a761fc59b7ace6392d8782fbb2a74888353ecf65

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

base/ustring.h

127lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3#pragma once
4
5#include <vector>
6#include <string_view>
7
8// ustring needs a new implementation, due to the std::codecvt deprecation.
9// Wrap u32string with ustring, in case we will use other implementation in the future
10class ustring : public std::u32string {
11 public:
12 ustring() = default;
13
14 explicit ustring(const char* str) { assign(FromUTF8(str)); }
15
16 explicit ustring(const std::string& str) { assign(FromUTF8(str)); }
17
18 explicit ustring(const std::string_view& str) { assign(FromUTF8(str)); }
19
20 explicit ustring(const char32_t* str) : std::u32string(str) {}
21
22 explicit ustring(const std::u32string_view& str) : std::u32string(str) {}
23
24 explicit operator std::string() const { return ToUTF8(*this); }
25
26 static size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
27 if (utf8_char <= 0x7F) {
28 *buffer = static_cast<char>(utf8_char);
29 return 1;
30 } else if (utf8_char <= 0x7FF) {
31 buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
32 utf8_char >>= 6;
33 buffer[0] = static_cast<char>(0xC0 | utf8_char);
34 return 2;
35 } else if (utf8_char <= 0xFFFF) {
36 buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
37 utf8_char >>= 6;
38 buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
39 utf8_char >>= 6;
40 buffer[0] = static_cast<char>(0xE0 | utf8_char);
41 return 3;
42 } else {
43 buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));
44 utf8_char >>= 6;
45 buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
46 utf8_char >>= 6;
47 buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
48 utf8_char >>= 6;
49 buffer[0] = static_cast<char>(0xF0 | utf8_char);
50 return 4;
51 }
52 }
53
54 static std::string EncodeUTF8Char(char32_t utf8_char) {
55 char utf8_buf[5]; // one extra space for zero
56 auto clen = EncodeUTF8Char(utf8_buf, utf8_char);
57 utf8_buf[clen] = 0;
58 return std::string(utf8_buf);
59 }
60
61 static bool ValidateUTF8(const std::string& data) {
62 int cnt = 0;
63 for (auto i = 0; i < data.size(); i++) {
64 int x = data[i];
65 if (!cnt) {
66 if ((x >> 5) == 0b110) {
67 cnt = 1;
68 } else if ((x >> 4) == 0b1110) {
69 cnt = 2;
70 } else if ((x >> 3) == 0b11110) {
71 cnt = 3;
72 } else if ((x >> 7) != 0) {
73 return false;
74 }
75 } else {
76 if ((x >> 6) != 0b10) return false;
77 cnt--;
78 }
79 }
80 return cnt == 0;
81 }
82
83 private:
84 using u32string = std::u32string;
85 static u32string FromUTF8(const std::string_view& utf8) {
86 u32string ucs32;
87 ucs32.reserve(utf8.length() / 2); // a rough estimation for less memory allocation.
88 for (size_t i = 0; i < utf8.size();) {
89 char32_t codepoint = 0;
90 if ((utf8[i] & 0x80) == 0) {
91 codepoint = utf8[i];
92 i++;
93 } else if ((utf8[i] & 0xE0) == 0xC0) {
94 codepoint = ((utf8[i] & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
95 i += 2;
96 } else if ((utf8[i] & 0xF0) == 0xE0) {
97 codepoint = ((utf8[i] & 0x0F) << 12) | ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
98 i += 3;
99 } else {
100 codepoint = ((utf8[i] & 0x07) << 18) | ((utf8[i + 1] & 0x3F) << 12) | ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
101 i += 4;
102 }
103 ucs32.push_back(codepoint);
104 }
105 return ucs32;
106 }
107
108 static std::string ToUTF8(const u32string& ucs32) {
109 std::string utf8;
110 utf8.reserve(ucs32.length() * 4);
111 for (char32_t codepoint : ucs32) {
112 utf8 += EncodeUTF8Char(codepoint);
113 }
114
115 return utf8;
116 }
117};
118
119namespace std {
120template <>
121struct hash<ustring> {
122 size_t operator()(const ustring& __str) const noexcept {
123 hash<u32string> standard_hash;
124 return standard_hash(static_cast<u32string>(__str));
125 }
126};
127} // namespace std
128