microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
68b9d1dc47663a9017c55d136c804417c8efec7d

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

base/ustring.h

128lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3#pragma once
4
5#include "ocos.h"
6#include <vector>
7#include <string_view>
8
9// ustring needs a new implementation, due to the std::codecvt deprecation.
10// Wrap u32string with ustring, in case we will use other implementation in the future
11class ustring : public std::u32string {
12 public:
13 ustring() = default;
14
15 explicit ustring(const char* str) { assign(FromUTF8(str)); }
16
17 explicit ustring(const std::string& str) { assign(FromUTF8(str)); }
18
19 explicit ustring(const std::string_view& str) { assign(FromUTF8(str)); }
20
21 explicit ustring(const char32_t* str) : std::u32string(str) {}
22
23 explicit ustring(const std::u32string_view& str) : std::u32string(str) {}
24
25 explicit operator std::string() const { return ToUTF8(*this); }
26
27 static size_t EncodeUTF8Char(char* buffer, char32_t utf8_char) {
28 if (utf8_char <= 0x7F) {
29 *buffer = static_cast<char>(utf8_char);
30 return 1;
31 } else if (utf8_char <= 0x7FF) {
32 buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
33 utf8_char >>= 6;
34 buffer[0] = static_cast<char>(0xC0 | utf8_char);
35 return 2;
36 } else if (utf8_char <= 0xFFFF) {
37 buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
38 utf8_char >>= 6;
39 buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
40 utf8_char >>= 6;
41 buffer[0] = static_cast<char>(0xE0 | utf8_char);
42 return 3;
43 } else {
44 buffer[3] = static_cast<char>(0x80 | (utf8_char & 0x3F));
45 utf8_char >>= 6;
46 buffer[2] = static_cast<char>(0x80 | (utf8_char & 0x3F));
47 utf8_char >>= 6;
48 buffer[1] = static_cast<char>(0x80 | (utf8_char & 0x3F));
49 utf8_char >>= 6;
50 buffer[0] = static_cast<char>(0xF0 | utf8_char);
51 return 4;
52 }
53 }
54
55 static std::string EncodeUTF8Char(char32_t utf8_char) {
56 char utf8_buf[5]; // one extra space for zero
57 auto clen = EncodeUTF8Char(utf8_buf, utf8_char);
58 utf8_buf[clen] = 0;
59 return std::string(utf8_buf);
60 }
61
62 static bool ValidateUTF8(const std::string& data) {
63 int cnt = 0;
64 for (auto i = 0; i < data.size(); i++) {
65 int x = data[i];
66 if (!cnt) {
67 if ((x >> 5) == 0b110) {
68 cnt = 1;
69 } else if ((x >> 4) == 0b1110) {
70 cnt = 2;
71 } else if ((x >> 3) == 0b11110) {
72 cnt = 3;
73 } else if ((x >> 7) != 0) {
74 return false;
75 }
76 } else {
77 if ((x >> 6) != 0b10) return false;
78 cnt--;
79 }
80 }
81 return cnt == 0;
82 }
83
84 private:
85 using u32string = std::u32string;
86 static u32string FromUTF8(const std::string_view& utf8) {
87 u32string ucs32;
88 ucs32.reserve(utf8.length() / 2); // a rough estimation for less memory allocation.
89 for (size_t i = 0; i < utf8.size();) {
90 char32_t codepoint = 0;
91 if ((utf8[i] & 0x80) == 0) {
92 codepoint = utf8[i];
93 i++;
94 } else if ((utf8[i] & 0xE0) == 0xC0) {
95 codepoint = ((utf8[i] & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
96 i += 2;
97 } else if ((utf8[i] & 0xF0) == 0xE0) {
98 codepoint = ((utf8[i] & 0x0F) << 12) | ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
99 i += 3;
100 } else {
101 codepoint = ((utf8[i] & 0x07) << 18) | ((utf8[i + 1] & 0x3F) << 12) | ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
102 i += 4;
103 }
104 ucs32.push_back(codepoint);
105 }
106 return ucs32;
107 }
108
109 static std::string ToUTF8(const u32string& ucs32) {
110 std::string utf8;
111 utf8.reserve(ucs32.length() * 4);
112 for (char32_t codepoint : ucs32) {
113 utf8 += EncodeUTF8Char(codepoint);
114 }
115
116 return utf8;
117 }
118};
119
120namespace std {
121template <>
122struct hash<ustring> {
123 size_t operator()(const ustring& __str) const noexcept {
124 hash<u32string> standard_hash;
125 return standard_hash(static_cast<u32string>(__str));
126 }
127};
128} // namespace std
129