microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

rel-0.7

Find a branch or tag

Branches

rel-0.7

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/bpetokenizer.hpp

456lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// Licensed under the MIT License.`
2	`// Partial code comes from other Microsoft employee.`
3	`#pragma once`
4	`#include "ocos.h"`
5	`#include "ustring.h"`
6
7	`#include <regex>`
8	`#include <list>`
9	`#include <unordered_map>`
10
11	`#include "unicode.h"`
12	`#include "nlohmann/json.hpp"`
13	`#include "string_utils.h"`
14	`#include "string_tensor.h"`
15
16	`// Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)`
17	`inline bool IsUnicodeSpace(char32_t ch) {`
18	`switch (ch) {`
19	`case 0x0009:`
20	`case 0x000A:`
21	`case 0x000B:`
22	`case 0x000C:`
23	`case 0x000D:`
24	`case 0x001C:`
25	`case 0x001D:`
26	`case 0x001E:`
27	`case 0x001F:`
28	`case 0x0020:`
29	`case 0x0085:`
30	`case 0x00A0:`
31	`case 0x1680:`
32	`case 0x2000:`
33	`case 0x2001:`
34	`case 0x2002:`
35	`case 0x2003:`
36	`case 0x2004:`
37	`case 0x2005:`
38	`case 0x2006:`
39	`case 0x2007:`
40	`case 0x2008:`
41	`case 0x2009:`
42	`case 0x200A:`
43	`case 0x2028:`
44	`case 0x2029:`
45	`case 0x202F:`
46	`case 0x205F:`
47	`case 0x3000:`
48	`return true;`
49	`}`
50	`return false;`
51	`}`
52
53	`inline bool IsEmptyUString(const ustring& str) {`
54	`return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });`
55	`}`
56
57	`class SpecialTokenMap {`
58	`public:`
59	`void Add(ustring p_str, int p_id) {`
60	`auto it = token_map_.find(p_str);`
61	`if (it != token_map_.end()) {`
62	`if (it->second != p_id) {`
63	`ORTX_CXX_API_THROW("Duplicate special tokens.", ORT_INVALID_ARGUMENT);`
64	`}`
65	`} else {`
66	`token_map_[p_str] = p_id;`
67	`token_list_.push_back(SpecialTokenInfo(std::move(p_str), p_id));`
68	`}`
69	`}`
70
71	`std::list<std::pair<ustring, int>> SplitBySpecialTokens(ustring input) const {`
72	`std::list<std::pair<ustring, int>> res;`
73	`res.emplace_back(std::move(input), -1);`
74	`for (const auto& st : token_list_) {`
75	`std::list<std::pair<ustring, int>> new_split_res;`
76	`for (auto& str : res) {`
77	`if (str.second != -1) {`
78	`new_split_res.push_back(std::move(str));`
79	`continue;`
80	`}`
81	`auto it = str.first.begin();`
82	`size_t search_pos = 0;`
83	`while (it != str.first.end()) {`
84	`// works fine for all clang-based platform: Mac OS, Android, WebAssembly`
85	`#if defined(__clang__)`
86	`auto search_it = std::search(it, str.first.end(), st.str.begin(), st.str.end());`
87	`#else`
88	`auto search_it = std::search(it, str.first.end(),`
89	`std::boyer_moore_searcher(st.str.begin(), st.str.end()));`
90	`#endif`
91	`if (search_it == str.first.end()) {`
92	`new_split_res.emplace_back(str.first.substr(search_pos), -1);`
93	`break;`
94	`}`
95	`auto prefixLen = search_it - it;`
96	`if (prefixLen != 0) {`
97	`new_split_res.emplace_back(str.first.substr(search_pos, prefixLen), -1);`
98	`search_pos += prefixLen;`
99	`}`
100	`new_split_res.emplace_back(str.first.substr(search_pos, st.str.size()), st.id);`
101	`it = search_it + st.str.size();`
102	`search_pos += st.str.size();`
103	`}`
104	`}`
105	`std::swap(new_split_res, res);`
106	`}`
107	`return res;`
108	`}`
109
110	`private:`
111	`struct SpecialTokenInfo {`
112	`ustring str;`
113	`int id;`
114
115	`SpecialTokenInfo(ustring p_str, int p_id)`
116	`: str(std::move(p_str)), id(p_id) {`
117	`if (str.empty()) {`
118	`ORTX_CXX_API_THROW("Empty special token.", ORT_INVALID_ARGUMENT);`
119	`}`
120	`}`
121	`};`
122
123	`std::list<SpecialTokenInfo> token_list_;`
124	`std::unordered_map<ustring, int> token_map_;`
125	`};`
126
127	`using json = nlohmann::json;`
128	`class VocabData {`
129	`public:`
130	`VocabData()`
131	`: unk_id_(-1) {`
132	`}`
133
134	`struct BpeNode {`
135	`int id;`
136	`int value;`
137	`};`
138
139	`void Load(std::istream& vocab_stream, std::istream& merges_stream, const char* unk_token, const char* special_tokens) {`
140	`json tok_json;`
141	`vocab_stream >> tok_json;`
142	`vocab_map_ = std::move(tok_json.get<std::unordered_map<std::string, int>>());`
143
144	`auto it = vocab_map_.find(unk_token);`
145	`if (it != vocab_map_.end()) {`
146	`unk_id_ = it->second;`
147	`} else {`
148	`int id = static_cast<int>(vocab_map_.size());`
149	`vocab_map_[unk_token] = id;`
150	`}`
151
152	`std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;`
153	`for (auto i = 33; i <= 126; ++i) {`
154	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
155	`}`
156	`for (auto i = 161; i <= 172; ++i) {`
157	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
158	`}`
159	`for (auto i = 174; i <= 255; ++i) {`
160	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
161	`}`
162
163	`int index = 256;`
164	`for (auto i = 0; i < 33; ++i) {`
165	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
166	`}`
167	`for (auto i = 127; i < 161; ++i) {`
168	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
169	`}`
170	`byte_encoder_[173] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
171
172	`index = 0;`
173	`std::string line;`
174	`while (std::getline(merges_stream, line)) {`
175	`line = std::regex_replace(line, std::regex("\r"), "");`
176	`if (line.empty()) continue;`
177	`if ((line[0] == '#') && (index == 0)) continue;`
178	`auto pos = line.find(' ');`
179	`if (pos == std::string::npos) {`
180	`ORTX_CXX_API_THROW("Cannot know how to parse line: " + line, ORT_INVALID_ARGUMENT);`
181	`}`
182	`std::string w1 = line.substr(0, pos);`
183	`std::string w2 = line.substr(pos + 1);`
184	`int iw1 = GetVocabIndex(w1);`
185	`int iw2 = GetVocabIndex(w2);`
186	`int iww = GetVocabIndex(w1 + w2);`
187	`std::pair<int, int> key{iw1, iw2};`
188	`BpeNode value{iww, index++};`
189	`bpe_map_[key] = value;`
190	`}`
191
192	`if (special_tokens != nullptr) {`
193	`std::istringstream istrea(special_tokens);`
194
195	`while (istrea >> line) {`
196	`if (line.empty()) continue;`
197	`line = std::regex_replace(line, std::regex("\r"), "");`
198	`ustring line_32(line);`
199	`int id = static_cast<int>(vocab_map_.size());`
200	`if (auto nestedIt = vocab_map_.find(line); nestedIt != vocab_map_.end()) {`
201	`id = nestedIt->second;`
202	`} else {`
203	`vocab_map_[line] = id;`
204	`}`
205	`special_tokens_.Add(std::move(line_32), id);`
206	`}`
207	`}`
208
209	`id2token_map_.resize(vocab_map_.size());`
210	`for (const auto& [t, i] : vocab_map_) {`
211	`id2token_map_[i] = t;`
212	`}`
213	`}`
214
215	`void bpe(std::list<int>& vals) const {`
216	`while (vals.size() >= 2) {`
217	`auto pos_it = vals.end();`
218	`int minval = std::numeric_limits<int>::max();`
219	`int ori_id1 = 0, ori_id2 = 0;`
220	`int aim_id = 0;`
221	`for (auto it = vals.begin(); it != vals.end(); ++it) {`
222	`auto it2 = it;`
223	`++it2;`
224	`if (it2 == vals.end()) break;`
225	`auto map_it = bpe_map_.find({it, it2});`
226	`if (map_it == bpe_map_.end()) continue;`
227	`if (minval > map_it->second.value) {`
228	`ori_id1 = *it;`
229	`ori_id2 = *it2;`
230	`minval = map_it->second.value;`
231	`pos_it = it;`
232	`aim_id = map_it->second.id;`
233	`}`
234	`}`
235	`if (pos_it == vals.end()) break;`
236
237	`pos_it = vals.erase(pos_it);`
238	`*pos_it = aim_id;`
239	`for (++pos_it; pos_it != vals.end(); ++pos_it) {`
240	`if (*pos_it != ori_id1) continue;`
241	`auto it2 = pos_it;`
242	`++it2;`
243	`if (it2 == vals.end()) break;`
244	`if (*it2 != ori_id2) continue;`
245	`pos_it = vals.erase(pos_it);`
246	`*pos_it = aim_id;`
247	`}`
248	`}`
249	`}`
250
251	`const auto& ByteEncoder() const {`
252	`return byte_encoder_;`
253	`}`
254
255	`auto SplitBySpecialTokens(const ustring& input) const {`
256	`return special_tokens_.SplitBySpecialTokens(input);`
257	`}`
258
259	`int GetEncoding(const std::string& key) {`
260	`auto it = vocab_map_.find(key);`
261	`return it->second;`
262	`}`
263
264	`size_t VocabSize() const { return vocab_map_.size(); }`
265
266	`int TokenToID(const std::string& input) const {`
267	`auto it = vocab_map_.find(input);`
268	`if (it == vocab_map_.end()) {`
269	`ORTX_CXX_API_THROW("Token not found: " + input, ORT_INVALID_ARGUMENT);`
270	`}`
271	`return it->second;`
272	`}`
273
274	`const std::string& IdToToken(int id) const {`
275	`if ((id < 0) \|\| (static_cast<size_t>(id) >= id2token_map_.size())) {`
276	`ORTX_CXX_API_THROW("Invalid ID: " + std::to_string(id), ORT_INVALID_ARGUMENT);`
277	`}`
278	`return id2token_map_[id];`
279	`}`
280
281	`private:`
282	`int GetVocabIndex(const std::string& str) {`
283	`auto it = vocab_map_.find(str);`
284	`if (it == vocab_map_.end()) {`
285	`ORTX_CXX_API_THROW("Cannot find word in vocabulary: " + str, ORT_INVALID_ARGUMENT);`
286	`}`
287	`return it->second;`
288	`}`
289
290	`private:`
291	`struct hash_pair {`
292	`template <class T1, class T2>`
293	`size_t operator()(const std::pair<T1, T2>& p) const {`
294	`auto hash1 = std::hash<T1>{}(p.first);`
295	`auto hash2 = std::hash<T2>{}(p.second);`
296	`return hash1 ^ (hash2 << 16);`
297	`}`
298	`};`
299	`std::unordered_map<std::pair<int, int>, BpeNode, hash_pair> bpe_map_;`
300
301	`int byte_encoder_[256] = {};`
302	`std::unordered_map<std::string, int> vocab_map_;`
303	`std::vector<std::string> id2token_map_;`
304
305	`int unk_id_;`
306	`SpecialTokenMap special_tokens_;`
307	`};`
308
309	`class TokenWithRegularExp {`
310	`public:`
311	`void Set(std::u32string_view val) {`
312	`m_text = val;`
313	`}`
314
315	`std::pair<bool, std::u32string_view> GetNextToken() {`
316	`while (!m_text.empty()) {`
317	`auto res = TryMatch();`
318	`if (res.empty()) {`
319	`m_text = m_text.substr(1);`
320	`continue;`
321	`}`
322	`return {true, res};`
323	`}`
324	`return {false, {}};`
325	`}`
326
327	`private:`
328	`std::u32string_view TryMatch() {`
329	`// python pattern:`
330	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{L}+\| ?\p{N}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+`
331
332	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\|`
333	`// Note: the sequencial of the following if should not be switched, which follows the python regex's syntax`
334	`if ((m_text[0] == U'\'') && (m_text.size() > 1)) {`
335	`if ((m_text[1] == U's') \|\| (m_text[1] == U't') \|\|`
336	`(m_text[1] == U'm') \|\| (m_text[1] == U'd')) {`
337	`std::u32string_view res = m_text.substr(0, 2);`
338	`m_text = m_text.substr(2);`
339	`return res;`
340	`}`
341
342	`if (m_text.size() > 2) {`
343	`if (((m_text[1] == U'r') && (m_text[2] == U'e')) \|\|`
344	`((m_text[1] == U'v') && (m_text[2] == U'e')) \|\|`
345	`((m_text[1] == U'l') && (m_text[2] == U'l'))) {`
346	`std::u32string_view res = m_text.substr(0, 3);`
347	`m_text = m_text.substr(3);`
348	`return res;`
349	`}`
350	`}`
351	`}`
352
353	`// ?\p{L}+`
354	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::L)) {`
355	`size_t i = 2;`
356	`for (; i < m_text.size(); ++i) {`
357	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
358	`break;`
359	`}`
360	`std::u32string_view res = m_text.substr(0, i);`
361	`m_text = m_text.substr(i);`
362	`return res;`
363	`}`
364	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::L) {`
365	`size_t i = 1;`
366	`for (; i < m_text.size(); ++i) {`
367	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
368	`break;`
369	`}`
370	`std::u32string_view res = m_text.substr(0, i);`
371	`m_text = m_text.substr(i);`
372	`return res;`
373	`}`
374
375	`// ?\p{N}+`
376	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::N)) {`
377	`size_t i = 2;`
378	`for (; i < m_text.size(); ++i) {`
379	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
380	`break;`
381	`}`
382	`std::u32string_view res = m_text.substr(0, i);`
383	`m_text = m_text.substr(i);`
384	`return res;`
385	`}`
386	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::N) {`
387	`size_t i = 1;`
388	`for (; i < m_text.size(); ++i) {`
389	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
390	`break;`
391	`}`
392	`std::u32string_view res = m_text.substr(0, i);`
393	`m_text = m_text.substr(i);`
394	`return res;`
395	`}`
396
397	`// ?[^\s\p{L}\p{N}]+`
398	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (NotLNZ(m_text[1]))) {`
399	`size_t i = 2;`
400	`for (; i < m_text.size(); ++i) {`
401	`if (!NotLNZ(m_text[i]))`
402	`break;`
403	`}`
404	`std::u32string_view res = m_text.substr(0, i);`
405	`m_text = m_text.substr(i);`
406	`return res;`
407	`}`
408	`if (NotLNZ(m_text[0])) {`
409	`size_t i = 1;`
410	`for (; i < m_text.size(); ++i) {`
411	`if (!NotLNZ(m_text[i]))`
412	`break;`
413	`}`
414	`std::u32string_view res = m_text.substr(0, i);`
415	`m_text = m_text.substr(i);`
416	`return res;`
417	`}`
418
419	`// \s+(?!\S)\|\s+`
420	`if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {`
421	`size_t i = 1;`
422	`for (; i < m_text.size(); ++i) {`
423	`if (!IsZ(m_text[i])) break;`
424	`}`
425	`if ((i > 1) && (i != m_text.size())) //\s+(?!\S)`
426	`{`
427	`i--;`
428	`std::u32string_view res = m_text.substr(0, i);`
429	`m_text = m_text.substr(i);`
430	`return res;`
431	`}`
432	`// \s+`
433	`std::u32string_view res = m_text.substr(0, i);`
434	`m_text = m_text.substr(i);`
435	`return res;`
436	`}`
437
438	`return std::u32string_view{};`
439	`}`
440
441	`static bool IsZ(char32_t ch) {`
442	`auto category = ufal::unilib::unicode::category(ch);`
443	`return (category & ufal::unilib::unicode::Z) != 0;`
444	`}`
445
446	`static bool NotLNZ(char32_t ch) {`
447	`auto category = ufal::unilib::unicode::category(ch);`
448	`if (category & ufal::unilib::unicode::L) return false;`
449	`if (category & ufal::unilib::unicode::N) return false;`
450	`if (category & ufal::unilib::unicode::Z) return false;`
451	`return true;`
452	`}`
453
454	`private:`
455	`std::u32string_view m_text;`
456	`};`
457

microsoft/onnxruntime-extensions

Branches

Tags

Clone