microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

e0d48e255f28e5465f63e7fc141df1e1d533cc40

Find a branch or tag

Branches

e0d48e255f28e5465f63e7fc141df1e1d533cc40

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/bpetokenizer.hpp

426lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// Licensed under the MIT License.`
2	`// Partial code comes from other Microsoft employee.`
3	`#pragma once`
4	`#include <string>`
5	`#include <vector>`
6	`#include <fstream>`
7	`#include <sstream>`
8	`#include <iostream>`
9	`#include <algorithm>`
10	`#include <list>`
11	`#include <memory>`
12	`#include <regex>`
13	`#include <sstream>`
14	`#include <stdexcept>`
15	`#include <unordered_map>`
16	`#include <functional>`
17	`#include <codecvt>`
18	`#include <mutex>`
19
20	`#include "nlohmann/json.hpp"`
21	`#include "clip_tokenizer.hpp"`
22	`#include "gpt2_tokenizer.hpp"`
23	`#include "string_tensor.h"`
24	`#include "unicode.h"`
25
26	`class SpecialTokenMap {`
27	`public:`
28	`void Add(ustring p_str, int p_id) {`
29	`auto it = token_map_.find(p_str);`
30	`if (it != token_map_.end()) {`
31	`if (it->second != p_id) {`
32	`ORTX_CXX_API_THROW("Duplicate special tokens.", ORT_INVALID_ARGUMENT);`
33	`}`
34	`} else {`
35	`token_map_[p_str] = p_id;`
36	`token_list_.push_back(SpecialTokenInfo(std::move(p_str), p_id));`
37	`}`
38	`}`
39
40	`std::list<std::pair<ustring, int>> SplitBySpecialTokens(ustring input) const {`
41	`std::list<std::pair<ustring, int>> res;`
42	`res.emplace_back(std::move(input), -1);`
43	`for (const auto& st : token_list_) {`
44	`std::list<std::pair<ustring, int>> new_split_res;`
45	`for (auto& str : res) {`
46	`if (str.second != -1) {`
47	`new_split_res.push_back(std::move(str));`
48	`continue;`
49	`}`
50	`auto it = str.first.begin();`
51	`size_t search_pos = 0;`
52	`while (it != str.first.end()) {`
53	`// works fine for all clang-based platform: Mac OS, Android, WebAssembly`
54	`#if defined(__clang__)`
55	`auto search_it = std::search(it, str.first.end(), st.str.begin(), st.str.end());`
56	`#else`
57	`auto search_it = std::search(it, str.first.end(),`
58	`std::boyer_moore_searcher(st.str.begin(), st.str.end()));`
59	`#endif`
60	`if (search_it == str.first.end()) {`
61	`new_split_res.emplace_back(str.first.substr(search_pos), -1);`
62	`break;`
63	`}`
64	`auto prefixLen = search_it - it;`
65	`if (prefixLen != 0) {`
66	`new_split_res.emplace_back(str.first.substr(search_pos, prefixLen), -1);`
67	`search_pos += prefixLen;`
68	`}`
69	`new_split_res.emplace_back(str.first.substr(search_pos, st.str.size()), st.id);`
70	`it = search_it + st.str.size();`
71	`search_pos += st.str.size();`
72	`}`
73	`}`
74	`std::swap(new_split_res, res);`
75	`}`
76	`return res;`
77	`}`
78
79	`private:`
80	`struct SpecialTokenInfo {`
81	`ustring str;`
82	`int id;`
83
84	`SpecialTokenInfo(ustring p_str, int p_id)`
85	`: str(std::move(p_str)), id(p_id) {`
86	`if (str.empty()) {`
87	`ORTX_CXX_API_THROW("Empty special token.", ORT_INVALID_ARGUMENT);`
88	`}`
89	`}`
90	`};`
91
92	`std::list<SpecialTokenInfo> token_list_;`
93	`std::unordered_map<ustring, int> token_map_;`
94	`};`
95
96	`using json = nlohmann::json;`
97	`class VocabData {`
98	`public:`
99	`VocabData()`
100	`: unk_id_(-1) {`
101	`}`
102
103	`struct BpeNode {`
104	`int id;`
105	`int value;`
106	`};`
107
108	`void Load(std::istream& vocab_stream, std::istream& merges_stream, const char* unk_token, const char* special_tokens) {`
109	`json tok_json;`
110	`vocab_stream >> tok_json;`
111	`vocab_map_ = std::move(tok_json.get<std::unordered_map<std::string, int>>());`
112
113	`auto it = vocab_map_.find(unk_token);`
114	`if (it != vocab_map_.end()) {`
115	`unk_id_ = it->second;`
116	`} else {`
117	`int id = static_cast<int>(vocab_map_.size());`
118	`vocab_map_[unk_token] = id;`
119	`std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl;`
120	`}`
121
122	`std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;`
123	`for (auto i = 33; i <= 126; ++i) {`
124	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
125	`}`
126	`for (auto i = 161; i <= 172; ++i) {`
127	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
128	`}`
129	`for (auto i = 174; i <= 255; ++i) {`
130	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
131	`}`
132
133	`int index = 256;`
134	`for (auto i = 0; i < 33; ++i) {`
135	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
136	`}`
137	`for (auto i = 127; i < 161; ++i) {`
138	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
139	`}`
140	`byte_encoder_[173] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
141
142	`index = 0;`
143	`std::string line;`
144	`while (std::getline(merges_stream, line)) {`
145	`line = std::regex_replace(line, std::regex("\r"), "");`
146	`if (line.empty()) continue;`
147	`if ((line[0] == '#') && (index == 0)) continue;`
148	`auto pos = line.find(' ');`
149	`if (pos == std::string::npos) {`
150	`ORTX_CXX_API_THROW("Cannot know how to parse line: " + line, ORT_INVALID_ARGUMENT);`
151	`}`
152	`std::string w1 = line.substr(0, pos);`
153	`std::string w2 = line.substr(pos + 1);`
154	`int iw1 = GetVocabIndex(w1);`
155	`int iw2 = GetVocabIndex(w2);`
156	`int iww = GetVocabIndex(w1 + w2);`
157	`std::pair<int, int> key{iw1, iw2};`
158	`BpeNode value{iww, index++};`
159	`bpe_map_[key] = value;`
160	`}`
161
162	`if (special_tokens != nullptr) {`
163	`std::istringstream istrea(special_tokens);`
164
165	`while (istrea >> line) {`
166	`if (line.empty()) continue;`
167	`line = std::regex_replace(line, std::regex("\r"), "");`
168	`ustring line_32(line);`
169	`int id = static_cast<int>(vocab_map_.size());`
170	`if (auto nestedIt = vocab_map_.find(line); nestedIt != vocab_map_.end()) {`
171	`id = nestedIt->second;`
172	`} else {`
173	`vocab_map_[line] = id;`
174	`}`
175	`special_tokens_.Add(std::move(line_32), id);`
176	`}`
177	`}`
178
179	`id2token_map_.resize(vocab_map_.size());`
180	`for (const auto& [t, i] : vocab_map_) {`
181	`id2token_map_[i] = t;`
182	`}`
183	`}`
184
185	`void bpe(std::list<int>& vals) const {`
186	`while (vals.size() >= 2) {`
187	`auto pos_it = vals.end();`
188	`int minval = std::numeric_limits<int>::max();`
189	`int ori_id1 = 0, ori_id2 = 0;`
190	`int aim_id = 0;`
191	`for (auto it = vals.begin(); it != vals.end(); ++it) {`
192	`auto it2 = it;`
193	`++it2;`
194	`if (it2 == vals.end()) break;`
195	`auto map_it = bpe_map_.find({it, it2});`
196	`if (map_it == bpe_map_.end()) continue;`
197	`if (minval > map_it->second.value) {`
198	`ori_id1 = *it;`
199	`ori_id2 = *it2;`
200	`minval = map_it->second.value;`
201	`pos_it = it;`
202	`aim_id = map_it->second.id;`
203	`}`
204	`}`
205	`if (pos_it == vals.end()) break;`
206
207	`pos_it = vals.erase(pos_it);`
208	`*pos_it = aim_id;`
209	`for (++pos_it; pos_it != vals.end(); ++pos_it) {`
210	`if (*pos_it != ori_id1) continue;`
211	`auto it2 = pos_it;`
212	`++it2;`
213	`if (it2 == vals.end()) break;`
214	`if (*it2 != ori_id2) continue;`
215	`pos_it = vals.erase(pos_it);`
216	`*pos_it = aim_id;`
217	`}`
218	`}`
219	`}`
220
221	`const auto& ByteEncoder() const {`
222	`return byte_encoder_;`
223	`}`
224
225	`auto SplitBySpecialTokens(const ustring& input) const {`
226	`return special_tokens_.SplitBySpecialTokens(input);`
227	`}`
228
229	`int GetEncoding(const std::string& key) {`
230	`auto it = vocab_map_.find(key);`
231	`return it->second;`
232	`}`
233
234	`size_t VocabSize() const { return vocab_map_.size(); }`
235
236	`int TokenToID(const std::string& input) const {`
237	`auto it = vocab_map_.find(input);`
238	`if (it == vocab_map_.end()) {`
239	`ORTX_CXX_API_THROW("Token not found: " + input, ORT_INVALID_ARGUMENT);`
240	`}`
241	`return it->second;`
242	`}`
243
244	`const std::string& IdToToken(int id) const {`
245	`if ((id < 0) \|\| (static_cast<size_t>(id) >= id2token_map_.size())) {`
246	`ORTX_CXX_API_THROW("Invalid ID: " + std::to_string(id), ORT_INVALID_ARGUMENT);`
247	`}`
248	`return id2token_map_[id];`
249	`}`
250
251	`private:`
252	`int GetVocabIndex(const std::string& str) {`
253	`auto it = vocab_map_.find(str);`
254	`if (it == vocab_map_.end()) {`
255	`ORTX_CXX_API_THROW("Cannot find word in vocabulary: " + str, ORT_INVALID_ARGUMENT);`
256	`}`
257	`return it->second;`
258	`}`
259
260	`private:`
261	`struct hash_pair {`
262	`template <class T1, class T2>`
263	`size_t operator()(const std::pair<T1, T2>& p) const {`
264	`auto hash1 = std::hash<T1>{}(p.first);`
265	`auto hash2 = std::hash<T2>{}(p.second);`
266	`return hash1 ^ (hash2 << 16);`
267	`}`
268	`};`
269	`std::unordered_map<std::pair<int, int>, BpeNode, hash_pair> bpe_map_;`
270
271	`int byte_encoder_[256] = {};`
272	`std::unordered_map<std::string, int> vocab_map_;`
273	`std::vector<std::string> id2token_map_;`
274
275	`int unk_id_;`
276	`SpecialTokenMap special_tokens_;`
277	`};`
278
279	`class TokenWithRegularExp {`
280	`public:`
281	`void Set(std::u32string_view val) {`
282	`m_text = val;`
283	`}`
284
285	`std::pair<bool, std::u32string_view> GetNextToken() {`
286	`while (!m_text.empty()) {`
287	`auto res = TryMatch();`
288	`if (res.empty()) {`
289	`m_text = m_text.substr(1);`
290	`continue;`
291	`}`
292	`return {true, res};`
293	`}`
294	`return {false, {}};`
295	`}`
296
297	`private:`
298	`std::u32string_view TryMatch() {`
299	`// python pattern:`
300	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{L}+\| ?\p{N}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+`
301
302	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\|`
303	`// Note: the sequencial of the following if should not be switched, which follows the python regex's syntax`
304	`if ((m_text[0] == U'\'') && (m_text.size() > 1)) {`
305	`if ((m_text[1] == U's') \|\| (m_text[1] == U't') \|\|`
306	`(m_text[1] == U'm') \|\| (m_text[1] == U'd')) {`
307	`std::u32string_view res = m_text.substr(0, 2);`
308	`m_text = m_text.substr(2);`
309	`return res;`
310	`}`
311
312	`if (m_text.size() > 2) {`
313	`if (((m_text[1] == U'r') && (m_text[2] == U'e')) \|\|`
314	`((m_text[1] == U'v') && (m_text[2] == U'e')) \|\|`
315	`((m_text[1] == U'l') && (m_text[2] == U'l'))) {`
316	`std::u32string_view res = m_text.substr(0, 3);`
317	`m_text = m_text.substr(3);`
318	`return res;`
319	`}`
320	`}`
321	`}`
322
323	`// ?\p{L}+`
324	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::L)) {`
325	`size_t i = 2;`
326	`for (; i < m_text.size(); ++i) {`
327	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
328	`break;`
329	`}`
330	`std::u32string_view res = m_text.substr(0, i);`
331	`m_text = m_text.substr(i);`
332	`return res;`
333	`}`
334	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::L) {`
335	`size_t i = 1;`
336	`for (; i < m_text.size(); ++i) {`
337	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
338	`break;`
339	`}`
340	`std::u32string_view res = m_text.substr(0, i);`
341	`m_text = m_text.substr(i);`
342	`return res;`
343	`}`
344
345	`// ?\p{N}+`
346	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::N)) {`
347	`size_t i = 2;`
348	`for (; i < m_text.size(); ++i) {`
349	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
350	`break;`
351	`}`
352	`std::u32string_view res = m_text.substr(0, i);`
353	`m_text = m_text.substr(i);`
354	`return res;`
355	`}`
356	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::N) {`
357	`size_t i = 1;`
358	`for (; i < m_text.size(); ++i) {`
359	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
360	`break;`
361	`}`
362	`std::u32string_view res = m_text.substr(0, i);`
363	`m_text = m_text.substr(i);`
364	`return res;`
365	`}`
366
367	`// ?[^\s\p{L}\p{N}]+`
368	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (NotLNZ(m_text[1]))) {`
369	`size_t i = 2;`
370	`for (; i < m_text.size(); ++i) {`
371	`if (!NotLNZ(m_text[i]))`
372	`break;`
373	`}`
374	`std::u32string_view res = m_text.substr(0, i);`
375	`m_text = m_text.substr(i);`
376	`return res;`
377	`}`
378	`if (NotLNZ(m_text[0])) {`
379	`size_t i = 1;`
380	`for (; i < m_text.size(); ++i) {`
381	`if (!NotLNZ(m_text[i]))`
382	`break;`
383	`}`
384	`std::u32string_view res = m_text.substr(0, i);`
385	`m_text = m_text.substr(i);`
386	`return res;`
387	`}`
388
389	`// \s+(?!\S)\|\s+`
390	`if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {`
391	`size_t i = 1;`
392	`for (; i < m_text.size(); ++i) {`
393	`if (!IsZ(m_text[i])) break;`
394	`}`
395	`if ((i > 1) && (i != m_text.size())) //\s+(?!\S)`
396	`{`
397	`i--;`
398	`std::u32string_view res = m_text.substr(0, i);`
399	`m_text = m_text.substr(i);`
400	`return res;`
401	`}`
402	`// \s+`
403	`std::u32string_view res = m_text.substr(0, i);`
404	`m_text = m_text.substr(i);`
405	`return res;`
406	`}`
407
408	`return std::u32string_view{};`
409	`}`
410
411	`static bool IsZ(char32_t ch) {`
412	`auto category = ufal::unilib::unicode::category(ch);`
413	`return (category & ufal::unilib::unicode::Z) != 0;`
414	`}`
415
416	`static bool NotLNZ(char32_t ch) {`
417	`auto category = ufal::unilib::unicode::category(ch);`
418	`if (category & ufal::unilib::unicode::L) return false;`
419	`if (category & ufal::unilib::unicode::N) return false;`
420	`if (category & ufal::unilib::unicode::Z) return false;`
421	`return true;`
422	`}`
423
424	`private:`
425	`std::u32string_view m_text;`
426	`};`
427

microsoft/onnxruntime-extensions

Branches

Tags

Clone