microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

skottmckay/BuildInfra_AndTestImageLibs

Find a branch or tag

Branches

skottmckay/BuildInfra_AndTestImageLibs

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/gpt2_tokenizer.cc

620lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// Copyright (c) Microsoft Corporation. All rights reserved.`
2	`// Licensed under the MIT License.`
3	`// Partial code comes from other Microsoft employee.`
4
5	`#include <string>`
6	`#include <vector>`
7	`#include <fstream>`
8	`#include <sstream>`
9	`#include <iostream>`
10	`#include <list>`
11	`#include <memory>`
12	`#include <regex>`
13	`#include <sstream>`
14	`#include <stdexcept>`
15	`#include <unordered_map>`
16	`#include <functional>`
17	`#include <codecvt>`
18	`#include <mutex>`
19
20	`#include "nlohmann/json.hpp"`
21	`#include "gpt2_tokenizer.hpp"`
22	`#include "string_tensor.h"`
23	`#include "unicode.h"`
24
25
26	`class SpecialTokenMap {`
27	`public:`
28	`void Add(ustring p_str, int p_id) {`
29	`auto it = token_map_.find(p_str);`
30	`if (it != token_map_.end()) {`
31	`if (it->second != p_id) {`
32	`ORT_CXX_API_THROW("Duplicate special tokens.", ORT_INVALID_ARGUMENT);`
33	`}`
34	`} else {`
35	`token_map_[p_str] = p_id;`
36	`token_list_.push_back(SpecialTokenInfo(std::move(p_str), p_id));`
37	`}`
38	`}`
39
40	`std::list<std::pair<ustring, int>> SplitBySpeicalTokens(ustring input) const {`
41	`std::list<std::pair<ustring, int>> res;`
42	`res.emplace_back(std::move(input), -1);`
43	`for (const auto& st : token_list_) {`
44	`std::list<std::pair<ustring, int>> new_split_res;`
45	`for (auto& str : res) {`
46	`if (str.second != -1) {`
47	`new_split_res.push_back(std::move(str));`
48	`continue;`
49	`}`
50	`auto it = str.first.begin();`
51	`size_t search_pos = 0;`
52	`while (it != str.first.end()) {`
53	`// work fine for all clang-based platform: Mac OS, Android, WebAssembly`
54	`#if defined(__clang__)`
55	`auto search_it = std::search(it, str.first.end(), st.str.begin(), st.str.end());`
56	`#else`
57	`auto search_it = std::search(it, str.first.end(),`
58	`std::boyer_moore_searcher(st.str.begin(), st.str.end()));`
59	`#endif`
60	`if (search_it == str.first.end()) {`
61	`new_split_res.emplace_back(str.first.substr(search_pos), -1);`
62	`break;`
63	`}`
64	`auto prefixLen = search_it - it;`
65	`if (prefixLen != 0) {`
66	`new_split_res.emplace_back(str.first.substr(search_pos, prefixLen), -1);`
67	`search_pos += prefixLen;`
68	`}`
69	`new_split_res.emplace_back(str.first.substr(search_pos, st.str.size()), st.id);`
70	`it = search_it + st.str.size();`
71	`search_pos += st.str.size();`
72	`}`
73	`}`
74	`std::swap(new_split_res, res);`
75	`}`
76	`return res;`
77	`}`
78
79	`private:`
80	`struct SpecialTokenInfo {`
81	`ustring str;`
82	`int id;`
83
84	`SpecialTokenInfo(ustring p_str, int p_id)`
85	`: str(std::move(p_str)), id(p_id) {`
86	`if (str.empty()) {`
87	`ORT_CXX_API_THROW("Empty special token.", ORT_INVALID_ARGUMENT);`
88	`}`
89	`}`
90	`};`
91
92	`std::list<SpecialTokenInfo> token_list_;`
93	`std::unordered_map<ustring, int> token_map_;`
94	`};`
95
96	`using json = nlohmann::json;`
97	`class VocabData {`
98	`public:`
99	`VocabData()`
100	`: unk_id_(-1) {`
101	`}`
102
103	`struct BpeNode {`
104	`int id;`
105	`int value;`
106	`};`
107
108	`void Load(std::istream& vocab_stream, std::istream& merges_stream, const char* unk_token, const char* special_tokens) {`
109	`json tok_json;`
110	`vocab_stream >> tok_json;`
111	`vocab_map_ = std::move(tok_json.get<std::unordered_map<std::string, int>>());`
112
113	`auto it = vocab_map_.find(unk_token);`
114	`if (it != vocab_map_.end()) {`
115	`unk_id_ = it->second;`
116	`} else {`
117	`int id = static_cast<int>(vocab_map_.size());`
118	`vocab_map_[unk_token] = id;`
119	`std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl;`
120	`}`
121
122	`std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;`
123	`for (auto i = 33; i <= 126; ++i) {`
124	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
125	`}`
126	`for (auto i = 161; i <= 172; ++i) {`
127	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
128	`}`
129	`for (auto i = 174; i <= 255; ++i) {`
130	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
131	`}`
132
133	`int index = 256;`
134	`for (auto i = 0; i < 33; ++i) {`
135	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
136	`}`
137	`for (auto i = 127; i < 161; ++i) {`
138	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
139	`}`
140	`byte_encoder_[173] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
141
142	`index = 0;`
143	`std::string line;`
144	`while (std::getline(merges_stream, line)) {`
145	`line = std::regex_replace(line, std::regex("\r"), "");`
146	`if (line.empty()) continue;`
147	`if ((line[0] == '#') && (index == 0)) continue;`
148	`auto pos = line.find(' ');`
149	`if (pos == std::string::npos) {`
150	`ORT_CXX_API_THROW("Cannot know how to parse line: " + line, ORT_INVALID_ARGUMENT);`
151	`}`
152	`std::string w1 = line.substr(0, pos);`
153	`std::string w2 = line.substr(pos + 1);`
154	`int iw1 = GetVocabIndex(w1);`
155	`int iw2 = GetVocabIndex(w2);`
156	`int iww = GetVocabIndex(w1 + w2);`
157	`std::pair<int, int> key{iw1, iw2};`
158	`BpeNode value{iww, index++};`
159	`bpe_map_[key] = value;`
160	`}`
161
162	`if (special_tokens != nullptr) {`
163	`std::istringstream istrea(special_tokens);`
164
165	`while (istrea >> line) {`
166	`if (line.empty()) continue;`
167	`line = std::regex_replace(line, std::regex("\r"), "");`
168	`ustring line_32(line);`
169	`int id = static_cast<int>(vocab_map_.size());`
170	`if (auto nestedIt = vocab_map_.find(line); nestedIt != vocab_map_.end()) {`
171	`id = nestedIt->second;`
172	`} else {`
173	`vocab_map_[line] = id;`
174	`}`
175	`special_tokens_.Add(std::move(line_32), id);`
176	`}`
177	`}`
178
179	`id2token_map_.resize(vocab_map_.size());`
180	`for (const auto& [t, i] : vocab_map_) {`
181	`id2token_map_[i] = t;`
182	`}`
183	`}`
184
185	`void bpe(std::list<int>& vals) const {`
186	`while (vals.size() >= 2) {`
187	`auto pos_it = vals.end();`
188	`int minval = std::numeric_limits<int>::max();`
189	`int ori_id1 = 0, ori_id2 = 0;`
190	`int aim_id = 0;`
191	`for (auto it = vals.begin(); it != vals.end(); ++it) {`
192	`auto it2 = it;`
193	`++it2;`
194	`if (it2 == vals.end()) break;`
195	`auto map_it = bpe_map_.find({it, it2});`
196	`if (map_it == bpe_map_.end()) continue;`
197	`if (minval > map_it->second.value) {`
198	`ori_id1 = *it;`
199	`ori_id2 = *it2;`
200	`minval = map_it->second.value;`
201	`pos_it = it;`
202	`aim_id = map_it->second.id;`
203	`}`
204	`}`
205	`if (pos_it == vals.end()) break;`
206
207	`pos_it = vals.erase(pos_it);`
208	`*pos_it = aim_id;`
209	`for (++pos_it; pos_it != vals.end(); ++pos_it) {`
210	`if (*pos_it != ori_id1) continue;`
211	`auto it2 = pos_it;`
212	`++it2;`
213	`if (it2 == vals.end()) break;`
214	`if (*it2 != ori_id2) continue;`
215	`pos_it = vals.erase(pos_it);`
216	`*pos_it = aim_id;`
217	`}`
218	`}`
219	`}`
220
221	`const auto& ByteEncoder() const {`
222	`return byte_encoder_;`
223	`}`
224
225	`auto SplitBySpeicalTokens(const ustring& input) const {`
226	`return special_tokens_.SplitBySpeicalTokens(input);`
227	`}`
228
229	`size_t VocabSize() const { return vocab_map_.size(); }`
230
231	`int TokenToID(const std::string& input) const {`
232	`auto it = vocab_map_.find(input);`
233	`if (it == vocab_map_.end()) {`
234	`ORT_CXX_API_THROW("Token not found: " + input, ORT_INVALID_ARGUMENT);`
235	`}`
236	`return it->second;`
237	`}`
238
239	`const std::string& IdToToken(int id) const {`
240	`if ((id < 0) \|\| (static_cast<size_t>(id) >= id2token_map_.size())) {`
241	`ORT_CXX_API_THROW("Invalid ID: " + std::to_string(id), ORT_INVALID_ARGUMENT);`
242	`}`
243	`return id2token_map_[id];`
244	`}`
245
246	`private:`
247	`int GetVocabIndex(const std::string& str) {`
248	`auto it = vocab_map_.find(str);`
249	`if (it == vocab_map_.end()) {`
250	`ORT_CXX_API_THROW("Cannot find word in vocabulary: " + str, ORT_INVALID_ARGUMENT);`
251	`}`
252	`return it->second;`
253	`}`
254
255	`private:`
256	`struct hash_pair {`
257	`template <class T1, class T2>`
258	`size_t operator()(const std::pair<T1, T2>& p) const {`
259	`auto hash1 = std::hash<T1>{}(p.first);`
260	`auto hash2 = std::hash<T2>{}(p.second);`
261	`return hash1 ^ (hash2 << 16);`
262	`}`
263	`};`
264	`std::unordered_map<std::pair<int, int>, BpeNode, hash_pair> bpe_map_;`
265
266	`int byte_encoder_[256] = {};`
267	`std::unordered_map<std::string, int> vocab_map_;`
268	`std::vector<std::string> id2token_map_;`
269
270	`int unk_id_;`
271	`SpecialTokenMap special_tokens_;`
272	`};`
273
274	`class TokenWithRegularExp {`
275	`public:`
276	`void Set(std::u32string_view val) {`
277	`m_text = val;`
278	`}`
279
280	`std::pair<bool, std::u32string_view> GetNextToken() {`
281	`while (!m_text.empty()) {`
282	`auto res = TryMatch();`
283	`if (res.empty()) {`
284	`m_text = m_text.substr(1);`
285	`continue;`
286	`}`
287	`return {true, res};`
288	`}`
289	`return {false, {}};`
290	`}`
291
292	`private:`
293	`std::u32string_view TryMatch() {`
294	`// python pattern:`
295	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{L}+\| ?\p{N}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+`
296
297	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\|`
298	`// Note: the sequencial of the following if should not be switched, which follows the python regex's syntax`
299	`if ((m_text[0] == U'\'') && (m_text.size() > 1)) {`
300	`if ((m_text[1] == U's') \|\| (m_text[1] == U't') \|\|`
301	`(m_text[1] == U'm') \|\| (m_text[1] == U'd')) {`
302	`std::u32string_view res = m_text.substr(0, 2);`
303	`m_text = m_text.substr(2);`
304	`return res;`
305	`}`
306
307	`if (m_text.size() > 2) {`
308	`if (((m_text[1] == U'r') && (m_text[2] == U'e')) \|\|`
309	`((m_text[1] == U'v') && (m_text[2] == U'e')) \|\|`
310	`((m_text[1] == U'l') && (m_text[2] == U'l'))) {`
311	`std::u32string_view res = m_text.substr(0, 3);`
312	`m_text = m_text.substr(3);`
313	`return res;`
314	`}`
315	`}`
316	`}`
317
318	`// ?\p{L}+`
319	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::L)) {`
320	`size_t i = 2;`
321	`for (; i < m_text.size(); ++i) {`
322	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
323	`break;`
324	`}`
325	`std::u32string_view res = m_text.substr(0, i);`
326	`m_text = m_text.substr(i);`
327	`return res;`
328	`}`
329	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::L) {`
330	`size_t i = 1;`
331	`for (; i < m_text.size(); ++i) {`
332	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
333	`break;`
334	`}`
335	`std::u32string_view res = m_text.substr(0, i);`
336	`m_text = m_text.substr(i);`
337	`return res;`
338	`}`
339
340	`// ?\p{N}+`
341	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::N)) {`
342	`size_t i = 2;`
343	`for (; i < m_text.size(); ++i) {`
344	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
345	`break;`
346	`}`
347	`std::u32string_view res = m_text.substr(0, i);`
348	`m_text = m_text.substr(i);`
349	`return res;`
350	`}`
351	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::N) {`
352	`size_t i = 1;`
353	`for (; i < m_text.size(); ++i) {`
354	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
355	`break;`
356	`}`
357	`std::u32string_view res = m_text.substr(0, i);`
358	`m_text = m_text.substr(i);`
359	`return res;`
360	`}`
361
362	`// ?[^\s\p{L}\p{N}]+`
363	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (NotLNZ(m_text[1]))) {`
364	`size_t i = 2;`
365	`for (; i < m_text.size(); ++i) {`
366	`if (!NotLNZ(m_text[i]))`
367	`break;`
368	`}`
369	`std::u32string_view res = m_text.substr(0, i);`
370	`m_text = m_text.substr(i);`
371	`return res;`
372	`}`
373	`if (NotLNZ(m_text[0])) {`
374	`size_t i = 1;`
375	`for (; i < m_text.size(); ++i) {`
376	`if (!NotLNZ(m_text[i]))`
377	`break;`
378	`}`
379	`std::u32string_view res = m_text.substr(0, i);`
380	`m_text = m_text.substr(i);`
381	`return res;`
382	`}`
383
384	`// \s+(?!\S)\|\s+`
385	`if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {`
386	`size_t i = 1;`
387	`for (; i < m_text.size(); ++i) {`
388	`if (!IsZ(m_text[i])) break;`
389	`}`
390	`if ((i > 1) && (i != m_text.size())) //\s+(?!\S)`
391	`{`
392	`i--;`
393	`std::u32string_view res = m_text.substr(0, i);`
394	`m_text = m_text.substr(i);`
395	`return res;`
396	`}`
397	`// \s+`
398	`std::u32string_view res = m_text.substr(0, i);`
399	`m_text = m_text.substr(i);`
400	`return res;`
401	`}`
402
403	`return std::u32string_view{};`
404	`}`
405
406	`static bool IsZ(char32_t ch) {`
407	`auto category = ufal::unilib::unicode::category(ch);`
408	`return (category & ufal::unilib::unicode::Z) != 0;`
409	`}`
410
411	`static bool NotLNZ(char32_t ch) {`
412	`auto category = ufal::unilib::unicode::category(ch);`
413	`if (category & ufal::unilib::unicode::L) return false;`
414	`if (category & ufal::unilib::unicode::N) return false;`
415	`if (category & ufal::unilib::unicode::Z) return false;`
416	`return true;`
417	`}`
418
419	`private:`
420	`std::u32string_view m_text;`
421	`};`
422
423	`//Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)`
424	`bool IsUnicodeSpace(char32_t ch) {`
425	`switch (ch) {`
426	`case 0x0009:`
427	`case 0x000A:`
428	`case 0x000B:`
429	`case 0x000C:`
430	`case 0x000D:`
431	`case 0x001C:`
432	`case 0x001D:`
433	`case 0x001E:`
434	`case 0x001F:`
435	`case 0x0020:`
436	`case 0x0085:`
437	`case 0x00A0:`
438	`case 0x1680:`
439	`case 0x2000:`
440	`case 0x2001:`
441	`case 0x2002:`
442	`case 0x2003:`
443	`case 0x2004:`
444	`case 0x2005:`
445	`case 0x2006:`
446	`case 0x2007:`
447	`case 0x2008:`
448	`case 0x2009:`
449	`case 0x200A:`
450	`case 0x2028:`
451	`case 0x2029:`
452	`case 0x202F:`
453	`case 0x205F:`
454	`case 0x3000:`
455	`return true;`
456	`}`
457	`return false;`
458	`}`
459
460	`bool IsEmptyUString(const ustring& str) {`
461	`return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });`
462	`}`
463
464
465
466	`KernelBpeTokenizer::KernelBpeTokenizer(const OrtApi& api, const OrtKernelInfo* info)`
467	`: BaseKernel(api, info) {`
468	`std::string vocab = ort_.KernelInfoGetAttribute<std::string>(info, "vocab");`
469	`if (vocab.empty()) {`
470	`ORT_CXX_API_THROW("vocabulary shouldn't be empty.", ORT_INVALID_ARGUMENT);`
471	`}`
472
473	`std::string merges = ort_.KernelInfoGetAttribute<std::string>(info, "merges");`
474	`if (merges.empty()) {`
475	`ORT_CXX_API_THROW("merges shouldn't be empty.", ORT_INVALID_ARGUMENT);`
476	`}`
477
478	`if (!TryToGetAttribute<int64_t>("padding_length", padding_length_)) {`
479	`padding_length_ = -1;`
480	`}`
481
482	`if (padding_length_ != -1 && padding_length_ <= 0) {`
483	`ORT_CXX_API_THROW("padding_length should be more than 0 or equal -1", ORT_INVALID_ARGUMENT);`
484	`}`
485
486	`std::stringstream vocabu_stream(vocab);`
487	`std::stringstream merges_stream(merges);`
488	`bbpe_tokenizer_ = std::make_shared<VocabData>();`
489	`bbpe_tokenizer_->Load(vocabu_stream, merges_stream, "<\|endoftext\|>", "<\|endoftext\|>");`
490	`}`
491
492	`std::vector<int64_t> KernelBpeTokenizer::Tokenize(const ustring& input, int64_t max_length) {`
493	`std::vector<int64_t> res;`
494
495	`if (IsEmptyUString(input)) {`
496	`return res;`
497	`}`
498
499	`auto special_token_split_res = bbpe_tokenizer_->SplitBySpeicalTokens(input);`
500	`TokenWithRegularExp regcmp;`
501
502	`for (auto& seg_id : special_token_split_res) {`
503	`if (static_cast<int64_t>(res.size()) >= max_length) break;`
504
505	`if (seg_id.second != -1) {`
506	`res.push_back(seg_id.second);`
507	`continue;`
508	`}`
509
510	`auto cur_input = std::move(seg_id.first);`
511	`// Note: keep ptr to make sure the string_view is valid in the following process`
512	`const char32_t* ptr = cur_input.c_str();`
513	`regcmp.Set(ptr);`
514
515	`while (static_cast<int64_t>(res.size()) < max_length) {`
516	`auto [b, tok] = regcmp.GetNextToken();`
517	`if (!b) break;`
518
519	`std::string utf8_token = std::string(ustring(tok));`
520
521	`byte_list_.clear();`
522	`for (char& cp : utf8_token) {`
523	`byte_list_.push_back(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)]);`
524	`}`
525
526	`bbpe_tokenizer_->bpe(byte_list_);`
527
528	`for (auto p : byte_list_) {`
529	`if (static_cast<int64_t>(res.size()) >= max_length) {`
530	`break;`
531	`}`
532
533	`res.push_back(p);`
534	`}`
535	`}`
536	`}`
537
538	`return res;`
539	`}`
540
541	`void KernelBpeTokenizer::Compute(OrtKernelContext* context) {`
542	`// Setup inputs`
543	`const OrtValue* input = ort_.KernelContext_GetInput(context, 0);`
544	`std::vector<std::string> str_input;`
545	`GetTensorMutableDataString(api_, ort_, context, input, str_input);`
546	`OrtTensorDimensions input_dim(ort_, input);`
547
548	`std::vector<std::vector<int64_t>> tokenize_results;`
549	`for (auto& str : str_input) {`
550	`tokenize_results.emplace_back(Tokenize(ustring(str), padding_length_ < 0 ? INT64_MAX : padding_length_));`
551	`}`
552
553	`size_t max_length = 0;`
554	`if (padding_length_ == -1) {`
555	`for (auto& res : tokenize_results) {`
556	`max_length = std::max(max_length, res.size());`
557	`}`
558	`} else {`
559	`max_length = static_cast<size_t>(padding_length_);`
560	`}`
561
562	`OrtTensorDimensions output_dim = input_dim;`
563	`output_dim.push_back(max_length);`
564	`OrtValue* tokenize_output = ort_.KernelContext_GetOutput(context, 0, output_dim.data(), output_dim.size());`
565	`OrtValue* attention_mask = ort_.KernelContext_GetOutput(context, 1, output_dim.data(), output_dim.size());`
566	`auto* token = ort_.GetTensorMutableData<int64_t>(tokenize_output);`
567	`auto* mask = ort_.GetTensorMutableData<int64_t>(attention_mask);`
568
569	`int idx = 0;`
570	`for (auto& res : tokenize_results) {`
571	`for (int64_t id : res) {`
572	`token[idx] = id;`
573	`mask[idx] = 1;`
574	`idx++;`
575	`}`
576
577	`for (size_t i = res.size(); i < max_length; i++) {`
578	`token[idx] = 0;`
579	`mask[idx] = 0;`
580	`idx++;`
581	`}`
582	`}`
583	`}`
584
585	`void* CustomOpBpeTokenizer::CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {`
586	`return CreateKernelImpl(api, info);`
587	`}`
588
589	`const char* CustomOpBpeTokenizer::GetName() const {`
590	`return "GPT2Tokenizer";`
591	`}`
592
593	`size_t CustomOpBpeTokenizer::GetInputTypeCount() const {`
594	`return 1;`
595	`}`
596
597	`ONNXTensorElementDataType CustomOpBpeTokenizer::GetInputType(size_t /index/) const {`
598	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
599	`}`
600	`size_t CustomOpBpeTokenizer::GetOutputTypeCount() const {`
601	`return 2;`
602	`}`
603
604	`ONNXTensorElementDataType CustomOpBpeTokenizer::GetOutputType(size_t /index/) const {`
605	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;`
606	`}`
607
608	`const OrtCustomOp** LoadTokenizerSchemaList() {`
609	`// create the global objects here to let the ORT catch the expection if any`
610	`static std::unique_ptr<CustomOpBpeTokenizer> p_CoBpeTokenizer;`
611	`static const OrtCustomOp* c_CustomOpList[2] = {nullptr}; // {&c_CoBpeTokenizer, nullptr};`
612	`static std::mutex mtx_loaded;`
613	`std::lock_guard<std::mutex> lck(mtx_loaded);`
614	`if (p_CoBpeTokenizer.get() == nullptr) {`
615	`p_CoBpeTokenizer = std::make_unique<CustomOpBpeTokenizer>();`
616	`c_CustomOpList[0] = p_CoBpeTokenizer.get();`
617	`}`
618
619	`return c_CustomOpList;`
620	`}`
621

microsoft/onnxruntime-extensions

Branches

Tags

Clone