microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

1ae69c0f7aeaab9911cf8ebf86ee92b34dadd26e

Find a branch or tag

Branches

1ae69c0f7aeaab9911cf8ebf86ee92b34dadd26e

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/gpt2_tokenizer.cc

638lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// Copyright (c) Microsoft Corporation. All rights reserved.`
2	`// Licensed under the MIT License.`
3	`// Partial code comes from other Microsoft employee.`
4
5	`#include <string>`
6	`#include <vector>`
7	`#include <fstream>`
8	`#include <sstream>`
9	`#include <iostream>`
10	`#include <list>`
11	`#include <memory>`
12	`#include <regex>`
13	`#include <sstream>`
14	`#include <stdexcept>`
15	`#include <unordered_map>`
16	`#include <functional>`
17	`#include <codecvt>`
18	`#include <mutex>`
19
20	`#include "nlohmann/json.hpp"`
21	`#include "ocos.h"`
22	`#include "string_tensor.h"`
23	`#include "unicode.h"`
24
25	`class SpecialTokenMap {`
26	`public:`
27	`void Add(ustring p_str, int p_id) {`
28	`auto it = token_map_.find(p_str);`
29	`if (it != token_map_.end()) {`
30	`if (it->second != p_id) {`
31	`throw std::runtime_error("Duplicate special tokens");`
32	`}`
33	`} else {`
34	`token_map_[p_str] = p_id;`
35	`token_list_.push_back(SpecialTokenInfo(std::move(p_str), p_id));`
36	`}`
37	`}`
38
39	`std::list<std::pair<ustring, int>> SplitBySpeicalTokens(ustring input) const {`
40	`std::list<std::pair<ustring, int>> res;`
41	`res.emplace_back(std::move(input), -1);`
42	`for (const auto& st : token_list_) {`
43	`std::list<std::pair<ustring, int>> new_split_res;`
44	`for (auto& str : res) {`
45	`if (str.second != -1) {`
46	`new_split_res.push_back(std::move(str));`
47	`continue;`
48	`}`
49	`auto it = str.first.begin();`
50	`size_t search_pos = 0;`
51	`while (it != str.first.end()) {`
52	`// work fine for all clang-based platform: Mac OS, Android, WebAssembly`
53	`#if defined(__clang__)`
54	`auto search_it = std::search(it, str.first.end(), st.str.begin(), st.str.end());`
55	`#else`
56	`auto search_it = std::search(it, str.first.end(),`
57	`std::boyer_moore_searcher(st.str.begin(), st.str.end()));`
58	`#endif`
59	`if (search_it == str.first.end()) {`
60	`new_split_res.emplace_back(str.first.substr(search_pos), -1);`
61	`break;`
62	`}`
63	`auto prefixLen = search_it - it;`
64	`if (prefixLen != 0) {`
65	`new_split_res.emplace_back(str.first.substr(search_pos, prefixLen), -1);`
66	`search_pos += prefixLen;`
67	`}`
68	`new_split_res.emplace_back(str.first.substr(search_pos, st.str.size()), st.id);`
69	`it = search_it + st.str.size();`
70	`search_pos += st.str.size();`
71	`}`
72	`}`
73	`std::swap(new_split_res, res);`
74	`}`
75	`return res;`
76	`}`
77
78	`private:`
79	`struct SpecialTokenInfo {`
80	`ustring str;`
81	`int id;`
82
83	`SpecialTokenInfo(ustring p_str, int p_id)`
84	`: str(std::move(p_str)), id(p_id) {`
85	`if (str.empty()) {`
86	`throw std::runtime_error("Empty special token.");`
87	`}`
88	`}`
89	`};`
90
91	`std::list<SpecialTokenInfo> token_list_;`
92	`std::unordered_map<ustring, int> token_map_;`
93	`};`
94
95	`using json = nlohmann::json;`
96	`class VocabData {`
97	`public:`
98	`VocabData()`
99	`: unk_id_(-1) {`
100	`}`
101
102	`struct BpeNode {`
103	`int id;`
104	`int value;`
105	`};`
106
107	`void Load(std::istream& vocab_stream, std::istream& merges_stream, const char* unk_token, const char* special_tokens) {`
108	`json tok_json;`
109	`vocab_stream >> tok_json;`
110	`vocab_map_ = std::move(tok_json.get<std::unordered_map<std::string, int>>());`
111
112	`auto it = vocab_map_.find(unk_token);`
113	`if (it != vocab_map_.end()) {`
114	`unk_id_ = it->second;`
115	`} else {`
116	`int id = static_cast<int>(vocab_map_.size());`
117	`vocab_map_[unk_token] = id;`
118	`std::cerr << "Special token (" << unk_token << ") have been added in the vocabulary." << std::endl;`
119	`}`
120
121	`std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> str_convert;`
122	`for (auto i = 33; i <= 126; ++i) {`
123	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
124	`}`
125	`for (auto i = 161; i <= 172; ++i) {`
126	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
127	`}`
128	`for (auto i = 174; i <= 255; ++i) {`
129	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)i));`
130	`}`
131
132	`int index = 256;`
133	`for (auto i = 0; i < 33; ++i) {`
134	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
135	`}`
136	`for (auto i = 127; i < 161; ++i) {`
137	`byte_encoder_[i] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
138	`}`
139	`byte_encoder_[173] = GetVocabIndex(str_convert.to_bytes((char32_t)(index++)));`
140
141	`index = 0;`
142	`std::string line;`
143	`while (std::getline(merges_stream, line)) {`
144	`line = std::regex_replace(line, std::regex("\r"), "");`
145	`if (line.empty()) continue;`
146	`if ((line[0] == '#') && (index == 0)) continue;`
147	`auto pos = line.find(' ');`
148	`if (pos == std::string::npos) {`
149	`throw std::runtime_error("Cannot know how to parse line: " + line);`
150	`}`
151	`std::string w1 = line.substr(0, pos);`
152	`std::string w2 = line.substr(pos + 1);`
153	`int iw1 = GetVocabIndex(w1);`
154	`int iw2 = GetVocabIndex(w2);`
155	`int iww = GetVocabIndex(w1 + w2);`
156	`std::pair<int, int> key{iw1, iw2};`
157	`BpeNode value{iww, index++};`
158	`bpe_map_[key] = value;`
159	`}`
160
161	`if (special_tokens != nullptr) {`
162	`std::istringstream istrea(special_tokens);`
163
164	`while (istrea >> line) {`
165	`if (line.empty()) continue;`
166	`line = std::regex_replace(line, std::regex("\r"), "");`
167	`ustring line_32(line);`
168	`int id = static_cast<int>(vocab_map_.size());`
169	`if (auto it = vocab_map_.find(line); it != vocab_map_.end()) {`
170	`id = it->second;`
171	`} else {`
172	`vocab_map_[line] = id;`
173	`}`
174	`special_tokens_.Add(std::move(line_32), id);`
175	`}`
176	`}`
177
178	`id2token_map_.resize(vocab_map_.size());`
179	`for (const auto& [t, i] : vocab_map_) {`
180	`id2token_map_[i] = t;`
181	`}`
182	`}`
183
184	`void bpe(std::list<int>& vals) const {`
185	`while (vals.size() >= 2) {`
186	`auto pos_it = vals.end();`
187	`int minval = std::numeric_limits<int>::max();`
188	`int ori_id1 = 0, ori_id2 = 0;`
189	`int aim_id = 0;`
190	`for (auto it = vals.begin(); it != vals.end(); ++it) {`
191	`auto it2 = it;`
192	`++it2;`
193	`if (it2 == vals.end()) break;`
194	`auto map_it = bpe_map_.find({it, it2});`
195	`if (map_it == bpe_map_.end()) continue;`
196	`if (minval > map_it->second.value) {`
197	`ori_id1 = *it;`
198	`ori_id2 = *it2;`
199	`minval = map_it->second.value;`
200	`pos_it = it;`
201	`aim_id = map_it->second.id;`
202	`}`
203	`}`
204	`if (pos_it == vals.end()) break;`
205
206	`pos_it = vals.erase(pos_it);`
207	`*pos_it = aim_id;`
208	`for (++pos_it; pos_it != vals.end(); ++pos_it) {`
209	`if (*pos_it != ori_id1) continue;`
210	`auto it2 = pos_it;`
211	`++it2;`
212	`if (it2 == vals.end()) break;`
213	`if (*it2 != ori_id2) continue;`
214	`pos_it = vals.erase(pos_it);`
215	`*pos_it = aim_id;`
216	`}`
217	`}`
218	`}`
219
220	`const auto& ByteEncoder() const {`
221	`return byte_encoder_;`
222	`}`
223
224	`auto SplitBySpeicalTokens(const ustring& input) const {`
225	`return special_tokens_.SplitBySpeicalTokens(input);`
226	`}`
227
228	`size_t VocabSize() const { return vocab_map_.size(); }`
229
230	`int TokenToID(const std::string& input) const {`
231	`auto it = vocab_map_.find(input);`
232	`if (it == vocab_map_.end()) {`
233	`throw std::runtime_error("Token not found: " + input);`
234	`}`
235	`return it->second;`
236	`}`
237
238	`const std::string& IdToToken(int id) const {`
239	`if ((id < 0) \|\| (id >= id2token_map_.size())) {`
240	`throw std::runtime_error("Invalid ID: " + std::to_string(id));`
241	`}`
242	`return id2token_map_[id];`
243	`}`
244
245	`private:`
246	`int GetVocabIndex(const std::string& str) {`
247	`auto it = vocab_map_.find(str);`
248	`if (it == vocab_map_.end()) {`
249	`throw std::runtime_error("Cannot find word in vocabulary: " + str);`
250	`}`
251	`return it->second;`
252	`}`
253
254	`private:`
255	`struct hash_pair {`
256	`template <class T1, class T2>`
257	`size_t operator()(const std::pair<T1, T2>& p) const {`
258	`auto hash1 = std::hash<T1>{}(p.first);`
259	`auto hash2 = std::hash<T2>{}(p.second);`
260	`return hash1 ^ (hash2 << 16);`
261	`}`
262	`};`
263	`std::unordered_map<std::pair<int, int>, BpeNode, hash_pair> bpe_map_;`
264
265	`int byte_encoder_[256] = {};`
266	`std::unordered_map<std::string, int> vocab_map_;`
267	`std::vector<std::string> id2token_map_;`
268
269	`int unk_id_;`
270	`SpecialTokenMap special_tokens_;`
271	`};`
272
273	`class TokenWithRegularExp {`
274	`public:`
275	`void Set(std::u32string_view val) {`
276	`m_text = val;`
277	`}`
278
279	`std::pair<bool, std::u32string_view> GetNextToken() {`
280	`while (!m_text.empty()) {`
281	`auto res = TryMatch();`
282	`if (res.empty()) {`
283	`m_text = m_text.substr(1);`
284	`continue;`
285	`}`
286	`return {true, res};`
287	`}`
288	`return {false, {}};`
289	`}`
290
291	`private:`
292	`std::u32string_view TryMatch() {`
293	`// python pattern:`
294	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{L}+\| ?\p{N}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+`
295
296	`// 's\|'t\|'re\|'ve\|'m\|'ll\|'d\|`
297	`// Note: the sequencial of the following if should not be switched, which follows the python regex's syntax`
298	`if ((m_text[0] == U'\'') && (m_text.size() > 1)) {`
299	`if ((m_text[1] == U's') \|\| (m_text[1] == U't') \|\|`
300	`(m_text[1] == U'm') \|\| (m_text[1] == U'd')) {`
301	`std::u32string_view res = m_text.substr(0, 2);`
302	`m_text = m_text.substr(2);`
303	`return res;`
304	`}`
305
306	`if (m_text.size() > 2) {`
307	`if (((m_text[1] == U'r') && (m_text[2] == U'e')) \|\|`
308	`((m_text[1] == U'v') && (m_text[2] == U'e')) \|\|`
309	`((m_text[1] == U'l') && (m_text[2] == U'l'))) {`
310	`std::u32string_view res = m_text.substr(0, 3);`
311	`m_text = m_text.substr(3);`
312	`return res;`
313	`}`
314	`}`
315	`}`
316
317	`// ?\p{L}+`
318	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::L)) {`
319	`size_t i = 2;`
320	`for (; i < m_text.size(); ++i) {`
321	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
322	`break;`
323	`}`
324	`std::u32string_view res = m_text.substr(0, i);`
325	`m_text = m_text.substr(i);`
326	`return res;`
327	`}`
328	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::L) {`
329	`size_t i = 1;`
330	`for (; i < m_text.size(); ++i) {`
331	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::L) == 0)`
332	`break;`
333	`}`
334	`std::u32string_view res = m_text.substr(0, i);`
335	`m_text = m_text.substr(i);`
336	`return res;`
337	`}`
338
339	`// ?\p{N}+`
340	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (ufal::unilib::unicode::category(m_text[1]) & ufal::unilib::unicode::N)) {`
341	`size_t i = 2;`
342	`for (; i < m_text.size(); ++i) {`
343	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
344	`break;`
345	`}`
346	`std::u32string_view res = m_text.substr(0, i);`
347	`m_text = m_text.substr(i);`
348	`return res;`
349	`}`
350	`if (ufal::unilib::unicode::category(m_text[0]) & ufal::unilib::unicode::N) {`
351	`size_t i = 1;`
352	`for (; i < m_text.size(); ++i) {`
353	`if ((ufal::unilib::unicode::category(m_text[i]) & ufal::unilib::unicode::N) == 0)`
354	`break;`
355	`}`
356	`std::u32string_view res = m_text.substr(0, i);`
357	`m_text = m_text.substr(i);`
358	`return res;`
359	`}`
360
361	`// ?[^\s\p{L}\p{N}]+`
362	`if ((m_text[0] == U' ') && (m_text.size() > 1) && (NotLNZ(m_text[1]))) {`
363	`size_t i = 2;`
364	`for (; i < m_text.size(); ++i) {`
365	`if (!NotLNZ(m_text[i]))`
366	`break;`
367	`}`
368	`std::u32string_view res = m_text.substr(0, i);`
369	`m_text = m_text.substr(i);`
370	`return res;`
371	`}`
372	`if (NotLNZ(m_text[0])) {`
373	`size_t i = 1;`
374	`for (; i < m_text.size(); ++i) {`
375	`if (!NotLNZ(m_text[i]))`
376	`break;`
377	`}`
378	`std::u32string_view res = m_text.substr(0, i);`
379	`m_text = m_text.substr(i);`
380	`return res;`
381	`}`
382
383	`// \s+(?!\S)\|\s+`
384	`if ((m_text.size() >= 1) && (IsZ(m_text[0]))) {`
385	`size_t i = 1;`
386	`for (; i < m_text.size(); ++i) {`
387	`if (!IsZ(m_text[i])) break;`
388	`}`
389	`if ((i > 1) && (i != m_text.size())) //\s+(?!\S)`
390	`{`
391	`i--;`
392	`std::u32string_view res = m_text.substr(0, i);`
393	`m_text = m_text.substr(i);`
394	`return res;`
395	`}`
396	`// \s+`
397	`std::u32string_view res = m_text.substr(0, i);`
398	`m_text = m_text.substr(i);`
399	`return res;`
400	`}`
401
402	`return std::u32string_view{};`
403	`}`
404
405	`static bool IsZ(char32_t ch) {`
406	`auto category = ufal::unilib::unicode::category(ch);`
407	`return (category & ufal::unilib::unicode::Z) != 0;`
408	`}`
409
410	`static bool NotLNZ(char32_t ch) {`
411	`auto category = ufal::unilib::unicode::category(ch);`
412	`if (category & ufal::unilib::unicode::L) return false;`
413	`if (category & ufal::unilib::unicode::N) return false;`
414	`if (category & ufal::unilib::unicode::Z) return false;`
415	`return true;`
416	`}`
417
418	`private:`
419	`std::u32string_view m_text;`
420	`};`
421
422	`//Note: the following logic comes from CPython: unicodetype_db.h (_PyUnicode_IsWhitespace)`
423	`bool IsUnicodeSpace(char32_t ch) {`
424	`switch (ch) {`
425	`case 0x0009:`
426	`case 0x000A:`
427	`case 0x000B:`
428	`case 0x000C:`
429	`case 0x000D:`
430	`case 0x001C:`
431	`case 0x001D:`
432	`case 0x001E:`
433	`case 0x001F:`
434	`case 0x0020:`
435	`case 0x0085:`
436	`case 0x00A0:`
437	`case 0x1680:`
438	`case 0x2000:`
439	`case 0x2001:`
440	`case 0x2002:`
441	`case 0x2003:`
442	`case 0x2004:`
443	`case 0x2005:`
444	`case 0x2006:`
445	`case 0x2007:`
446	`case 0x2008:`
447	`case 0x2009:`
448	`case 0x200A:`
449	`case 0x2028:`
450	`case 0x2029:`
451	`case 0x202F:`
452	`case 0x205F:`
453	`case 0x3000:`
454	`return true;`
455	`}`
456	`return false;`
457	`}`
458
459	`bool IsEmptyUString(const ustring& str) {`
460	`return std::all_of(str.begin(), str.end(), [](char32_t ch) { return IsUnicodeSpace(ch); });`
461	`}`
462
463	`struct KernelBpeTokenizer : BaseKernel {`
464	`KernelBpeTokenizer(OrtApi api, const OrtKernelInfo* info);`
465	`void Compute(OrtKernelContext* context);`
466
467	`private:`
468	`std::vector<int64_t> Tokenize(const ustring& input, int64_t max_length);`
469
470	`int64_t padding_length_;`
471	`std::list<int> byte_list_;`
472	`std::shared_ptr<VocabData> bbpe_tokenizer_;`
473	`};`
474
475	`struct CustomOpBpeTokenizer : Ort::CustomOpBase<CustomOpBpeTokenizer, KernelBpeTokenizer> {`
476	`void* CreateKernel(OrtApi api, const OrtKernelInfo* info) const;`
477	`const char* GetName() const;`
478	`size_t GetInputTypeCount() const;`
479	`ONNXTensorElementDataType GetInputType(size_t index) const;`
480	`size_t GetOutputTypeCount() const;`
481	`ONNXTensorElementDataType GetOutputType(size_t index) const;`
482	`};`
483
484	`KernelBpeTokenizer::KernelBpeTokenizer(OrtApi api, const OrtKernelInfo* info)`
485	`: BaseKernel(api, info) {`
486	`std::string vocab = ort_.KernelInfoGetAttribute<std::string>(info, "vocab");`
487	`if (vocab.empty()) {`
488	`throw std::runtime_error("vocabulary shouldn't be empty.");`
489	`}`
490
491	`std::string merges = ort_.KernelInfoGetAttribute<std::string>(info, "merges");`
492	`if (merges.empty()) {`
493	`throw std::runtime_error("merges shouldn't be empty.");`
494	`}`
495
496	`if (!TryToGetAttribute<int64_t>("padding_length", padding_length_)) {`
497	`padding_length_ = -1;`
498	`}`
499
500	`if (padding_length_ != -1 && padding_length_ <= 0) {`
501	`throw std::runtime_error("padding_length should be more than 0 or equal -1");`
502	`}`
503
504	`std::stringstream vocabu_stream(vocab);`
505	`std::stringstream merges_stream(merges);`
506	`bbpe_tokenizer_ = std::make_shared<VocabData>();`
507	`bbpe_tokenizer_->Load(vocabu_stream, merges_stream, "<\|endoftext\|>", "<\|endoftext\|>");`
508	`}`
509
510	`std::vector<int64_t> KernelBpeTokenizer::Tokenize(const ustring& input, int64_t max_length) {`
511	`std::vector<int64_t> res;`
512
513	`if (IsEmptyUString(input)) {`
514	`return res;`
515	`}`
516
517	`auto special_token_split_res = bbpe_tokenizer_->SplitBySpeicalTokens(input);`
518	`TokenWithRegularExp regcmp;`
519
520	`for (auto& seg_id : special_token_split_res) {`
521	`if (res.size() >= max_length) break;`
522
523	`if (seg_id.second != -1) {`
524	`res.push_back(seg_id.second);`
525	`continue;`
526	`}`
527
528	`auto cur_input = std::move(seg_id.first);`
529	`// Note: keep ptr to make sure the string_view is valid in the following process`
530	`const char32_t* ptr = cur_input.c_str();`
531	`regcmp.Set(ptr);`
532
533	`while (res.size() < max_length) {`
534	`auto [b, tok] = regcmp.GetNextToken();`
535	`if (!b) break;`
536
537	`std::string utf8_token = std::string(ustring(tok));`
538
539	`byte_list_.clear();`
540	`for (char& cp : utf8_token) {`
541	`byte_list_.push_back(bbpe_tokenizer_->ByteEncoder()[static_cast<unsigned char>(cp)]);`
542	`}`
543
544	`bbpe_tokenizer_->bpe(byte_list_);`
545
546	`for (auto p : byte_list_) {`
547	`if (res.size() >= max_length) {`
548	`break;`
549	`}`
550
551	`res.push_back(p);`
552	`}`
553	`}`
554	`}`
555
556	`return std::move(res);`
557	`}`
558
559	`void KernelBpeTokenizer::Compute(OrtKernelContext* context) {`
560	`// Setup inputs`
561	`const OrtValue* input = ort_.KernelContext_GetInput(context, 0);`
562	`std::vector<std::string> str_input;`
563	`GetTensorMutableDataString(api_, ort_, context, input, str_input);`
564	`OrtTensorDimensions input_dim(ort_, input);`
565
566	`std::vector<std::vector<int64_t>> tokenize_results;`
567	`for (auto& str : str_input) {`
568	`tokenize_results.emplace_back(Tokenize(ustring(str), padding_length_ < 0 ? INT64_MAX : padding_length_));`
569	`}`
570
571	`size_t max_length = 0;`
572	`if (padding_length_ == -1) {`
573	`for (auto& res : tokenize_results) {`
574	`max_length = std::max(max_length, res.size());`
575	`}`
576	`} else {`
577	`max_length = padding_length_;`
578	`}`
579
580	`OrtTensorDimensions output_dim = input_dim;`
581	`output_dim.push_back(max_length);`
582	`OrtValue* tokenize_output = ort_.KernelContext_GetOutput(context, 0, output_dim.data(), output_dim.size());`
583	`OrtValue* attention_mask = ort_.KernelContext_GetOutput(context, 1, output_dim.data(), output_dim.size());`
584	`auto* token = ort_.GetTensorMutableData<int64_t>(tokenize_output);`
585	`auto* mask = ort_.GetTensorMutableData<int64_t>(attention_mask);`
586
587	`int idx = 0;`
588	`for (auto& res : tokenize_results) {`
589	`for (int64_t id : res) {`
590	`token[idx] = id;`
591	`mask[idx] = 1;`
592	`idx++;`
593	`}`
594
595	`for (int i = res.size(); i < max_length; i++) {`
596	`token[idx] = 0;`
597	`mask[idx] = 0;`
598	`idx++;`
599	`}`
600	`}`
601	`}`
602
603	`void* CustomOpBpeTokenizer::CreateKernel(OrtApi api, const OrtKernelInfo* info) const {`
604	`return new KernelBpeTokenizer(api, info);`
605	`}`
606
607	`const char* CustomOpBpeTokenizer::GetName() const {`
608	`return "GPT2Tokenizer";`
609	`}`
610
611	`size_t CustomOpBpeTokenizer::GetInputTypeCount() const {`
612	`return 1;`
613	`}`
614
615	`ONNXTensorElementDataType CustomOpBpeTokenizer::GetInputType(size_t index) const {`
616	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
617	`}`
618	`size_t CustomOpBpeTokenizer::GetOutputTypeCount() const {`
619	`return 2;`
620	`}`
621
622	`ONNXTensorElementDataType CustomOpBpeTokenizer::GetOutputType(size_t index) const {`
623	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;`
624	`}`
625
626	`const OrtCustomOp** LoadTokenizerSchemaList() {`
627	`// create the global objects here to let the ORT catch the expection if any`
628	`static std::unique_ptr<CustomOpBpeTokenizer> p_CoBpeTokenizer;`
629	`static const OrtCustomOp* c_CustomOpList[2] = {nullptr}; // {&c_CoBpeTokenizer, nullptr};`
630	`static std::mutex mtx_loaded;`
631	`std::lock_guard<std::mutex> lck(mtx_loaded);`
632	`if (p_CoBpeTokenizer.get() == nullptr) {`
633	`p_CoBpeTokenizer = std::make_unique<CustomOpBpeTokenizer>();`
634	`c_CustomOpList[0] = p_CoBpeTokenizer.get();`
635	`}`
636
637	`return c_CustomOpList;`
638	`}`
639

microsoft/onnxruntime-extensions

Branches

Tags

Clone