microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

rel-0.4

Find a branch or tag

Branches

rel-0.4

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/basic_tokenizer.cc

130lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// Copyright (c) Microsoft Corporation. All rights reserved.`
2	`// Licensed under the MIT License.`
3
4	`#include "string_utils.h"`
5	`#include "basic_tokenizer.hpp"`
6	`#include "string_tensor.h"`
7	`#include <vector>`
8	`#include <locale>`
9	`#include <codecvt>`
10	`#include <algorithm>`
11
12	`BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation, bool remove_control_chars):`
13	`do_lower_case_(do_lower_case), tokenize_chinese_chars_(tokenize_chinese_chars), strip_accents_(strip_accents), tokenize_punctuation_(tokenize_punctuation),`
14	`remove_control_chars_(remove_control_chars){}`
15
16	`std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {`
17	`std::vector<ustring> result;`
18	`ustring token;`
19	`auto push_current_token_and_clear = [&result, &token]() {`
20	`if (!token.empty()) {`
21	`result.push_back(token);`
22	`token.clear();`
23	`}`
24	`};`
25
26	`auto push_single_char_and_clear = [&result, &token](char32_t c) {`
27	`token.push_back(c);`
28	`result.push_back(token);`
29	`token.clear();`
30	`};`
31
32	`// strip accent first`
33	`if (strip_accents_) {`
34	`for (auto& c : text) {`
35	`c = StripAccent(c);`
36	`}`
37	`}`
38
39	`if (do_lower_case_) {`
40	`for (auto& c : text) {`
41	`c = ::tolower(c);`
42	`}`
43	`}`
44
45	`for (auto c : text) {`
46	`if (tokenize_chinese_chars_ && IsCJK(c)) {`
47	`push_current_token_and_clear();`
48	`push_single_char_and_clear(c);`
49	`continue;`
50	`}`
51
52	`if (strip_accents_ && IsAccent(c)) {`
53	`continue;`
54	`}`
55
56	`// 0x2019 unicode is not punctuation in some Linux platform,`
57	`// to be consistent, take it as punctatuation always.`
58	`if (tokenize_punctuation_ && (::iswpunct(c) \|\| c == wint_t(0x2019))) {`
59	`push_current_token_and_clear();`
60	`push_single_char_and_clear(c);`
61	`continue;`
62	`}`
63
64	`// split by space`
65	`if (::iswspace(c)) {`
66	`push_current_token_and_clear();`
67	`continue;`
68	`}`
69
70	`// iscntrl will judge \t\f\n\r as control char`
71	`// but it has been filter by isspace(c)`
72	`if (remove_control_chars_ && ::iswcntrl(c)) {`
73	`continue;`
74	`}`
75
76	`token.push_back(c);`
77	`}`
78
79	`push_current_token_and_clear();`
80	`return result;`
81	`}`
82
83	`KernelBasicTokenizer::KernelBasicTokenizer(OrtApi api, const OrtKernelInfo* info) : BaseKernel(api, info) {`
84	`bool do_lower_case = TryToGetAttributeWithDefault("do_lower_case", true);`
85	`bool tokenize_chinese_chars = TryToGetAttributeWithDefault("tokenize_chinese_chars", true);`
86	`bool strip_accents = TryToGetAttributeWithDefault("strip_accents", false);`
87	`bool tokenize_punctuation = TryToGetAttributeWithDefault("tokenize_punctuation", false);`
88	`bool remove_control_chars = TryToGetAttributeWithDefault("strip_accents", true);`
89
90	`tokenizer_ = std::make_shared<BasicTokenizer>(do_lower_case, tokenize_chinese_chars, strip_accents, tokenize_punctuation, remove_control_chars);`
91	`}`
92
93	`void KernelBasicTokenizer::Compute(OrtKernelContext* context) {`
94	`// Setup inputs`
95	`const OrtValue* input = ort_.KernelContext_GetInput(context, 0);`
96	`std::vector<std::string> input_data;`
97	`GetTensorMutableDataString(api_, ort_, context, input, input_data);`
98
99	`OrtTensorDimensions dimensions(ort_, input);`
100	`if (dimensions.size() != 1 && dimensions[0] != 1) {`
101	`ORT_CXX_API_THROW("[BasicTokenizer]: only support string scalar.", ORT_INVALID_GRAPH);`
102	`}`
103
104	`OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size());`
105	`std::vector<ustring> result = tokenizer_->Tokenize(ustring(input_data[0]));`
106
107	`FillTensorDataString(api_, ort_, context, result, output);`
108	`}`
109
110	`void* CustomOpBasicTokenizer::CreateKernel(OrtApi api, const OrtKernelInfo* info) const {`
111	`return new KernelBasicTokenizer(api, info);`
112	`};`
113
114	`const char* CustomOpBasicTokenizer::GetName() const { return "BasicTokenizer"; };`
115
116	`size_t CustomOpBasicTokenizer::GetInputTypeCount() const {`
117	`return 1;`
118	`};`
119
120	`ONNXTensorElementDataType CustomOpBasicTokenizer::GetInputType(size_t /index/) const {`
121	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
122	`};`
123
124	`size_t CustomOpBasicTokenizer::GetOutputTypeCount() const {`
125	`return 1;`
126	`};`
127
128	`ONNXTensorElementDataType CustomOpBasicTokenizer::GetOutputType(size_t /index/) const {`
129	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
130	`};`
131

microsoft/onnxruntime-extensions

Branches

Tags

Clone