microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

rel-0.7

Find a branch or tag

Branches

rel-0.7

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/bert_tokenizer_decoder.cc

196lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`#include "bert_tokenizer_decoder.hpp"`
2
3	`BertTokenizerDecoder::BertTokenizerDecoder(`
4	`std::string vocab,`
5	`std::string unk_token,`
6	`std::string sep_token,`
7	`std::string pad_token,`
8	`std::string cls_token,`
9	`std::string mask_token,`
10	`std::string suffix_indicator) : unk_token_(unk_token),`
11	`suffix_indicator_(suffix_indicator),`
12	`raw_vocab_(vocab) {`
13	`auto tokens = SplitString(raw_vocab_, "\n", true);`
14	`vocab_.reserve(tokens.size());`
15	`for (size_t i = 0; i < tokens.size(); i++) {`
16	`auto& token = tokens[i];`
17	`if (token == unk_token) {`
18	`unk_token_id_ = static_cast<int32_t>(i);`
19	`}`
20	`if (token == sep_token) {`
21	`sep_token_id_ = static_cast<int32_t>(i);`
22	`}`
23	`if (token == pad_token) {`
24	`sep_token_id_ = static_cast<int32_t>(i);`
25	`}`
26	`if (token == cls_token) {`
27	`cls_token_id_ = static_cast<int32_t>(i);`
28	`}`
29	`if (token == mask_token) {`
30	`mask_token_id_ = static_cast<int32_t>(i);`
31	`}`
32
33	`if (token.rfind(suffix_indicator_, 0) == 0) {`
34	`vocab_.emplace_back(token.substr(suffix_indicator.size(), token.size() - suffix_indicator.size()));`
35	`is_substr_.push_back(true);`
36	`} else {`
37	`vocab_.push_back(token);`
38	`is_substr_.push_back(false);`
39	`}`
40	`}`
41	`}`
42
43	`std::string BertTokenizerDecoder::Decode(const std::vector<int64_t>& ids, bool skip_special_tokens, bool clean_up_tokenization_spaces) {`
44	`std::string result;`
45	`int64_t pre_token = -1;`
46
47	`for (auto id : ids) {`
48	`if (skip_special_tokens && (id == sep_token_id_ \|\| id == pad_token_id_ \|\| id == cls_token_id_ \|\| id == mask_token_id_)) {`
49	`continue;`
50	`}`
51
52	`// deal with unk ids`
53	`if (id < 0 \|\| static_cast<size_t>(id) >= vocab_.size()) {`
54	`if (!result.empty()) {`
55	`result.push_back(' ');`
56	`}`
57	`result.append(unk_token_);`
58	`continue;`
59	`}`
60
61	`// skip first substr`
62	`if (result.empty() && is_substr_[static_cast<size_t>(id)]) {`
63	`continue;`
64	`}`
65
66	`// At following situations, we needn't add space`
67	`// we needn't add a space at the beginning of the output`
68	`// we needn't add a space when the token is a substr (such as ##ing)`
69	`// we needn't add a space at the left or right of punctuation (such as client-side shouldn't be client - side), when clean_up_tokenization_spaces is true`
70	`if (!(result.empty() \|\| is_substr_[static_cast<size_t>(id)] \|\| (clean_up_tokenization_spaces && RemoveTokenizeSpace(pre_token, id)))) {`
71	`result.push_back(' ');`
72	`}`
73
74	`result.append(vocab_[static_cast<size_t>(id)]);`
75	`pre_token = id;`
76	`}`
77
78	`return result;`
79	`}`
80
81	`bool BertTokenizerDecoder::RemoveTokenizeSpace(int64_t pre_token_id, int64_t new_token_id) {`
82	`if (pre_token_id < 0) {`
83	`return true;`
84	`}`
85
86	`auto pre_char = ustring(vocab_[static_cast<size_t>(pre_token_id)]).back();`
87	`auto cur_char = ustring(vocab_[static_cast<size_t>(new_token_id)])[0];`
88
89	`// normal punctuation`
90	`if (cur_char == U'!' \|\| cur_char == U'.' \|\| cur_char == U'?' \|\| cur_char == U',' \|\| cur_char == '~' \|\| cur_char == ':') {`
91	`return true;`
92	`}`
93
94	`// only remove left side space`
95	`if (cur_char == U'}' \|\| cur_char == U']' \|\| cur_char == U'>' \|\| cur_char == ')') {`
96	`return true;`
97	`}`
98
99	`// only remove right side space`
100	`if (pre_char == U'{' \|\| pre_char == U'[' \|\| pre_char == U'<' \|\| pre_char == '(' \|\| pre_char == '$') {`
101	`return true;`
102	`}`
103
104	`// remove both side space`
105	`if (pre_char == U'-' \|\| pre_char == U'\'' \|\| pre_char == U'"' \|\| pre_char == U'/' \|\| pre_char == U'@' \|\| pre_char == U'\\' \|\|`
106	`cur_char == U'-' \|\| cur_char == U'\'' \|\| cur_char == U'"' \|\| cur_char == U'/' \|\| cur_char == U'@' \|\| cur_char == U'\\') {`
107	`return true;`
108	`}`
109
110	`// remove both space beside unicode punctuation`
111	`if (pre_char > 128 && IsPunct(pre_char)) {`
112	`return true;`
113	`}`
114
115	`if (cur_char > 128 && IsPunct(cur_char)) {`
116	`return true;`
117	`}`
118
119	`return false;`
120	`}`
121
122	`KernelBertTokenizerDecoder::KernelBertTokenizerDecoder(const OrtApi& api, const OrtKernelInfo& info) : BaseKernel(api, info) {`
123	`std::string vocab = ort_.KernelInfoGetAttribute<std::string>(&info, "vocab_file");`
124	`std::string unk_token = TryToGetAttributeWithDefault("unk_token", std::string("[UNK]"));`
125	`std::string sep_token = TryToGetAttributeWithDefault("sep_token", std::string("[SEP]"));`
126	`std::string pad_token = TryToGetAttributeWithDefault("pad_token", std::string("[PAD]"));`
127	`std::string cls_token = TryToGetAttributeWithDefault("cls_token", std::string("[CLS]"));`
128	`std::string mask_token = TryToGetAttributeWithDefault("mask_token", std::string("[MASK]"));`
129	`std::string suffix_indicator = TryToGetAttributeWithDefault("suffix_indicator", std::string("##"));`
130
131	`use_indices_ = TryToGetAttributeWithDefault("use_indices", false);`
132	`skip_special_tokens_ = TryToGetAttributeWithDefault("skip_special_tokens", false);`
133	`clean_up_tokenization_spaces_ = TryToGetAttributeWithDefault("clean_up_tokenization_spaces", true);`
134
135	`decoder_ = std::make_shared<BertTokenizerDecoder>(vocab, unk_token, sep_token, pad_token,`
136	`cls_token, mask_token, suffix_indicator);`
137	`}`
138
139	`void KernelBertTokenizerDecoder::Compute(OrtKernelContext* context) {`
140	`const OrtValue* ids = ort_.KernelContext_GetInput(context, 0);`
141	`const int64_t* p_ids = ort_.GetTensorData<int64_t>(ids);`
142	`OrtTensorDimensions ids_dim(ort_, ids);`
143
144	`if (!((ids_dim.size() == 1) \|\| (ids_dim.size() == 2 && ids_dim[0] == 1))) {`
145	`ORTX_CXX_API_THROW("[BertTokenizerDecoder]: Expect ids dimension [n] or [1,n].", ORT_INVALID_GRAPH);`
146	`}`
147
148	`// const int64_t* p_row_indices = ort_row_indices_dim.empty() ? nullptr : ort_.GetTensorData<int64_t>(ort_row_indices);`
149	`const OrtValue* positions = ort_.KernelContext_GetInput(context, 1);`
150	`OrtTensorDimensions positions_dim(ort_, positions);`
151	`if (use_indices_ &&`
152	`(!((positions_dim.Size() == 0) \|\|`
153	`(positions_dim.size() == 2 && positions_dim[1] == 2)))) {`
154	`ORTX_CXX_API_THROW("[BertTokenizerDecoder]: Expect positions empty or a [n, 2] matrix when use indices", ORT_INVALID_GRAPH);`
155	`}`
156
157	`const int64_t* p_positions = positions_dim.Size() == 0 ? nullptr : ort_.GetTensorData<int64_t>(positions);`
158
159	`std::vector<std::string> result;`
160	`std::vector<int64_t> output_dim(1);`
161	`if (!use_indices_) {`
162	`result.push_back(decoder_->Decode(std::vector<int64_t>(p_ids, p_ids + ids_dim.Size()), skip_special_tokens_, clean_up_tokenization_spaces_));`
163	`output_dim[0] = 1;`
164	`} else {`
165	`if (p_positions != nullptr) {`
166	`for (int i = 0; i < positions_dim[0]; i++) {`
167	`int64_t start = p_positions[2 * i];`
168	`int64_t end = p_positions[2 * i + 1];`
169
170	`result.push_back(decoder_->Decode(std::vector<int64_t>(p_ids + start, p_ids + end), skip_special_tokens_, clean_up_tokenization_spaces_));`
171	`}`
172	`output_dim[0] = positions_dim[0];`
173	`}`
174	`}`
175	`OrtValue* output = ort_.KernelContext_GetOutput(context, 0, output_dim.data(), output_dim.size());`
176
177	`FillTensorDataString(api_, ort_, context, result, output);`
178	`}`
179
180	`const char* CustomOpBertTokenizerDecoder::GetName() const { return "BertTokenizerDecoder"; };`
181
182	`size_t CustomOpBertTokenizerDecoder::GetInputTypeCount() const {`
183	`return 2;`
184	`};`
185
186	`ONNXTensorElementDataType CustomOpBertTokenizerDecoder::GetInputType(size_t /index/) const {`
187	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;`
188	`};`
189
190	`size_t CustomOpBertTokenizerDecoder::GetOutputTypeCount() const {`
191	`return 1;`
192	`};`
193
194	`ONNXTensorElementDataType CustomOpBertTokenizerDecoder::GetOutputType(size_t /index/) const {`
195	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
196	`};`
197

microsoft/onnxruntime-extensions

Branches

Tags

Clone