microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
rel-0.7

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

onnxruntime_extensions/tools/pre_post_processing/steps/nlp.py

337lines · modecode

1# Copyright (c) Microsoft Corporation. All rights reserved.
2# Licensed under the MIT License.
3
4import onnx
5from collections import OrderedDict
6from pathlib import Path
7
8from typing import Optional, Union, Dict
9from ..step import Step
10
11
12class TokenizerParam(object):
13 def __init__(self, vocab_or_file: Union[Path, dict], **kwargs):
14 self.vocab_or_file = vocab_or_file
15 self.tweaked_bos_id = 1
16 self.strip_accents = 0
17 self.do_lower_case = 0
18 self.is_sentence_pair = 0
19 self.__assigned_with_kwargs(**kwargs)
20
21 def __assigned_with_kwargs(self, **kwargs):
22 for key in self.__dict__.keys():
23 if key in kwargs and kwargs.get(key) is not None:
24 setattr(self, key, kwargs[key])
25
26
27class SentencePieceTokenizer(Step):
28 def __init__(
29 self,
30 tokenizer_param: TokenizerParam,
31 nbest_size=0,
32 alpha=1.0,
33 reverse=False,
34 add_bos=False,
35 add_eos=False,
36 name: Optional[str] = None,
37 ):
38 """
39 Brief:
40 SentencePieceTokenizer has actually 6 inputs in definition, but we allow user to provide only text input,
41 and make the others, "nbest_size", "alpha", "add_bos", "add_eos", "reverse" optional.
42 Args:
43 tokenizer_param: some essential infos to build a tokenizer
44 you can create a TokenizerParam object like:
45 tokenizer_param = TokenizerParam(vocab_size=tokenizer.vocab_size,
46 tweaked_bos_id=tokenizer.tweaked_bos_id)
47
48 nbest_size: int, optional (default = 0)
49 alpha: float, optional (default = 1.0)
50 reverse: bool, optional (default = False)
51 add_bos: bool, optional (default = False)
52 add_eos: bool, optional (default = False)
53 Please see more detail explanation in
54 https://www.tensorflow.org/text/api_docs/python/text/SentencepieceTokenizer#args
55
56 name: Optional name of step. Defaults to 'SentencePieceTokenizer'
57
58 """
59 super().__init__(
60 ["input_text", "nbest_size", "alpha", "add_bos", "add_eos", "reverse"], ["input_ids", "attention_mask"], name
61 )
62 self._tokenizer_param = tokenizer_param
63 # python bool value (True/False) is not supported in c++, so we use 0/1 to represent bool
64 self._optional_kwargs = dict(
65 nbest_size=nbest_size, alpha=alpha, add_bos=int(add_bos), add_eos=int(add_eos), reverse=int(reverse)
66 )
67
68 def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
69 # input text
70 input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0)
71 input_shape_0 = input_shape_str0.split(",")
72 # ideally, we should support batch input, each batch has different length and output a token
73 # !!! But, the implementation of SentencePieceTokenizer is not batch supported, inputs will be flatten to 1D
74 # in the sentence-piece kernel
75 assert input_type_str0 == "string"
76
77 # we have to do this hack here, because some models tweaked bos_id to 0, but we have still 1
78 # as default value in model file.
79 # it is only a temporary solution, we will remove it in the future.
80 tweak_bos_id = False
81 if self._tokenizer_param.tweaked_bos_id != 1 and self._optional_kwargs["add_bos"]:
82 self._optional_kwargs["add_bos"] = 0
83 tweak_bos_id = True
84
85 batch_dim = input_shape_0[0] if len(input_shape_0) > 1 else "1"
86 prefix_ = f'step_{self.step_num}'
87 output_shape_str = f"{batch_dim}, {prefix_}__num_ids"
88
89 def build_input_declare():
90 input_base = f"{input_type_str0}[{input_shape_str0}] {self.input_names[0]}"
91 return input_base
92
93 def build_call_para():
94 para_base = ["input_with_batch"]
95 para_base.append("i64_nbest_size")
96 para_base.append("f32_alpha")
97 para_base.append("bool_add_bos")
98 para_base.append("bool_add_eos")
99 para_base.append("bool_reverse")
100 return ",".join(para_base)
101
102 def build_forward_declare():
103 # default values for nbest_size, alpha, add_bos, add_eos, reverse
104 declare_base = [
105 f"i64_nbest_size = Constant <value = int64[1] {{{self._optional_kwargs['nbest_size']}}}> ()",
106 f"f32_alpha = Constant <value = float[1] {{ {self._optional_kwargs['alpha']} }}> ()",
107 f"bool_add_bos = Constant <value = bool[1] {{{self._optional_kwargs['add_bos']}}}> ()",
108 f"bool_add_eos = Constant <value = bool[1] {{{self._optional_kwargs['add_eos']}}}> ()",
109 f"bool_reverse = Constant <value = bool[1] {{{self._optional_kwargs['reverse']}}}> ()",
110 ]
111
112 return "\n".join(declare_base)
113
114 # TODO Camembert and XLMRoberta tokenizers has a different bos_token_id (0) from the default value (1)
115 # Now, we are hacking it.
116
117 def hack_bos_id():
118 if tweak_bos_id:
119 return f'''
120 k_start = Constant <value = int32[1] {{{self._tokenizer_param.tweaked_bos_id}}}> ()
121 input_ids_concat02 = Concat <axis = 0> (k_start, token)
122 input_ids_bdim = Unsqueeze(input_ids_concat02, i64_0)
123 '''
124 else:
125 return '''
126 input_ids_bdim = Unsqueeze(token, i64_0)
127 '''
128
129 def build_unsqueeze():
130 if len(input_shape_0) == 1:
131 return f"""
132 input_with_batch = Unsqueeze({self.input_names[0]}, i64_0)
133 """
134 else:
135 return f"""
136 input_with_batch = Identity({self.input_names[0]})
137 """
138
139 converter_graph = onnx.parser.parse_graph(
140 f"""\
141 SentencePiecetokenizer ({build_input_declare()})
142 => (int64[{output_shape_str}] {self.output_names[0]},int64[{output_shape_str}] {self.output_names[1]})
143 {{
144 {build_forward_declare()}
145 i64_neg1 = Constant <value = int64[1] {{-1}}> ()
146 i64_0 = Constant <value = int64[1] {{0}}> ()
147 {build_unsqueeze()}
148 token,idx = com.microsoft.extensions.SentencepieceTokenizer ({build_call_para()})
149 {hack_bos_id()}
150 {self.output_names[0]} = Cast <to = 7> (input_ids_bdim)
151 attention_mask_i32=Greater({self.output_names[0]}, i64_neg1)
152 {self.output_names[1]} = Cast <to = 7> (attention_mask_i32)
153 }}
154 """
155 )
156
157 with open(self._tokenizer_param.vocab_or_file, "rb") as f:
158 content = f.read()
159
160 token_model_attr = onnx.helper.make_attribute("model", content)
161 node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == "SentencepieceTokenizer")
162 converter_graph.node[node_idx].attribute.append(token_model_attr)
163
164 return converter_graph
165
166
167def _vocab_to_dict(vocab_or_file: Union[Dict[str, int], Path, str]):
168 if isinstance(vocab_or_file, (Path, str)):
169 # read from file
170 import json
171 with open(vocab_or_file, "r") as f:
172 vocab = json.load(f)
173 else:
174 vocab = vocab_or_file
175
176 ordered_vocab = OrderedDict(sorted(vocab.items(), key=lambda item: int(item[1])))
177
178 vocab = "\n".join(ordered_vocab.keys())
179 return dict(vocab_file=vocab)
180
181
182class BertTokenizer(Step):
183 def __init__(self, tokenizer_param: TokenizerParam, name: Optional[str] = None):
184 """
185 Brief: This step is used to convert the input text into the input_ids, attention_mask, token_type_ids.
186 It supports an input of a single string for classification models, or two strings for QA models.
187 Args:
188 tokenizer_param: some essential infos to build a tokenizer,
189 You can create a TokenizerParam like this:
190 tokenizer_param = TokenizerParam(vocab=tokenizer.vocab, # vocab is dict or file_path
191 strip_accents = True or False (Optional),
192 do_lower_case = True or False (Optional)
193 )
194
195 name: Optional name of step. Defaults to 'BertTokenizer'
196
197 """
198 super().__init__(["input_text"], ["input_ids", "attention_mask", "token_type_ids"], name)
199 self._tokenizer_param = tokenizer_param
200
201 def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
202 input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0)
203
204 input_shape_0 = input_shape_str0.split(",")
205 prefix_ = f'step_{self.step_num}'
206 # only support bath size 1 until tokenizer op supports batch size > 1
207 batch_dim = input_shape_0[0] if len(input_shape_0) > 1 else "1"
208 output_shape_str = f"{batch_dim}, _{prefix_}__num_ids"
209 assert input_type_str0 == "string"
210
211 onnx_tokenizer_impl = "HfBertTokenizer" if self._tokenizer_param.is_sentence_pair else "BertTokenizer"
212
213 def build_output_declare():
214 output_base = []
215 for out in self.output_names:
216 output_base.append(f"int64[{output_shape_str}] {out}")
217
218 return ",".join(output_base)
219
220 def get_tokenizer_ret():
221 if onnx_tokenizer_impl == "HfBertTokenizer":
222 return ",".join(self.output_names)
223 # different output orders for BertTokenizer and HfBertTokenizer
224 return "ids,types,mask"
225
226 def build_output_imp():
227 if onnx_tokenizer_impl == "HfBertTokenizer":
228 return ""
229
230 # BertTokenizer has different output dimensions
231 ret_vars = get_tokenizer_ret().split(",")
232 ret_vars[1], ret_vars[2] = ret_vars[2], ret_vars[1]
233 output_str = []
234
235 for idx, out in enumerate(self.output_names):
236 output_str.append(f"{out} = Unsqueeze({ret_vars[idx]}, i64_0)")
237
238 return "\n".join(output_str)
239
240 def build_input_declare():
241 inputs = f"{input_type_str0}[{input_shape_str0}] {self.input_names[0]}"
242 return inputs
243
244 def build_unsqueeze():
245 if len(input_shape_0) == 1:
246 return f"""
247 input_with_batch = Unsqueeze({self.input_names[0]}, i64_0)
248 """
249 else:
250 return f"""
251 input_with_batch = Identity({self.input_names[0]})
252 """
253
254 converter_graph = onnx.parser.parse_graph(
255 f"""\
256 {onnx_tokenizer_impl} ({build_input_declare()})
257 => ({build_output_declare()})
258 {{
259 i64_0 = Constant <value = int64[1] {{0}}> ()
260 {build_unsqueeze()}
261 {get_tokenizer_ret()} = com.microsoft.extensions.{onnx_tokenizer_impl} (input_with_batch)
262 {build_output_imp()}
263 }}
264 """
265 )
266
267 bert_tokenizer_param = self._tokenizer_param
268 token_model_attr = []
269
270 attrs = _vocab_to_dict(bert_tokenizer_param.vocab_or_file)
271 attrs["strip_accents"] = bert_tokenizer_param.strip_accents
272 attrs["do_lower_case"] = bert_tokenizer_param.do_lower_case
273
274 for attr in attrs:
275 token_model_attr.append(onnx.helper.make_attribute(attr, attrs[attr]))
276
277 node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == onnx_tokenizer_impl)
278 converter_graph.node[node_idx].attribute.extend(token_model_attr)
279
280 return converter_graph
281
282
283class BertTokenizerQADecoder(Step):
284 def __init__(self, tokenizer_param: TokenizerParam, name: Optional[str] = None):
285 """
286 Brief:
287 Decode the input_ids to text
288 Args:
289 tokenizer_param: some essential info to build a tokenizer.
290 you can create a TokenizerParam object like:
291 tokenizer_param = TokenizerParam(vocab=tokenizer.vocab, #vocab is dict or file_path)
292 name: Optional name of step. Defaults to 'BertTokenizerQADecoder'
293 """
294 super().__init__(
295 ["start_logits", "end_logits", "input_ids"], ["text"], name)
296 self._tokenizer_param = tokenizer_param
297
298 def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
299 def build_input_declare():
300 inputs = []
301 for idx, inp in enumerate(self.input_names):
302 input_type_str_x, input_shape_str_x = self._get_input_type_and_shape_strs(graph, idx)
303 inputs.append(f"{input_type_str_x}[{input_shape_str_x}] {inp}")
304 return ",".join(inputs)
305
306 # A unique name for output shape
307 prefix_ = f'step_{self.step_num}'
308 output_shape_str = f"_{prefix_}_any_len"
309 converter_graph = onnx.parser.parse_graph(
310 f"""\
311 tokenizer_decoder ({build_input_declare()})
312 => (string[{output_shape_str}] {self.output_names[0]})
313 {{
314 i64_em = Constant <value = int64[0] {{}}> ()
315 i64_1 = Constant <value = int64[1] {{1}}> ()
316 i64_0 = Constant <value = int64[1] {{0}}> ()
317 i64_neg1 = Constant <value = int64[1] {{-1}}> ()
318
319 s_position = ArgMax<axis = -1, keepdims = 0>({self.input_names[0]})
320 e_position = ArgMax<axis = -1, keepdims = 0>({self.input_names[1]})
321 ee_position = Add(e_position,i64_1)
322 u_i64_neg1 = Unsqueeze(i64_neg1, i64_0)
323 slice_ids= Slice({self.input_names[2]}, s_position, ee_position, i64_neg1)
324 {self.output_names[0]} = com.microsoft.extensions.BertTokenizerDecoder (slice_ids, i64_em)
325 }}
326 """
327 )
328
329 attrs = _vocab_to_dict(self._tokenizer_param.vocab_or_file)
330 token_model_attr = []
331 for attr in attrs:
332 token_model_attr.append(onnx.helper.make_attribute(attr, attrs[attr]))
333
334 node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == "BertTokenizerDecoder")
335 converter_graph.node[node_idx].attribute.extend(token_model_attr)
336
337 return converter_graph
338