microsoft/onnxruntime-extensions
Publicmirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable
onnxruntime_extensions/tools/pre_post_processing/steps/nlp.py
337lines · modecode
| 1 | # Copyright (c) Microsoft Corporation. All rights reserved. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | import onnx |
| 5 | from collections import OrderedDict |
| 6 | from pathlib import Path |
| 7 | |
| 8 | from typing import Optional, Union, Dict |
| 9 | from ..step import Step |
| 10 | |
| 11 | |
| 12 | class TokenizerParam(object): |
| 13 | def __init__(self, vocab_or_file: Union[Path, dict], **kwargs): |
| 14 | self.vocab_or_file = vocab_or_file |
| 15 | self.tweaked_bos_id = 1 |
| 16 | self.strip_accents = 0 |
| 17 | self.do_lower_case = 0 |
| 18 | self.is_sentence_pair = 0 |
| 19 | self.__assigned_with_kwargs(**kwargs) |
| 20 | |
| 21 | def __assigned_with_kwargs(self, **kwargs): |
| 22 | for key in self.__dict__.keys(): |
| 23 | if key in kwargs and kwargs.get(key) is not None: |
| 24 | setattr(self, key, kwargs[key]) |
| 25 | |
| 26 | |
| 27 | class SentencePieceTokenizer(Step): |
| 28 | def __init__( |
| 29 | self, |
| 30 | tokenizer_param: TokenizerParam, |
| 31 | nbest_size=0, |
| 32 | alpha=1.0, |
| 33 | reverse=False, |
| 34 | add_bos=False, |
| 35 | add_eos=False, |
| 36 | name: Optional[str] = None, |
| 37 | ): |
| 38 | """ |
| 39 | Brief: |
| 40 | SentencePieceTokenizer has actually 6 inputs in definition, but we allow user to provide only text input, |
| 41 | and make the others, "nbest_size", "alpha", "add_bos", "add_eos", "reverse" optional. |
| 42 | Args: |
| 43 | tokenizer_param: some essential infos to build a tokenizer |
| 44 | you can create a TokenizerParam object like: |
| 45 | tokenizer_param = TokenizerParam(vocab_size=tokenizer.vocab_size, |
| 46 | tweaked_bos_id=tokenizer.tweaked_bos_id) |
| 47 | |
| 48 | nbest_size: int, optional (default = 0) |
| 49 | alpha: float, optional (default = 1.0) |
| 50 | reverse: bool, optional (default = False) |
| 51 | add_bos: bool, optional (default = False) |
| 52 | add_eos: bool, optional (default = False) |
| 53 | Please see more detail explanation in |
| 54 | https://www.tensorflow.org/text/api_docs/python/text/SentencepieceTokenizer#args |
| 55 | |
| 56 | name: Optional name of step. Defaults to 'SentencePieceTokenizer' |
| 57 | |
| 58 | """ |
| 59 | super().__init__( |
| 60 | ["input_text", "nbest_size", "alpha", "add_bos", "add_eos", "reverse"], ["input_ids", "attention_mask"], name |
| 61 | ) |
| 62 | self._tokenizer_param = tokenizer_param |
| 63 | # python bool value (True/False) is not supported in c++, so we use 0/1 to represent bool |
| 64 | self._optional_kwargs = dict( |
| 65 | nbest_size=nbest_size, alpha=alpha, add_bos=int(add_bos), add_eos=int(add_eos), reverse=int(reverse) |
| 66 | ) |
| 67 | |
| 68 | def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int): |
| 69 | # input text |
| 70 | input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0) |
| 71 | input_shape_0 = input_shape_str0.split(",") |
| 72 | # ideally, we should support batch input, each batch has different length and output a token |
| 73 | # !!! But, the implementation of SentencePieceTokenizer is not batch supported, inputs will be flatten to 1D |
| 74 | # in the sentence-piece kernel |
| 75 | assert input_type_str0 == "string" |
| 76 | |
| 77 | # we have to do this hack here, because some models tweaked bos_id to 0, but we have still 1 |
| 78 | # as default value in model file. |
| 79 | # it is only a temporary solution, we will remove it in the future. |
| 80 | tweak_bos_id = False |
| 81 | if self._tokenizer_param.tweaked_bos_id != 1 and self._optional_kwargs["add_bos"]: |
| 82 | self._optional_kwargs["add_bos"] = 0 |
| 83 | tweak_bos_id = True |
| 84 | |
| 85 | batch_dim = input_shape_0[0] if len(input_shape_0) > 1 else "1" |
| 86 | prefix_ = f'step_{self.step_num}' |
| 87 | output_shape_str = f"{batch_dim}, {prefix_}__num_ids" |
| 88 | |
| 89 | def build_input_declare(): |
| 90 | input_base = f"{input_type_str0}[{input_shape_str0}] {self.input_names[0]}" |
| 91 | return input_base |
| 92 | |
| 93 | def build_call_para(): |
| 94 | para_base = ["input_with_batch"] |
| 95 | para_base.append("i64_nbest_size") |
| 96 | para_base.append("f32_alpha") |
| 97 | para_base.append("bool_add_bos") |
| 98 | para_base.append("bool_add_eos") |
| 99 | para_base.append("bool_reverse") |
| 100 | return ",".join(para_base) |
| 101 | |
| 102 | def build_forward_declare(): |
| 103 | # default values for nbest_size, alpha, add_bos, add_eos, reverse |
| 104 | declare_base = [ |
| 105 | f"i64_nbest_size = Constant <value = int64[1] {{{self._optional_kwargs['nbest_size']}}}> ()", |
| 106 | f"f32_alpha = Constant <value = float[1] {{ {self._optional_kwargs['alpha']} }}> ()", |
| 107 | f"bool_add_bos = Constant <value = bool[1] {{{self._optional_kwargs['add_bos']}}}> ()", |
| 108 | f"bool_add_eos = Constant <value = bool[1] {{{self._optional_kwargs['add_eos']}}}> ()", |
| 109 | f"bool_reverse = Constant <value = bool[1] {{{self._optional_kwargs['reverse']}}}> ()", |
| 110 | ] |
| 111 | |
| 112 | return "\n".join(declare_base) |
| 113 | |
| 114 | # TODO Camembert and XLMRoberta tokenizers has a different bos_token_id (0) from the default value (1) |
| 115 | # Now, we are hacking it. |
| 116 | |
| 117 | def hack_bos_id(): |
| 118 | if tweak_bos_id: |
| 119 | return f''' |
| 120 | k_start = Constant <value = int32[1] {{{self._tokenizer_param.tweaked_bos_id}}}> () |
| 121 | input_ids_concat02 = Concat <axis = 0> (k_start, token) |
| 122 | input_ids_bdim = Unsqueeze(input_ids_concat02, i64_0) |
| 123 | ''' |
| 124 | else: |
| 125 | return ''' |
| 126 | input_ids_bdim = Unsqueeze(token, i64_0) |
| 127 | ''' |
| 128 | |
| 129 | def build_unsqueeze(): |
| 130 | if len(input_shape_0) == 1: |
| 131 | return f""" |
| 132 | input_with_batch = Unsqueeze({self.input_names[0]}, i64_0) |
| 133 | """ |
| 134 | else: |
| 135 | return f""" |
| 136 | input_with_batch = Identity({self.input_names[0]}) |
| 137 | """ |
| 138 | |
| 139 | converter_graph = onnx.parser.parse_graph( |
| 140 | f"""\ |
| 141 | SentencePiecetokenizer ({build_input_declare()}) |
| 142 | => (int64[{output_shape_str}] {self.output_names[0]},int64[{output_shape_str}] {self.output_names[1]}) |
| 143 | {{ |
| 144 | {build_forward_declare()} |
| 145 | i64_neg1 = Constant <value = int64[1] {{-1}}> () |
| 146 | i64_0 = Constant <value = int64[1] {{0}}> () |
| 147 | {build_unsqueeze()} |
| 148 | token,idx = com.microsoft.extensions.SentencepieceTokenizer ({build_call_para()}) |
| 149 | {hack_bos_id()} |
| 150 | {self.output_names[0]} = Cast <to = 7> (input_ids_bdim) |
| 151 | attention_mask_i32=Greater({self.output_names[0]}, i64_neg1) |
| 152 | {self.output_names[1]} = Cast <to = 7> (attention_mask_i32) |
| 153 | }} |
| 154 | """ |
| 155 | ) |
| 156 | |
| 157 | with open(self._tokenizer_param.vocab_or_file, "rb") as f: |
| 158 | content = f.read() |
| 159 | |
| 160 | token_model_attr = onnx.helper.make_attribute("model", content) |
| 161 | node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == "SentencepieceTokenizer") |
| 162 | converter_graph.node[node_idx].attribute.append(token_model_attr) |
| 163 | |
| 164 | return converter_graph |
| 165 | |
| 166 | |
| 167 | def _vocab_to_dict(vocab_or_file: Union[Dict[str, int], Path, str]): |
| 168 | if isinstance(vocab_or_file, (Path, str)): |
| 169 | # read from file |
| 170 | import json |
| 171 | with open(vocab_or_file, "r") as f: |
| 172 | vocab = json.load(f) |
| 173 | else: |
| 174 | vocab = vocab_or_file |
| 175 | |
| 176 | ordered_vocab = OrderedDict(sorted(vocab.items(), key=lambda item: int(item[1]))) |
| 177 | |
| 178 | vocab = "\n".join(ordered_vocab.keys()) |
| 179 | return dict(vocab_file=vocab) |
| 180 | |
| 181 | |
| 182 | class BertTokenizer(Step): |
| 183 | def __init__(self, tokenizer_param: TokenizerParam, name: Optional[str] = None): |
| 184 | """ |
| 185 | Brief: This step is used to convert the input text into the input_ids, attention_mask, token_type_ids. |
| 186 | It supports an input of a single string for classification models, or two strings for QA models. |
| 187 | Args: |
| 188 | tokenizer_param: some essential infos to build a tokenizer, |
| 189 | You can create a TokenizerParam like this: |
| 190 | tokenizer_param = TokenizerParam(vocab=tokenizer.vocab, # vocab is dict or file_path |
| 191 | strip_accents = True or False (Optional), |
| 192 | do_lower_case = True or False (Optional) |
| 193 | ) |
| 194 | |
| 195 | name: Optional name of step. Defaults to 'BertTokenizer' |
| 196 | |
| 197 | """ |
| 198 | super().__init__(["input_text"], ["input_ids", "attention_mask", "token_type_ids"], name) |
| 199 | self._tokenizer_param = tokenizer_param |
| 200 | |
| 201 | def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int): |
| 202 | input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0) |
| 203 | |
| 204 | input_shape_0 = input_shape_str0.split(",") |
| 205 | prefix_ = f'step_{self.step_num}' |
| 206 | # only support bath size 1 until tokenizer op supports batch size > 1 |
| 207 | batch_dim = input_shape_0[0] if len(input_shape_0) > 1 else "1" |
| 208 | output_shape_str = f"{batch_dim}, _{prefix_}__num_ids" |
| 209 | assert input_type_str0 == "string" |
| 210 | |
| 211 | onnx_tokenizer_impl = "HfBertTokenizer" if self._tokenizer_param.is_sentence_pair else "BertTokenizer" |
| 212 | |
| 213 | def build_output_declare(): |
| 214 | output_base = [] |
| 215 | for out in self.output_names: |
| 216 | output_base.append(f"int64[{output_shape_str}] {out}") |
| 217 | |
| 218 | return ",".join(output_base) |
| 219 | |
| 220 | def get_tokenizer_ret(): |
| 221 | if onnx_tokenizer_impl == "HfBertTokenizer": |
| 222 | return ",".join(self.output_names) |
| 223 | # different output orders for BertTokenizer and HfBertTokenizer |
| 224 | return "ids,types,mask" |
| 225 | |
| 226 | def build_output_imp(): |
| 227 | if onnx_tokenizer_impl == "HfBertTokenizer": |
| 228 | return "" |
| 229 | |
| 230 | # BertTokenizer has different output dimensions |
| 231 | ret_vars = get_tokenizer_ret().split(",") |
| 232 | ret_vars[1], ret_vars[2] = ret_vars[2], ret_vars[1] |
| 233 | output_str = [] |
| 234 | |
| 235 | for idx, out in enumerate(self.output_names): |
| 236 | output_str.append(f"{out} = Unsqueeze({ret_vars[idx]}, i64_0)") |
| 237 | |
| 238 | return "\n".join(output_str) |
| 239 | |
| 240 | def build_input_declare(): |
| 241 | inputs = f"{input_type_str0}[{input_shape_str0}] {self.input_names[0]}" |
| 242 | return inputs |
| 243 | |
| 244 | def build_unsqueeze(): |
| 245 | if len(input_shape_0) == 1: |
| 246 | return f""" |
| 247 | input_with_batch = Unsqueeze({self.input_names[0]}, i64_0) |
| 248 | """ |
| 249 | else: |
| 250 | return f""" |
| 251 | input_with_batch = Identity({self.input_names[0]}) |
| 252 | """ |
| 253 | |
| 254 | converter_graph = onnx.parser.parse_graph( |
| 255 | f"""\ |
| 256 | {onnx_tokenizer_impl} ({build_input_declare()}) |
| 257 | => ({build_output_declare()}) |
| 258 | {{ |
| 259 | i64_0 = Constant <value = int64[1] {{0}}> () |
| 260 | {build_unsqueeze()} |
| 261 | {get_tokenizer_ret()} = com.microsoft.extensions.{onnx_tokenizer_impl} (input_with_batch) |
| 262 | {build_output_imp()} |
| 263 | }} |
| 264 | """ |
| 265 | ) |
| 266 | |
| 267 | bert_tokenizer_param = self._tokenizer_param |
| 268 | token_model_attr = [] |
| 269 | |
| 270 | attrs = _vocab_to_dict(bert_tokenizer_param.vocab_or_file) |
| 271 | attrs["strip_accents"] = bert_tokenizer_param.strip_accents |
| 272 | attrs["do_lower_case"] = bert_tokenizer_param.do_lower_case |
| 273 | |
| 274 | for attr in attrs: |
| 275 | token_model_attr.append(onnx.helper.make_attribute(attr, attrs[attr])) |
| 276 | |
| 277 | node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == onnx_tokenizer_impl) |
| 278 | converter_graph.node[node_idx].attribute.extend(token_model_attr) |
| 279 | |
| 280 | return converter_graph |
| 281 | |
| 282 | |
| 283 | class BertTokenizerQADecoder(Step): |
| 284 | def __init__(self, tokenizer_param: TokenizerParam, name: Optional[str] = None): |
| 285 | """ |
| 286 | Brief: |
| 287 | Decode the input_ids to text |
| 288 | Args: |
| 289 | tokenizer_param: some essential info to build a tokenizer. |
| 290 | you can create a TokenizerParam object like: |
| 291 | tokenizer_param = TokenizerParam(vocab=tokenizer.vocab, #vocab is dict or file_path) |
| 292 | name: Optional name of step. Defaults to 'BertTokenizerQADecoder' |
| 293 | """ |
| 294 | super().__init__( |
| 295 | ["start_logits", "end_logits", "input_ids"], ["text"], name) |
| 296 | self._tokenizer_param = tokenizer_param |
| 297 | |
| 298 | def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int): |
| 299 | def build_input_declare(): |
| 300 | inputs = [] |
| 301 | for idx, inp in enumerate(self.input_names): |
| 302 | input_type_str_x, input_shape_str_x = self._get_input_type_and_shape_strs(graph, idx) |
| 303 | inputs.append(f"{input_type_str_x}[{input_shape_str_x}] {inp}") |
| 304 | return ",".join(inputs) |
| 305 | |
| 306 | # A unique name for output shape |
| 307 | prefix_ = f'step_{self.step_num}' |
| 308 | output_shape_str = f"_{prefix_}_any_len" |
| 309 | converter_graph = onnx.parser.parse_graph( |
| 310 | f"""\ |
| 311 | tokenizer_decoder ({build_input_declare()}) |
| 312 | => (string[{output_shape_str}] {self.output_names[0]}) |
| 313 | {{ |
| 314 | i64_em = Constant <value = int64[0] {{}}> () |
| 315 | i64_1 = Constant <value = int64[1] {{1}}> () |
| 316 | i64_0 = Constant <value = int64[1] {{0}}> () |
| 317 | i64_neg1 = Constant <value = int64[1] {{-1}}> () |
| 318 | |
| 319 | s_position = ArgMax<axis = -1, keepdims = 0>({self.input_names[0]}) |
| 320 | e_position = ArgMax<axis = -1, keepdims = 0>({self.input_names[1]}) |
| 321 | ee_position = Add(e_position,i64_1) |
| 322 | u_i64_neg1 = Unsqueeze(i64_neg1, i64_0) |
| 323 | slice_ids= Slice({self.input_names[2]}, s_position, ee_position, i64_neg1) |
| 324 | {self.output_names[0]} = com.microsoft.extensions.BertTokenizerDecoder (slice_ids, i64_em) |
| 325 | }} |
| 326 | """ |
| 327 | ) |
| 328 | |
| 329 | attrs = _vocab_to_dict(self._tokenizer_param.vocab_or_file) |
| 330 | token_model_attr = [] |
| 331 | for attr in attrs: |
| 332 | token_model_attr.append(onnx.helper.make_attribute(attr, attrs[attr])) |
| 333 | |
| 334 | node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == "BertTokenizerDecoder") |
| 335 | converter_graph.node[node_idx].attribute.extend(token_model_attr) |
| 336 | |
| 337 | return converter_graph |
| 338 | |