microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
v0.4.2

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

onnxruntime_extensions/_cuops.py

248lines · modecode

1# Copyright (c) Microsoft Corporation. All rights reserved.
2# Licensed under the MIT License. See License.txt in the project root for
3# license information.
4###############################################################################
5
6import onnx
7from onnx import onnx_pb as onnx_proto
8from ._ocos import default_opset_domain
9
10
11class CustomOp:
12 @classmethod
13 def op_type(cls):
14 rcls = cls
15 while CustomOp != rcls.__base__:
16 rcls = rcls.__base__
17 return rcls.__name__
18
19 @classmethod
20 def get_inputs(cls): return None
21
22 @classmethod
23 def get_output(cls): return None
24
25 @classmethod
26 def serialize_attr(cls, attrs):
27 """
28 Only support serialize the basic python type like list or dict,
29 All other types needs to be serialized by the users
30 :param attrs: the dict attributes
31 :return: the dict of serialized data
32 """
33 return attrs
34
35 io_def = onnx.helper.make_tensor_value_info
36
37
38class GPT2Tokenizer(CustomOp):
39 @classmethod
40 def get_inputs(cls):
41 return [cls.io_def('input_text', onnx_proto.TensorProto.STRING, [None])]
42
43 @classmethod
44 def get_outputs(cls):
45 return [cls.io_def("input_ids", onnx.TensorProto.INT64, [None, None]),
46 cls.io_def('attention_mask', onnx.TensorProto.INT64, [None, None])]
47
48
49class VectorToString(CustomOp):
50 @classmethod
51 def get_inputs(cls):
52 return [cls.io_def("token_ids", onnx.TensorProto.INT64, [])]
53
54 @classmethod
55 def get_outputs(cls):
56 return [cls.io_def('text', onnx_proto.TensorProto.STRING, [])]
57
58 @classmethod
59 def serialize_attr(cls, attrs):
60 attr_data = {}
61 for k_, v_ in attrs.items():
62 if k_ == 'map' and isinstance(v_, dict):
63 attr_data[k_] = '\n'.join(k + "\t" + " ".join([str(i) for i in v]) for k, v in v_.items())
64 elif k_ == 'map' and isinstance(v_, str):
65 attr_data[k_] = v_
66 else:
67 attr_data[k_] = v_
68 return attr_data
69
70
71class StringMapping(CustomOp):
72 @classmethod
73 def get_inputs(cls):
74 return [cls.io_def("input", onnx.TensorProto.STRING, [])]
75
76 @classmethod
77 def get_outputs(cls):
78 return [cls.io_def('output', onnx_proto.TensorProto.STRING, [])]
79
80 @classmethod
81 def serialize_attr(cls, attrs):
82 attr_data = {}
83 for k_, v_ in attrs.items():
84 if k_ == 'map' and isinstance(v_, dict):
85 attr_data[k_] = '\n'.join(k + "\t" + v for k, v in v_.items())
86 elif k_ == 'map' and isinstance(v_, str):
87 attr_data[k_] = v_
88 else:
89 attr_data[k_] = v_
90 return attr_data
91
92
93class StringToVector(CustomOp):
94 @classmethod
95 def get_inputs(cls):
96 return [cls.io_def("text", onnx.TensorProto.STRING, [None])]
97
98 @classmethod
99 def get_outputs(cls):
100 return [cls.io_def('token_ids', onnx_proto.TensorProto.INT64, [])]
101
102 @classmethod
103 def serialize_attr(cls, attrs):
104 attr_data = {}
105 for k_, v_ in attrs.items():
106 if k_ == 'map' and isinstance(v_, dict):
107 attr_data[k_] = '\n'.join(k + "\t" + " ".join([str(i) for i in v]) for k, v in v_.items())
108 elif k_ == 'map' and isinstance(v_, str):
109 attr_data[k_] = v_
110 elif k_ == 'unk' and isinstance(v_, list):
111 attr_data[k_] = ' '.join(str(i) for i in v_)
112 else:
113 attr_data[k_] = v_
114 return attr_data
115
116
117class BlingFireSentenceBreaker(CustomOp):
118 @classmethod
119 def get_inputs(cls):
120 return [cls.io_def("text", onnx.TensorProto.STRING, [None])]
121
122 @classmethod
123 def get_outputs(cls):
124 return [cls.io_def('sentence', onnx_proto.TensorProto.STRING, [])]
125
126 @classmethod
127 def serialize_attr(cls, attrs):
128 attrs_data = {}
129 for k_, v_ in attrs.items():
130 if k_ == 'model':
131 with open(v_, "rb") as model_file:
132 attrs_data[k_] = model_file.read()
133 else:
134 attrs_data[k_] = v_
135 return attrs_data
136
137
138class SegmentExtraction(CustomOp):
139 @classmethod
140 def get_inputs(cls):
141 return [cls.io_def("input", onnx.TensorProto.INT64, [None, None])]
142
143 @classmethod
144 def get_outputs(cls):
145 return [cls.io_def('position', onnx_proto.TensorProto.INT64, [None, 2]),
146 cls.io_def('value', onnx_proto.TensorProto.INT64, [None])]
147
148
149class BertTokenizer(CustomOp):
150 @classmethod
151 def get_inputs(cls):
152 return [cls.io_def("text", onnx.TensorProto.STRING, [None])]
153
154 @classmethod
155 def get_outputs(cls):
156 return [cls.io_def('input_ids', onnx_proto.TensorProto.INT64, [None]),
157 cls.io_def('token_type_ids', onnx_proto.TensorProto.INT64, [None]),
158 cls.io_def('attention_mask', onnx_proto.TensorProto.INT64, [None])]
159
160 @classmethod
161 def serialize_attr(cls, attrs):
162 attrs_data = {}
163 for k_, v_ in attrs.items():
164 if k_ == 'vocab_file':
165 with open(v_, "r", encoding='utf-8') as model_file:
166 lines = model_file.readlines()
167 attrs_data[k_] = '\n'.join(lines)
168 else:
169 attrs_data[k_] = v_
170 return attrs_data
171
172
173class BertTokenizerDecoder(CustomOp):
174 @classmethod
175 def get_inputs(cls):
176 return [cls.io_def("ids", onnx.TensorProto.INT64, [None]),
177 cls.io_def("position", onnx.TensorProto.INT64, [None, None])]
178
179 @classmethod
180 def get_outputs(cls):
181 return [cls.io_def('str', onnx_proto.TensorProto.STRING, [None])]
182
183 @classmethod
184 def serialize_attr(cls, attrs):
185 attrs_data = {}
186 for k_, v_ in attrs.items():
187 if k_ == 'vocab_file':
188 with open(v_, "r", encoding='utf-8') as model_file:
189 lines = model_file.readlines()
190 attrs_data[k_] = '\n'.join(lines)
191 else:
192 attrs_data[k_] = v_
193 return attrs_data
194
195
196class SentencepieceTokenizer(CustomOp):
197 @classmethod
198 def get_inputs(cls):
199 return [
200 cls.io_def('inputs', onnx_proto.TensorProto.STRING, [None]),
201 cls.io_def('nbest_size', onnx_proto.TensorProto.INT64, [None]),
202 cls.io_def('alpha', onnx_proto.TensorProto.FLOAT, [None]),
203 cls.io_def('add_bos', onnx_proto.TensorProto.BOOL, [None]),
204 cls.io_def('add_eos', onnx_proto.TensorProto.BOOL, [None]),
205 cls.io_def('reverse', onnx_proto.TensorProto.BOOL, [None])
206 ]
207
208 @classmethod
209 def get_outputs(cls):
210 return [
211 cls.io_def('tokens', onnx_proto.TensorProto.INT32, [None]),
212 cls.io_def('indices', onnx_proto.TensorProto.INT64, [None])
213
214 ]
215
216
217class SingleOpGraph:
218 @classmethod
219 def get_next_id(cls):
220 if not hasattr(cls, '_id_counter'):
221 cls._id_counter = 0
222 cls._id_counter += 1
223 return cls._id_counter
224
225 @classmethod
226 def build_my_graph(cls, op_class, *args, **kwargs):
227 if isinstance(op_class, str):
228 op_class = cls.get_op_class(op_class)
229
230 op_type = op_class.op_type()
231 inputs = op_class.get_inputs()
232 outputs = op_class.get_outputs()
233 attrs = op_class.serialize_attr(kwargs)
234 cuop = onnx.helper.make_node(op_type,
235 [i_.name for i_ in inputs],
236 [o_.name for o_ in outputs],
237 "{}_{}".format(op_type, cls.get_next_id()),
238 **attrs,
239 domain=default_opset_domain())
240 graph = onnx.helper.make_graph([cuop],
241 "og_{}_{}".format(op_type, cls.get_next_id()),
242 inputs,
243 outputs)
244 return graph
245
246 @staticmethod
247 def get_op_class(op_type):
248 return globals()[op_type]
249