microsoft/onnxruntime-extensions

Public

mirrored from https://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

v0.4.0

Find a branch or tag

Branches

v0.4.0

Clone

HTTPS

Download ZIP

onnxruntime-extensions/operators/tokenizer

operators/tokenizer/basic_tokenizer.cc

130lines · modeblame

Raw Download

Latest commit unavailable.

unknown

`aef5ef1e`Mojimi4 years ago	1	`// Copyright (c) Microsoft Corporation. All rights reserved.`
	2	`// Licensed under the MIT License.`
	3
	4	`#include "string_utils.h"`
	5	`#include "basic_tokenizer.hpp"`
	6	`#include "string_tensor.h"`
	7	`#include <vector>`
	8	`#include <locale>`
	9	`#include <codecvt>`
	10	`#include <algorithm>`
	11
	12	`BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation, bool remove_control_chars):`
	13	`do_lower_case_(do_lower_case), tokenize_chinese_chars_(tokenize_chinese_chars), strip_accents_(strip_accents), tokenize_punctuation_(tokenize_punctuation),`
	14	`remove_control_chars_(remove_control_chars){}`
	15
	16	`std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {`
	17	`std::vector<ustring> result;`
	18	`ustring token;`
	19	`auto push_current_token_and_clear = [&result, &token]() {`
	20	`if (!token.empty()) {`
	21	`result.push_back(token);`
	22	`token.clear();`
	23	`}`
	24	`};`
	25
	26	`auto push_single_char_and_clear = [&result, &token](char32_t c) {`
	27	`token.push_back(c);`
	28	`result.push_back(token);`
	29	`token.clear();`
	30	`};`
	31
	32	`// strip accent first`
	33	`if (strip_accents_) {`
	34	`for (auto& c : text) {`
	35	`c = StripAccent(c);`
	36	`}`
	37	`}`
	38
	39	`if (do_lower_case_) {`
	40	`for (auto& c : text) {`
	41	`c = ::tolower(c);`
	42	`}`
	43	`}`
	44
	45	`for (auto c : text) {`
	46	`if (tokenize_chinese_chars_ && IsCJK(c)) {`
	47	`push_current_token_and_clear();`
	48	`push_single_char_and_clear(c);`
	49	`continue;`
	50	`}`
	51
	52	`if (strip_accents_ && IsAccent(c)) {`
	53	`continue;`
	54	`}`
	55
`9f3abe20`Wenbing Li4 years ago	56	`// 0x2019 unicode is not punctuation in some Linux platform,`
	57	`// to be consistent, take it as punctatuation always.`
	58	`if (tokenize_punctuation_ && (::iswpunct(c) \|\| c == wint_t(0x2019))) {`
`aef5ef1e`Mojimi4 years ago	59	`push_current_token_and_clear();`
	60	`push_single_char_and_clear(c);`
	61	`continue;`
	62	`}`
	63
	64	`// split by space`
`cce66310`Mojimi4 years ago	65	`if (::iswspace(c)) {`
`aef5ef1e`Mojimi4 years ago	66	`push_current_token_and_clear();`
	67	`continue;`
	68	`}`
	69
	70	`// iscntrl will judge \t\f\n\r as control char`
	71	`// but it has been filter by isspace(c)`
`cce66310`Mojimi4 years ago	72	`if (remove_control_chars_ && ::iswcntrl(c)) {`
`aef5ef1e`Mojimi4 years ago	73	`continue;`
	74	`}`
	75
	76	`token.push_back(c);`
	77	`}`
	78
	79	`push_current_token_and_clear();`
	80	`return result;`
	81	`}`
	82
	83	`KernelBasicTokenizer::KernelBasicTokenizer(OrtApi api, const OrtKernelInfo* info) : BaseKernel(api, info) {`
	84	`bool do_lower_case = TryToGetAttributeWithDefault("do_lower_case", true);`
	85	`bool tokenize_chinese_chars = TryToGetAttributeWithDefault("tokenize_chinese_chars", true);`
	86	`bool strip_accents = TryToGetAttributeWithDefault("strip_accents", false);`
	87	`bool tokenize_punctuation = TryToGetAttributeWithDefault("tokenize_punctuation", false);`
	88	`bool remove_control_chars = TryToGetAttributeWithDefault("strip_accents", true);`
	89
	90	`tokenizer_ = std::make_shared<BasicTokenizer>(do_lower_case, tokenize_chinese_chars, strip_accents, tokenize_punctuation, remove_control_chars);`
	91	`}`
	92
	93	`void KernelBasicTokenizer::Compute(OrtKernelContext* context) {`
	94	`// Setup inputs`
	95	`const OrtValue* input = ort_.KernelContext_GetInput(context, 0);`
	96	`std::vector<std::string> input_data;`
	97	`GetTensorMutableDataString(api_, ort_, context, input, input_data);`
	98
	99	`OrtTensorDimensions dimensions(ort_, input);`
	100	`if (dimensions.size() != 1 && dimensions[0] != 1) {`
	101	`ORT_CXX_API_THROW("[BasicTokenizer]: only support string scalar.", ORT_INVALID_GRAPH);`
	102	`}`
	103
	104	`OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size());`
	105	`std::vector<ustring> result = tokenizer_->Tokenize(ustring(input_data[0]));`
	106
	107	`FillTensorDataString(api_, ort_, context, result, output);`
	108	`}`
	109
	110	`void* CustomOpBasicTokenizer::CreateKernel(OrtApi api, const OrtKernelInfo* info) const {`
	111	`return new KernelBasicTokenizer(api, info);`
	112	`};`
	113
	114	`const char* CustomOpBasicTokenizer::GetName() const { return "BasicTokenizer"; };`
	115
	116	`size_t CustomOpBasicTokenizer::GetInputTypeCount() const {`
	117	`return 1;`
	118	`};`
	119
	120	`ONNXTensorElementDataType CustomOpBasicTokenizer::GetInputType(size_t /index/) const {`
	121	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
	122	`};`
	123
	124	`size_t CustomOpBasicTokenizer::GetOutputTypeCount() const {`
	125	`return 1;`
	126	`};`
	127
	128	`ONNXTensorElementDataType CustomOpBasicTokenizer::GetOutputType(size_t /index/) const {`
	129	`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
	130	`};`

microsoft/onnxruntime-extensions

Branches

Tags

Clone