microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
rel-0.4

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/tokenizer/basic_tokenizer.cc

130lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4#include "string_utils.h"
5#include "basic_tokenizer.hpp"
6#include "string_tensor.h"
7#include <vector>
8#include <locale>
9#include <codecvt>
10#include <algorithm>
11
12BasicTokenizer::BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation, bool remove_control_chars):
13 do_lower_case_(do_lower_case), tokenize_chinese_chars_(tokenize_chinese_chars), strip_accents_(strip_accents), tokenize_punctuation_(tokenize_punctuation),
14 remove_control_chars_(remove_control_chars){}
15
16std::vector<ustring> BasicTokenizer::Tokenize(ustring text) {
17 std::vector<ustring> result;
18 ustring token;
19 auto push_current_token_and_clear = [&result, &token]() {
20 if (!token.empty()) {
21 result.push_back(token);
22 token.clear();
23 }
24 };
25
26 auto push_single_char_and_clear = [&result, &token](char32_t c) {
27 token.push_back(c);
28 result.push_back(token);
29 token.clear();
30 };
31
32 // strip accent first
33 if (strip_accents_) {
34 for (auto& c : text) {
35 c = StripAccent(c);
36 }
37 }
38
39 if (do_lower_case_) {
40 for (auto& c : text) {
41 c = ::tolower(c);
42 }
43 }
44
45 for (auto c : text) {
46 if (tokenize_chinese_chars_ && IsCJK(c)) {
47 push_current_token_and_clear();
48 push_single_char_and_clear(c);
49 continue;
50 }
51
52 if (strip_accents_ && IsAccent(c)) {
53 continue;
54 }
55
56 // 0x2019 unicode is not punctuation in some Linux platform,
57 // to be consistent, take it as punctatuation always.
58 if (tokenize_punctuation_ && (::iswpunct(c) || c == wint_t(0x2019))) {
59 push_current_token_and_clear();
60 push_single_char_and_clear(c);
61 continue;
62 }
63
64 // split by space
65 if (::iswspace(c)) {
66 push_current_token_and_clear();
67 continue;
68 }
69
70 // iscntrl will judge \t\f\n\r as control char
71 // but it has been filter by isspace(c)
72 if (remove_control_chars_ && ::iswcntrl(c)) {
73 continue;
74 }
75
76 token.push_back(c);
77 }
78
79 push_current_token_and_clear();
80 return result;
81}
82
83KernelBasicTokenizer::KernelBasicTokenizer(OrtApi api, const OrtKernelInfo* info) : BaseKernel(api, info) {
84 bool do_lower_case = TryToGetAttributeWithDefault("do_lower_case", true);
85 bool tokenize_chinese_chars = TryToGetAttributeWithDefault("tokenize_chinese_chars", true);
86 bool strip_accents = TryToGetAttributeWithDefault("strip_accents", false);
87 bool tokenize_punctuation = TryToGetAttributeWithDefault("tokenize_punctuation", false);
88 bool remove_control_chars = TryToGetAttributeWithDefault("strip_accents", true);
89
90 tokenizer_ = std::make_shared<BasicTokenizer>(do_lower_case, tokenize_chinese_chars, strip_accents, tokenize_punctuation, remove_control_chars);
91}
92
93void KernelBasicTokenizer::Compute(OrtKernelContext* context) {
94 // Setup inputs
95 const OrtValue* input = ort_.KernelContext_GetInput(context, 0);
96 std::vector<std::string> input_data;
97 GetTensorMutableDataString(api_, ort_, context, input, input_data);
98
99 OrtTensorDimensions dimensions(ort_, input);
100 if (dimensions.size() != 1 && dimensions[0] != 1) {
101 ORT_CXX_API_THROW("[BasicTokenizer]: only support string scalar.", ORT_INVALID_GRAPH);
102 }
103
104 OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size());
105 std::vector<ustring> result = tokenizer_->Tokenize(ustring(input_data[0]));
106
107 FillTensorDataString(api_, ort_, context, result, output);
108}
109
110void* CustomOpBasicTokenizer::CreateKernel(OrtApi api, const OrtKernelInfo* info) const {
111 return new KernelBasicTokenizer(api, info);
112};
113
114const char* CustomOpBasicTokenizer::GetName() const { return "BasicTokenizer"; };
115
116size_t CustomOpBasicTokenizer::GetInputTypeCount() const {
117 return 1;
118};
119
120ONNXTensorElementDataType CustomOpBasicTokenizer::GetInputType(size_t /*index*/) const {
121 return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
122};
123
124size_t CustomOpBasicTokenizer::GetOutputTypeCount() const {
125 return 1;
126};
127
128ONNXTensorElementDataType CustomOpBasicTokenizer::GetOutputType(size_t /*index*/) const {
129 return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
130};
131