microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
rel-0.7

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/tokenizer/basic_tokenizer.hpp

38lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4#pragma once
5
6#include "ocos.h"
7#include "string_utils.h"
8#include "ustring.h"
9
10class BasicTokenizer {
11 public:
12 BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation,
13 bool remove_control_chars);
14 std::vector<ustring> Tokenize(ustring text);
15
16 private:
17 bool do_lower_case_;
18 bool strip_accents_;
19 bool tokenize_chinese_chars_;
20 bool tokenize_punctuation_;
21 bool remove_control_chars_;
22};
23
24struct KernelBasicTokenizer : BaseKernel {
25 KernelBasicTokenizer(const OrtApi& api, const OrtKernelInfo& info);
26 void Compute(OrtKernelContext* context);
27
28 private:
29 std::shared_ptr<BasicTokenizer> tokenizer_;
30};
31
32struct CustomOpBasicTokenizer : OrtW::CustomOpBase<CustomOpBasicTokenizer, KernelBasicTokenizer> {
33 const char* GetName() const;
34 size_t GetInputTypeCount() const;
35 ONNXTensorElementDataType GetInputType(size_t index) const;
36 size_t GetOutputTypeCount() const;
37 ONNXTensorElementDataType GetOutputType(size_t index) const;
38};
39