microsoft/onnxruntime-extensions

Public

mirrored from https://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
rel-0.4

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/tokenizer/basic_tokenizer.hpp

37lines · modeblame

aef5ef1eMojimi4 years ago1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4#pragma once
5
6#include "ocos.h"
7#include "string_utils.h"
8#include "ustring.h"
9
10class BasicTokenizer {
11public:
12BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation, bool remove_control_chars);
13std::vector<ustring> Tokenize(ustring text);
14
15private:
16bool do_lower_case_;
17bool strip_accents_;
18bool tokenize_chinese_chars_;
19bool tokenize_punctuation_;
20bool remove_control_chars_;
21};
22
23struct KernelBasicTokenizer : BaseKernel {
24KernelBasicTokenizer(OrtApi api, const OrtKernelInfo* info);
25void Compute(OrtKernelContext* context);
26private:
27std::shared_ptr<BasicTokenizer> tokenizer_;
28};
29
30struct CustomOpBasicTokenizer : Ort::CustomOpBase<CustomOpBasicTokenizer, KernelBasicTokenizer> {
31void* CreateKernel(OrtApi api, const OrtKernelInfo* info) const;
32const char* GetName() const;
33size_t GetInputTypeCount() const;
34ONNXTensorElementDataType GetInputType(size_t index) const;
35size_t GetOutputTypeCount() const;
36ONNXTensorElementDataType GetOutputType(size_t index) const;
37};