microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
13d9e27ccd8a0de9a1225756fbf6860a1931484f

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/tokenizer/basic_tokenizer.hpp

37lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4#pragma once
5
6#include "ocos.h"
7#include "string_utils.h"
8#include "ustring.h"
9
10class BasicTokenizer {
11 public:
12 BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation, bool remove_control_chars);
13 std::vector<ustring> Tokenize(ustring text);
14
15 private:
16 bool do_lower_case_;
17 bool strip_accents_;
18 bool tokenize_chinese_chars_;
19 bool tokenize_punctuation_;
20 bool remove_control_chars_;
21};
22
23struct KernelBasicTokenizer : BaseKernel {
24 KernelBasicTokenizer(const OrtApi& api, const OrtKernelInfo* info);
25 void Compute(OrtKernelContext* context);
26 private:
27 std::shared_ptr<BasicTokenizer> tokenizer_;
28};
29
30struct CustomOpBasicTokenizer : OrtW::CustomOpBase<CustomOpBasicTokenizer, KernelBasicTokenizer> {
31 void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const;
32 const char* GetName() const;
33 size_t GetInputTypeCount() const;
34 ONNXTensorElementDataType GetInputType(size_t index) const;
35 size_t GetOutputTypeCount() const;
36 ONNXTensorElementDataType GetOutputType(size_t index) const;
37};
38