microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
f74770feed077546874ed7e66d1aba9e2509fea9

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/tokenizer/wordpiece_tokenizer.hpp

46lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4#pragma once
5
6#include <unordered_map>
7#include <vector>
8#include "ocos.h"
9#include "ustring.h"
10#include "string_utils.h"
11#include "string_tensor.h"
12
13struct KernelWordpieceTokenizer : BaseKernel {
14 KernelWordpieceTokenizer(OrtApi api, const OrtKernelInfo* info);
15 void Compute(OrtKernelContext* context);
16
17 private:
18 int64_t max_input_chars_per_word_;
19 std::u32string suffix_indicator_;
20 ustring unk_token_;
21 std::unordered_map<std::u32string, int32_t> vocab_;
22};
23
24struct CustomOpWordpieceTokenizer : Ort::CustomOpBase<CustomOpWordpieceTokenizer, KernelWordpieceTokenizer> {
25 void* CreateKernel(OrtApi api, const OrtKernelInfo* info) const;
26 const char* GetName() const;
27 size_t GetInputTypeCount() const;
28 ONNXTensorElementDataType GetInputType(size_t index) const;
29 size_t GetOutputTypeCount() const;
30 ONNXTensorElementDataType GetOutputType(size_t index) const;
31};
32
33void KernelWordpieceTokenizer_Split(const std::u32string& suffix_indicator,
34 const std::u32string& text,
35 std::vector<std::u32string>& words);
36
37void KernelWordpieceTokenizer_Tokenizer(const std::unordered_map<std::u32string, int32_t>& vocab,
38 const std::u32string& suffix_indicator,
39 const ustring& unk_token,
40 const std::vector<ustring>& texts,
41 std::vector<ustring>& tokens,
42 std::vector<int32_t>& indices,
43 std::vector<int64_t>& rows,
44 const int64_t* existing_rows = nullptr,
45 int64_t n_existing_rows = 0,
46 int64_t max_input_chars_per_word = 200);
47