microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
v0.4.0

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

operators/tokenizer/basic_tokenizer.hpp

37lines · modepreview

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "ocos.h"
#include "string_utils.h"
#include "ustring.h"

class BasicTokenizer {
 public:
  BasicTokenizer(bool do_lower_case, bool tokenize_chinese_chars, bool strip_accents, bool tokenize_punctuation, bool remove_control_chars);
  std::vector<ustring> Tokenize(ustring text);

 private:
  bool do_lower_case_;
  bool strip_accents_;
  bool tokenize_chinese_chars_;
  bool tokenize_punctuation_;
  bool remove_control_chars_;
};

struct KernelBasicTokenizer : BaseKernel {
  KernelBasicTokenizer(OrtApi api,  const OrtKernelInfo* info);
  void Compute(OrtKernelContext* context);
 private:
  std::shared_ptr<BasicTokenizer> tokenizer_;
};

struct CustomOpBasicTokenizer : Ort::CustomOpBase<CustomOpBasicTokenizer, KernelBasicTokenizer> {
  void* CreateKernel(OrtApi api, const OrtKernelInfo* info) const;
  const char* GetName() const;
  size_t GetInputTypeCount() const;
  ONNXTensorElementDataType GetInputType(size_t index) const;
  size_t GetOutputTypeCount() const;
  ONNXTensorElementDataType GetOutputType(size_t index) const;
};