microsoft/onnxruntime-extensions

Public

mirrored from https://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
wechi/changes-to-bridge-torch

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

include/ortx_tokenizer.h

170lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4// C ABI header file for the onnxruntime-extensions tokenization module
5
6#pragma once
7
8#include "ortx_types.h"
9
10typedef enum {
11 kOrtxKindUnknown = 0,
12
13 kOrtxKindBegin = 0x7788, // starting from a number to help validate the object
14 kOrtxKindTokenizer = kOrtxKindBegin,
15 kOrtxKindStringArray = 0x7789,
16 kOrtxKindTokenId2DArray = 0x778A,
17 kOrtxKindDetokenizerCache = 0x778B,
18 kOrtxKindEnd = 0x7999
19} extObjectKind_t;
20
21// all object managed by the library should be 'derived' from this struct
22// which eventually will be released by TfmDispose if C++, or TFM_DISPOSE if C
23typedef struct {
24 int ext_kind_;
25} OrtxObject;
26
27const int API_VERSION = 1;
28
29// typedefs to create/dispose function flood, and to make the API more C++ friendly with less casting
30typedef OrtxObject OrtxTokenizer;
31typedef OrtxObject OrtxStringArray;
32typedef OrtxObject OrtxTokenId2DArray;
33typedef OrtxObject OrtxDetokenizerCache;
34
35// C, instead of C++ doesn't cast automatically,
36// so we need to use a macro to cast the object to the correct type
37#define ORTX_DISPOSE(obj) OrtxDispose((OrtxObject**)&obj)
38
39typedef uint32_t extTokenId_t;
40
41#ifdef __cplusplus
42extern "C" {
43#endif
44
45/** \brief Get the current C ABI version of this library
46 *
47 * \snippet{doc} snippets.dox int Return Value
48 */
49int ORTX_API_CALL OrtxGetAPIVersion(void);
50
51/** \brief Get the last error message generated by the library
52 *
53 * \param message Pointer to store the last error message
54 * \return Pointer to the last error message
55 */
56const char* ORTX_API_CALL OrtxGetLastErrorMessage(void);
57
58/** \brief Create a new object of the specified kind
59 *
60 * \param kind The kind of object to create
61 * \param object Pointer to store the created object
62 * \param ... Additional arguments based on the kind of object
63 * \return Error code indicating the success or failure of the operation
64 */
65extError_t ORTX_API_CALL OrtxCreate(extObjectKind_t kind, OrtxObject** object, ...);
66
67/** \brief Dispose the specified object
68 *
69 * \param object Pointer to the object to dispose
70 * \return Error code indicating the success or failure of the operation
71 */
72extError_t ORTX_API_CALL OrtxDispose(OrtxObject** object);
73
74/** \brief Dispose the specified object
75 *
76 * \param object Pointer to the object to dispose
77 * \return Error code indicating the success or failure of the operation
78 */
79extError_t ORTX_API_CALL OrtxDisposeOnly(OrtxObject* object);
80
81/** \brief Create a tokenizer object with the specified tokenizer path
82 *
83 * \param tokenizer Pointer to store the created tokenizer object
84 * \param tokenizer_path The path to the tokenizer
85 * \return Error code indicating the success or failure of the operation
86 */
87extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer, const char* tokenizer_path);
88
89/** \brief Tokenize the input using the specified tokenizer
90 *
91 * \param tokenizer Pointer to the tokenizer object
92 * \param input Array of input strings
93 * \param batch_size Number of input strings in the batch
94 * \param output Pointer to store the tokenized result
95 * \return Error code indicating the success or failure of the operation
96 */
97extError_t ORTX_API_CALL OrtxTokenize(
98 const OrtxTokenizer* tokenizer, const char* input[], size_t batch_size, OrtxTokenId2DArray** output);
99
100/** \brief Detokenize the input using the specified tokenizer
101 *
102 * \param tokenizer Pointer to the tokenizer object
103 * \param input Pointer to the input token IDs
104 * \param output Pointer to store the detokenized result
105 * \return Error code indicating the success or failure of the operation
106 */
107extError_t ORTX_API_CALL OrtxDetokenize(
108 const OrtxTokenizer* tokenizer, const OrtxTokenId2DArray* input, OrtxStringArray** output);
109
110/** \brief Detokenize the input using the specified tokenizer (1D version)
111 *
112 * \param tokenizer Pointer to the tokenizer object
113 * \param input Pointer to the input token IDs
114 * \param len Length of the input token IDs array
115 * \param output Pointer to store the detokenized result
116 * \return Error code indicating the success or failure of the operation
117 */
118extError_t ORTX_API_CALL OrtxDetokenize1D(
119 const OrtxTokenizer* tokenizer, const extTokenId_t* input, size_t len, OrtxStringArray** output);
120
121/** \brief Detokenize the input using the specified tokenizer with caching
122 *
123 * \param tokenizer Pointer to the tokenizer object
124 * \param cache Pointer to the detokenizer cache
125 * \param next_id Next token ID to detokenize
126 * \param text_out Pointer to store the detokenized text
127 * \return Error code indicating the success or failure of the operation
128 */
129extError_t ORTX_API_CALL OrtxDetokenizeCached(
130 const OrtxTokenizer* tokenizer, OrtxDetokenizerCache* cache, extTokenId_t next_id, const char** text_out);
131
132/** \brief Get the length of the string array
133 *
134 * \param string_array Pointer to the string array
135 * \param length Pointer to store the length of the string array
136 * \return Error code indicating the success or failure of the operation
137 */
138extError_t ORTX_API_CALL OrtxStringArrayGetBatch(const OrtxStringArray* string_array, size_t* length);
139
140/** \brief Get the item at the specified index from the string array
141 *
142 * \param string_array Pointer to the string array
143 * \param index Index of the item to retrieve
144 * \param item Pointer to store the retrieved item
145 * \return Error code indicating the success or failure of the operation
146 */
147extError_t ORTX_API_CALL OrtxStringArrayGetItem(const OrtxStringArray* string_array, size_t index, const char** item);
148
149/** \brief Get the batch size of the token ID 2D array
150 *
151 * \param token_id_2d_array Pointer to the token ID 2D array
152 * \param length Pointer to store the batch size
153 * \return Error code indicating the success or failure of the operation
154 */
155extError_t ORTX_API_CALL OrtxTokenId2DArrayGetBatch(const OrtxTokenId2DArray* token_id_2d_array, size_t* length);
156
157/** \brief Get the item at the specified index from the token ID 2D array
158 *
159 * \param token_id_2d_array Pointer to the token ID 2D array
160 * \param index Index of the item to retrieve
161 * \param item Pointer to store the retrieved item
162 * \param length Pointer to store the length of the item
163 * \return Error code indicating the success or failure of the operation
164 */
165extError_t ORTX_API_CALL OrtxTokenId2DArrayGetItem(
166 const OrtxTokenId2DArray* token_id_2d_array, size_t index, const extTokenId_t** item, size_t* length);
167
168#ifdef __cplusplus
169}
170#endif
171