microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
f9290e8bac2758dba8279d7ebc10e7027ffe0503

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

include/ortx_tokenizer.h

171lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4// C ABI header file for the onnxruntime-extensions tokenization module
5
6#pragma once
7
8#include "ortx_types.h"
9
10typedef enum {
11 kOrtxKindUnknown = 0,
12
13 kOrtxKindBegin = 0x7788, // starting from a number to help validate the object
14 kOrtxKindTokenizer = kOrtxKindBegin,
15 kOrtxKindStringArray = 0x7789,
16 kOrtxKindTokenId2DArray = 0x778A,
17 kOrtxKindDetokenizerCache = 0x778B,
18 kOrtxKindEnd = 0x7999
19} extObjectKind_t;
20
21// all object managed by the library should be 'derived' from this struct
22// which eventually will be released by TfmDispose if C++, or TFM_DISPOSE if C
23typedef struct {
24 int ext_kind_;
25} OrtxObject;
26
27const int API_VERSION = 1;
28
29// typedefs to create/dispose function flood, and to make the API more C++ friendly with less casting
30typedef OrtxObject OrtxTokenizer;
31typedef OrtxObject OrtxStringArray;
32typedef OrtxObject OrtxTokenId2DArray;
33typedef OrtxObject OrtxDetokenizerCache;
34
35// C, instead of C++ doesn't cast automatically,
36// so we need to use a macro to cast the object to the correct type
37#define ORTX_DISPOSE(obj) OrtxDispose((OrtxObject**)&obj)
38
39typedef uint32_t extTokenId_t;
40
41#ifdef __cplusplus
42extern "C" {
43#endif
44
45/** \brief Get the current C ABI version of this library
46 *
47 * \snippet{doc} snippets.dox int Return Value
48 */
49int ORTX_API_CALL OrtxGetAPIVersion(void);
50
51/** \brief Get the last error message generated by the library
52 *
53 * \param message Pointer to store the last error message
54 * \return Pointer to the last error message
55 */
56const char* ORTX_API_CALL OrtxGetLastErrorMessage(void);
57
58/** \brief Create a new object of the specified kind
59 *
60 * \param kind The kind of object to create
61 * \param object Pointer to store the created object
62 * \param ... Additional arguments based on the kind of object
63 * \return Error code indicating the success or failure of the operation
64 */
65extError_t ORTX_API_CALL OrtxCreate(extObjectKind_t kind, OrtxObject** object, ...);
66
67/** \brief Dispose the specified object
68 *
69 * \param object Pointer to the object to dispose
70 * \return Error code indicating the success or failure of the operation
71 */
72extError_t ORTX_API_CALL OrtxDispose(OrtxObject** object);
73
74/** \brief Dispose the specified object
75 *
76 * \param object Pointer to the object to dispose
77 * \return Error code indicating the success or failure of the operation
78 */
79extError_t ORTX_API_CALL OrtxDisposeOnly(OrtxObject* object);
80
81
82/** \brief Create a tokenizer object with the specified tokenizer path
83 *
84 * \param tokenizer Pointer to store the created tokenizer object
85 * \param tokenizer_path The path to the tokenizer
86 * \return Error code indicating the success or failure of the operation
87 */
88extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer, const char* tokenizer_path);
89
90/** \brief Tokenize the input using the specified tokenizer
91 *
92 * \param tokenizer Pointer to the tokenizer object
93 * \param input Array of input strings
94 * \param batch_size Number of input strings in the batch
95 * \param output Pointer to store the tokenized result
96 * \return Error code indicating the success or failure of the operation
97 */
98extError_t ORTX_API_CALL OrtxTokenize(
99 const OrtxTokenizer* tokenizer, const char* input[], size_t batch_size, OrtxTokenId2DArray** output);
100
101/** \brief Detokenize the input using the specified tokenizer
102 *
103 * \param tokenizer Pointer to the tokenizer object
104 * \param input Pointer to the input token IDs
105 * \param output Pointer to store the detokenized result
106 * \return Error code indicating the success or failure of the operation
107 */
108extError_t ORTX_API_CALL OrtxDetokenize(
109 const OrtxTokenizer* tokenizer, const OrtxTokenId2DArray* input, OrtxStringArray** output);
110
111/** \brief Detokenize the input using the specified tokenizer (1D version)
112 *
113 * \param tokenizer Pointer to the tokenizer object
114 * \param input Pointer to the input token IDs
115 * \param len Length of the input token IDs array
116 * \param output Pointer to store the detokenized result
117 * \return Error code indicating the success or failure of the operation
118 */
119extError_t ORTX_API_CALL OrtxDetokenize1D(
120 const OrtxTokenizer* tokenizer, const extTokenId_t* input, size_t len, OrtxStringArray** output);
121
122/** \brief Detokenize the input using the specified tokenizer with caching
123 *
124 * \param tokenizer Pointer to the tokenizer object
125 * \param cache Pointer to the detokenizer cache
126 * \param next_id Next token ID to detokenize
127 * \param text_out Pointer to store the detokenized text
128 * \return Error code indicating the success or failure of the operation
129 */
130extError_t ORTX_API_CALL OrtxDetokenizeCached(
131 const OrtxTokenizer* tokenizer, OrtxDetokenizerCache* cache, extTokenId_t next_id, const char** text_out);
132
133/** \brief Get the length of the string array
134 *
135 * \param string_array Pointer to the string array
136 * \param length Pointer to store the length of the string array
137 * \return Error code indicating the success or failure of the operation
138 */
139extError_t ORTX_API_CALL OrtxStringArrayGetBatch(const OrtxStringArray* string_array, size_t* length);
140
141/** \brief Get the item at the specified index from the string array
142 *
143 * \param string_array Pointer to the string array
144 * \param index Index of the item to retrieve
145 * \param item Pointer to store the retrieved item
146 * \return Error code indicating the success or failure of the operation
147 */
148extError_t ORTX_API_CALL OrtxStringArrayGetItem(const OrtxStringArray* string_array, size_t index, const char** item);
149
150/** \brief Get the batch size of the token ID 2D array
151 *
152 * \param token_id_2d_array Pointer to the token ID 2D array
153 * \param length Pointer to store the batch size
154 * \return Error code indicating the success or failure of the operation
155 */
156extError_t ORTX_API_CALL OrtxTokenId2DArrayGetBatch(const OrtxTokenId2DArray* token_id_2d_array, size_t* length);
157
158/** \brief Get the item at the specified index from the token ID 2D array
159 *
160 * \param token_id_2d_array Pointer to the token ID 2D array
161 * \param index Index of the item to retrieve
162 * \param item Pointer to store the retrieved item
163 * \param length Pointer to store the length of the item
164 * \return Error code indicating the success or failure of the operation
165 */
166extError_t ORTX_API_CALL OrtxTokenId2DArrayGetItem(
167 const OrtxTokenId2DArray* token_id_2d_array, size_t index, const extTokenId_t** item, size_t* length);
168
169#ifdef __cplusplus
170}
171#endif
172