microsoft/onnxruntime-extensions
Publicmirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable
include/ortx_tokenizer.h
171lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. All rights reserved. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | // C ABI header file for the onnxruntime-extensions tokenization module |
| 5 | |
| 6 | #pragma once |
| 7 | |
| 8 | #include "ortx_types.h" |
| 9 | |
| 10 | typedef enum { |
| 11 | kOrtxKindUnknown = 0, |
| 12 | |
| 13 | kOrtxKindBegin = 0x7788, // starting from a number to help validate the object |
| 14 | kOrtxKindTokenizer = kOrtxKindBegin, |
| 15 | kOrtxKindStringArray = 0x7789, |
| 16 | kOrtxKindTokenId2DArray = 0x778A, |
| 17 | kOrtxKindDetokenizerCache = 0x778B, |
| 18 | kOrtxKindEnd = 0x7999 |
| 19 | } extObjectKind_t; |
| 20 | |
| 21 | // all object managed by the library should be 'derived' from this struct |
| 22 | // which eventually will be released by TfmDispose if C++, or TFM_DISPOSE if C |
| 23 | typedef struct { |
| 24 | int ext_kind_; |
| 25 | } OrtxObject; |
| 26 | |
| 27 | const int API_VERSION = 1; |
| 28 | |
| 29 | // typedefs to create/dispose function flood, and to make the API more C++ friendly with less casting |
| 30 | typedef OrtxObject OrtxTokenizer; |
| 31 | typedef OrtxObject OrtxStringArray; |
| 32 | typedef OrtxObject OrtxTokenId2DArray; |
| 33 | typedef OrtxObject OrtxDetokenizerCache; |
| 34 | |
| 35 | // C, instead of C++ doesn't cast automatically, |
| 36 | // so we need to use a macro to cast the object to the correct type |
| 37 | #define ORTX_DISPOSE(obj) OrtxDispose((OrtxObject**)&obj) |
| 38 | |
| 39 | typedef uint32_t extTokenId_t; |
| 40 | |
| 41 | #ifdef __cplusplus |
| 42 | extern "C" { |
| 43 | #endif |
| 44 | |
| 45 | /** \brief Get the current C ABI version of this library |
| 46 | * |
| 47 | * \snippet{doc} snippets.dox int Return Value |
| 48 | */ |
| 49 | int ORTX_API_CALL OrtxGetAPIVersion(void); |
| 50 | |
| 51 | /** \brief Get the last error message generated by the library |
| 52 | * |
| 53 | * \param message Pointer to store the last error message |
| 54 | * \return Pointer to the last error message |
| 55 | */ |
| 56 | const char* ORTX_API_CALL OrtxGetLastErrorMessage(void); |
| 57 | |
| 58 | /** \brief Create a new object of the specified kind |
| 59 | * |
| 60 | * \param kind The kind of object to create |
| 61 | * \param object Pointer to store the created object |
| 62 | * \param ... Additional arguments based on the kind of object |
| 63 | * \return Error code indicating the success or failure of the operation |
| 64 | */ |
| 65 | extError_t ORTX_API_CALL OrtxCreate(extObjectKind_t kind, OrtxObject** object, ...); |
| 66 | |
| 67 | /** \brief Dispose the specified object |
| 68 | * |
| 69 | * \param object Pointer to the object to dispose |
| 70 | * \return Error code indicating the success or failure of the operation |
| 71 | */ |
| 72 | extError_t ORTX_API_CALL OrtxDispose(OrtxObject** object); |
| 73 | |
| 74 | /** \brief Dispose the specified object |
| 75 | * |
| 76 | * \param object Pointer to the object to dispose |
| 77 | * \return Error code indicating the success or failure of the operation |
| 78 | */ |
| 79 | extError_t ORTX_API_CALL OrtxDisposeOnly(OrtxObject* object); |
| 80 | |
| 81 | |
| 82 | /** \brief Create a tokenizer object with the specified tokenizer path |
| 83 | * |
| 84 | * \param tokenizer Pointer to store the created tokenizer object |
| 85 | * \param tokenizer_path The path to the tokenizer |
| 86 | * \return Error code indicating the success or failure of the operation |
| 87 | */ |
| 88 | extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer, const char* tokenizer_path); |
| 89 | |
| 90 | /** \brief Tokenize the input using the specified tokenizer |
| 91 | * |
| 92 | * \param tokenizer Pointer to the tokenizer object |
| 93 | * \param input Array of input strings |
| 94 | * \param batch_size Number of input strings in the batch |
| 95 | * \param output Pointer to store the tokenized result |
| 96 | * \return Error code indicating the success or failure of the operation |
| 97 | */ |
| 98 | extError_t ORTX_API_CALL OrtxTokenize( |
| 99 | const OrtxTokenizer* tokenizer, const char* input[], size_t batch_size, OrtxTokenId2DArray** output); |
| 100 | |
| 101 | /** \brief Detokenize the input using the specified tokenizer |
| 102 | * |
| 103 | * \param tokenizer Pointer to the tokenizer object |
| 104 | * \param input Pointer to the input token IDs |
| 105 | * \param output Pointer to store the detokenized result |
| 106 | * \return Error code indicating the success or failure of the operation |
| 107 | */ |
| 108 | extError_t ORTX_API_CALL OrtxDetokenize( |
| 109 | const OrtxTokenizer* tokenizer, const OrtxTokenId2DArray* input, OrtxStringArray** output); |
| 110 | |
| 111 | /** \brief Detokenize the input using the specified tokenizer (1D version) |
| 112 | * |
| 113 | * \param tokenizer Pointer to the tokenizer object |
| 114 | * \param input Pointer to the input token IDs |
| 115 | * \param len Length of the input token IDs array |
| 116 | * \param output Pointer to store the detokenized result |
| 117 | * \return Error code indicating the success or failure of the operation |
| 118 | */ |
| 119 | extError_t ORTX_API_CALL OrtxDetokenize1D( |
| 120 | const OrtxTokenizer* tokenizer, const extTokenId_t* input, size_t len, OrtxStringArray** output); |
| 121 | |
| 122 | /** \brief Detokenize the input using the specified tokenizer with caching |
| 123 | * |
| 124 | * \param tokenizer Pointer to the tokenizer object |
| 125 | * \param cache Pointer to the detokenizer cache |
| 126 | * \param next_id Next token ID to detokenize |
| 127 | * \param text_out Pointer to store the detokenized text |
| 128 | * \return Error code indicating the success or failure of the operation |
| 129 | */ |
| 130 | extError_t ORTX_API_CALL OrtxDetokenizeCached( |
| 131 | const OrtxTokenizer* tokenizer, OrtxDetokenizerCache* cache, extTokenId_t next_id, const char** text_out); |
| 132 | |
| 133 | /** \brief Get the length of the string array |
| 134 | * |
| 135 | * \param string_array Pointer to the string array |
| 136 | * \param length Pointer to store the length of the string array |
| 137 | * \return Error code indicating the success or failure of the operation |
| 138 | */ |
| 139 | extError_t ORTX_API_CALL OrtxStringArrayGetBatch(const OrtxStringArray* string_array, size_t* length); |
| 140 | |
| 141 | /** \brief Get the item at the specified index from the string array |
| 142 | * |
| 143 | * \param string_array Pointer to the string array |
| 144 | * \param index Index of the item to retrieve |
| 145 | * \param item Pointer to store the retrieved item |
| 146 | * \return Error code indicating the success or failure of the operation |
| 147 | */ |
| 148 | extError_t ORTX_API_CALL OrtxStringArrayGetItem(const OrtxStringArray* string_array, size_t index, const char** item); |
| 149 | |
| 150 | /** \brief Get the batch size of the token ID 2D array |
| 151 | * |
| 152 | * \param token_id_2d_array Pointer to the token ID 2D array |
| 153 | * \param length Pointer to store the batch size |
| 154 | * \return Error code indicating the success or failure of the operation |
| 155 | */ |
| 156 | extError_t ORTX_API_CALL OrtxTokenId2DArrayGetBatch(const OrtxTokenId2DArray* token_id_2d_array, size_t* length); |
| 157 | |
| 158 | /** \brief Get the item at the specified index from the token ID 2D array |
| 159 | * |
| 160 | * \param token_id_2d_array Pointer to the token ID 2D array |
| 161 | * \param index Index of the item to retrieve |
| 162 | * \param item Pointer to store the retrieved item |
| 163 | * \param length Pointer to store the length of the item |
| 164 | * \return Error code indicating the success or failure of the operation |
| 165 | */ |
| 166 | extError_t ORTX_API_CALL OrtxTokenId2DArrayGetItem( |
| 167 | const OrtxTokenId2DArray* token_id_2d_array, size_t index, const extTokenId_t** item, size_t* length); |
| 168 | |
| 169 | #ifdef __cplusplus |
| 170 | } |
| 171 | #endif |
| 172 | |