microsoft/onnxruntime-extensions
Publicmirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable
include/ortx_tokenizer.h
110lines · modecode
| 1 | // Copyright (c) Microsoft Corporation. All rights reserved. |
| 2 | // Licensed under the MIT License. |
| 3 | |
| 4 | // C ABI header file for the onnxruntime-extensions tokenization module |
| 5 | |
| 6 | #pragma once |
| 7 | |
| 8 | #include "ortx_utils.h" |
| 9 | |
| 10 | // typedefs to create/dispose function flood, and to make the API more C++ friendly with less casting |
| 11 | typedef OrtxObject OrtxTokenizer; |
| 12 | typedef OrtxObject OrtxStringArray; |
| 13 | typedef OrtxObject OrtxTokenId2DArray; |
| 14 | typedef OrtxObject OrtxDetokenizerCache; |
| 15 | |
| 16 | |
| 17 | #ifdef __cplusplus |
| 18 | extern "C" { |
| 19 | #endif |
| 20 | |
| 21 | /** \brief Create a tokenizer object with the specified tokenizer path |
| 22 | * |
| 23 | * \param tokenizer Pointer to store the created tokenizer object |
| 24 | * \param tokenizer_path The path to the tokenizer directory, which is utf-8 encoded |
| 25 | * \return Error code indicating the success or failure of the operation |
| 26 | */ |
| 27 | extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer, const char* tokenizer_path); |
| 28 | |
| 29 | /** \brief Tokenize the input using the specified tokenizer |
| 30 | * |
| 31 | * \param tokenizer Pointer to the tokenizer object |
| 32 | * \param input Array of input strings |
| 33 | * \param batch_size Number of input strings in the batch |
| 34 | * \param output Pointer to store the tokenized result |
| 35 | * \return Error code indicating the success or failure of the operation |
| 36 | */ |
| 37 | extError_t ORTX_API_CALL OrtxTokenize( |
| 38 | const OrtxTokenizer* tokenizer, const char* input[], size_t batch_size, OrtxTokenId2DArray** output); |
| 39 | |
| 40 | /** \brief Detokenize the input using the specified tokenizer |
| 41 | * |
| 42 | * \param tokenizer Pointer to the tokenizer object |
| 43 | * \param input Pointer to the input token IDs |
| 44 | * \param output Pointer to store the detokenized result |
| 45 | * \return Error code indicating the success or failure of the operation |
| 46 | */ |
| 47 | extError_t ORTX_API_CALL OrtxDetokenize( |
| 48 | const OrtxTokenizer* tokenizer, const OrtxTokenId2DArray* input, OrtxStringArray** output); |
| 49 | |
| 50 | /** \brief Detokenize the input using the specified tokenizer (1D version) |
| 51 | * |
| 52 | * \param tokenizer Pointer to the tokenizer object |
| 53 | * \param input Pointer to the input token IDs |
| 54 | * \param len Length of the input token IDs array |
| 55 | * \param output Pointer to store the detokenized result |
| 56 | * \return Error code indicating the success or failure of the operation |
| 57 | */ |
| 58 | extError_t ORTX_API_CALL OrtxDetokenize1D( |
| 59 | const OrtxTokenizer* tokenizer, const extTokenId_t* input, size_t len, OrtxStringArray** output); |
| 60 | |
| 61 | /** \brief Detokenize the input using the specified tokenizer with caching |
| 62 | * |
| 63 | * \param tokenizer Pointer to the tokenizer object |
| 64 | * \param cache Pointer to the detokenizer cache |
| 65 | * \param next_id Next token ID to detokenize |
| 66 | * \param text_out Pointer to store the detokenized text |
| 67 | * \return Error code indicating the success or failure of the operation |
| 68 | */ |
| 69 | extError_t ORTX_API_CALL OrtxDetokenizeCached( |
| 70 | const OrtxTokenizer* tokenizer, OrtxDetokenizerCache* cache, extTokenId_t next_id, const char** text_out); |
| 71 | |
| 72 | /** \brief Get the length of the string array |
| 73 | * |
| 74 | * \param string_array Pointer to the string array |
| 75 | * \param length Pointer to store the length of the string array |
| 76 | * \return Error code indicating the success or failure of the operation |
| 77 | */ |
| 78 | extError_t ORTX_API_CALL OrtxStringArrayGetBatch(const OrtxStringArray* string_array, size_t* length); |
| 79 | |
| 80 | /** \brief Get the item at the specified index from the string array |
| 81 | * |
| 82 | * \param string_array Pointer to the string array |
| 83 | * \param index Index of the item to retrieve |
| 84 | * \param item Pointer to store the retrieved item |
| 85 | * \return Error code indicating the success or failure of the operation |
| 86 | */ |
| 87 | extError_t ORTX_API_CALL OrtxStringArrayGetItem(const OrtxStringArray* string_array, size_t index, const char** item); |
| 88 | |
| 89 | /** \brief Get the batch size of the token ID 2D array |
| 90 | * |
| 91 | * \param token_id_2d_array Pointer to the token ID 2D array |
| 92 | * \param length Pointer to store the batch size |
| 93 | * \return Error code indicating the success or failure of the operation |
| 94 | */ |
| 95 | extError_t ORTX_API_CALL OrtxTokenId2DArrayGetBatch(const OrtxTokenId2DArray* token_id_2d_array, size_t* length); |
| 96 | |
| 97 | /** \brief Get the item at the specified index from the token ID 2D array |
| 98 | * |
| 99 | * \param token_id_2d_array Pointer to the token ID 2D array |
| 100 | * \param index Index of the item to retrieve |
| 101 | * \param item Pointer to store the retrieved item |
| 102 | * \param length Pointer to store the length of the item |
| 103 | * \return Error code indicating the success or failure of the operation |
| 104 | */ |
| 105 | extError_t ORTX_API_CALL OrtxTokenId2DArrayGetItem( |
| 106 | const OrtxTokenId2DArray* token_id_2d_array, size_t index, const extTokenId_t** item, size_t* length); |
| 107 | |
| 108 | #ifdef __cplusplus |
| 109 | } |
| 110 | #endif |
| 111 | |