microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
0f1f454867b57511e4ebdf5449483d0ce207b368

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

include/ortx_tokenizer.h

110lines · modecode

1// Copyright (c) Microsoft Corporation. All rights reserved.
2// Licensed under the MIT License.
3
4// C ABI header file for the onnxruntime-extensions tokenization module
5
6#pragma once
7
8#include "ortx_utils.h"
9
10// typedefs to create/dispose function flood, and to make the API more C++ friendly with less casting
11typedef OrtxObject OrtxTokenizer;
12typedef OrtxObject OrtxStringArray;
13typedef OrtxObject OrtxTokenId2DArray;
14typedef OrtxObject OrtxDetokenizerCache;
15
16
17#ifdef __cplusplus
18extern "C" {
19#endif
20
21/** \brief Create a tokenizer object with the specified tokenizer path
22 *
23 * \param tokenizer Pointer to store the created tokenizer object
24 * \param tokenizer_path The path to the tokenizer directory, which is utf-8 encoded
25 * \return Error code indicating the success or failure of the operation
26 */
27extError_t ORTX_API_CALL OrtxCreateTokenizer(OrtxTokenizer** tokenizer, const char* tokenizer_path);
28
29/** \brief Tokenize the input using the specified tokenizer
30 *
31 * \param tokenizer Pointer to the tokenizer object
32 * \param input Array of input strings
33 * \param batch_size Number of input strings in the batch
34 * \param output Pointer to store the tokenized result
35 * \return Error code indicating the success or failure of the operation
36 */
37extError_t ORTX_API_CALL OrtxTokenize(
38 const OrtxTokenizer* tokenizer, const char* input[], size_t batch_size, OrtxTokenId2DArray** output);
39
40/** \brief Detokenize the input using the specified tokenizer
41 *
42 * \param tokenizer Pointer to the tokenizer object
43 * \param input Pointer to the input token IDs
44 * \param output Pointer to store the detokenized result
45 * \return Error code indicating the success or failure of the operation
46 */
47extError_t ORTX_API_CALL OrtxDetokenize(
48 const OrtxTokenizer* tokenizer, const OrtxTokenId2DArray* input, OrtxStringArray** output);
49
50/** \brief Detokenize the input using the specified tokenizer (1D version)
51 *
52 * \param tokenizer Pointer to the tokenizer object
53 * \param input Pointer to the input token IDs
54 * \param len Length of the input token IDs array
55 * \param output Pointer to store the detokenized result
56 * \return Error code indicating the success or failure of the operation
57 */
58extError_t ORTX_API_CALL OrtxDetokenize1D(
59 const OrtxTokenizer* tokenizer, const extTokenId_t* input, size_t len, OrtxStringArray** output);
60
61/** \brief Detokenize the input using the specified tokenizer with caching
62 *
63 * \param tokenizer Pointer to the tokenizer object
64 * \param cache Pointer to the detokenizer cache
65 * \param next_id Next token ID to detokenize
66 * \param text_out Pointer to store the detokenized text
67 * \return Error code indicating the success or failure of the operation
68 */
69extError_t ORTX_API_CALL OrtxDetokenizeCached(
70 const OrtxTokenizer* tokenizer, OrtxDetokenizerCache* cache, extTokenId_t next_id, const char** text_out);
71
72/** \brief Get the length of the string array
73 *
74 * \param string_array Pointer to the string array
75 * \param length Pointer to store the length of the string array
76 * \return Error code indicating the success or failure of the operation
77 */
78extError_t ORTX_API_CALL OrtxStringArrayGetBatch(const OrtxStringArray* string_array, size_t* length);
79
80/** \brief Get the item at the specified index from the string array
81 *
82 * \param string_array Pointer to the string array
83 * \param index Index of the item to retrieve
84 * \param item Pointer to store the retrieved item
85 * \return Error code indicating the success or failure of the operation
86 */
87extError_t ORTX_API_CALL OrtxStringArrayGetItem(const OrtxStringArray* string_array, size_t index, const char** item);
88
89/** \brief Get the batch size of the token ID 2D array
90 *
91 * \param token_id_2d_array Pointer to the token ID 2D array
92 * \param length Pointer to store the batch size
93 * \return Error code indicating the success or failure of the operation
94 */
95extError_t ORTX_API_CALL OrtxTokenId2DArrayGetBatch(const OrtxTokenId2DArray* token_id_2d_array, size_t* length);
96
97/** \brief Get the item at the specified index from the token ID 2D array
98 *
99 * \param token_id_2d_array Pointer to the token ID 2D array
100 * \param index Index of the item to retrieve
101 * \param item Pointer to store the retrieved item
102 * \param length Pointer to store the length of the item
103 * \return Error code indicating the success or failure of the operation
104 */
105extError_t ORTX_API_CALL OrtxTokenId2DArrayGetItem(
106 const OrtxTokenId2DArray* token_id_2d_array, size_t index, const extTokenId_t** item, size_t* length);
107
108#ifdef __cplusplus
109}
110#endif
111