microsoft/onnxruntime-extensions

Public

mirrored fromhttps://github.com/microsoft/onnxruntime-extensionsAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

82a59c634565e72b210831c109bf971b90030efb

Find a branch or tag

Branches

82a59c634565e72b210831c109bf971b90030efb

Clone

HTTPS

Download ZIP

onnxruntime-extensions/include

include/ortx_extractor.h

183lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`// C ABI header file for the onnxruntime-extensions tokenization module`
2
3	`#pragma once`
4
5	`#include "ortx_utils.h"`
6
7	`typedef OrtxObject OrtxFeatureExtractor;`
8	`typedef OrtxObject OrtxRawAudios;`
9
10	`#ifdef __cplusplus`
11	`extern "C" {`
12	`#endif`
13
14	`/**`
15	`* @brief Creates a feature extractor object.`
16	`*`
17	`* This function creates a feature extractor object based on the provided feature definition.`
18	`*`
19	`* @param[out] extractor Pointer to a pointer to the created feature extractor object.`
20	`* @param[in] fe_def The feature definition used to create the feature extractor.`
21	`*`
22	`* @return An error code indicating the result of the operation.`
23	`*/`
24	`extError_t ORTX_API_CALL OrtxCreateSpeechFeatureExtractor(OrtxFeatureExtractor** extractor, const char* fe_def);`
25
26	`/**`
27	`* Loads a collection of audio files into memory.`
28	`*`
29	* This function loads a collection of audio files specified by the `audio_paths` array
30	* into memory and returns a pointer to the loaded audio data in the `audios` parameter.
31	`*`
32	`* @param audios A pointer to a pointer that will be updated with the loaded audio data.`
33	`* The caller is responsible for freeing the memory allocated for the audio data.`
34	`* @param audio_paths An array of strings representing the paths to the audio files to be loaded.`
35	`* @param num_audios The number of audio files to be loaded.`
36	`*`
37	* @return An `extError_t` value indicating the success or failure of the operation.
38	`*/`
39	`extError_t ORTX_API_CALL OrtxLoadAudios(OrtxRawAudios** audios, const char* const* audio_paths, size_t num_audios);`
40
41	`/**`
42	`* @brief Creates an array of raw audio objects, which refers to the audio data and sizes provided.`
43	`*`
44	`* This function creates an array of raw audio objects based on the provided data and sizes. The data will be stored in`
45	* the `audios` parameter.
46	`*`
47	`* @param audios Pointer to the variable that will hold the created raw audio objects.`
48	`* @param data Array of pointers to the audio data.`
49	`* @param sizes Array of pointers to the sizes of the audio data.`
50	`* @param num_audios Number of audio objects to create.`
51	`*`
52	`* @return extError_t Error code indicating the success or failure of the operation.`
53	`*/`
54	`extError_t ORTX_API_CALL OrtxCreateRawAudios(OrtxRawAudios** audios, const void* data[], const int64_t sizes[],`
55	`size_t num_audios);`
56
57	`/**`
58	`* @brief Calculates the log mel spectrogram for a given audio using the specified feature extractor.`
59	`*`
60	`* This function takes an instance of the OrtxFeatureExtractor struct, an instance of the OrtxRawAudios struct,`
61	`* and a pointer to an OrtxTensorResult pointer. It calculates the log mel spectrogram for the given audio using`
62	`* the specified feature extractor and stores the result in the provided log_mel pointer.`
63	`*`
64	`* @param extractor The feature extractor to use for calculating the log mel spectrogram.`
65	`* @param audio The raw audio data to process.`
66	`* @param log_mel A pointer to an OrtxTensorResult pointer where the result will be stored.`
67	`* @return An extError_t value indicating the success or failure of the operation.`
68	`*/`
69	`extError_t ORTX_API_CALL OrtxSpeechLogMel(OrtxFeatureExtractor* extractor, OrtxRawAudios* audio,`
70	`OrtxTensorResult** log_mel);`
71
72	`/**`
73	`* @brief Splits an input audio signal and outputs the areas of high vs low energy based on the STFT analysis.`
74	`*`
75	`* This function takes an input waveform tensor and associated parameters such as sample rate,`
76	`* frame length, hop length, and energy threshold (in dB), and identifies contiguous segments`
77	`* of speech or sound activity. It writes the resulting segment start and end indices into`
78	`* the provided output tensor.`
79	`*`
80	`* @param input The input waveform tensor (1D or 2D) containing audio samples.`
81	`* @param sr_tensor A tensor containing the sample rate of the input audio (in Hz).`
82	`* @param frame_ms_tensor A tensor containing the frame size in milliseconds.`
83	`* @param hop_ms_tensor A tensor containing the hop length in milliseconds.`
84	`* @param energy_threshold_db_tensor A tensor specifying the energy threshold in decibels (dB)`
85	`* used to decide which frames are considered active.`
86	`* @param output0 A pointer to an output tensor where the resulting segments will be written.`
87	`* Each row contains two integers: [start_sample, end_sample] for a detected segment.`
88	`* @return An extError_t value indicating the success or failure of the operation.`
89	`*/`
90	`extError_t ORTX_API_CALL OrtxSplitSignalSegments(const OrtxTensor* input, const OrtxTensor* sr_tensor,`
91	`const OrtxTensor* frame_ms_tensor, const OrtxTensor* hop_ms_tensor,`
92	`const OrtxTensor* energy_threshold_db_tensor, OrtxTensor* output0);`
93
94	`/**`
95	`* @brief Merges adjacent signal segments that are separated by short gaps.`
96	`*`
97	`* This function takes a tensor of detected segments (each row containing [start, end] indices)`
98	`* and merges any consecutive segments whose gap is smaller than the specified threshold (in milliseconds).`
99	`*`
100	`* @param segments_tensor The input tensor of detected segments, of shape [N, 2].`
101	`* @param merge_gap_ms_tensor A tensor containing a single integer value representing`
102	`* the maximum allowed gap (in milliseconds) between consecutive segments to be merged.`
103	`* @param output0 A pointer to an output tensor where the merged segments will be stored.`
104	`* Each row contains two integers: [merged_start_sample, merged_end_sample].`
105	`* @return An extError_t value indicating the success or failure of the operation.`
106	`*/`
107	`extError_t ORTX_API_CALL OrtxMergeSignalSegments(const OrtxTensor* segments_tensor,`
108	`const OrtxTensor* merge_gap_ms_tensor, OrtxTensor* output0);`
109
110	`/**`
111	`* @brief Extracts log-mel features from raw audio data using a feature extractor.`
112	`*`
113	`* This function processes the input audio buffers through the provided feature extractor,`
114	`* producing log-mel spectrogram outputs suitable for inference or further signal analysis.`
115	`*`
116	`* @param extractor A pointer to an OrtxFeatureExtractor object that defines the feature`
117	`* extraction pipeline and processing parameters.`
118	`* @param audio A pointer to an OrtxRawAudios structure containing raw audio data buffers`
119	`* and associated metadata (e.g., sampling rate, channels).`
120	`* @param result A pointer to an OrtxTensorResult pointer that will be allocated and set to`
121	`* hold the resulting log-mel spectrogram data and other outputs based on json configuration.`
122	`*`
123	`* @return An extError_t value indicating success or error status. Returns`
124	`* EXT_SUCCESS on success, or an appropriate error code if extraction fails.`
125	`*/`
126	`extError_t ORTX_API_CALL OrtxFeatureExtraction(OrtxFeatureExtractor* extractor, OrtxRawAudios* audio,`
127	`OrtxTensorResult** result);`
128
129	`/**`
130	`* @brief Decode raw audio bytes to float32 PCM samples.`
131	`*`
132	`* Decodes the audio at the given index from an OrtxRawAudios object into`
133	`* float32 PCM samples (mono). The result contains two tensors:`
134	`* [0] = float32 PCM samples of shape [1, num_samples]`
135	`* [1] = int64 sample rate of shape [1]`
136	`*`
137	`* Note: this entry point constructs and initializes a fresh AudioDecoder on every call. For`
138	`* decoding many audios at once, prefer OrtxDecodeAudios, which amortizes decoder`
139	`* initialization across the whole batch.`
140	`*`
141	`* @param raw_audios An OrtxRawAudios object containing one or more audio buffers.`
142	`* @param index Index of the audio to decode (0-based).`
143	`* @param target_sample_rate Target sample rate for resampling. Set to 0 to keep the native rate.`
144	`* @param stereo_to_mono If non-zero, multi-channel audio is downmixed to mono. If zero,`
145	`* channels are preserved (PCM tensor shape becomes [num_channels,`
146	`* num_samples]).`
147	`* @param result Output: OrtxTensorResult with [pcm, sample_rate] tensors.`
148	`*`
149	`* @return An extError_t value indicating success or error status.`
150	`*/`
151	`extError_t ORTX_API_CALL OrtxDecodeAudio(OrtxRawAudios* raw_audios, size_t index, int64_t target_sample_rate,`
152	`int stereo_to_mono, OrtxTensorResult** result);`
153
154	`/**`
155	`* @brief Decode all raw audio buffers in an OrtxRawAudios container to float32 PCM samples.`
156	`*`
157	`* Batch variant of OrtxDecodeAudio: initializes the decoder once and decodes every audio in`
158	`* the container, producing one OrtxTensorResult per audio. Each result has the same layout as`
159	`* OrtxDecodeAudio:`
160	`* [0] = float32 PCM samples of shape [1, num_samples]`
161	`* [1] = int64 sample rate of shape [1]`
162	`*`
163	`* Failure semantics: fail-fast. On error, any results already produced are freed and every`
164	* entry in `results` is set to nullptr.
165	`*`
166	`* @param raw_audios An OrtxRawAudios object containing one or more audio buffers.`
167	`* @param target_sample_rate Target sample rate for resampling. Set to 0 to keep the native rate.`
168	`* @param stereo_to_mono If non-zero, multi-channel audio is downmixed to mono. If zero,`
169	`* channels are preserved (PCM tensor shape becomes [num_channels,`
170	`* num_samples]).`
171	* @param results Caller-allocated array of size `num_results`; on success each entry
172	`* is set to a new OrtxTensorResult* that the caller must dispose.`
173	* @param num_results Size of the `results` array; must equal the number of audios in
174	* `raw_audios`.
175	`*`
176	`* @return An extError_t value indicating success or error status.`
177	`*/`
178	`extError_t ORTX_API_CALL OrtxDecodeAudios(OrtxRawAudios* raw_audios, int64_t target_sample_rate,`
179	`int stereo_to_mono, OrtxTensorResult** results, size_t num_results);`
180
181	`#ifdef __cplusplus`
182	`}`
183	`#endif`
184

microsoft/onnxruntime-extensions

Branches

Tags

Clone