microsoft/onnxruntime-extensions

Public

mirrored from https://github.com/microsoft/onnxruntime-extensionsAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
copilot/document-operator-contracts

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

include/ortx_extractor.h

131lines · modecode

1// C ABI header file for the onnxruntime-extensions tokenization module
2
3#pragma once
4
5#include "ortx_utils.h"
6
7typedef OrtxObject OrtxFeatureExtractor;
8typedef OrtxObject OrtxRawAudios;
9
10#ifdef __cplusplus
11extern "C" {
12#endif
13
14/**
15 * @brief Creates a feature extractor object.
16 *
17 * This function creates a feature extractor object based on the provided feature definition.
18 *
19 * @param[out] extractor Pointer to a pointer to the created feature extractor object.
20 * @param[in] fe_def The feature definition used to create the feature extractor.
21 *
22 * @return An error code indicating the result of the operation.
23 */
24extError_t ORTX_API_CALL OrtxCreateSpeechFeatureExtractor(OrtxFeatureExtractor** extractor, const char* fe_def);
25
26/**
27 * Loads a collection of audio files into memory.
28 *
29 * This function loads a collection of audio files specified by the `audio_paths` array
30 * into memory and returns a pointer to the loaded audio data in the `audios` parameter.
31 *
32 * @param audios A pointer to a pointer that will be updated with the loaded audio data.
33 * The caller is responsible for freeing the memory allocated for the audio data.
34 * @param audio_paths An array of strings representing the paths to the audio files to be loaded.
35 * @param num_audios The number of audio files to be loaded.
36 *
37 * @return An `extError_t` value indicating the success or failure of the operation.
38 */
39extError_t ORTX_API_CALL OrtxLoadAudios(OrtxRawAudios** audios, const char* const* audio_paths, size_t num_audios);
40
41/**
42 * @brief Creates an array of raw audio objects, which refers to the audio data and sizes provided.
43 *
44 * This function creates an array of raw audio objects based on the provided data and sizes. The data will be stored in
45 * the `audios` parameter.
46 *
47 * @param audios Pointer to the variable that will hold the created raw audio objects.
48 * @param data Array of pointers to the audio data.
49 * @param sizes Array of pointers to the sizes of the audio data.
50 * @param num_audios Number of audio objects to create.
51 *
52 * @return extError_t Error code indicating the success or failure of the operation.
53 */
54extError_t ORTX_API_CALL OrtxCreateRawAudios(OrtxRawAudios** audios, const void* data[], const int64_t sizes[],
55 size_t num_audios);
56
57/**
58 * @brief Calculates the log mel spectrogram for a given audio using the specified feature extractor.
59 *
60 * This function takes an instance of the OrtxFeatureExtractor struct, an instance of the OrtxRawAudios struct,
61 * and a pointer to an OrtxTensorResult pointer. It calculates the log mel spectrogram for the given audio using
62 * the specified feature extractor and stores the result in the provided log_mel pointer.
63 *
64 * @param extractor The feature extractor to use for calculating the log mel spectrogram.
65 * @param audio The raw audio data to process.
66 * @param log_mel A pointer to an OrtxTensorResult pointer where the result will be stored.
67 * @return An extError_t value indicating the success or failure of the operation.
68 */
69extError_t ORTX_API_CALL OrtxSpeechLogMel(OrtxFeatureExtractor* extractor, OrtxRawAudios* audio,
70 OrtxTensorResult** log_mel);
71
72/**
73 * @brief Splits an input audio signal and outputs the areas of high vs low energy based on the STFT analysis.
74 *
75 * This function takes an input waveform tensor and associated parameters such as sample rate,
76 * frame length, hop length, and energy threshold (in dB), and identifies contiguous segments
77 * of speech or sound activity. It writes the resulting segment start and end indices into
78 * the provided output tensor.
79 *
80 * @param input The input waveform tensor (1D or 2D) containing audio samples.
81 * @param sr_tensor A tensor containing the sample rate of the input audio (in Hz).
82 * @param frame_ms_tensor A tensor containing the frame size in milliseconds.
83 * @param hop_ms_tensor A tensor containing the hop length in milliseconds.
84 * @param energy_threshold_db_tensor A tensor specifying the energy threshold in decibels (dB)
85 * used to decide which frames are considered active.
86 * @param output0 A pointer to an output tensor where the resulting segments will be written.
87 * Each row contains two integers: [start_sample, end_sample] for a detected segment.
88 * @return An extError_t value indicating the success or failure of the operation.
89 */
90extError_t ORTX_API_CALL OrtxSplitSignalSegments(const OrtxTensor* input, const OrtxTensor* sr_tensor,
91 const OrtxTensor* frame_ms_tensor, const OrtxTensor* hop_ms_tensor,
92 const OrtxTensor* energy_threshold_db_tensor, OrtxTensor* output0);
93
94/**
95 * @brief Merges adjacent signal segments that are separated by short gaps.
96 *
97 * This function takes a tensor of detected segments (each row containing [start, end] indices)
98 * and merges any consecutive segments whose gap is smaller than the specified threshold (in milliseconds).
99 *
100 * @param segments_tensor The input tensor of detected segments, of shape [N, 2].
101 * @param merge_gap_ms_tensor A tensor containing a single integer value representing
102 * the maximum allowed gap (in milliseconds) between consecutive segments to be merged.
103 * @param output0 A pointer to an output tensor where the merged segments will be stored.
104 * Each row contains two integers: [merged_start_sample, merged_end_sample].
105 * @return An extError_t value indicating the success or failure of the operation.
106 */
107extError_t ORTX_API_CALL OrtxMergeSignalSegments(const OrtxTensor* segments_tensor,
108 const OrtxTensor* merge_gap_ms_tensor, OrtxTensor* output0);
109
110/**
111 * @brief Extracts log-mel features from raw audio data using a feature extractor.
112 *
113 * This function processes the input audio buffers through the provided feature extractor,
114 * producing log-mel spectrogram outputs suitable for inference or further signal analysis.
115 *
116 * @param extractor A pointer to an OrtxFeatureExtractor object that defines the feature
117 * extraction pipeline and processing parameters.
118 * @param audio A pointer to an OrtxRawAudios structure containing raw audio data buffers
119 * and associated metadata (e.g., sampling rate, channels).
120 * @param result A pointer to an OrtxTensorResult pointer that will be allocated and set to
121 * hold the resulting log-mel spectrogram data and other outputs based on json configuration.
122 *
123 * @return An extError_t value indicating success or error status. Returns
124 * EXT_SUCCESS on success, or an appropriate error code if extraction fails.
125 */
126extError_t ORTX_API_CALL OrtxFeatureExtraction(OrtxFeatureExtractor* extractor, OrtxRawAudios* audio,
127 OrtxTensorResult** result);
128
129#ifdef __cplusplus
130}
131#endif