openai/openai-python
Publicmirrored from https://github.com/openai/openai-pythonAvailable
examples/realtime/audio_util.py
142lines · modeblame
488ec04bRobert Craigie1 years ago | 1 | from __future__ import annotations |
| 2 | | |
| 3 | import io | |
| 4 | import base64 | |
| 5 | import asyncio | |
| 6 | import threading | |
| 7 | from typing import Callable, Awaitable | |
| 8 | | |
| 9 | import numpy as np | |
| 10 | import pyaudio | |
| 11 | import sounddevice as sd | |
| 12 | from pydub import AudioSegment | |
| 13 | | |
3d3d16abstainless-app[bot]9 months ago | 14 | from openai.resources.realtime.realtime import AsyncRealtimeConnection |
488ec04bRobert Craigie1 years ago | 15 | |
| 16 | CHUNK_LENGTH_S = 0.05 # 100ms | |
| 17 | SAMPLE_RATE = 24000 | |
| 18 | FORMAT = pyaudio.paInt16 | |
| 19 | CHANNELS = 1 | |
| 20 | | |
| 21 | # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false | |
| 22 | | |
| 23 | | |
| 24 | def audio_to_pcm16_base64(audio_bytes: bytes) -> bytes: | |
| 25 | # load the audio file from the byte stream | |
| 26 | audio = AudioSegment.from_file(io.BytesIO(audio_bytes)) | |
| 27 | print(f"Loaded audio: {audio.frame_rate=} {audio.channels=} {audio.sample_width=} {audio.frame_width=}") | |
| 28 | # resample to 24kHz mono pcm16 | |
| 29 | pcm_audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(CHANNELS).set_sample_width(2).raw_data | |
| 30 | return pcm_audio | |
| 31 | | |
| 32 | | |
| 33 | class AudioPlayerAsync: | |
| 34 | def __init__(self): | |
| 35 | self.queue = [] | |
| 36 | self.lock = threading.Lock() | |
| 37 | self.stream = sd.OutputStream( | |
| 38 | callback=self.callback, | |
| 39 | samplerate=SAMPLE_RATE, | |
| 40 | channels=CHANNELS, | |
| 41 | dtype=np.int16, | |
| 42 | blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE), | |
| 43 | ) | |
| 44 | self.playing = False | |
| 45 | self._frame_count = 0 | |
| 46 | | |
| 47 | def callback(self, outdata, frames, time, status): # noqa | |
| 48 | with self.lock: | |
| 49 | data = np.empty(0, dtype=np.int16) | |
| 50 | | |
| 51 | # get next item from queue if there is still space in the buffer | |
| 52 | while len(data) < frames and len(self.queue) > 0: | |
| 53 | item = self.queue.pop(0) | |
| 54 | frames_needed = frames - len(data) | |
| 55 | data = np.concatenate((data, item[:frames_needed])) | |
| 56 | if len(item) > frames_needed: | |
| 57 | self.queue.insert(0, item[frames_needed:]) | |
| 58 | | |
| 59 | self._frame_count += len(data) | |
| 60 | | |
| 61 | # fill the rest of the frames with zeros if there is no more data | |
| 62 | if len(data) < frames: | |
| 63 | data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16))) | |
| 64 | | |
| 65 | outdata[:] = data.reshape(-1, 1) | |
| 66 | | |
| 67 | def reset_frame_count(self): | |
| 68 | self._frame_count = 0 | |
| 69 | | |
| 70 | def get_frame_count(self): | |
| 71 | return self._frame_count | |
| 72 | | |
| 73 | def add_data(self, data: bytes): | |
| 74 | with self.lock: | |
| 75 | # bytes is pcm16 single channel audio data, convert to numpy array | |
| 76 | np_data = np.frombuffer(data, dtype=np.int16) | |
| 77 | self.queue.append(np_data) | |
| 78 | if not self.playing: | |
| 79 | self.start() | |
| 80 | | |
| 81 | def start(self): | |
| 82 | self.playing = True | |
| 83 | self.stream.start() | |
| 84 | | |
| 85 | def stop(self): | |
| 86 | self.playing = False | |
| 87 | self.stream.stop() | |
| 88 | with self.lock: | |
| 89 | self.queue = [] | |
| 90 | | |
| 91 | def terminate(self): | |
| 92 | self.stream.close() | |
| 93 | | |
| 94 | | |
| 95 | async def send_audio_worker_sounddevice( | |
| 96 | connection: AsyncRealtimeConnection, | |
| 97 | should_send: Callable[[], bool] | None = None, | |
| 98 | start_send: Callable[[], Awaitable[None]] | None = None, | |
| 99 | ): | |
| 100 | sent_audio = False | |
| 101 | | |
| 102 | device_info = sd.query_devices() | |
| 103 | print(device_info) | |
| 104 | | |
| 105 | read_size = int(SAMPLE_RATE * 0.02) | |
| 106 | | |
| 107 | stream = sd.InputStream( | |
| 108 | channels=CHANNELS, | |
| 109 | samplerate=SAMPLE_RATE, | |
| 110 | dtype="int16", | |
| 111 | ) | |
| 112 | stream.start() | |
| 113 | | |
| 114 | try: | |
| 115 | while True: | |
| 116 | if stream.read_available < read_size: | |
| 117 | await asyncio.sleep(0) | |
| 118 | continue | |
| 119 | | |
| 120 | data, _ = stream.read(read_size) | |
| 121 | | |
| 122 | if should_send() if should_send else True: | |
| 123 | if not sent_audio and start_send: | |
| 124 | await start_send() | |
| 125 | await connection.send( | |
| 126 | {"type": "input_audio_buffer.append", "audio": base64.b64encode(data).decode("utf-8")} | |
| 127 | ) | |
| 128 | sent_audio = True | |
| 129 | | |
| 130 | elif sent_audio: | |
| 131 | print("Done, triggering inference") | |
| 132 | await connection.send({"type": "input_audio_buffer.commit"}) | |
| 133 | await connection.send({"type": "response.create", "response": {}}) | |
| 134 | sent_audio = False | |
| 135 | | |
| 136 | await asyncio.sleep(0) | |
| 137 | | |
| 138 | except KeyboardInterrupt: | |
| 139 | pass | |
| 140 | finally: | |
| 141 | stream.stop() | |
| 142 | stream.close() |