Here’s receiving a stream of chunks into a buffer (and only afterwards playing WAV.)
import io
import pyaudio
from openai import OpenAI
def byteplay(bytestream):
pya = pyaudio.PyAudio()
stream = pya.open(format=pya.get_format_from_width(width=2), channels=1, rate=24000, output=True)
stream.write(bytestream)
stream.stop_stream()
stream.close()
client = OpenAI()
with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
input="hello there, I'm making a WAV file today",
response_format='wav'
) as response:
# Initialize an empty bytes buffer
buffer = io.BytesIO()
# Read audio data from the generator
for chunk in response.iter_bytes():
print(len(chunk))
buffer.write(chunk)
# Go back to the start of the buffer
buffer.seek(0)
# Play the audio from the buffer
byteplay(buffer.getvalue())
#print(len(buffer.getvalue()))
So you can pull out your http chunks and stream the audio again.
Can WAV be instantly played? Yes, if you want immediate buffer underruns and noise. With a 100k preload threaded playback buffer, I only got a sentence to play before buffer underrun on my WiFi PC: the API doesn’t send uncompressed audio fast enough even at 24k/1ch. You also can’t know the final length from the stream to know how much to buffer even if you measure the stream rate.
buffered wav streaming with pyaudio
import pyaudio
import queue
import threading
from openai import OpenAI
def play_audio_data(pya, audio_queue):
""" Plays audio chunks from the queue. """
stream = pya.open(format=pya.get_format_from_width(width=2), channels=1, rate=24000, output=True)
while True:
chunk = audio_queue.get()
if chunk is None: # Sentinel value to stop the playback
break
stream.write(chunk)
stream.stop_stream()
stream.close()
def stream_audio(model: str, voice: str, input_text: str, initial_buffer_size: int = 150000):
pya = pyaudio.PyAudio()
audio_queue = queue.Queue()
client = OpenAI()
with client.audio.speech.with_streaming_response.create(
model=model,
voice=voice,
input=input_text,
response_format='wav'
) as response:
buffer = b'' # Temporary buffer to accumulate initial chunks
playback_started = False
play_thread = None
# Process each chunk only once
for chunk in response.iter_bytes():
if not playback_started:
buffer += chunk
# Check if initial buffer is sufficiently filled
if len(buffer) >= initial_buffer_size:
# Start the playback thread once the buffer size is reached
audio_queue.put(buffer) # Send the initial buffer to the queue
play_thread = threading.Thread(target=play_audio_data, args=(pya, audio_queue))
play_thread.start()
playback_started = True
buffer = b'' # Clear the initial buffer since it's now in the queue
else:
audio_queue.put(chunk)
if not playback_started:
# If the stream ends before filling the initial buffer, start playback with whatever we have
audio_queue.put(buffer)
play_thread = threading.Thread(target=play_audio_data, args=(pya, audio_queue))
play_thread.start()
# End signal for the playback thread
audio_queue.put(None)
if play_thread:
play_thread.join()
# Cleanup
pya.terminate()
# Usage
stream_audio(model="tts-1", voice="alloy", input_text="hello there, I'm making a wav file today")
Thus we must go compressed (and I choose open source) to make this thing talk some paragraphs before a chat completions response is even done.
Unlike AAC, which is a raw stream and you have to mux it into a mp4 if you want a normal file, when specifying opus
, the Opus is wrapped in OGG file stream.
An ogg audio is sent in small http chunks by OpenAI, but the actual OGG frames are large. The first two ogg frames have no audio, but lots of null tag space. The internal 20ms latency of Opus as a codec cannot be accessed. One must do a couple rounds of buffering, reassembling OGG packets, and decoding and playing those gapless. That puts at least a sentence delay before you’d be able to start playing without buffer overruns on a Python app. I’ll make it work though…
Just resending OGG packets to a browser, you can let that robust WebRTC client figure out the playing.