Connection getting closed when streaming audio to Real Time API using OpenAI Python SDK

I am writing a python client code to stream audio from a microphone to a real time API using AsyncOpenAI and I am getting connection close error as soon i start streaming the audio.

If I stream a audio from a file or save entire audio to a buffer and then stream, it is working fine, but if i stream audio real time from a mic, i am getting connection closed issue.

Error:
websockets.exceptions.ConnectionClosedOK: sent 1000 (OK); then received 1000 (OK)

Please post your code

2 Likes

Here is the Python Client code, i wrote to stream audio data to OpenAI Real Time API to get the transcriptions/responses.

client = AsyncOpenAI()
connection = None
instruction = “Your knowledge cutoff is 2023-10. You are a highly accurate and efficient AI designed for transcription tasks. When provided with audio data, transcribe the content verbatim. Maintain clarity and fidelity to the original spoken words. Avoid altering the content or providing additional commentary.”

#process audio
async def process_audio_chunk(connection, chunk, sample_rate):

if chunk.ndim > 1:
    chunk = np.mean(chunk, axis=1)

# Convert to floating-point and normalize
chunk_float = chunk.astype(np.float32)
chunk_normalized = chunk_float / np.iinfo(chunk.dtype).max

# Resample to 24kHz
resampled = librosa.resample(chunk_normalized, orig_sr=sample_rate, target_sr=24000)

# Convert to 16-bit PCM
pcm16 = (resampled * 32767).astype(np.int16)

 # Process in chunks of 8192 bytes
chunk_size = 8192 // 2  # 4096 samples (8192 bytes)
for i in range(0, len(pcm16), chunk_size):
    chunk_to_send = pcm16[i:i+chunk_size]
    
    # Pad the last chunk if necessary
    if len(chunk_to_send) < chunk_size:
        chunk_to_send = np.pad(chunk_to_send, (0, chunk_size - len(chunk_to_send)), 'constant')
    
    # Send to the model
    await connection.input_audio_buffer.append(
        audio=base64.b64encode(cast(any, chunk_to_send.tobytes())).decode("utf-8")
    )

#Process responses
async def handle_events(connection):

async for event in connection:
    if event.type == "conversation.item.input_audio_transcription.completed":
        yield event.transcript

#main handler
async def transcribe_stream(audio_chunk, state):

print(f"audio_chunk:{audio_chunk} state:{state}")
global connection
global instruction
if connection is None:
    #state["connection"] = await create_conneciton()
    async with client.beta.realtime.connect(model="gpt-4o-realtime-preview") as conn:
        connection = conn
        print("connection:", connection)
        await connection.session.update(session={
            'modalities': ['text'],
            "turn_detection": {"type": "server_vad"},
            "instructions": instruction,
            "temperature": 0.6,
            "input_audio_transcription": {'model': "whisper-1"}
        })

if audio_chunk is not None:
    await process_audio_chunk(connection, audio_chunk[1], audio_chunk[0])

transcript = ""
async for partial_transcript in handle_events(connection):
    transcript += partial_transcript
    yield transcript

yield transcript

demo = gr.Interface(
fn=transcribe_stream,
inputs=[
gr.Audio(source=“microphone”, streaming=True)
],
outputs=[
gr.Textbox(label=“Transcription Output”)
],
live=True,
title=“Real-time Audio Transcription with OpenAI Real Time API”,
description=“Speak into your microphone for real-time transcription.”
)

demo.queue().launch(share=True)