Here is the Python Client code, i wrote to stream audio data to OpenAI Real Time API to get the transcriptions/responses.
client = AsyncOpenAI()
connection = None
instruction = “Your knowledge cutoff is 2023-10. You are a highly accurate and efficient AI designed for transcription tasks. When provided with audio data, transcribe the content verbatim. Maintain clarity and fidelity to the original spoken words. Avoid altering the content or providing additional commentary.”
#process audio
async def process_audio_chunk(connection, chunk, sample_rate):
if chunk.ndim > 1:
chunk = np.mean(chunk, axis=1)
# Convert to floating-point and normalize
chunk_float = chunk.astype(np.float32)
chunk_normalized = chunk_float / np.iinfo(chunk.dtype).max
# Resample to 24kHz
resampled = librosa.resample(chunk_normalized, orig_sr=sample_rate, target_sr=24000)
# Convert to 16-bit PCM
pcm16 = (resampled * 32767).astype(np.int16)
# Process in chunks of 8192 bytes
chunk_size = 8192 // 2 # 4096 samples (8192 bytes)
for i in range(0, len(pcm16), chunk_size):
chunk_to_send = pcm16[i:i+chunk_size]
# Pad the last chunk if necessary
if len(chunk_to_send) < chunk_size:
chunk_to_send = np.pad(chunk_to_send, (0, chunk_size - len(chunk_to_send)), 'constant')
# Send to the model
await connection.input_audio_buffer.append(
audio=base64.b64encode(cast(any, chunk_to_send.tobytes())).decode("utf-8")
)
#Process responses
async def handle_events(connection):
async for event in connection:
if event.type == "conversation.item.input_audio_transcription.completed":
yield event.transcript
#main handler
async def transcribe_stream(audio_chunk, state):
print(f"audio_chunk:{audio_chunk} state:{state}")
global connection
global instruction
if connection is None:
#state["connection"] = await create_conneciton()
async with client.beta.realtime.connect(model="gpt-4o-realtime-preview") as conn:
connection = conn
print("connection:", connection)
await connection.session.update(session={
'modalities': ['text'],
"turn_detection": {"type": "server_vad"},
"instructions": instruction,
"temperature": 0.6,
"input_audio_transcription": {'model': "whisper-1"}
})
if audio_chunk is not None:
await process_audio_chunk(connection, audio_chunk[1], audio_chunk[0])
transcript = ""
async for partial_transcript in handle_events(connection):
transcript += partial_transcript
yield transcript
yield transcript
demo = gr.Interface(
fn=transcribe_stream,
inputs=[
gr.Audio(source=“microphone”, streaming=True)
],
outputs=[
gr.Textbox(label=“Transcription Output”)
],
live=True,
title=“Real-time Audio Transcription with OpenAI Real Time API”,
description=“Speak into your microphone for real-time transcription.”
)
demo.queue().launch(share=True)