[Realtime API] Error committing input audio buffer: the buffer is empty

I’m trying to basically replicate the functionality of the openai-realtime-console.

However, I can’t even get past receiving an audio response back. Currently I’m able to create a session, update it, create a conversation item and create a response. I get a greeting based on the content of the conversation item. After that I’m simply collecting the incoming audio from my frontend, converting it based on what the API requires. When the frontend signals that no more audio is incoming, I append the accumulated audio to to the API buffer and commit, however it claims that the buffer is empty.
I also tried appending the audio as its being recevied and processed in the begginning as it made more sence to me latency-wise.
Clearing the buffer after the initial greeting didn’t help either.

Am I missing a very obvious step here?

import base64
import json
import time
import uuid

from flask import request
from flask_socketio import SocketIO

from decouple import config
import numpy as np
import websockets

# Configuration
OPENAI_API_KEY = config("OPENAI")
socketio = SocketIO(cors_allowed_origins="*")

url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01"
headers = {
    "Authorization": f"Bearer {OPENAI_API_KEY}",
    "OpenAI-Beta": "realtime=v1",
}

active_conversations = {}
processed_audio_chunks = []


# Sockets
def setup_socketio(app):
    socketio.init_app(app)


@socketio.on("connect")
def on_connect():
    sid = request.sid
    print(f"Client connected: {sid}")
    active_conversations[sid] = {"websocket": None}


@socketio.on("disconnect")
def on_disconnect():
    sid = request.sid
    print(f"Client disconnected: {sid}")


@socketio.on("start_conversation")
def start_conversation_wrapper():
    sid = request.sid
    start_ai_conversation(sid=sid)


@socketio.on("user_audio_chunk")
def on_user_audio_chunk(data):
    sid = request.sid
    print()
    handle_input_audio(sid, data)


@socketio.on("end_audio_input")
def on_end_audio_input():
    sid = request.sid
    if sid not in active_conversations:
        print(f"Error: No active conversation found for session {sid}")
        return

    try:
        time.sleep(0.5)
        handle_audio_commit(sid)
    except Exception as e:
        print(f"Error handling audio commit for session {sid}: {str(e)}")


###


def start_ai_conversation(sid):
    try:
        websocket = websockets.sync.client.connect(url, additional_headers=headers)
        active_conversations[sid]["websocket"] = websocket
        ai_conversation(websocket, sid)

        while True:
            event = websocket.recv()
            event_data = json.loads(event)
            event_type = event_data.get("type")

            handle_event(sid, event_type, event_data)

    except Exception as e:
        print(f"Error in start_conversation for {sid}: {str(e)}")
    finally:
        if sid in active_conversations:
            del active_conversations[sid]


def ai_conversation(websocket, sid):
    try:
        session_update = {
            "type": "session.update",
            "event_id": f"event_{uuid.uuid4()}",
            "session": {
                "modalities": ["audio", "text"],
                "instructions": "Your knowledge cutoff is 2023-10. You are a helpful assistant.",
                "voice": "alloy",
                "input_audio_format": "pcm16",
                "output_audio_format": "pcm16",
                "input_audio_transcription": {"model": "whisper-1"},
                "turn_detection": {
                    "type": "server_vad",
                    "threshold": 0.5,
                    "prefix_padding_ms": 300,
                    "silence_duration_ms": 200,
                },
                "temperature": 0.8,
                "max_response_output_tokens": "inf",
            },
        }

        send_websocket_event(websocket, session_update)
    except Exception as e:
        print(f"Error in session_update: {str(e)}")

    try:
        conversation_item = {
            "event_id": f"event_{uuid.uuid4()}",
            "type": "conversation.item.create",
            "item": {
                "type": "message",
                "role": "user",
                "content": [{"type": "input_text", "text": "Hello! My name is John"}],
            },
        }

        send_websocket_event(websocket, conversation_item)

    except Exception as e:
        print(f"Error in conversation.item.create: {str(e)}")

    try:
        response_create = {
            "event_id": f"event_{uuid.uuid4()}",
            "type": "response.create",
        }

        send_websocket_event(websocket, response_create)

    except Exception as e:
        print(f"Error in response.create: {str(e)}")


def handle_event(sid, event_type, event_data):
    event_handlers = {
        "session.created": lambda: print("session.created"),
        "session.updated": lambda: print("session.updated", event_data),
        "response.created": lambda: print("response.created"),
        "rate_limits.updated": lambda: print(
            "rate_limits.updated",
            event_data.get("rate_limits", [])[0].get("remaining", "N/A"),
        ),
        "response.audio_transcript.delta": lambda: (
            print("response.audio_transcript.delta"),
        ),
        "response.audio.delta": lambda: handle_audio_delta(sid, event_data),
        "response.done": lambda: print(
            "response.done",
            event_data.get("response", {})
            .get("output", [{}])[0]
            .get("content", [{}])[0]
            .get("text", ""),
        ),
        "error": lambda: handle_error(event_data),
    }
    handler = event_handlers.get(
        event_type, lambda: print(f"Unhandled event for {event_type}")
    )

    handler()


def send_websocket_event(websocket, event):
    websocket.send(json.dumps(event))


def handle_audio_delta(sid, event_data):
    audio_chunk = base64.b64decode(event_data.get("delta", ""))
    socketio.emit(
        "audio_chunk",
        {"chunk": audio_chunk},
    )


def handle_input_audio(sid, data):
    audio_chunk = data.get("audioChunk")
    sample_rate = data.get("sampleRate", 44100)

    websocket = active_conversations[sid].get("websocket")

    if not websocket:
        print(f"Error: No active WebSocket for session {sid}")
        return

    if audio_chunk:
        try:
            if isinstance(audio_chunk, str):
                audio_bytes = base64.b64decode(audio_chunk)
            elif isinstance(audio_chunk, bytes):
                audio_bytes = audio_chunk
            else:
                raise ValueError(f"Unsupported audio_chunk format: {type(audio_chunk)}")

            print(f"Audio chunk length: {len(audio_bytes)} bytes")
            print(f"Sample rate: {sample_rate}")

            if len(audio_bytes) % 2 != 0:
                audio_bytes = audio_bytes[:-1]

            audio_array = np.frombuffer(audio_bytes, dtype=np.int16)

            audio_float = audio_array.astype(np.float32) / 32768.0

            base64_chunk = base64.b64encode(audio_float.tobytes()).decode("utf-8")

            processed_audio_chunks.append({"audio": base64_chunk})

            print(
                f"Processed audio chunk for session {sid}, length: {len(audio_float)} samples, sample_rate: {sample_rate}"
            )
        except Exception as e:
            print(
                f"Error processing and sending audio chunk for session {sid}: {str(e)}"
            )
            import traceback

            print(traceback.format_exc())
    else:
        print(f"Invalid audio data received for session {sid}")


def handle_audio_commit(sid):
    websocket = active_conversations[sid].get("websocket")
    if not websocket:
        print(f"Error: No active WebSocket for session {sid}")
        return

    try:
        for chunk in processed_audio_chunks:
            print(type(chunk["audio"]))
            input_audio_buffer_append = {
                "event_id": f"event_{uuid.uuid4()}",
                "type": "input_audio_buffer.append",
                "audio": chunk["audio"],
            }
            send_websocket_event(websocket, input_audio_buffer_append)

        input_audio_buffer_commit = {
            "event_id": f"event_{uuid.uuid4()}",
            "type": "input_audio_buffer.commit",
        }
        send_websocket_event(websocket, input_audio_buffer_commit)

        print(f"Successfully appended audio buffer for session {sid}")
    except Exception as e:
        print(f"Error appending or committing audio buffer for session {sid}: {str(e)}")
        import traceback

        print(traceback.format_exc())

    try:
        response_create = {
            "event_id": f"event_{uuid.uuid4()}",
            "type": "response.create",
        }
        send_websocket_event(websocket, response_create)
    except Exception as e:
        print(f"Error committing audio buffer for session {sid}: {str(e)}")


def handle_error(error_data):
    error = error_data.get("error", {})
    print(
        f"Error occurred: Type: {error.get('type')}, Code: {error.get('code')}, Message: {error.get('message')}"
    )

1 Like

UPDATE
Fix:
in session_update:
"turn_detection": None

Hey @emkataumre , I have tried same but i’m getting invalid_request_error

event -
{
“type”:“session.update”,
“turn_detection”: “None”,
“session”:“sess_BJvYbMiu8Wfp4JljB97bw”
}

error response -
{“type”:“error”,“event_id”:“event_BJvYne5Zj3yFBS5ysu89N”,“error”:{“type”:“invalid_request_error”,“code”:“unknown_parameter”,“message”:“Unknown parameter: ‘turn_detection’.”,“param”:“turn_detection”,“event_id”:null}}