Problems using session.update with the realtime-api (issue with "input_audio_transcription")

Hi Folks:

I am new to the realtime API. I am writing code to try out features and learn.

I am writing a Python program using the websocket-client library. I am starting simple by reading a wav file containing a command. I want the real-time API to send back text.

My problem is, after I get a conversation.item.created, I do not get either a transcription, or further websocket events, like input_audio_buffer.speech_stopped.
WebSocket connection opened.
Audio written to output_16khz_mono.wav
debug sending frames to openai
sending 0 1024

Sending input_audio_buffer.commit
Event: session.updated - Session has been updated.
{‘type’: ‘session.updated’, ‘event_id’: ‘event_AHnNzP28AzQyltxYbHWVc’, ‘session’: {‘id’: ‘sess_AHnNyF82tvZhR75TqFEgC’, ‘object’: ‘realtime.session’, ‘model’: ‘gpt-4o-realtime-preview-2024-10-01’, ‘expires_at’: 1728805678, ‘modalities’: [‘text’], ‘instructions’: ‘You are a helpful AI assistant. You execute commands to the best of your ability.Please answer in a clear and concise manner.’, ‘voice’: ‘alloy’, ‘turn_detection’: {‘type’: ‘server_vad’, ‘threshold’: 0.3, ‘prefix_padding_ms’: 300, ‘silence_duration_ms’: 200}, ‘input_audio_format’: ‘pcm16’, ‘output_audio_format’: ‘pcm16’, ‘input_audio_transcription’: None, ‘tool_choice’: ‘auto’, ‘temperature’: 0.8, ‘max_response_output_tokens’: ‘inf’, ‘tools’: }}
Event: input_audio_buffer.speech_started - Speech has started.
Event: input_audio_buffer.committed - Audio buffer has been committed.
{‘type’: ‘input_audio_buffer.committed’, ‘event_id’: ‘event_AHnNzPkhZfh7MON8Zuc2v’, ‘previous_item_id’: None, ‘item_id’: ‘item_AHnNzdE29TE1YyjP7vxXw’}
Event: conversation.item.created - A conversation item has been created.
{‘type’: ‘conversation.item.created’, ‘event_id’: ‘event_AHnNzLiujXFBaLy6uJ9rz’, ‘previous_item_id’: None, ‘item’: {‘id’: ‘item_AHnNzdE29TE1YyjP7vxXw’, ‘object’: ‘realtime.item’, ‘type’: ‘message’, ‘status’: ‘completed’, ‘role’: ‘user’, ‘content’: [{‘type’: ‘input_audio’, ‘transcript’: None}]}}

If I add input_audio_transcription to the session property, I get:

{‘type’: ‘error’, ‘event_id’: ‘event_AHnEmJsPiDD7XR0Dqokux’, ‘error’: {‘type’: ‘invalid_request_error’, ‘code’: ‘unknown_parameter’, ‘message’: “Unknown parameter: ‘session.input_audio_transcription.enabled’.”, ‘param’: ‘session.input_audio_transcription.enabled’, ‘event_id’: None}}

I believe I am sending data as 16khz PCM, mono.

Here is my code, warts and all

import base64
import os
import io
import json
import threading
import websocket
import wave
from pydub import AudioSegment


t_list = []
ws = None

# Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  # Requires OpenAI Realtime API Access

SYSTEM_MESSAGE = (
    "You are a helpful AI assistant. You execute commands to the best of your ability."
    "Please answer in a clear and concise manner."
)
VOICE = 'alloy'

if not OPENAI_API_KEY:
    raise ValueError('Missing the OpenAI API key. Please set it in the .env file.')
    
# for debugging
import wave

def write_audio_to_wav(audio_data, output_file, sample_rate=16000, num_channels=1, sample_width=2):
    
    with wave.open(output_file, 'wb') as wav_file:
        # Set the WAV file parameters
        wav_file.setnchannels(num_channels)  # Mono
        wav_file.setsampwidth(sample_width)  # 2 bytes for 16-bit audio
        wav_file.setframerate(sample_rate)   # Sample rate

        # Write the raw PCM data to the WAV file
        wav_file.writeframes(audio_data)
    
    print(f"Audio written to {output_file}")


def convert_to_16khz_mono(input_file):
    # Load the audio file
    audio = AudioSegment.from_wav(input_file)

    # Resample to 16kHz and make sure it's mono
    audio = audio.set_frame_rate(16000).set_channels(1)  
 
    return audio.raw_data  

def send_audio_to_openai(ws, audio_data):
    """Send audio data to OpenAI Realtime API in chunks."""

    print(f"debug sending frames to openai")

    frames_per_packet = 1024  # Adjust packet size as needed
    for i in range(0, len(audio_data), frames_per_packet):
        audio_chunk = audio_data[i:i + frames_per_packet]
        audio_append = {
            "type": "input_audio_buffer.append",
            "audio": base64.b64encode(audio_chunk).decode('utf-8')
        }
        print(f"sending {i} {len(audio_chunk)}")
        ws.send(json.dumps(audio_append))

    audio_commit = {
        "type": "input_audio_buffer.commit"
    }
    
    print("Sending input_audio_buffer.commit")
    ws.send(json.dumps(audio_commit))

def on_message(ws, message):
    """Handle incoming WebSocket messages from OpenAI and print the text response."""
    response = json.loads(message)
    
    # Extracting event type from the message
    event_type = response.get('type', '')

    # Using match to handle different event types
    match event_type:
        # Session-related events
        case 'session.created':
            print(f"Event: {event_type} - Session has been created.")
        
        case 'session.updated':
            print(f"Event: {event_type} - Session has been updated.")
            print(f"{response}")

        # Conversation-related events
        case 'conversation.created':
            print(f"Event: {event_type} - Conversation has been created.")
            print(f"{response}")

        case 'conversation.item.created':
            print(f"Event: {event_type} - A conversation item has been created.")
            print(f"{response}")

        case 'conversation.item.input_audio_transcription.completed':
            print(f"Event: {event_type} - Audio transcription completed.")
        
        case 'conversation.item.input_audio_transcription.failed':
            print(f"***** Event: {event_type} - Audio transcription failed. ****")

        case 'conversation.item.truncated':
            print(f"Event: {event_type} - Conversation item was truncated.")

        case 'conversation.item.deleted':
            print(f"Event: {event_type} - Conversation item has been deleted.")
        
        # Input audio buffer events
        case 'input_audio_buffer.committed':
            print(f"Event: {event_type} - Audio buffer has been committed.")
            print(f"{response}")

        case 'input_audio_buffer.cleared':
            print(f"Event: {event_type} - Audio buffer has been cleared.")

        case 'input_audio_buffer.speech_started':
            print(f"Event: {event_type} - Speech has started.")

        case 'input_audio_buffer.speech_stopped':
            print(f"Event: {event_type} - Speech has stopped.")

        # Response-related events
        case 'response.created':
            print(f"Event: {event_type} - A new response has been created.")

        case 'response.done':
            print(f"Event: {event_type} - Response is fully complete.")
            if "response" in response:
                output_items = response["response"].get("output", [])
                full_text = ""
                
                for item in output_items:
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_item in content:
                            if content_item.get("type") == "text":
                                full_text += content_item.get("text", "")
                
                print(f"Final response: {full_text}")
            else:
                print(f"Response completed without output.")

        case 'response.output_item.added':
            print(f"Event: {event_type} - Response output item added.")

        case 'response.output_item.done':
            print(f"Event: {event_type} - Response output item done.")

        case 'response.content_part.added':
            print(f"Event: {event_type} - Response content part added.")

        case 'response.content_part.done':
            print(f"Event: {event_type} - Response content part done.")

        # Partial and full text response events
        case 'response.text.delta':
            print(f"Event: {event_type} - Partial text response: {response.get('text_delta', '')}")

        case 'response.text.done':
            print(f"Event: {event_type} - Text response generation completed.")
            #print(f"Full Text: {response.get('text', '')}")

        # Audio response events
        case 'response.audio_transcript.delta':
            print(f"Event: {event_type} - Partial audio transcript: {response.get('transcript_delta', '')}")

        case 'response.audio_transcript.done':
            print(f"Event: {event_type} - Audio transcript generation completed.")

        case 'response.audio.delta':
            print(f"Event: {event_type} - Partial audio response delta.")

        case 'response.audio.done':
            print(f"Event: {event_type} - Audio response generation completed.")

        # Function call arguments
        case 'response.function_call_arguments.delta':
            print(f"Event: {event_type} - Partial function call argument delta.")

        case 'response.function_call_arguments.done':
            print(f"Event: {event_type} - Function call argument generation completed.")

        # Rate limit updates
        case 'rate_limits.updated':
            print(f"Event: {event_type} - Rate limits have been updated.")

        # Default case for unhandled events
        case _:
            print(f"Unhandled event type: {event_type}")
            print(f"{response}")

def on_error(ws, error):
    print(f"WebSocket Error: {error} {type(error)}")

def on_close(ws, close_status_code, close_msg):
    print(f"WebSocket Closed: {close_status_code} - {close_msg}")

def on_open(ws):
    """Send session update and start sending audio data."""
    print("WebSocket connection opened.")

    # Send session update to OpenAI WebSocket to receive text instead of audio

    session_update = {
        "type": "session.update",

        "session": {
            "input_audio_format": "pcm16",  # Still need to match input audio format
            "instructions": SYSTEM_MESSAGE,
            "modalities": ["text"],             # We are only interested in text
            "turn_detection": {
                "type": "server_vad",
                "threshold": 0.3
                },
            "temperature": 0.8,
            "tool_choice": "auto",
        },
    }
 
    ws.send(json.dumps(session_update))

    # Start sending audio data to OpenAI
    wav_file_path = '/.test.wav'  # Replace with your actual WAV file path
    audio_data = convert_to_16khz_mono(wav_file_path)

    # for debugging
    write_audio_to_wav(audio_data, "output_16khz_mono.wav")
    
    # Using a separate thread to send audio
    send_thread = threading.Thread(target=send_audio_to_openai, args=(ws, audio_data))
    t = send_thread.start()
    t_list.append(t)

def start_websocket():

    websocket_url = 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01'

    # Initialize WebSocket connection
    ws = websocket.WebSocketApp(
        websocket_url,
        header={"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime=v1"},
        on_message=on_message,
        on_error=on_error,
        on_close=on_close,
        on_open=on_open
    )

    # Run the WebSocket connection in the main thread
    ws.run_forever()

if __name__ == "__main__":

    #websocket.enableTrace(True) 

    t_list = []

    send_thread = threading.Thread(target=start_websocket)
    t_list.append(send_thread)
    t = send_thread.start()

    for t in t_list:
        t.join()

What else should I be looking at?

Cheers,
Andrew

1 Like

Voice instead of Audio worked for me.

type or paste code here
```app.add_handler(MessageHandler(filters.VOICE, handle_voice))

I modified the code to get rid of the threads. I also use a 24k PCM sample as the input file.

import base64
import os
import json
import websocket
import wave
from pydub import AudioSegment

# Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  # Requires OpenAI Realtime API Access
SYSTEM_MESSAGE = "You are a helpful AI assistant. Please answer in a clear and concise manner."
VOICE = 'alloy'

if not OPENAI_API_KEY:
    raise ValueError('Missing the OpenAI API key. Please set it in the .env file.')


# Function to convert audio to 16kHz mono
def convert_to_24khz_mono(input_file):
    audio = AudioSegment.from_wav(input_file)
    audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2).raw_data  
    return audio  

def send_audio_to_openai(ws, audio_data):
    """Send raw PCM audio data to OpenAI Realtime API in chunks."""
    frames_per_packet = 1024  # Number of bytes per packet, can be adjusted
    total_length = len(audio_data)
    
    print(f"Sending {total_length} bytes of audio data to OpenAI API in chunks of {frames_per_packet} bytes...")

    try:
        # Loop through audio data in chunks and send each chunk to the API
        for i in range(0, total_length, frames_per_packet):
            audio_chunk = audio_data[i:i + frames_per_packet]
            
            # Prepare the message with Base64-encoded audio chunk
            audio_append = {
                "type": "input_audio_buffer.append",
                "audio": base64.b64encode(audio_chunk).decode('utf-8')
            }

            # Send the audio chunk over WebSocket
            ws.send(json.dumps(audio_append))
            print(f"Sent chunk {i // frames_per_packet + 1}/{(total_length // frames_per_packet) + 1}")

        # Commit the audio buffer once all chunks are sent
        audio_commit = {
            "type": "input_audio_buffer.commit"
        }
        ws.send(json.dumps(audio_commit))
        print("Audio buffer committed.")
    
    except Exception as e:
        print(f"Error while sending audio data: {e}")


# Handling different server events with a match statement
def on_message(ws, message):
    response = json.loads(message)
    event_type = response.get('type', '')

    match event_type:
        case 'session.created':
            print(f"Event: {event_type} - Session has been created.")

        case 'session.updated':
            print(f"Event: {event_type} - Session has been updated.")
            print(f"{response}")

        case 'conversation.created':
            print(f"Event: {event_type} - Conversation has been created.")
            print(f"{response}")

        case 'conversation.item.created':
            print(f"Event: {event_type} - A conversation item has been created.")
            print(f"{response}")
            if response['item']['content'][0].get('transcript') is None:
                print("Warning: Transcript is None, possible audio processing issue.")

        case 'conversation.item.input_audio_transcription.completed':
            print(f"Event: {event_type} - Audio transcription completed.")

        case 'conversation.item.input_audio_transcription.failed':
            print(f"***** Event: {event_type} - Audio transcription failed. ****")

        case 'conversation.item.truncated':
            print(f"Event: {event_type} - Conversation item was truncated.")

        case 'conversation.item.deleted':
            print(f"Event: {event_type} - Conversation item has been deleted.")

        case 'input_audio_buffer.committed':
            print(f"Event: {event_type} - Audio buffer has been committed.")
            print(f"{response}")

        case 'input_audio_buffer.cleared':
            print(f"Event: {event_type} - Audio buffer has been cleared.")

        case 'input_audio_buffer.speech_started':
            print(f"Event: {event_type} - Speech has started.")

        case 'input_audio_buffer.speech_stopped':
            print(f"Event: {event_type} - Speech has stopped.")

        case 'response.created':
            print(f"Event: {event_type} - A new response has been created.")

        case 'response.done':
            print(f"Event: {event_type} - Response is fully complete.")
            print(f"{response}")

            """
            if "response" in response:
                output_items = response["response"].get("output", [])
                full_text = ""
                for item in output_items:
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_item in content:
                            if content_item.get("type") == "text":
                                full_text += content_item.get("text", "")
                print(f"Final response: {full_text}")
            else:
                print(f"Response completed without output.")
            """    

        case 'response.output_item.added':
            print(f"Event: {event_type} - Response output item added.")

        case 'response.output_item.done':
            print(f"Event: {event_type} - Response output item done.")

        case 'response.content_part.added':
            print(f"Event: {event_type} - Response content part added.")

        case 'response.content_part.done':
            print(f"Event: {event_type} - Response content part done.")

        case 'response.text.delta':
            print(f"Event: {event_type} - Partial text response: {response.get('text_delta', '')}")

        case 'response.text.done':
            print(f"Event: {event_type} - Text response generation completed.")

        case 'response.audio_transcript.delta':
            print(f"Event: {event_type} - Partial audio transcript: {response.get('transcript_delta', '')}")

        case 'response.audio_transcript.done':
            print(f"Event: {event_type} - Audio transcript generation completed.")

        case 'response.audio.delta':
            print(f"Event: {event_type} - Partial audio response delta.")

        case 'response.audio.done':
            print(f"Event: {event_type} - Audio response generation completed.")

        case 'response.function_call_arguments.delta':
            print(f"Event: {event_type} - Partial function call argument delta.")

        case 'response.function_call_arguments.done':
            print(f"Event: {event_type} - Function call argument generation completed.")

        case 'rate_limits.updated':
            print(f"Event: {event_type} - Rate limits have been updated.")

        case _:
            print(f"Unhandled event type: {event_type}")
            print(f"{response}")

# Handling WebSocket errors
def on_error(ws, error):
    print(f"WebSocket Error: {error} {type(error)}")

# Handling WebSocket closure
def on_close(ws, close_status_code, close_msg):
    print(f"WebSocket Closed: {close_status_code} - {close_msg}")

# WebSocket connection opened event
def on_open(ws):
    print("WebSocket connection opened.")

    # Send session update to OpenAI WebSocket
    
    session_update = {
        "type": "session.update",
        "session": {
            "input_audio_format": "pcm16",
            "instructions": SYSTEM_MESSAGE,
            "modalities": ["text"],
            "turn_detection": {
                "type": "server_vad",
                "threshold": 0.5,
                'prefix_padding_ms': 300, 
                'silence_duration_ms': 500
            },
            "temperature": 0.8
        },
    }
    
    ws.send(json.dumps(session_update))

    # Convert the audio file and send it after the session update
    wav_file_path = 'your-file.wav'  # Replace with your actual WAV file path
    audio_data = convert_to_24khz_mono(wav_file_path)
    
    send_audio_to_openai(ws, audio_data)

# Start the WebSocket connection
def start_websocket():
    websocket_url = 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01'

    ws = websocket.WebSocketApp(
        websocket_url,
        header={"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime=v1"},
        on_message=on_message,
        on_error=on_error,
        on_close=on_close,
        on_open=on_open
    )

    # Run the WebSocket connection in a blocking loop (no need for threads)
    ws.run_forever()

if __name__ == "__main__":
    start_websocket()

were you able to get this to work? I’m running into a strange issue where when i send an event of type conversation.item.create with the audio data it works;

when i use the event with type input_audio_buffer.append with the same audio data I get this error
Error committing input audio buffer: the buffer is empty.

1 Like

My bad. I didn’t use realtime.
Sorry

Hi @me51 no the new code doesn’t work. I may try using conversation.item.create and not chunk the data to see what works.

Blockquote

when i use the event with type input_audio_buffer.append with the same audio data I get this error
Error committing input audio buffer: the buffer is empty.

What are you initially sending to start the conversation?

Cheers,
Andrew

Here is my current order of operations:

(A) I open connection # standard convo set up here
(B) {type: session.update, …} # I set the session details / instructions here

After audio input I hit
(C) {type: conversation.item.create, item: {“type”: “message”, “role”: “user”, “content”: {“type”: “input_audio”, “audio”: $base64data}}
(D) {“type”: response.create}

This works for me in terms of starting the conversation and getting audio from the real time api to render to my user

When I replace (C) and (D) with
(C) {“type”: “input_audio_buffer.append”, “audio”: $base64data}
(D) {“type”: “input_audio_buffer.commit”}

I get the error stated above
Error committing input audio buffer: the buffer is empty.

I’d like to use input_audio_buffer to take advantage of server side VAD and more to make the experience truly interactive.

Hi @me51

Thanks for the reply. That said, I could try that sequence. That said, it would be easier to follow along if you posted code snippets.

{type: conversation.item.create, item: {“type”: “message”, “role”: “user”, “content”: {“type”: “input_audio”, “audio”: $base64data}}

I don’t know what $base64data is. Not a Python variable? Javascript? What I can tell you is with input_audio_buffer.append, you sending your audio in base64 encoded chucks. Does $base64data contain base64 encoded audio data?

Ah apologies – yup, $base64data is meant to be a stand in for a chunk of a base64 audio data

Hi me51:

You can’t send an empty input_audio_buffer. I think this is an oversight. And yes, you are right: you need to send a response.create after you have committed the buffer. I looked at the streaming user audio.

I’ll update my code, warts and all to:

import base64
import os
import json
import websocket
import wave
from pydub import AudioSegment

# Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')  # Requires OpenAI Realtime API Access
SYSTEM_MESSAGE = "You are a helpful AI assistant. Please answer in a clear and concise manner."
VOICE = 'alloy'

if not OPENAI_API_KEY:
    raise ValueError('Missing the OpenAI API key. Please set it in the .env file.')


# Function to convert audio to 24kHz mono
def convert_to_24khz_mono(input_file):
    audio = AudioSegment.from_wav(input_file)
    audio = audio.set_frame_rate(24000).set_channels(1).set_sample_width(2).raw_data  
    return audio 

def send_audio_to_openai(ws, audio_data):
    """Send raw PCM audio data to OpenAI Realtime API in chunks."""
    frames_per_packet = 1024  # Number of bytes per packet, can be adjusted
    total_length = len(audio_data)
    
    print(f"Sending {total_length} bytes of audio data to OpenAI API in chunks of {frames_per_packet} bytes...")

    try:
        # Loop through audio data in chunks and send each chunk to the API
        for i in range(0, total_length, frames_per_packet):
            audio_chunk = audio_data[i:i + frames_per_packet]
            
            # Prepare the message with Base64-encoded audio chunk
            audio_append = {
                "type": "input_audio_buffer.append",
                "audio": base64.b64encode(audio_chunk).decode('utf-8')
            }

            # Send the audio chunk over WebSocket
            ws.send(json.dumps(audio_append))
            print(f"Sent chunk {i // frames_per_packet + 1}/{(total_length // frames_per_packet) + 1}")

        # Commit the audio buffer once all chunks are sent
        audio_commit = {
            "type": "input_audio_buffer.commit"
        }
        ws.send(json.dumps(audio_commit))
        print("Audio buffer committed.")

        response_create = {
            "type" : "response.create"
        }

        # see Javascript example for stream user audio
        ws.send(json.dumps(response_create))
        print("response created")


    except Exception as e:
        print(f"Error while sending audio data: {e}")


# Handling different server events with a match statement
def on_message(ws, message):
    response = json.loads(message)
    event_type = response.get('type', '')

    match event_type:
        case 'session.created':
            print(f"Event: {event_type} - Session has been created.")

        case 'session.updated':
            print(f"Event: {event_type} - Session has been updated.")
            print(f"{response}")

        case 'conversation.created':
            print(f"Event: {event_type} - Conversation has been created.")
            print(f"{response}")

        case 'conversation.item.created':
            print(f"Event: {event_type} - A conversation item has been created.")
            print(f"{response}")
            if response['item']['content'][0].get('transcript') is None:
                print("Warning: Transcript is None, possible audio processing issue.")

        case 'conversation.item.input_audio_transcription.completed':
            print(f"Event: {event_type} - Audio transcription completed.")

        case 'conversation.item.input_audio_transcription.failed':
            print(f"***** Event: {event_type} - Audio transcription failed. ****")

        case 'conversation.item.truncated':
            print(f"Event: {event_type} - Conversation item was truncated.")

        case 'conversation.item.deleted':
            print(f"Event: {event_type} - Conversation item has been deleted.")

        case 'input_audio_buffer.committed':
            print(f"Event: {event_type} - Audio buffer has been committed.")
            print(f"{response}")

        case 'input_audio_buffer.cleared':
            print(f"Event: {event_type} - Audio buffer has been cleared.")

        case 'input_audio_buffer.speech_started':
            print(f"Event: {event_type} - Speech has started.")

        case 'input_audio_buffer.speech_stopped':
            print(f"Event: {event_type} - Speech has stopped.")

        case 'response.created':
            print(f"Event: {event_type} - A new response has been created.")

        case 'response.done':
            print(f"Event: {event_type} - Response is fully complete.")
            print(f"{response}")

            if "response" in response:
                output_items = response["response"].get("output", [])
                full_text = ""
                for item in output_items:
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_item in content:
                            if content_item.get("type") == "text":
                                full_text += content_item.get("text", "")
                print(f"Final response: {full_text}")
            else:
                print(f"Response completed without output.")    

        case 'response.output_item.added':
            print(f"Event: {event_type} - Response output item added.")

        case 'response.output_item.done':
            print(f"Event: {event_type} - Response output item done.")

        case 'response.content_part.added':
            print(f"Event: {event_type} - Response content part added.")

        case 'response.content_part.done':
            print(f"Event: {event_type} - Response content part done.")

        case 'response.text.delta':
            print(f"Event: {event_type} - Partial text response: {response.get('text_delta', '')}")

        case 'response.text.done':
            print(f"Event: {event_type} - Text response generation completed.")

        case 'response.audio_transcript.delta':
            print(f"Event: {event_type} - Partial audio transcript: {response.get('transcript_delta', '')}")

        case 'response.audio_transcript.done':
            print(f"Event: {event_type} - Audio transcript generation completed.")

        case 'response.audio.delta':
            print(f"Event: {event_type} - Partial audio response delta.")

        case 'response.audio.done':
            print(f"Event: {event_type} - Audio response generation completed.")

        case 'response.function_call_arguments.delta':
            print(f"Event: {event_type} - Partial function call argument delta.")

        case 'response.function_call_arguments.done':
            print(f"Event: {event_type} - Function call argument generation completed.")

        case 'rate_limits.updated':
            print(f"Event: {event_type} - Rate limits have been updated.")

        case _:
            print(f"Unhandled event type: {event_type}")
            print(f"{response}")

# Handling WebSocket errors
def on_error(ws, error):
    print(f"WebSocket Error: {error} {type(error)}")

# Handling WebSocket closure
def on_close(ws, close_status_code, close_msg):
    print(f"WebSocket Closed: {close_status_code} - {close_msg}")

# WebSocket connection opened event
def on_open(ws):
    print("WebSocket connection opened.")

    session_update = {
        "type": "session.update",
        "session": {
            "input_audio_format": "pcm16",
            "instructions": SYSTEM_MESSAGE,
            "modalities": ["text"],
            "turn_detection": {
                "type": "server_vad",
                "threshold": 0.5,
            },
            "temperature": 0.8
        },
    }
    
    ws.send(json.dumps(session_update))

    # Convert the audio file and send it after the session update
    wav_file_path = your_file_here.wav'  # Replace with your actual WAV file path
    audio_data = convert_to_24khz_mono(wav_file_path)
    
    send_audio_to_openai(ws, audio_data)

# Start the WebSocket connection
def start_websocket():
    websocket_url = 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01'

    ws = websocket.WebSocketApp(
        websocket_url,
        header={"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime=v1"},
        on_message=on_message,
        on_error=on_error,
        on_close=on_close,
        on_open=on_open
    )

    # Run the WebSocket connection in a blocking loop (no need for threads)
    ws.run_forever()

if __name__ == "__main__":
    start_websocket()

Right now, this code sends audio and returns text. I’ll modify it in the days to come to return and play audio.

Cheers,
Andrew