Hi Folks:
I am new to the realtime API. I am writing code to try out features and learn.
I am writing a Python program using the websocket-client library. I am starting simple by reading a wav file containing a command. I want the real-time API to send back text.
My problem is, after I get a conversation.item.created, I do not get either a transcription, or further websocket events, like input_audio_buffer.speech_stopped.
WebSocket connection opened.
Audio written to output_16khz_mono.wav
debug sending frames to openai
sending 0 1024
…
Sending input_audio_buffer.commit
Event: session.updated - Session has been updated.
{‘type’: ‘session.updated’, ‘event_id’: ‘event_AHnNzP28AzQyltxYbHWVc’, ‘session’: {‘id’: ‘sess_AHnNyF82tvZhR75TqFEgC’, ‘object’: ‘realtime.session’, ‘model’: ‘gpt-4o-realtime-preview-2024-10-01’, ‘expires_at’: 1728805678, ‘modalities’: [‘text’], ‘instructions’: ‘You are a helpful AI assistant. You execute commands to the best of your ability.Please answer in a clear and concise manner.’, ‘voice’: ‘alloy’, ‘turn_detection’: {‘type’: ‘server_vad’, ‘threshold’: 0.3, ‘prefix_padding_ms’: 300, ‘silence_duration_ms’: 200}, ‘input_audio_format’: ‘pcm16’, ‘output_audio_format’: ‘pcm16’, ‘input_audio_transcription’: None, ‘tool_choice’: ‘auto’, ‘temperature’: 0.8, ‘max_response_output_tokens’: ‘inf’, ‘tools’: }}
Event: input_audio_buffer.speech_started - Speech has started.
Event: input_audio_buffer.committed - Audio buffer has been committed.
{‘type’: ‘input_audio_buffer.committed’, ‘event_id’: ‘event_AHnNzPkhZfh7MON8Zuc2v’, ‘previous_item_id’: None, ‘item_id’: ‘item_AHnNzdE29TE1YyjP7vxXw’}
Event: conversation.item.created - A conversation item has been created.
{‘type’: ‘conversation.item.created’, ‘event_id’: ‘event_AHnNzLiujXFBaLy6uJ9rz’, ‘previous_item_id’: None, ‘item’: {‘id’: ‘item_AHnNzdE29TE1YyjP7vxXw’, ‘object’: ‘realtime.item’, ‘type’: ‘message’, ‘status’: ‘completed’, ‘role’: ‘user’, ‘content’: [{‘type’: ‘input_audio’, ‘transcript’: None}]}}
If I add input_audio_transcription to the session property, I get:
{‘type’: ‘error’, ‘event_id’: ‘event_AHnEmJsPiDD7XR0Dqokux’, ‘error’: {‘type’: ‘invalid_request_error’, ‘code’: ‘unknown_parameter’, ‘message’: “Unknown parameter: ‘session.input_audio_transcription.enabled’.”, ‘param’: ‘session.input_audio_transcription.enabled’, ‘event_id’: None}}
I believe I am sending data as 16khz PCM, mono.
Here is my code, warts and all
import base64
import os
import io
import json
import threading
import websocket
import wave
from pydub import AudioSegment
t_list = []
ws = None
# Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # Requires OpenAI Realtime API Access
SYSTEM_MESSAGE = (
"You are a helpful AI assistant. You execute commands to the best of your ability."
"Please answer in a clear and concise manner."
)
VOICE = 'alloy'
if not OPENAI_API_KEY:
raise ValueError('Missing the OpenAI API key. Please set it in the .env file.')
# for debugging
import wave
def write_audio_to_wav(audio_data, output_file, sample_rate=16000, num_channels=1, sample_width=2):
with wave.open(output_file, 'wb') as wav_file:
# Set the WAV file parameters
wav_file.setnchannels(num_channels) # Mono
wav_file.setsampwidth(sample_width) # 2 bytes for 16-bit audio
wav_file.setframerate(sample_rate) # Sample rate
# Write the raw PCM data to the WAV file
wav_file.writeframes(audio_data)
print(f"Audio written to {output_file}")
def convert_to_16khz_mono(input_file):
# Load the audio file
audio = AudioSegment.from_wav(input_file)
# Resample to 16kHz and make sure it's mono
audio = audio.set_frame_rate(16000).set_channels(1)
return audio.raw_data
def send_audio_to_openai(ws, audio_data):
"""Send audio data to OpenAI Realtime API in chunks."""
print(f"debug sending frames to openai")
frames_per_packet = 1024 # Adjust packet size as needed
for i in range(0, len(audio_data), frames_per_packet):
audio_chunk = audio_data[i:i + frames_per_packet]
audio_append = {
"type": "input_audio_buffer.append",
"audio": base64.b64encode(audio_chunk).decode('utf-8')
}
print(f"sending {i} {len(audio_chunk)}")
ws.send(json.dumps(audio_append))
audio_commit = {
"type": "input_audio_buffer.commit"
}
print("Sending input_audio_buffer.commit")
ws.send(json.dumps(audio_commit))
def on_message(ws, message):
"""Handle incoming WebSocket messages from OpenAI and print the text response."""
response = json.loads(message)
# Extracting event type from the message
event_type = response.get('type', '')
# Using match to handle different event types
match event_type:
# Session-related events
case 'session.created':
print(f"Event: {event_type} - Session has been created.")
case 'session.updated':
print(f"Event: {event_type} - Session has been updated.")
print(f"{response}")
# Conversation-related events
case 'conversation.created':
print(f"Event: {event_type} - Conversation has been created.")
print(f"{response}")
case 'conversation.item.created':
print(f"Event: {event_type} - A conversation item has been created.")
print(f"{response}")
case 'conversation.item.input_audio_transcription.completed':
print(f"Event: {event_type} - Audio transcription completed.")
case 'conversation.item.input_audio_transcription.failed':
print(f"***** Event: {event_type} - Audio transcription failed. ****")
case 'conversation.item.truncated':
print(f"Event: {event_type} - Conversation item was truncated.")
case 'conversation.item.deleted':
print(f"Event: {event_type} - Conversation item has been deleted.")
# Input audio buffer events
case 'input_audio_buffer.committed':
print(f"Event: {event_type} - Audio buffer has been committed.")
print(f"{response}")
case 'input_audio_buffer.cleared':
print(f"Event: {event_type} - Audio buffer has been cleared.")
case 'input_audio_buffer.speech_started':
print(f"Event: {event_type} - Speech has started.")
case 'input_audio_buffer.speech_stopped':
print(f"Event: {event_type} - Speech has stopped.")
# Response-related events
case 'response.created':
print(f"Event: {event_type} - A new response has been created.")
case 'response.done':
print(f"Event: {event_type} - Response is fully complete.")
if "response" in response:
output_items = response["response"].get("output", [])
full_text = ""
for item in output_items:
if item.get("type") == "message":
content = item.get("content", [])
for content_item in content:
if content_item.get("type") == "text":
full_text += content_item.get("text", "")
print(f"Final response: {full_text}")
else:
print(f"Response completed without output.")
case 'response.output_item.added':
print(f"Event: {event_type} - Response output item added.")
case 'response.output_item.done':
print(f"Event: {event_type} - Response output item done.")
case 'response.content_part.added':
print(f"Event: {event_type} - Response content part added.")
case 'response.content_part.done':
print(f"Event: {event_type} - Response content part done.")
# Partial and full text response events
case 'response.text.delta':
print(f"Event: {event_type} - Partial text response: {response.get('text_delta', '')}")
case 'response.text.done':
print(f"Event: {event_type} - Text response generation completed.")
#print(f"Full Text: {response.get('text', '')}")
# Audio response events
case 'response.audio_transcript.delta':
print(f"Event: {event_type} - Partial audio transcript: {response.get('transcript_delta', '')}")
case 'response.audio_transcript.done':
print(f"Event: {event_type} - Audio transcript generation completed.")
case 'response.audio.delta':
print(f"Event: {event_type} - Partial audio response delta.")
case 'response.audio.done':
print(f"Event: {event_type} - Audio response generation completed.")
# Function call arguments
case 'response.function_call_arguments.delta':
print(f"Event: {event_type} - Partial function call argument delta.")
case 'response.function_call_arguments.done':
print(f"Event: {event_type} - Function call argument generation completed.")
# Rate limit updates
case 'rate_limits.updated':
print(f"Event: {event_type} - Rate limits have been updated.")
# Default case for unhandled events
case _:
print(f"Unhandled event type: {event_type}")
print(f"{response}")
def on_error(ws, error):
print(f"WebSocket Error: {error} {type(error)}")
def on_close(ws, close_status_code, close_msg):
print(f"WebSocket Closed: {close_status_code} - {close_msg}")
def on_open(ws):
"""Send session update and start sending audio data."""
print("WebSocket connection opened.")
# Send session update to OpenAI WebSocket to receive text instead of audio
session_update = {
"type": "session.update",
"session": {
"input_audio_format": "pcm16", # Still need to match input audio format
"instructions": SYSTEM_MESSAGE,
"modalities": ["text"], # We are only interested in text
"turn_detection": {
"type": "server_vad",
"threshold": 0.3
},
"temperature": 0.8,
"tool_choice": "auto",
},
}
ws.send(json.dumps(session_update))
# Start sending audio data to OpenAI
wav_file_path = '/.test.wav' # Replace with your actual WAV file path
audio_data = convert_to_16khz_mono(wav_file_path)
# for debugging
write_audio_to_wav(audio_data, "output_16khz_mono.wav")
# Using a separate thread to send audio
send_thread = threading.Thread(target=send_audio_to_openai, args=(ws, audio_data))
t = send_thread.start()
t_list.append(t)
def start_websocket():
websocket_url = 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01'
# Initialize WebSocket connection
ws = websocket.WebSocketApp(
websocket_url,
header={"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime=v1"},
on_message=on_message,
on_error=on_error,
on_close=on_close,
on_open=on_open
)
# Run the WebSocket connection in the main thread
ws.run_forever()
if __name__ == "__main__":
#websocket.enableTrace(True)
t_list = []
send_thread = threading.Thread(target=start_websocket)
t_list.append(send_thread)
t = send_thread.start()
for t in t_list:
t.join()
What else should I be looking at?
Cheers,
Andrew