Hi all,
I am new user in OpenAI and trying to use realtime API to get response in wav from my wav question. question is generated from google TTS (for testing). content is how to sum 4 and 5 ?
My code is somethink like.
import websocket
import json
import base64
import threading
import os
from dotenv import load_dotenv#-Load the .env file
#- Retrieving the value of the “PATH” environment variableAPI_KEY = os.environ[“OPENAI_API_KEY”]
#- Path to the question audio file (WAV format)QUESTION_AUDIO_PATH = “/tmp/sum.wav”
RESPONSE_AUDIO_PATH = “/tmp/output.wav”REALTIME_API_URL = “wss://api.openai.com/v1/realtime?model=gpt-4o-mini-realtime-preview”
headers = [
"Authorization: Bearer " + API_KEY,
“OpenAI-Beta: realtime=v1”
session_id = None
con_id = None
audio_chunks =import wave
def save_wav(filename, audio_chunks):
with wave.open(filename, “wb”) as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit PCM
wav_file.setframerate(24000) # 24 kHz sample rate
wav_file.writeframes(b"".join(audio_chunks)) # Combine & write chunks#- Function to encode the WAV file in Base64
def encode_audio(file_path):
with open(file_path, “rb”) as audio_file:
return base64.b64encode(audio_file.read()).decode(“utf-8”)def on_open(ws):
print(“WebSocket Connection Opened”)
#- Step 1: Start a response response_request = { "type": "response.create", # ✅ CORRECT TYPE "response": { "modalities": ["text", "audio"], "instructions": "Please assist the user.", "voice": "sage", "output_audio_format": "pcm16", "max_output_tokens": 1024, "conversation": "auto" } } ws.send(json.dumps(response_request)) print("🎤 Sent request to create response...")
def on_message(ws, message):
global QUESTION_AUDIO_PATH, con_id
print(“Response received from OpenAI:”, message)
response = json.loads(message)#- Step 2: create conversation if response.get("type") == "response.created": encoded_audio = encode_audio(QUESTION_AUDIO_PATH) con_id = response['event_id'] print("✅ Response received from OpenAI: con_id ", con_id) audio_request = { "type": "conversation.item.create", "item": { "type": "message", "role": "user", "content": [ { "type": "input_audio", "audio": encoded_audio } ], }, } ws.send(json.dumps(audio_request)) print(f"🎤 Sent audio data to OpenAI...") #- Step 5: Check if audio response is received if response.get("type") == "response.audio.delta": audio_chunk = base64.b64decode(response["delta"]) # Decode Base64 audio_chunks.append(audio_chunk) # Store in list print("🎤 Received an audio chunk!") if response.get("type") == "response.done" and response.get('response').get('status') == 'completed' : save_wav(RESPONSE_AUDIO_PATH, audio_chunks) print("✅ Audio saved another as 'response.wav'") ws.close() # Close WebSocket when done
#- Callback when an error occurs
def on_error(ws, error):
print(f"WebSocket Error: {error}")
#- Callback when the connection is closed
def on_close(ws, close_status_code, close_reason):
print(f"WebSocket Closed: {close_status_code} - {close_reason}")
#- Add required headers (API Key & Beta Access)
#- Create WebSocketApp
ws = websocket.WebSocketApp(
)#- Run WebSocket in a separate thread
wst = threading.Thread(target=ws.run_forever)
wst.daemon = True
wst.start()#- Keep the script running
while wst.is_alive():
I try to changed the question content to text or another file, but has same response in all the time. Any one can suggest how to get it works please.
Thank all in advance.