Hello everyone,
I’m attempting to use OpenAI’s Realtime API (gpt-4o-realtime-preview-2024-12-17) in a voice-to-voice setup from my Raspberry Pi. I successfully fetch an ephemeral token (no more 401 errors), but when I send my SDP offer to https://api.openai.com/v1/realtime
, I get HTTP 201 plus my own local SDP, instead of a 200 with a genuine “answer.” Here’s my code:
import os
import sys
import signal
import asyncio
import requests
import pyaudio
import av
from aiortc import RTCPeerConnection, RTCConfiguration, RTCIceServer
from aiortc.mediastreams import AudioStreamTrack
# -------------------------------------------------------------------------
# CONFIG
# -------------------------------------------------------------------------
# Put your standard API key here (or get from environment).
STANDARD_API_KEY = (
"sk-" # <--- Insert your standard key
)
# The model and voice you want from the Realtime API
MODEL_ID = "gpt-4o-realtime-preview-2024-12-17"
VOICE_NAME = "coral" # Or another valid voice, if available
INSTRUCTIONS = "You are helpful. No local STT or TTS here."
# Audio config for your Pi:
MIC_RATE = 48000
MIC_CHANNELS = 1
MIC_FORMAT = pyaudio.paInt16
FRAMES_PER_BUFFER = 1024
# Device indexes:
MIC_DEVICE_INDEX = None # e.g. 2 if your USB mic is index=2
SPEAKER_DEVICE_INDEX = None # e.g. 1 if your speaker is index=1
# -------------------------------------------------------------------------
# 1) GET EPHEMERAL TOKEN
# -------------------------------------------------------------------------
def get_ephemeral_token():
"""
Calls the /v1/realtime/sessions endpoint with your standard API key
to get an ephemeral token for the chosen model + voice.
"""
url = "https://api.openai.com/v1/realtime/sessions"
headers = {
"Authorization": f"Bearer {STANDARD_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": MODEL_ID,
"voice": VOICE_NAME,
}
print("[TOKEN] Requesting ephemeral token from Realtime sessions...")
r = requests.post(url, headers=headers, json=payload)
if r.status_code != 200:
print("[ERROR] Could not fetch ephemeral token.")
print("Status code:", r.status_code)
print("Response:", r.text)
sys.exit(1)
data = r.json()
ephemeral_token = data.get("client_secret", {}).get("value", "")
if not ephemeral_token:
print("[ERROR] ephemeral_token is empty. Response data:", data)
sys.exit(1)
print("[TOKEN] Ephemeral token obtained successfully.")
return ephemeral_token
# -------------------------------------------------------------------------
# 2) PyAudioMicTrack (Mic -> WebRTC)
# -------------------------------------------------------------------------
class PyAudioMicTrack(AudioStreamTrack):
kind = "audio"
def __init__(self):
super().__init__()
self.pa = pyaudio.PyAudio()
try:
self.stream = self.pa.open(
format=MIC_FORMAT,
channels=MIC_CHANNELS,
rate=MIC_RATE,
input=True,
input_device_index=MIC_DEVICE_INDEX,
frames_per_buffer=FRAMES_PER_BUFFER,
)
except OSError as e:
print(f"[ERROR] Could not open mic device {MIC_DEVICE_INDEX}: {e}")
sys.exit(1)
async def recv(self):
data = self.stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
# Convert raw PCM to AudioFrame
num_samples = len(data) // 2 # S16 => 2 bytes each
audio_frame = av.AudioFrame.from_ndarray(
buffer_to_ndarray(data),
layout="mono" if MIC_CHANNELS == 1 else "stereo",
sample_rate=MIC_RATE,
)
return audio_frame
def stop(self):
print("[AUDIO] Stopping mic track.")
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.pa:
self.pa.terminate()
self.pa = None
super().stop()
def buffer_to_ndarray(data: bytes):
import numpy as np
return np.frombuffer(data, dtype=np.int16)
# -------------------------------------------------------------------------
# 3) PyAudioSpeaker (Remote track -> Speaker)
# -------------------------------------------------------------------------
class PyAudioSpeaker:
def __init__(self):
self.pa = pyaudio.PyAudio()
self.stream = self.pa.open(
format=MIC_FORMAT,
channels=MIC_CHANNELS,
rate=MIC_RATE,
output=True,
output_device_index=SPEAKER_DEVICE_INDEX,
frames_per_buffer=FRAMES_PER_BUFFER,
)
def play(self, pcm_data: bytes):
self.stream.write(pcm_data)
def close(self):
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.stream = None
if self.pa:
self.pa.terminate()
self.pa = None
# -------------------------------------------------------------------------
# 4) RealtimeVoiceChat main class
# -------------------------------------------------------------------------
class RealtimeVoiceChat:
def __init__(self):
self.pc = None
self.alive = True
self.speaker = PyAudioSpeaker()
self.ephemeral_token = None
async def start(self):
# 1) ephemeral token
self.ephemeral_token = get_ephemeral_token()
# 2) PeerConnection
ice_config = RTCConfiguration([RTCIceServer("stun:stun.l.google.com:19302")])
self.pc = RTCPeerConnection(ice_config)
# 3) Add local mic
mic_track = PyAudioMicTrack()
self.pc.addTrack(mic_track)
# 4) On remote track => write to speaker
@self.pc.on("track")
def on_track(track):
print("[INFO] Received remote track of kind:", track.kind)
if track.kind == "audio":
asyncio.ensure_future(self._play_remote(track))
# 5) Create Offer
offer = await self.pc.createOffer()
await self.pc.setLocalDescription(offer)
# 6) POST offer to /v1/realtime with ephemeral token
from urllib.parse import urlencode
query = urlencode({
"model": MODEL_ID,
"voice": VOICE_NAME,
"instructions": INSTRUCTIONS,
})
url = f"https://api.openai.com/v1/realtime?{query}"
headers = {
"Authorization": f"Bearer {self.ephemeral_token}",
"Content-Type": "application/sdp"
}
print("[INFO] Sending offer to Realtime API.")
resp = requests.post(url, headers=headers, data=self.pc.localDescription.sdp)
if resp.status_code != 200:
print("[ERROR] Realtime offer->answer failed:", resp.status_code, resp.text)
await self.stop()
return
# 7) setRemoteDescription from answer
answer_sdp = resp.text
answer = {
"type": "answer",
"sdp": answer_sdp,
}
await self.pc.setRemoteDescription(answer)
print("[INFO] WebRTC established. Speak now. Ctrl+C to stop.")
while self.alive:
await asyncio.sleep(1)
async def _play_remote(self, track):
import av
while self.alive:
frame = await track.recv()
buf = frame.planes[0].to_bytes()
self.speaker.play(buf)
async def stop(self):
self.alive = False
if self.pc:
await self.pc.close()
self.speaker.close()
print("[MAIN] RealtimeVoiceChat stopped.")
# -------------------------------------------------------------------------
# MAIN
# -------------------------------------------------------------------------
async def main():
chat = RealtimeVoiceChat()
loop = asyncio.get_event_loop()
def handle_sigint():
print("\n[MAIN] Ctrl+C pressed, stopping chat.")
loop.create_task(chat.stop())
loop.add_signal_handler(signal.SIGINT, handle_sigint)
await chat.start()
await chat.stop()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("[MAIN] KeyboardInterrupt, exiting.")
sys.exit(0)
Symptoms:
- I do createOffer(), setLocalDescription(offer), then POST that SDP to /v1/realtime.
- The endpoint responds with HTTP 201 plus the same SDP I just sent, instead of HTTP 200 + a remote “answer.”
- My code logs Realtime offer->answer failed: 201 … and can’t finalize the WebRTC handshake.
Has anyone encountered this issue, or can suggest what might be going wrong? I’ve verified my ephemeral token is valid (no more 401s), so the problem now is that the server only sends back the local SDP with status 201, not a true answer. Could it be that I’m missing voice permissions, or the model/voice is unavailable?
Any help or guidance would be appreciated. Thank you!