Issues with WebRTC Realtime Endpoint Returning

yadnik22 · December 29, 2024, 2:11pm

Hello everyone,

I’m attempting to use OpenAI’s Realtime API (gpt-4o-realtime-preview-2024-12-17) in a voice-to-voice setup from my Raspberry Pi. I successfully fetch an ephemeral token (no more 401 errors), but when I send my SDP offer to https://api.openai.com/v1/realtime, I get HTTP 201 plus my own local SDP, instead of a 200 with a genuine “answer.” Here’s my code:

import os
import sys
import signal
import asyncio
import requests
import pyaudio
import av

from aiortc import RTCPeerConnection, RTCConfiguration, RTCIceServer
from aiortc.mediastreams import AudioStreamTrack

# -------------------------------------------------------------------------
# CONFIG
# -------------------------------------------------------------------------
# Put your standard API key here (or get from environment).
STANDARD_API_KEY = (
    "sk-"  # <--- Insert your standard key
)

# The model and voice you want from the Realtime API
MODEL_ID = "gpt-4o-realtime-preview-2024-12-17"
VOICE_NAME = "coral"  # Or another valid voice, if available
INSTRUCTIONS = "You are helpful. No local STT or TTS here."

# Audio config for your Pi:
MIC_RATE = 48000
MIC_CHANNELS = 1
MIC_FORMAT = pyaudio.paInt16
FRAMES_PER_BUFFER = 1024

# Device indexes:
MIC_DEVICE_INDEX = None   # e.g. 2 if your USB mic is index=2
SPEAKER_DEVICE_INDEX = None  # e.g. 1 if your speaker is index=1

# -------------------------------------------------------------------------
# 1) GET EPHEMERAL TOKEN
# -------------------------------------------------------------------------
def get_ephemeral_token():
    """
    Calls the /v1/realtime/sessions endpoint with your standard API key 
    to get an ephemeral token for the chosen model + voice.
    """
    url = "https://api.openai.com/v1/realtime/sessions"
    headers = {
        "Authorization": f"Bearer {STANDARD_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": MODEL_ID,
        "voice": VOICE_NAME,
    }
    print("[TOKEN] Requesting ephemeral token from Realtime sessions...")
    r = requests.post(url, headers=headers, json=payload)
    if r.status_code != 200:
        print("[ERROR] Could not fetch ephemeral token.")
        print("Status code:", r.status_code)
        print("Response:", r.text)
        sys.exit(1)

    data = r.json()
    ephemeral_token = data.get("client_secret", {}).get("value", "")
    if not ephemeral_token:
        print("[ERROR] ephemeral_token is empty. Response data:", data)
        sys.exit(1)

    print("[TOKEN] Ephemeral token obtained successfully.")
    return ephemeral_token

# -------------------------------------------------------------------------
# 2) PyAudioMicTrack (Mic -> WebRTC)
# -------------------------------------------------------------------------
class PyAudioMicTrack(AudioStreamTrack):
    kind = "audio"

    def __init__(self):
        super().__init__()
        self.pa = pyaudio.PyAudio()
        try:
            self.stream = self.pa.open(
                format=MIC_FORMAT,
                channels=MIC_CHANNELS,
                rate=MIC_RATE,
                input=True,
                input_device_index=MIC_DEVICE_INDEX,
                frames_per_buffer=FRAMES_PER_BUFFER,
            )
        except OSError as e:
            print(f"[ERROR] Could not open mic device {MIC_DEVICE_INDEX}: {e}")
            sys.exit(1)

    async def recv(self):
        data = self.stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
        # Convert raw PCM to AudioFrame
        num_samples = len(data) // 2  # S16 => 2 bytes each
        audio_frame = av.AudioFrame.from_ndarray(
            buffer_to_ndarray(data),
            layout="mono" if MIC_CHANNELS == 1 else "stereo",
            sample_rate=MIC_RATE,
        )
        return audio_frame

    def stop(self):
        print("[AUDIO] Stopping mic track.")
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
        if self.pa:
            self.pa.terminate()
            self.pa = None
        super().stop()

def buffer_to_ndarray(data: bytes):
    import numpy as np
    return np.frombuffer(data, dtype=np.int16)

# -------------------------------------------------------------------------
# 3) PyAudioSpeaker (Remote track -> Speaker)
# -------------------------------------------------------------------------
class PyAudioSpeaker:
    def __init__(self):
        self.pa = pyaudio.PyAudio()
        self.stream = self.pa.open(
            format=MIC_FORMAT,
            channels=MIC_CHANNELS,
            rate=MIC_RATE,
            output=True,
            output_device_index=SPEAKER_DEVICE_INDEX,
            frames_per_buffer=FRAMES_PER_BUFFER,
        )

    def play(self, pcm_data: bytes):
        self.stream.write(pcm_data)

    def close(self):
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
        if self.pa:
            self.pa.terminate()
            self.pa = None

# -------------------------------------------------------------------------
# 4) RealtimeVoiceChat main class
# -------------------------------------------------------------------------
class RealtimeVoiceChat:
    def __init__(self):
        self.pc = None
        self.alive = True
        self.speaker = PyAudioSpeaker()
        self.ephemeral_token = None

    async def start(self):
        # 1) ephemeral token
        self.ephemeral_token = get_ephemeral_token()

        # 2) PeerConnection
        ice_config = RTCConfiguration([RTCIceServer("stun:stun.l.google.com:19302")])
        self.pc = RTCPeerConnection(ice_config)

        # 3) Add local mic
        mic_track = PyAudioMicTrack()
        self.pc.addTrack(mic_track)

        # 4) On remote track => write to speaker
        @self.pc.on("track")
        def on_track(track):
            print("[INFO] Received remote track of kind:", track.kind)
            if track.kind == "audio":
                asyncio.ensure_future(self._play_remote(track))

        # 5) Create Offer
        offer = await self.pc.createOffer()
        await self.pc.setLocalDescription(offer)

        # 6) POST offer to /v1/realtime with ephemeral token
        from urllib.parse import urlencode
        query = urlencode({
            "model": MODEL_ID,
            "voice": VOICE_NAME,
            "instructions": INSTRUCTIONS,
        })
        url = f"https://api.openai.com/v1/realtime?{query}"
        headers = {
            "Authorization": f"Bearer {self.ephemeral_token}",
            "Content-Type": "application/sdp"
        }

        print("[INFO] Sending offer to Realtime API.")
        resp = requests.post(url, headers=headers, data=self.pc.localDescription.sdp)
        if resp.status_code != 200:
            print("[ERROR] Realtime offer->answer failed:", resp.status_code, resp.text)
            await self.stop()
            return

        # 7) setRemoteDescription from answer
        answer_sdp = resp.text
        answer = {
            "type": "answer",
            "sdp": answer_sdp,
        }
        await self.pc.setRemoteDescription(answer)

        print("[INFO] WebRTC established. Speak now. Ctrl+C to stop.")
        while self.alive:
            await asyncio.sleep(1)

    async def _play_remote(self, track):
        import av
        while self.alive:
            frame = await track.recv()
            buf = frame.planes[0].to_bytes()
            self.speaker.play(buf)

    async def stop(self):
        self.alive = False
        if self.pc:
            await self.pc.close()
        self.speaker.close()
        print("[MAIN] RealtimeVoiceChat stopped.")

# -------------------------------------------------------------------------
# MAIN
# -------------------------------------------------------------------------
async def main():
    chat = RealtimeVoiceChat()

    loop = asyncio.get_event_loop()

    def handle_sigint():
        print("\n[MAIN] Ctrl+C pressed, stopping chat.")
        loop.create_task(chat.stop())

    loop.add_signal_handler(signal.SIGINT, handle_sigint)

    await chat.start()
    await chat.stop()

if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        print("[MAIN] KeyboardInterrupt, exiting.")
        sys.exit(0)

Symptoms:

I do createOffer(), setLocalDescription(offer), then POST that SDP to /v1/realtime.
The endpoint responds with HTTP 201 plus the same SDP I just sent, instead of HTTP 200 + a remote “answer.”
My code logs Realtime offer->answer failed: 201 … and can’t finalize the WebRTC handshake.

Has anyone encountered this issue, or can suggest what might be going wrong? I’ve verified my ephemeral token is valid (no more 401s), so the problem now is that the server only sends back the local SDP with status 201, not a true answer. Could it be that I’m missing voice permissions, or the model/voice is unavailable?

Any help or guidance would be appreciated. Thank you!

rbcoder12345 · January 4, 2025, 11:24am

Same problem here. Have you found the solution?

Topic		Replies	Views
400 SDP Error with Empty Reason Message During WebRTC Transcription API transcribe , realtime	2	358	October 30, 2025
Realtime WebRTC doesn't work with ephemeral token API	1	539	March 23, 2025
Can't Hear Inbound Audio from OpenAI Realtime Agent (WebRTC) - Oubound Works, Inbound Stuck at 1 kbps API ios-app , agents , audio , api-realtime-speech	2	319	November 10, 2025
Realtime API - SDP response says invalid realtime token Bugs realtime	5	408	October 17, 2025
Trouble establishing connection with wss://api.openai.com/v1/realtime for GPT-4o — No consumption visible in usage dashboard Bugs api	1	612	September 9, 2025

Issues with WebRTC Realtime Endpoint Returning

Related topics