Semantic VAD might not be working with transcription mode

I’m implementing a realtime API for transcription using Websockets, with Semantic VAD specified, but it seems that with small fillers like ‘umm’ the connection breaks and ‘conversation.item.input_audio_transcription.completed’ is sent. In OpenAI Playground’s real-time API, when I specify semantic VAD with eagerness: low, it waits for about 5 seconds of silence even after saying fillers. Is transcribe mode not working correctly?

Or is there something wrong with my current implementation or the way I’m sending requests?

"use server";

import { OpenAI } from "openai";

const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY,
});

export async function fetchSTTEphemeralToken(): Promise<OpenAI.Beta.Realtime.Sessions.SessionCreateResponse.ClientSecret> {
  const response = await openai.beta.realtime.transcriptionSessions.create({});

  const ephemeralToken = response.client_secret;

  return ephemeralToken;
}
"use client";

import { useState, useRef, useEffect } from "react";
import { fetchSTTEphemeralToken } from "../actions/fetchSTTEphemeralToken";

// Add the AudioWorklet processor code at the top level
const audioWorkletProcessorCode = `
class PCMProcessor extends AudioWorkletProcessor {
  constructor() {
    super();
    this.sampleRate = 24000;
    this.chunkSize = this.sampleRate * 0.1; // 100ms chunks
    this.buffer = [];
  }

  process(inputs, outputs, parameters) {
    const input = inputs[0];
    if (input && input[0]) {
      const float32Data = input[0];
      this.buffer.push(...float32Data);

      while (this.buffer.length >= this.chunkSize) {
        const chunk = this.buffer.slice(0, this.chunkSize);
        this.buffer = this.buffer.slice(this.chunkSize);

        const int16Buffer = new Int16Array(chunk.length);
        for (let i = 0; i < chunk.length; i++) {
          int16Buffer[i] = Math.max(-1, Math.min(1, chunk[i])) * 0x7fff;
        }

        this.port.postMessage(int16Buffer.buffer, [int16Buffer.buffer]);
      }
    }
    return true;
  }
}

registerProcessor('pcm-processor', PCMProcessor);
`;

export default function TalkPage() {
  const [isRecording, setIsRecording] = useState(false);
  const [transcription, setTranscription] = useState("");
  const [status, setStatus] = useState("Ready");
  const [error, setError] = useState<string | null>(null);

  const websocketRef = useRef<WebSocket | null>(null);
  const audioContextRef = useRef<AudioContext | null>(null);
  const audioWorkletNodeRef = useRef<AudioWorkletNode | null>(null);
  const streamRef = useRef<MediaStream | null>(null);

  // Function to start recording
  const startRecording = async () => {
    try {
      setStatus("Initializing...");
      setError(null);

      const ephemeralToken = await fetchSTTEphemeralToken();
      console.log("Ephemeral token:", ephemeralToken);

      const ws = new WebSocket(
        "wss://api.openai.com/v1/realtime?intent=transcription",
        [
          "realtime",
          `openai-insecure-api-key.${ephemeralToken.value}`,
          "openai-beta.realtime-v1",
        ]
      );

      websocketRef.current = ws;

      ws.onopen = async () => {
        console.log("WebSocket connection established");

        const configMessage = {
          type: "transcription_session.update",
          session: {
            input_audio_transcription: {
              model: "gpt-4o-transcribe",
              language: "ja",
            },
            input_audio_noise_reduction: { type: "near_field" },
            turn_detection: {
              type: "semantic_vad",
              eagerness: "low",
            },
          },
        };
        ws.send(JSON.stringify(configMessage));

        try {
          const stream = await navigator.mediaDevices.getUserMedia({
            audio: { sampleRate: 24000, channelCount: 1 },
          });
          streamRef.current = stream;

          const audioContext = new AudioContext({ sampleRate: 24000 });
          audioContextRef.current = audioContext;

          // Set up AudioWorklet
          const blob = new Blob([audioWorkletProcessorCode], {
            type: "application/javascript",
          });
          const workletURL = URL.createObjectURL(blob);
          await audioContext.audioWorklet.addModule(workletURL);

          const source = audioContext.createMediaStreamSource(stream);
          const pcmProcessor = new AudioWorkletNode(
            audioContext,
            "pcm-processor"
          );
          audioWorkletNodeRef.current = pcmProcessor;

          pcmProcessor.port.onmessage = (event) => {
            if (websocketRef.current?.readyState === WebSocket.OPEN) {
              const audio = Buffer.from(event.data).toString("base64");
              websocketRef.current.send(
                JSON.stringify({
                  type: "input_audio_buffer.append",
                  audio,
                })
              );
            }
          };

          source.connect(pcmProcessor);
          pcmProcessor.connect(audioContext.destination);

          setIsRecording(true);
          setStatus("Recording");
        } catch (err) {
          console.error("Error accessing microphone:", err);
          setError(
            `Microphone error: ${
              err instanceof Error ? err.message : String(err)
            }`
          );
        }
      };

      ws.onmessage = (event) => {
        const message = JSON.parse(event.data);
        console.log("WebSocket message:", message);

        if (
          message.type ===
          "conversation.item.input_audio_transcription.completed"
        ) {
          console.log("Transcription completed:", message);
          if (message.transcript) {
            setTranscription(message.transcript);
          }
        }
      };

      ws.onerror = (error) => {
        console.error("WebSocket error:", error);
        setError(`WebSocket error: ${error}`);
        setStatus("Error");
      };

      ws.onclose = (event) => {
        console.log("WebSocket closed:", event);
        setError(`WebSocket closed: ${event.reason}`);
        setStatus("Disconnected");
        stopRecording();
      };
    } catch (err) {
      console.error("Error starting recording:", err);
      setError(`Error: ${err instanceof Error ? err.message : String(err)}`);
      setStatus("Error");
    }
  };

  const stopRecording = () => {
    if (audioWorkletNodeRef.current) {
      audioWorkletNodeRef.current.disconnect();
      audioWorkletNodeRef.current = null;
    }

    if (audioContextRef.current) {
      audioContextRef.current.close();
      audioContextRef.current = null;
    }

    if (streamRef.current) {
      streamRef.current.getTracks().forEach((track) => track.stop());
      streamRef.current = null;
    }

    if (websocketRef.current) {
      websocketRef.current.close();
      websocketRef.current = null;
    }

    setIsRecording(false);
    setStatus("Stopped");
  };

  // Clean up on component unmount
  useEffect(() => {
    return () => {
      if (streamRef.current) {
        streamRef.current.getTracks().forEach((track) => track.stop());
      }

      if (websocketRef.current) {
        websocketRef.current.close();
      }
    };
  }, []);

  return (
    <div className="flex flex-col items-center justify-center min-h-screen p-4 bg-gray-50">
      <div className="w-full max-w-md p-6 bg-white rounded-lg shadow-md">
        <h1 className="mb-6 text-2xl font-bold text-center">
          Experimental Talk Page
        </h1>

        <div className="mb-6">
          <div className="p-4 mb-4 text-sm bg-gray-100 rounded-lg">
            <p className="font-semibold">
              Status: <span className="font-normal">{status}</span>
            </p>
            {error && <p className="mt-2 text-red-500">{error}</p>}
          </div>

          <div className="flex justify-center">
            <button
              onClick={isRecording ? stopRecording : startRecording}
              className={`flex items-center justify-center w-16 h-16 rounded-full focus:outline-none ${
                isRecording
                  ? "bg-red-500 hover:bg-red-600"
                  : "bg-blue-500 hover:bg-blue-600"
              }`}
            >
              {isRecording ? (
                <svg
                  xmlns="http://www.w3.org/2000/svg"
                  className="w-8 h-8 text-white"
                  viewBox="0 0 20 20"
                  fill="currentColor"
                >
                  <rect x="6" y="6" width="8" height="8" fill="white" />
                </svg>
              ) : (
                <svg
                  xmlns="http://www.w3.org/2000/svg"
                  className="w-8 h-8 text-white"
                  viewBox="0 0 20 20"
                  fill="currentColor"
                >
                  <path
                    fillRule="evenodd"
                    d="M10 18a8 8 0 100-16 8 8 0 000 16zM9.555 7.168A1 1 0 008 8v4a1 1 0 001.555.832l3-2a1 1 0 000-1.664l-3-2z"
                    clipRule="evenodd"
                  />
                </svg>
              )}
            </button>
          </div>
        </div>

        <div className="p-4 bg-gray-100 rounded-lg">
          <h2 className="mb-2 text-lg font-semibold">Transcription:</h2>
          <div className="p-3 bg-white border rounded-md min-h-[100px]">
            {transcription || (
              <span className="text-gray-400">
                Speak to see transcription here...
              </span>
            )}
          </div>
        </div>
      </div>
    </div>
  );
}

Hi, just to understand the issue, are fillers being ignored in transcription mode? Do you have a recording that shows the problem?

I almost took a whole day to debug the same issue that I have been facing. The solution is simple , please use the model “gpt-4o-realtime-preview-2024-12-17”. I was using the other model that is “gpt-4o-realtime-preview-2024-10-01” that gave me this error message “Semantic VAD is not supported for model gpt-4o-realtime-preview-2024-10-01”.

Please use “gpt-4o-realtime-preview-2024-12-17” and transcription will work for you