I’m implementing a realtime API for transcription using Websockets, with Semantic VAD specified, but it seems that with small fillers like ‘umm’ the connection breaks and ‘conversation.item.input_audio_transcription.completed’ is sent. In OpenAI Playground’s real-time API, when I specify semantic VAD with eagerness: low, it waits for about 5 seconds of silence even after saying fillers. Is transcribe mode not working correctly?
Or is there something wrong with my current implementation or the way I’m sending requests?
"use server";
import { OpenAI } from "openai";
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
export async function fetchSTTEphemeralToken(): Promise<OpenAI.Beta.Realtime.Sessions.SessionCreateResponse.ClientSecret> {
const response = await openai.beta.realtime.transcriptionSessions.create({});
const ephemeralToken = response.client_secret;
return ephemeralToken;
}
"use client";
import { useState, useRef, useEffect } from "react";
import { fetchSTTEphemeralToken } from "../actions/fetchSTTEphemeralToken";
// Add the AudioWorklet processor code at the top level
const audioWorkletProcessorCode = `
class PCMProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.sampleRate = 24000;
this.chunkSize = this.sampleRate * 0.1; // 100ms chunks
this.buffer = [];
}
process(inputs, outputs, parameters) {
const input = inputs[0];
if (input && input[0]) {
const float32Data = input[0];
this.buffer.push(...float32Data);
while (this.buffer.length >= this.chunkSize) {
const chunk = this.buffer.slice(0, this.chunkSize);
this.buffer = this.buffer.slice(this.chunkSize);
const int16Buffer = new Int16Array(chunk.length);
for (let i = 0; i < chunk.length; i++) {
int16Buffer[i] = Math.max(-1, Math.min(1, chunk[i])) * 0x7fff;
}
this.port.postMessage(int16Buffer.buffer, [int16Buffer.buffer]);
}
}
return true;
}
}
registerProcessor('pcm-processor', PCMProcessor);
`;
export default function TalkPage() {
const [isRecording, setIsRecording] = useState(false);
const [transcription, setTranscription] = useState("");
const [status, setStatus] = useState("Ready");
const [error, setError] = useState<string | null>(null);
const websocketRef = useRef<WebSocket | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const audioWorkletNodeRef = useRef<AudioWorkletNode | null>(null);
const streamRef = useRef<MediaStream | null>(null);
// Function to start recording
const startRecording = async () => {
try {
setStatus("Initializing...");
setError(null);
const ephemeralToken = await fetchSTTEphemeralToken();
console.log("Ephemeral token:", ephemeralToken);
const ws = new WebSocket(
"wss://api.openai.com/v1/realtime?intent=transcription",
[
"realtime",
`openai-insecure-api-key.${ephemeralToken.value}`,
"openai-beta.realtime-v1",
]
);
websocketRef.current = ws;
ws.onopen = async () => {
console.log("WebSocket connection established");
const configMessage = {
type: "transcription_session.update",
session: {
input_audio_transcription: {
model: "gpt-4o-transcribe",
language: "ja",
},
input_audio_noise_reduction: { type: "near_field" },
turn_detection: {
type: "semantic_vad",
eagerness: "low",
},
},
};
ws.send(JSON.stringify(configMessage));
try {
const stream = await navigator.mediaDevices.getUserMedia({
audio: { sampleRate: 24000, channelCount: 1 },
});
streamRef.current = stream;
const audioContext = new AudioContext({ sampleRate: 24000 });
audioContextRef.current = audioContext;
// Set up AudioWorklet
const blob = new Blob([audioWorkletProcessorCode], {
type: "application/javascript",
});
const workletURL = URL.createObjectURL(blob);
await audioContext.audioWorklet.addModule(workletURL);
const source = audioContext.createMediaStreamSource(stream);
const pcmProcessor = new AudioWorkletNode(
audioContext,
"pcm-processor"
);
audioWorkletNodeRef.current = pcmProcessor;
pcmProcessor.port.onmessage = (event) => {
if (websocketRef.current?.readyState === WebSocket.OPEN) {
const audio = Buffer.from(event.data).toString("base64");
websocketRef.current.send(
JSON.stringify({
type: "input_audio_buffer.append",
audio,
})
);
}
};
source.connect(pcmProcessor);
pcmProcessor.connect(audioContext.destination);
setIsRecording(true);
setStatus("Recording");
} catch (err) {
console.error("Error accessing microphone:", err);
setError(
`Microphone error: ${
err instanceof Error ? err.message : String(err)
}`
);
}
};
ws.onmessage = (event) => {
const message = JSON.parse(event.data);
console.log("WebSocket message:", message);
if (
message.type ===
"conversation.item.input_audio_transcription.completed"
) {
console.log("Transcription completed:", message);
if (message.transcript) {
setTranscription(message.transcript);
}
}
};
ws.onerror = (error) => {
console.error("WebSocket error:", error);
setError(`WebSocket error: ${error}`);
setStatus("Error");
};
ws.onclose = (event) => {
console.log("WebSocket closed:", event);
setError(`WebSocket closed: ${event.reason}`);
setStatus("Disconnected");
stopRecording();
};
} catch (err) {
console.error("Error starting recording:", err);
setError(`Error: ${err instanceof Error ? err.message : String(err)}`);
setStatus("Error");
}
};
const stopRecording = () => {
if (audioWorkletNodeRef.current) {
audioWorkletNodeRef.current.disconnect();
audioWorkletNodeRef.current = null;
}
if (audioContextRef.current) {
audioContextRef.current.close();
audioContextRef.current = null;
}
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}
if (websocketRef.current) {
websocketRef.current.close();
websocketRef.current = null;
}
setIsRecording(false);
setStatus("Stopped");
};
// Clean up on component unmount
useEffect(() => {
return () => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
}
if (websocketRef.current) {
websocketRef.current.close();
}
};
}, []);
return (
<div className="flex flex-col items-center justify-center min-h-screen p-4 bg-gray-50">
<div className="w-full max-w-md p-6 bg-white rounded-lg shadow-md">
<h1 className="mb-6 text-2xl font-bold text-center">
Experimental Talk Page
</h1>
<div className="mb-6">
<div className="p-4 mb-4 text-sm bg-gray-100 rounded-lg">
<p className="font-semibold">
Status: <span className="font-normal">{status}</span>
</p>
{error && <p className="mt-2 text-red-500">{error}</p>}
</div>
<div className="flex justify-center">
<button
onClick={isRecording ? stopRecording : startRecording}
className={`flex items-center justify-center w-16 h-16 rounded-full focus:outline-none ${
isRecording
? "bg-red-500 hover:bg-red-600"
: "bg-blue-500 hover:bg-blue-600"
}`}
>
{isRecording ? (
<svg
xmlns="http://www.w3.org/2000/svg"
className="w-8 h-8 text-white"
viewBox="0 0 20 20"
fill="currentColor"
>
<rect x="6" y="6" width="8" height="8" fill="white" />
</svg>
) : (
<svg
xmlns="http://www.w3.org/2000/svg"
className="w-8 h-8 text-white"
viewBox="0 0 20 20"
fill="currentColor"
>
<path
fillRule="evenodd"
d="M10 18a8 8 0 100-16 8 8 0 000 16zM9.555 7.168A1 1 0 008 8v4a1 1 0 001.555.832l3-2a1 1 0 000-1.664l-3-2z"
clipRule="evenodd"
/>
</svg>
)}
</button>
</div>
</div>
<div className="p-4 bg-gray-100 rounded-lg">
<h2 className="mb-2 text-lg font-semibold">Transcription:</h2>
<div className="p-3 bg-white border rounded-md min-h-[100px]">
{transcription || (
<span className="text-gray-400">
Speak to see transcription here...
</span>
)}
</div>
</div>
</div>
</div>
);
}