I am using openai realtime api with transcription mode using WebRTC. I get transcript for normal case. But if I send audio buffer to data channel using “input_audio_buffer.append” type , the transcript I get is wrong. Am I doing something wrong? I have attached all related codes below.
Session creation:
const response = await fetch("https://api.openai.com/v1/realtime/transcription_sessions", {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`
},
body: JSON.stringify({
"input_audio_transcription": {
"model": dictationModel,
"prompt": dictationPrompt,
"language": "en",
},
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
},
"input_audio_noise_reduction": {
"type": "near_field"
},
"include": null
})
});
Start capturing audio:
const stream = await navigator.mediaDevices.getUserMedia({
audio: true, // { sampleRate: 24000, channelCount: 1, echoCancellation: true }, // 24kHz, mono for OpenAI
});
audioContextRef.current = new AudioContext({ sampleRate: 24000 });
processorRef.current = audioContextRef.current.createScriptProcessor(
2048,
1,
1
);
processorRef.current.onaudioprocess = (event) => {
if (!isPreCapturing.current) return;
const audioData = event.inputBuffer.getChannelData(0);
preAudioBuffer.current.push(new Float32Array(audioData));
};
sourceRef.current =
audioContextRef.current.createMediaStreamSource(stream);
sourceRef.current.connect(processorRef.current);
processorRef.current.connect(audioContextRef.current.destination);
Send buffered audio after data channel open:
// Send audio chunks individually for real-time transcription
for (const chunk of preAudioBuffer.current) {
const base64 = arrayBufferToBase64(chunk);
dataChannelRef.current.send(
JSON.stringify({
type: 'input_audio_buffer.append',
audio: base64,
})
);
// Add small delay between chunks to prevent overwhelming the connection
await new Promise((resolve) => setTimeout(resolve, 20));
}
Helpers:
const floatTo16BitPCM = (float32Array: Float32Array) => {
const buffer = new ArrayBuffer(float32Array.length * 2);
const view = new DataView(buffer);
let offset = 0;
for (let i = 0; i < float32Array.length; i++, offset += 2) {
let s = Math.max(-1, Math.min(1, float32Array[i]));
view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
}
return buffer;
}
const arrayBufferToBase64 = (arrayBuffer: ArrayBuffer | Int16Array | Float32Array) => {
if (arrayBuffer instanceof Float32Array) {
arrayBuffer = floatTo16BitPCM(arrayBuffer);
} else if (arrayBuffer instanceof Int16Array) {
arrayBuffer = arrayBuffer.buffer;
}
let uint8Array = new Uint8Array(arrayBuffer);
let binary = '';
const chunkSize = 0x8000; // 32KB chunk size
for (let i = 0; i < uint8Array.length; i += chunkSize) {
let chunk = uint8Array.subarray(i, i + chunkSize);
binary += String.fromCharCode(...Array.from(chunk));
}
return btoa(binary);
}