I do not know if it helps but this websocket implementation seems to work for me:
import { getToken } from './backend.r.js'
async function getWS() {
const EPHEMERAL_KEY = await getToken()
const ws = new WebSocket(
'wss://api.openai.com/v1/realtime?intent=transcription',
[
'realtime',
// Auth
'openai-insecure-api-key.' + EPHEMERAL_KEY,
// Optional
'openai-organization.' + 'org-xxx,
'openai-project.' + 'proj_xxx,
// Beta protocol, required
'openai-beta.realtime-v1',
],
)
ws.addEventListener('error', (error) => {
console.error('WebSocket error:', error)
})
ws.addEventListener('message', (evt) => {
console.log(evt.data)
if (typeof evt.data !== 'string') return
const deltaType = 'conversation.item.input_audio_transcription.delta'
const isDelta = evt.data.includes(deltaType)
if (!isDelta) return
const data = JSON.parse(evt.data)
if (data.type !== deltaType) return
document.body.textContent += data.delta
})
await new Promise((fn) => ws.addEventListener('open', fn))
ws.send(
JSON.stringify({
type: 'transcription_session.update',
session: {
input_audio_transcription: {
model: 'gpt-4o-transcribe',
},
},
}),
)
return ws
}
const audioWorkletProcessorCode = `
class PCMProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.sampleRate = 24000; // 24kHz sample rate
this.chunkSize = this.sampleRate * 0.1; // 100ms worth of samples (2400 samples)
this.buffer = []; // Buffer to accumulate audio samples
}
process(inputs, outputs, parameters) {
const input = inputs[0];
if (input && input[0]) {
const float32Data = input[0];
// Accumulate samples in the buffer
this.buffer.push(...float32Data);
// When the buffer reaches the chunk size, process and send
while (this.buffer.length >= this.chunkSize) {
const chunk = this.buffer.slice(0, this.chunkSize); // Take 100ms worth of samples
this.buffer = this.buffer.slice(this.chunkSize); // Remove processed samples from the buffer
// Convert Float32 to Int16
const int16Buffer = new Int16Array(chunk.length);
for (let i = 0; i < chunk.length; i++) {
int16Buffer[i] = Math.max(-1, Math.min(1, chunk[i])) * 0x7fff;
}
// Post to the main thread
this.port.postMessage(int16Buffer.buffer, [int16Buffer.buffer]);
}
}
return true; // Keep the processor alive
}
}
registerProcessor('pcm-processor', PCMProcessor);
`
export async function main() {
const audioEl = document.createElement('audio')
audioEl.autoplay = true
const stream = await navigator.mediaDevices.getUserMedia({
audio: { sampleRate: 24000, channelCount: 1 },
})
const audioContext = new AudioContext({ sampleRate: 24000 })
const blob = new Blob([audioWorkletProcessorCode], {
type: 'application/javascript',
})
const workletURL = URL.createObjectURL(blob)
await audioContext.audioWorklet.addModule(workletURL)
const source = audioContext.createMediaStreamSource(stream)
const pcmProcessor = new AudioWorkletNode(audioContext, 'pcm-processor')
const ws = await getWS()
pcmProcessor.port.onmessage = (event) => {
const int16Buffer = event.data
const audio = Buffer.from(int16Buffer).toString('base64')
ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio }))
console.log('100ms audio chunk sent')
}
source.connect(pcmProcessor)
pcmProcessor.connect(audioContext.destination)
}