WebRTC transcription guide seems to be broken

I do not know if it helps but this websocket implementation seems to work for me:

import { getToken } from './backend.r.js'

async function getWS() {
  const EPHEMERAL_KEY = await getToken()

  const ws = new WebSocket(
    'wss://api.openai.com/v1/realtime?intent=transcription',
    [
      'realtime',
      // Auth
      'openai-insecure-api-key.' + EPHEMERAL_KEY,
      // Optional
      'openai-organization.' + 'org-xxx,
      'openai-project.' + 'proj_xxx,
      // Beta protocol, required
      'openai-beta.realtime-v1',
    ],
  )

  ws.addEventListener('error', (error) => {
    console.error('WebSocket error:', error)
  })

  ws.addEventListener('message', (evt) => {
    console.log(evt.data)

    if (typeof evt.data !== 'string') return

    const deltaType = 'conversation.item.input_audio_transcription.delta'
    const isDelta = evt.data.includes(deltaType)
    if (!isDelta) return

    const data = JSON.parse(evt.data)
    if (data.type !== deltaType) return

    document.body.textContent += data.delta
  })

  await new Promise((fn) => ws.addEventListener('open', fn))

  ws.send(
    JSON.stringify({
      type: 'transcription_session.update',
      session: {
        input_audio_transcription: {
          model: 'gpt-4o-transcribe',
        },
      },
    }),
  )

  return ws
}

const audioWorkletProcessorCode = `
class PCMProcessor extends AudioWorkletProcessor {
  constructor() {
    super();
    this.sampleRate = 24000; // 24kHz sample rate
    this.chunkSize = this.sampleRate * 0.1; // 100ms worth of samples (2400 samples)
    this.buffer = []; // Buffer to accumulate audio samples
  }

  process(inputs, outputs, parameters) {
    const input = inputs[0];
    if (input && input[0]) {
      const float32Data = input[0];

      // Accumulate samples in the buffer
      this.buffer.push(...float32Data);

      // When the buffer reaches the chunk size, process and send
      while (this.buffer.length >= this.chunkSize) {
        const chunk = this.buffer.slice(0, this.chunkSize); // Take 100ms worth of samples
        this.buffer = this.buffer.slice(this.chunkSize); // Remove processed samples from the buffer

        // Convert Float32 to Int16
        const int16Buffer = new Int16Array(chunk.length);
        for (let i = 0; i < chunk.length; i++) {
          int16Buffer[i] = Math.max(-1, Math.min(1, chunk[i])) * 0x7fff;
        }

        // Post to the main thread
        this.port.postMessage(int16Buffer.buffer, [int16Buffer.buffer]);
      }
    }

    return true; // Keep the processor alive
  }
}

registerProcessor('pcm-processor', PCMProcessor);
`

export async function main() {
  const audioEl = document.createElement('audio')
  audioEl.autoplay = true

  const stream = await navigator.mediaDevices.getUserMedia({
    audio: { sampleRate: 24000, channelCount: 1 },
  })

  const audioContext = new AudioContext({ sampleRate: 24000 })

  const blob = new Blob([audioWorkletProcessorCode], {
    type: 'application/javascript',
  })

  const workletURL = URL.createObjectURL(blob)
  await audioContext.audioWorklet.addModule(workletURL)

  const source = audioContext.createMediaStreamSource(stream)
  const pcmProcessor = new AudioWorkletNode(audioContext, 'pcm-processor')

  const ws = await getWS()

  pcmProcessor.port.onmessage = (event) => {
    const int16Buffer = event.data

    const audio = Buffer.from(int16Buffer).toString('base64')

    ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio }))

    console.log('100ms audio chunk sent')
  }

  source.connect(pcmProcessor)
  pcmProcessor.connect(audioContext.destination)
}

1 Like