Realtime: Recording to a stream from browser

hagen.rode · January 23, 2025, 12:56pm

Has anyone managed to record to a stream in the browser? i.e using JavaScript? I’m sending the audio like this:

mediaRecorder.ondataavailable = async (event) => {
          const audioBlob = event.data
          const arrayBuffer = await audioBlob.arrayBuffer()
          ws.value.send(arrayBuffer) 
          console.log('Sending audio chunk to server...')
        }

But it does not seem to be valid. I get back
Invalid 'audio'. Expected base64-encoded audio bytes (mono PCM16 at 24kHz) but got an invalid value.

Has anyone got working code that can be used in JS (Vue) to record to Stream? I know my backend works, since I have everything working well on my native/flutter app.

jochenschultz · January 23, 2025, 1:22pm

Yes, I did… I’ll go through my old codes… will post here once i find it

jochenschultz · January 23, 2025, 1:43pm

omg… that was in times of jquery…

import $ from "jquery";

class AudioProcessor extends AudioWorkletProcessor {
    constructor() {
        super();
    }
    process(inputs, outputs) {
        const input = inputs[0];
        const output = outputs[0];

        for (let channel = 0; channel < input.length; ++channel) {
            output[channel].set(input[channel]);
        }

        if (input[0].length > 0) {
            const bufferData = input[0].slice(0, input[0].length / 2);
            this.port.postMessage(bufferData);
        }

        return true;
    }
}


$.widget("custom.audioRecorder", {
    options: {
        recordButton: "#recordButton",
        decibelLevel: "#decibelLevel",
        numberOfCanvases: 6,
        silenceTimeout: 3000,
        wordEndThreshold: 10,
        canvasIdPrefix: "canvas"
    },

    _create: function() {
        this.isRecording = false;
        this.audioContext = null;
        this.mediaRecorder = null;
        this.audioChunks = [];
        this.canvases = [];
        this.canvasContexts = [];
        this._initializeCanvasElements();

        $(this.options.recordButton).on("click", this._toggleRecording.bind(this));
    },

    _initializeCanvasElements: function() {
        for (let i = 1; i <= this.options.numberOfCanvases; i++) {
            const canvasElement = document.getElementById(`${this.options.canvasIdPrefix}${i}`);
            this.canvases.push(canvasElement);
            this.canvasContexts.push(canvasElement.getContext("2d"));
        }
    },

    _toggleRecording: async function() {
        if (this.isRecording) {
            this._stopRecording();
            $(this.options.recordButton).html('<i class="fas fa-microphone"></i> Aufnahme starten');
            $(this.options.decibelLevel).addClass("hidden");
        } else {
            await this._startRecording();
            $(this.options.recordButton).html('<i class="fas fa-microphone-slash"></i> Aufnahme stoppen');
            $(this.options.decibelLevel).removeClass("hidden");
        }
        this.isRecording = !this.isRecording;
    },

    _startRecording: async function() {
        this.audioContext = new AudioContext();
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        this._setupMediaRecorder(stream);
        this._setupAudioAnalyser(stream);
    },

    _setupMediaRecorder: function(stream) {
        this.mediaRecorder = new MediaRecorder(stream);
        this.mediaRecorder.start(this.options.silenceTimeout);

        this.mediaRecorder.addEventListener("dataavailable", (e) => {
            this.audioChunks.push(e.data);
        });

        this.mediaRecorder.addEventListener("stop", async () => {
            await this._handleDataTransfer();
            if (this.isRecording) {
                this.mediaRecorder.start(this.options.silenceTimeout);
            }
        });
    },

    _setupAudioAnalyser: function(stream) {
        const source = this.audioContext.createMediaStreamSource(stream);
        const analyser = this.audioContext.createAnalyser();
        analyser.fftSize = 2048;
        source.connect(analyser);

        this._visualizeAudio(analyser);
        this._updateDecibelLevel(analyser);
    },

    _stopRecording: function() {
        if (this.mediaRecorder && this.mediaRecorder.state !== 'inactive') {
            this.mediaRecorder.stop();
        }
        if (this.audioContext) {
            this.audioContext.close();
            this.audioContext = null;
        }
    },

    _handleDataTransfer: async function() {
        if (this.audioChunks.length > 0) {
            const wavFile = await this._createWavFile(this.audioChunks);
            await this._sendWavFile(wavFile);
            this.audioChunks = [];
        }
    },

    _sendWavFile: async function(wavFile) {
        const response = await fetch('/sound/save', {
            method: 'POST',
            headers: {
                'Content-Type': 'application/octet-stream',
            },
            body: wavFile.buffer,
        });

        if (response.ok) {
            console.log('WAV file sent successfully');
        } else {
            console.error('Failed to send WAV file:', response.statusText);
        }
    },

    _createWavFile: async function(audioChunks) {
        const audioBuffer = new Blob(audioChunks, { type: 'audio/wav' });
        const reader = new FileReader();
        reader.readAsArrayBuffer(audioBuffer);
        return new Promise((resolve) => {
            reader.onload = () => {
                resolve(new Uint8Array(reader.result));
            };
        });
    },

    _visualizeAudio: function(analyser) {
        const bufferLength = analyser.frequencyBinCount;
        const dataArray = new Uint8Array(bufferLength);

        for (let i = 0; i < this.options.numberOfCanvases; i++) {
            this.canvases[i].width = 30;
            this.canvases[i].height = 100;
        }

        const draw = () => {
            if (!this.isRecording) {
                return;
            }

            analyser.getByteFrequencyData(dataArray);

            for (let i = 0; i < this.options.numberOfCanvases; i++) {
                const canvasContext = this.canvasContexts[i];
                canvasContext.clearRect(0, 0, 30, 100);

                const barHeight = dataArray[i] * 0.5;
                canvasContext.fillStyle = 'rgb(' + (barHeight+100) + ',50,50)';
                canvasContext.fillRect(0, 100 - barHeight, 30, barHeight);
            }

            requestAnimationFrame(draw);
        }

        draw();
    },

    _updateDecibelLevel: function(analyser) {
        const dataArray = new Uint8Array(analyser.frequencyBinCount);
        let inWord = false;
        let lastSoundTime = Date.now();

        const update = () => {
            if (!this.isRecording) {
                return;
            }

            analyser.getByteFrequencyData(dataArray);
            let sum = dataArray.reduce((a, b) => a + b, 0);
            let average = sum / dataArray.length;

            if (average > this.options.wordEndThreshold) {
                lastSoundTime = Date.now();
                inWord = true;
            } else if (inWord && Date.now() - lastSoundTime >= this.options.silenceTimeout) {
                inWord = false;
                this._handleDataTransfer().then(r => console.log(r));
            }

            $(this.options.decibelLevel).text(`Decibel Level: ${average}`);

            requestAnimationFrame(update);
        }

        update();
    }
});
$(document).ready(function() {
    $("body").audioRecorder();
});

registerProcessor('audio-processor', AudioProcessor);

and then I had a shellscript…

#!/bin/bash

# Iterate over each WAV file starting with "recording_"
for file in recording_*.webm; do
    # Convert .webm to .wav
    OUTPUT_WAV="${file%.webm}.wav"
    ffmpeg -i "$file" -acodec pcm_s16le -ac 1 -ar 44100 "$OUTPUT_WAV"

    # Extract silence start and end times
    SILENCE_OUTPUT=$(ffmpeg -i "$OUTPUT_WAV" -af silencedetect=n=-30dB:d=0.5 -f null - 2>&1)

    # Print the entire SILENCE_OUTPUT for debugging
    echo "$SILENCE_OUTPUT"

    # Extract silence times
    FIRST_SILENCE_END=$(echo "$SILENCE_OUTPUT" | grep "silence_end" | awk -F': ' '{print $2}' | awk -F' \\|' '{print $1}' | head -1)
    SECOND_SILENCE_START=$(echo "$SILENCE_OUTPUT" | grep "silence_start" | awk -F': ' '{print $2}' | tail -1)

    # Debug
    echo "Debug: FIRST_SILENCE_END=$FIRST_SILENCE_END"
    echo "Debug: SECOND_SILENCE_START=$SECOND_SILENCE_START"

    # Calculate duration of non-silent part
    if [ "$FIRST_SILENCE_END" ] && [ "$SECOND_SILENCE_START" ]; then
        DURATION=$(echo "$SECOND_SILENCE_START - $FIRST_SILENCE_END" | bc)
    else
        DURATION=0
    fi

    # Debug
    echo "Debug: DURATION=$DURATION"

    # Decide on how to process file
    if [[ "$DURATION" == "0" ]]; then
        # If there's no silence or silence is less than 0.5 seconds, just copy the file with clean_ prefix
        cp "$file" "clean_$file"
    else
        # Extract non-silent part
        ffmpeg -i "$file" -ss "$FIRST_SILENCE_END" -t "$DURATION" "clean_$file"
    fi
done

Maybe you can use something from it or get anything from it…

I think it also made some sort of frequence analysis… to display pitch and volume or something…

Was working on speaker diarization in javascript for a couple days but gave up at some point after understanding how much work that would mean…

hagen.rode · January 23, 2025, 1:59pm

Thank you! I managed to solve it as well, thanks to CoPilot.

created a file pcm-processor

class PCMProcessor extends AudioWorkletProcessor {
  constructor() {
    super()
    this.isRecording = true
    this.buffer = []
    this.bufferSize = 24000 // Adjust this size as needed

    this.port.onmessage = (event) => {
      if (event.data === 'STOP') {
        this.isRecording = false
        this.flush()
      }
    }
  }

  flush() {
    if (this.buffer.length > 0) {
      const finalBuffer = new Uint8Array(this.buffer)
      this.port.postMessage(finalBuffer.buffer)
      this.buffer = []
    }
  }

  process(inputs, outputs, parameters) {
    if (!this.isRecording) {
      this.flush()
      return false // Stop processing
    }
    outputs, parameters

    const input = inputs[0]
    if (input.length > 0) {
      const inputChannel = input[0]
      for (let i = 0; i < inputChannel.length; i++) {
        const sample = Math.max(-1, Math.min(1, inputChannel[i]))
        const intSample = sample < 0 ? sample * 0x8000 : sample * 0x7fff
        this.buffer.push(intSample & 0xff)
        this.buffer.push((intSample >> 8) & 0xff)

        if (this.buffer.length >= this.bufferSize) {
          const outputBuffer = new Uint8Array(this.buffer)
          this.port.postMessage(outputBuffer.buffer)
          this.buffer = []
        }
      }
    }
    return true
  }
}

registerProcessor('pcm-processor', PCMProcessor)

and then in my vue composable.

const startRecording = async () => {
    if (ws.value && isConnected.value) {
      // Get microphone access
      try {
        stream = await navigator.mediaDevices.getUserMedia({ audio: true })
        audioContext = new AudioContext({ sampleRate: 24000 }) // Ensure 24kHz sample rate

        // Register the AudioWorkletProcessor
        await audioContext.audioWorklet.addModule('/src/audio/pcm-processor.js')

        mediaStreamSource = audioContext.createMediaStreamSource(stream)
        pcmNode = new AudioWorkletNode(audioContext, 'pcm-processor')

        pcmNode.port.onmessage = (event) => {
          ws.value.send(event.data)
          console.log('Sending audio chunk to server...')
        }

        mediaStreamSource.connect(pcmNode)
        pcmNode.connect(audioContext.destination)

        console.log('Recording started...')
        isRecording.value = true
      } catch (err) {
        console.error('Error accessing microphone:', err)
      }
    } else {
      console.error('WebSocket is not connected or invalid')
    }
  }

  const stopRecording = () => {
    //if (isRecording.value) {
    console.log('Recording stopping...')
    isRecording.value = false
    const data = { event: 'STOP_SPEAKING' }
    ws.value.send(JSON.stringify(data))
    console.log('Recording stopped.')

    if (mediaStreamSource) {
      mediaStreamSource.disconnect()
    }

    if (pcmNode) {
      pcmNode.port.postMessage('STOP') // Send STOP message to the processor
      pcmNode.port.onmessage = null // Remove the event listener
      pcmNode.disconnect()
    }

    if (audioContext) {
      audioContext.close().then(() => {
        console.log('Audio context closed.')
      })
    }

    if (stream) {
      stream.getTracks().forEach((track) => track.stop())
    }

    //}
  }

Topic		Replies	Views
Real time api with websockets working example API api-realtime-speech	0	259	January 24, 2025
Playing audio in JS sent from realtime API API realtime	13	6458	January 9, 2025
Realtime transcription issue API	22	1614	June 5, 2025
MediaRecorder API w/ Whisper not working on mobile browsers API whisper , as-wiki	7	2130	December 20, 2024
Real Time Speech To Text API Disconnects Immediately API realtime , api-realtime	1	262	March 25, 2025

Realtime: Recording to a stream from browser

Related topics