Had all these issues as well. Looks like Whisper works best with a specific kind of file format. Mono channel, sample rate of 16khz, and pcm_s16le encoding. You can use ffmpeg to convert your audio with these settings or also use the web audio api. This is what worked for me:
async function convertAudioToMono(file: File | Blob): Promise<Blob> {
const audioContext = new AudioContext({ sampleRate: 16000 });
const arrayBuffer = await file.arrayBuffer();
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
// Create offline context for processing
const offlineContext = new OfflineAudioContext(1, audioBuffer.length, 16000);
const source = offlineContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineContext.destination);
source.start();
// Render audio
const renderedBuffer = await offlineContext.startRendering();
// Convert to WAV format
const length = renderedBuffer.length * 2;
const buffer = new ArrayBuffer(44 + length);
const view = new DataView(buffer);
// WAV header
const writeString = (view: DataView, offset: number, string: string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(view, 0, "RIFF");
view.setUint32(4, 36 + length, true);
writeString(view, 8, "WAVE");
writeString(view, 12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, 16000, true);
view.setUint32(28, 32000, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
writeString(view, 36, "data");
view.setUint32(40, length, true);
// Write audio data
const data = new Float32Array(renderedBuffer.getChannelData(0));
let offset = 44;
for (let i = 0; i < data.length; i++) {
const sample = Math.max(-1, Math.min(1, data[i]));
view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true);
offset += 2;
}
return new Blob([buffer], { type: "audio/wav" });
}