Unrecognized file format error whisper BytesIO, can't write to disk

Hi I’m creating an API with transcription functionality and I don’t want to write to disk for obvious reasons. What am I doing wrong here?

async def upload_file(file: UploadFile = File(...), api_key: APIKey = Depends(get_api_key)):
    # Transcribe with Whisper
    try:
        audio_bytes = BytesIO(await file.read())
        transcript = await client.audio.transcriptions.create(
            model="whisper-1", 
            file=audio_bytes,
            response_format="vtt"
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        await file.close()
        audio_bytes.close()
    
    return {"transcription": transcript}

And I get the error:

{"detail":"Error code: 400 - {'error': {'message': \"Unrecognized file format. Supported formats: ['flac', 'm4a', 'mp3', 'mp4', 'mpeg', 'mpga', 'oga', 'ogg', 'wav', 'webm']\", 'type': 'invalid_request_error', 'param': None, 'code': None}}"}

I’ve tried different file formats and I’m uploading correctly.

1 Like

Went ahead and wrote to disk anyway, if anybody has a solution let me know, will be keeping an eye on this thread.

1 Like

Are you uploading a voice memo or recording from an iPhone or Android?
I had issues with both of those and was able to fix them.

If you can give some more info or post the file type that would help.

When I use the same files via Curl it works mate it’s not the files. Where you able to use BytesIO? And they also work when I write to disk in a tempfile, I just don’t like the approach.

I used pydub for my solution on the Samsung files, but I am still saving a tempfile. Not sure of any other way.

@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
    print("request received")
    if 'audio' not in request.files:
        return jsonify({"error": "No audio file"}), 400

    audio_file = request.files['audio']

    try:
        # Save the uploaded .3gp file to a temporary file
        with tempfile.NamedTemporaryFile(suffix='.3gp', delete=False) as tmp_3gp:
            audio_file.save(tmp_3gp.name)
            tmp_3gp_path = tmp_3gp.name

        # Convert the .3gp file to .mp3 and save to another temporary file
        with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
            sound = AudioSegment.from_file(tmp_3gp_path, format="3gp")
            sound.export(tmp_mp3.name, format="mp3")
            tmp_mp3_path = tmp_mp3.name

        # Remove the original .3gp temporary file
        os.remove(tmp_3gp_path)

        # OpenAI API call with the converted .mp3 file
        with open(tmp_mp3_path, 'rb') as mp3_file:
            transcript = client.audio.transcriptions.create(
                model="whisper-1",
                file=mp3_file,
                language="en"
            )

        print("transcribed")
        print(transcript)
        return jsonify({"transcription": transcript.text})
    
    except Exception as e:
        traceback.print_exc()
        # Remove temporary files in case of an exception
        os.remove(tmp_3gp_path)
        os.remove(tmp_mp3_path)
        return jsonify({"error": str(e)}), 500
    finally:
        # Cleanup: Ensure temporary files are deleted
        if os.path.exists(tmp_3gp_path):
            os.remove(tmp_3gp_path)
        if os.path.exists(tmp_mp3_path):
            os.remove(tmp_mp3_path)

Thx for your help I’m just using shutil for now… But in future would like to get this resolved with an in memory solution. Anyway i gotta move on cause time is ticking and I need to build out a whole lot more then just this tiny part :D. It’s just for internal use anyway not going to see tons of traffic