I have faccing a problem to transcribe audio(blob) to text

I want to convert audio to text by whisper model but I can’t do it.

class TextView(APIView):
def post(self,request,*args, **kwargs):
if ‘audio_file’ in request.data:
audio_blob = request.data.get(‘audio_file’)
input_text = self.generate_text(audio_blob)
return input_text

def generate_text(self, inp_audio):
    model_id='whisper-1'
    response=openai.Audio.transcribe(
        api_key=config('OPENAI_API_KEY'),
        model=model_id,
        file=inp_audio
    )
    return response['text']

When I used process_audio_blob function then I got accurate results.

def process_audio_blob(self, audio_blob):
    recognizer = sr.Recognizer()
    audio_bytes = audio_blob.read()
    audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
    wav_data = audio_segment.export(format="wav").read()
    try:
        with io.BytesIO(wav_data) as wav_io:
            audio_data = sr.AudioData(wav_io.read(
            ), sample_rate=audio_segment.frame_rate, sample_width=audio_segment.sample_width)
        text = recognizer.recognize_google(audio_data, language='en-US')
        return text
    except:
        return 'None'

According to the process_audio_blob function, I try to whisper like below but it still doesn’t work . Also, this function can’t generate text.

def generate_text(self, audio_blob):
    model_id='whisper-1'
    audio_bytes = audio_blob.read()
    audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
    wav_data = audio_segment.export(format="wav").read()
    try:
        with io.BytesIO(wav_data) as wav_io:
            audio_data = sr.AudioData(wav_io.read(
            ), sample_rate=audio_segment.frame_rate, sample_width=audio_segment.sample_width)
        response = openai.Audio.transcribe(
            api_key=config('OPENAI_API_KEY'),
            model=model_id,
            file=audio_data
        )
        print(response['text'])
        return response['text']
    except Exception as e:  # Catch any exception related to transcription
        print(f"Error during transcription: {e}")
        return None

I have recorded audio by this , “react-voice-visualizer” and code for recording

import React, { useEffect } from “react”;
import { useVoiceVisualizer, VoiceVisualizer } from “react-voice-visualizer”;
import Product from “services/Product”;

const Record = ({
setLoading,
setSpeech,
setSearchedKeyWord,
setProductList,
}) => {
const recorderControls = useVoiceVisualizer();
const { stopRecording, recordedBlob, error, audioRef } = recorderControls;

useEffect(() => {
    const sendAudioToAPI = async () => {
        stopRecording();
        if (recordedBlob) {
            setLoading(true);
            try {
                let formData = new FormData();
                formData.append("audio_file", recordedBlob);
                const response = await Product.audioProduct(formData);

                if (response.status === 200) {
                    setLoading(false);
                    setSpeech(response?.input_text);
                    // setSearchedKeyWord(response?.products);
                    setProductList(response?.results);
                    console.log("Audio sent successfully!");
                } else {
                    setProductList([]);
                    setLoading(false);
                    console.error("Failed to send audio.");
                }
            } catch (error) {
                console.error("Error sending audio:", error);
            }
        }
    };
    if (recordedBlob) {
        sendAudioToAPI();
    }
    if (error) {
        console.log("Error:", error);
    }
}, [recordedBlob]);

// useEffect(() => {
//   let interval = setTimeout(() => {
//     stopRecording();
//   }, 30000);
//   return () => clearTimeout(interval);
// }, []);

return (
    <div className='w-full relative'>
        <div className=''>
            <VoiceVisualizer
                // onlyRecording={true}
                // backgroundColor="white"
                mainBarColor='green'
                secondaryBarColor='red'
                controls={recorderControls}
                speed={5}
                ref={audioRef}
                // animateCurrentPick={true}
                defaultAudioWaveIconColor='green'
                defaultMicrophoneIconColor='red'
                isControlPanelShown={true}
                barWidth={2}
                // isDefaultUIShown={false}
                isProgressIndicatorShown={true}
                canvasContainerClassName='mb-5'
                progressIndicatorClassName='className'
                fullscreen={true}
                controlButtonsClassName=''
                progressIndicatorTimeClassName='hidden'
                recordingTime={30}
                duration={30}
                isProgressIndicatorTimeShown={false}
            />
        </div>
    </div>
);

};
export default Record;

This is making a request with the python openai library:

from openai import OpenAI
client = OpenAI()

audio_file = open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
  model="whisper-1",
  file=audio_file
)

The API key is not a parameter. It is set from an OPENAI_API_KEY environment variable.

When it says “file”, it means a supported audio file, not a random stream of bytes. One of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. Or an io Bytes virtual file created by an audio encoder.

pip install --upgrade openai at the python install user’s level to get the latest openai module.

Use the OpenAI API must be with a API key that the organization has added prepayment credits to pay for OpenAI services.

It works for memory-uploaded files but not for blob files that pass from the front end by REST API. How can I solve this ? I mean how can I use blob for whisper?

class BazChatView(APIView):
    def post(self, request, *args, **kwargs):
            if 'audio_file' in request.data:
                audio_blob = request.data.get('audio_file')
                input_text=self.generate_text(audio_blob)
                print(input_text)
 def blob_to_memory_file(self, audio_blob):
        # Read the blob audio
        audio_bytes = audio_blob.read()
        print(type(audio_bytes))
        # Convert the bytes to audiosgment
        audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))

        # Export audio_segment as wav file to byte object
        byte_obj = io.BytesIO()
        audio_segment.export(byte_obj, format="wav")
        byte_obj.seek(0)  # Important: Reset to the start of the bytes stream
        print(type(byte_obj))

        return byte_obj

    def generate_text(self, audio_blob):
        # Model ID for Whisper ASR model
        model_id='whisper-1'

        # Convert Blob to In-Memory wav file
        byte_obj = self.blob_to_memory_file(audio_blob)
        
        try:
            response = client.audio.translations.create(
                model=model_id,
                file=byte_obj,
            )
            return response.text
        except Exception as e:
            print(f"Error during transcription: {e}")
            return None

Now I got this message “expected str, bytes or os.PathLike object, not InMemoryUploadedFile”
How can I fix it?