My code snippet simulates real-time audio streaming from a microphone to GPT for ASR transcription
The generate_og_oudio_chunks function cuts the already recorded. ogg audio into 200 bytes and sends it as a trunk to the GPT. The model selected is gpt-4o-transcribe
But I found that no matter how large the audio I choose, GPT only provides streaming response after the audio transmission is completed
Is my code written incorrectly, or is GPT 4o Transcribe indeed using this processing mechanism?
async def generate_ogg_audio_chunks(audio_file_path, state):
response = requests.get(audio_file_path)
audio_data_bytes = response.content
logger.info("Upload START.")
for i in range(0, len(audio_data_bytes), chunk_size):
state.last_audio_chunk_time = time.time()
yield audio_data_bytes[i:i + chunk_size]
await asyncio.sleep(delay)
logger.info("Upload END.")
async def call_openai_asr_stream_async_sse(audio_file_path, language_code, model):
api_url = "v1/audio/transcriptions"
language = language_code[:2] # en-US
headers = {
'Authorization': 'Bearer ' + os.getenv('OPENAI_ASR_API_KEY'),
}
state = TranscriptionState()
file_stream = generate_ogg_audio_chunks(audio_file_path, state)
async with aiohttp.ClientSession() as session:
try:
data = aiohttp.FormData()
data.add_field('file', file_stream, filename="audio_file_path.ogg")
data.add_field('model', model)
data.add_field('response_format', 'json')
data.add_field('language', language)
data.add_field('prompt', get_prompt_by_lang(language))
data.add_field('stream', "true")
async with session.post(api_url, data=data, headers=headers) as response:
response.raise_for_status()
# Process the SSE stream
async for line in response.content:
# Decode the byte stream to a string (utf-8)
line = line.decode('utf-8').strip()
if line:
logger.info(f"receive {model} sse data: {line}")
if line.startswith('data:'):
event_data = line[5:].strip()
try:
res_data = json.loads(event_data)
if res_data["type"] == 'transcript.text.done':
text = res_data.get("text", "")
text = text.replace("�", "")
text = text.replace("\ufefb", "")
state.final_transcript_final = text
state.transcript_end = time.time()
metrics_result = stream_metrics_simply(state)
return metrics_result
except ValueError as e:
print(f"Error parsing SSE data: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
except Exception as e:
logger.error(f"An error occurred: {e}", exc_info=True)