bojo
December 20, 2024, 1:47am
1
I am trying to use the realtime API to translate input audio stream… I am getting this
02:40:28 - response.audio_transcript.done
02:40:28 - response.output_item.done
Transcript: I’m sorry, but I can’t identify who a speaker is based on their voice.
02:40:28 - response.done
I did not ask it to transcribe. And this is definitely not what I was saying.
And ideas
Hi, have you checked if something similar may apply to your case?
hi there, could you please elaborate more on this? Thanks!
bojo
December 20, 2024, 9:11pm
3
I took a look now. But cannot relate to it.
It actually sometimes stop responding all together. I dont get even the message handles saying that voice was appended or anything.
After session started, it stops responding
Sharing the code you used to implement the thing would be a sort of a starting point to help us help you.
works fine on my end, check your code or the documentation
bojo
December 21, 2024, 7:47pm
6
sure, here it is
class RealtimeTranslator:
def __init__(self):
self.api_key = "KEY-HERE" # Set your API key directly here
self.ws = None
self.audio = pyaudio.PyAudio()
# Audio settings matching OpenAI requirements
self.format = pyaudio.paInt16
self.channels = 1
self.rate = 16000
self.chunk = int(self.rate * 5) # Increase chunk size to 5 seconds
self.stream = None
def on_message(self, ws, message):
# logger.info(f"Message received: {message}")
data = json.loads(message)
if data["type"] == "result":
translated_text = data['data']['text']
logger.info(f"Translation received: {translated_text}")
print(f"Translated Text: {translated_text}")
elif data["type"] == "error":
logger.error(f"Error from server: {data['error']['message']}")
pass
elif data["type"] == "session.created":
logger.info("Session created successfully")
pass
elif data["type"] == "session.updated":
logger.info("Session updated successfully")
pass
elif data["type"] == "response.audio.delta":
# Handle response.audio.delta
logger.info("response.audio.delta")
pass
elif data["type"] == "response.audio.done":
# Handle response.audio.done
logger.info("response.audio.done")
pass
elif data["type"] == "response.audio_transcript.done":
# Handle response.audio_transcript.done
logger.info("response.audio_transcript.done")
pass
elif data["type"] == "response.content_part.done":
# Handle response.content_part.done
pass
elif data["type"] == "response.output_item.done":
logger.info("response.output_item.done")
# Handle response.output_item.done
item = data['item']
if item['type'] == 'message' and item['status'] == 'completed':
for content in item['content']:
if content['type'] == 'audio' and 'transcript' in content:
print(f"Transcript: {content['transcript']}")
elif data["type"] == "response.done":
# Handle response.done
logger.info("response.done")
pass
else:
logger.info(f"Unhandled message type: {data['type']}")
pass
def on_error(self, ws, error):
logger.error(f"Error: {error}")
def on_close(self, ws, close_status_code, close_msg):
logger.info("Connection closed")
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
def on_open(self, ws):
logger.info("Connection established")
# Set translation instructions and configuration
ws.send(json.dumps({
"event_id": "event_123",
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"instructions": "You are a helpful assistant. Translate",
"voice": "sage",
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"input_audio_transcription": None, # Replace null with None
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500,
"create_response": True # Fix the typo here
},
"temperature": 0.8,
"max_response_output_tokens": "inf"
}
}))
# Start audio streaming
self.start_audio_stream()
def audio_callback(self, in_data, frame_count, time_info, status):
if self.ws and self.ws.sock and self.ws.sock.connected:
try:
# Validate buffer length
buffer_length = len(in_data)
min_buffer_length = int(self.rate * 0.1 * 2) # 100ms worth of samples, 2 bytes per sample
if buffer_length >= min_buffer_length:
# Append the audio data
audio_hex = bytes(in_data).hex()
self.ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": audio_hex
}))
else:
logger.warning(f"Buffer too small: {buffer_length} bytes. Expected at least {min_buffer_length} bytes.")
except Exception as e:
logger.error(f"Error sending audio: {e}")
return (in_data, pyaudio.paContinue)
def start_audio_stream(self):
# Ensure we capture enough samples for 100ms minimum
min_samples = int(self.rate * 0.1) # 100ms worth of samples
self.stream = self.audio.open(
format=self.format,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer=self.chunk, # Use the increased chunk size
stream_callback=self.audio_callback
)
self.stream.start_stream()
logger.info("Audio stream started")
def list_microphones(self):
"""List available microphones."""
info = self.audio.get_host_api_info_by_index(0)
num_devices = info.get('deviceCount')
for i in range(num_devices):
device_info = self.audio.get_device_info_by_host_api_device_index(0, i)
if device_info.get('maxInputChannels') > 0:
print(f"Input Device id {i} - {device_info.get('name')}")
def connect(self):
"""Connect to OpenAI Realtime API and send a session update message."""
try:
websocket.enableTrace(False) # Disable detailed websocket trace logs
self.ws = websocket.WebSocketApp(
"wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
header={
"Authorization": f"Bearer {self.api_key}",
"openai-beta": "realtime=v1"
},
on_open=self.on_open,
on_message=self.on_message,
on_error=self.on_error,
on_close=self.on_close
)
self.ws.run_forever()
except Exception as e:
logger.error(f"Connection error: {e}")
if "invalid_api_key" in str(e):
logger.error("Invalid API key. Please check your OpenAI API key.")
raise
def stop(self):
"""Safely stop the session with the API server and quit."""
if self.ws:
self.ws.close()
if self.stream:
self.stream.stop_stream()
self.stream.close()
self.audio.terminate()
logger.info("Translation service stopped")
if __name__ == "__main__":
try:
translator = RealtimeTranslator()
translator.list_microphones() # List available microphones
translator.connect()
except KeyboardInterrupt:
print("\nStopping translation service...")
translator.stop() # Safely stop the session
except ValueError as e:
print(f"Error: {e}")
translator.stop() # Safely stop the session