Realtime API returning a weird transcription

I am trying to use the realtime API to translate input audio stream… I am getting this
02:40:28 - response.audio_transcript.done
02:40:28 - response.output_item.done
Transcript: I’m sorry, but I can’t identify who a speaker is based on their voice.
02:40:28 - response.done

I did not ask it to transcribe. And this is definitely not what I was saying.
And ideas

Hi, have you checked if something similar may apply to your case?

I took a look now. But cannot relate to it.
It actually sometimes stop responding all together. I dont get even the message handles saying that voice was appended or anything.
After session started, it stops responding

Sharing the code you used to implement the thing would be a sort of a starting point to help us help you.

works fine on my end, check your code or the documentation

sure, here it is

class RealtimeTranslator:
    def __init__(self):
        self.api_key = "KEY-HERE"  # Set your API key directly here
        self.ws = None
        self.audio = pyaudio.PyAudio()
        
        # Audio settings matching OpenAI requirements
        self.format = pyaudio.paInt16
        self.channels = 1
        self.rate = 16000
        self.chunk = int(self.rate * 5)  # Increase chunk size to 5 seconds
        self.stream = None

    def on_message(self, ws, message):
        # logger.info(f"Message received: {message}")
        data = json.loads(message)
        if data["type"] == "result":
            translated_text = data['data']['text']
            logger.info(f"Translation received: {translated_text}")
            print(f"Translated Text: {translated_text}")
        elif data["type"] == "error":
            logger.error(f"Error from server: {data['error']['message']}")
            pass
        elif data["type"] == "session.created":
            logger.info("Session created successfully")
            pass
        elif data["type"] == "session.updated":
            logger.info("Session updated successfully")
            pass
        elif data["type"] == "response.audio.delta":
            # Handle response.audio.delta
            logger.info("response.audio.delta")
            pass
        elif data["type"] == "response.audio.done":
            # Handle response.audio.done
            logger.info("response.audio.done")
            pass
        elif data["type"] == "response.audio_transcript.done":
            # Handle response.audio_transcript.done
            logger.info("response.audio_transcript.done")
            pass
        elif data["type"] == "response.content_part.done":
            # Handle response.content_part.done
            pass
        elif data["type"] == "response.output_item.done":
            logger.info("response.output_item.done")
            # Handle response.output_item.done
            item = data['item']
            if item['type'] == 'message' and item['status'] == 'completed':
                for content in item['content']:
                    if content['type'] == 'audio' and 'transcript' in content:
                        print(f"Transcript: {content['transcript']}")
        elif data["type"] == "response.done":
            # Handle response.done
            logger.info("response.done")
            pass
        else:
            logger.info(f"Unhandled message type: {data['type']}")
            pass

    def on_error(self, ws, error):
        logger.error(f"Error: {error}")

    def on_close(self, ws, close_status_code, close_msg):
        logger.info("Connection closed")
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
        self.audio.terminate()

    def on_open(self, ws):
        logger.info("Connection established")
        # Set translation instructions and configuration
        ws.send(json.dumps({
            "event_id": "event_123",
            "type": "session.update",
            "session": {
                "modalities": ["text", "audio"],
                "instructions": "You are a helpful assistant. Translate",
                "voice": "sage",
                "input_audio_format": "pcm16",
                "output_audio_format": "pcm16",
                "input_audio_transcription": None,  # Replace null with None
                "turn_detection": {
                    "type": "server_vad",
                    "threshold": 0.5,
                    "prefix_padding_ms": 300,
                    "silence_duration_ms": 500,
                    "create_response": True  # Fix the typo here
                },
                "temperature": 0.8,
                "max_response_output_tokens": "inf"
            }
        }))
        # Start audio streaming
        self.start_audio_stream()

    def audio_callback(self, in_data, frame_count, time_info, status):
        if self.ws and self.ws.sock and self.ws.sock.connected:
            try:
                # Validate buffer length
                buffer_length = len(in_data)
                min_buffer_length = int(self.rate * 0.1 * 2)  # 100ms worth of samples, 2 bytes per sample

                if buffer_length >= min_buffer_length:
                    # Append the audio data
                    audio_hex = bytes(in_data).hex()
                    self.ws.send(json.dumps({
                        "type": "input_audio_buffer.append",
                        "audio": audio_hex
                    }))
                    
                else:
                    logger.warning(f"Buffer too small: {buffer_length} bytes. Expected at least {min_buffer_length} bytes.")
                
            except Exception as e:
                logger.error(f"Error sending audio: {e}")
                
        return (in_data, pyaudio.paContinue)

    def start_audio_stream(self):
        # Ensure we capture enough samples for 100ms minimum
        min_samples = int(self.rate * 0.1)  # 100ms worth of samples
        self.stream = self.audio.open(
            format=self.format,
            channels=self.channels,
            rate=self.rate,
            input=True,
            frames_per_buffer=self.chunk,  # Use the increased chunk size
            stream_callback=self.audio_callback
        )
        self.stream.start_stream()
        logger.info("Audio stream started")

    def list_microphones(self):
        """List available microphones."""
        info = self.audio.get_host_api_info_by_index(0)
        num_devices = info.get('deviceCount')
        for i in range(num_devices):
            device_info = self.audio.get_device_info_by_host_api_device_index(0, i)
            if device_info.get('maxInputChannels') > 0:
                print(f"Input Device id {i} - {device_info.get('name')}")

    def connect(self):
        """Connect to OpenAI Realtime API and send a session update message."""
        try:
            websocket.enableTrace(False)  # Disable detailed websocket trace logs
            self.ws = websocket.WebSocketApp(
                "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
                header={
                    "Authorization": f"Bearer {self.api_key}",
                    "openai-beta": "realtime=v1"
                },
                on_open=self.on_open,
                on_message=self.on_message,
                on_error=self.on_error,
                on_close=self.on_close
            )
            self.ws.run_forever()
        except Exception as e:
            logger.error(f"Connection error: {e}")
            if "invalid_api_key" in str(e):
                logger.error("Invalid API key. Please check your OpenAI API key.")
            raise

    def stop(self):
        """Safely stop the session with the API server and quit."""
        if self.ws:
            self.ws.close()
        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
        self.audio.terminate()
        logger.info("Translation service stopped")
if __name__ == "__main__":
    try:
        translator = RealtimeTranslator()
        translator.list_microphones()  # List available microphones
        translator.connect()
    except KeyboardInterrupt:
        print("\nStopping translation service...")
        translator.stop()  # Safely stop the session
    except ValueError as e:
        print(f"Error: {e}")
        translator.stop()  # Safely stop the session