Realtime API with WebRTC: issue when the modality is updated to ["text", "audio"]: the audio output is missing

I create a session with the following parameter:

{
    "type": "session.created",
    "event_id": "event_BUZT6pjANkHwGkVWwWFSn",
    "session": {
        "id": "sess_BUZT5m2SNu7y6hYlhb78f",
        "object": "realtime.session",
        "expires_at": 1746627680,
        "input_audio_noise_reduction": null,
        "turn_detection": {
            "type": "server_vad",
            "threshold": 0.5,
            "prefix_padding_ms": 300,
            "silence_duration_ms": 200,
            "create_response": true,
            "interrupt_response": true
        },
        "input_audio_format": "pcm16",
        "input_audio_transcription": {
            "model": "whisper-1",
            "language": "fr",
            "prompt": null
        },
        "client_secret": null,
        "include": null,
        "model": "gpt-4o-realtime-preview-2024-12-17",
        "modalities": [
            "text"
        ],
        "instructions": "Ta Fiche d'Identité est M. Heureux. Ennéagramme : Type 2 (Altruiste) : Chaleureux, sensible et attentif aux besoins, apprécie particulièrement les échanges bienveillants et efficaces.",
        "voice": "ash",
        "output_audio_format": "pcm16",
        "tool_choice": "auto",
        "temperature": 0.8,
        "max_response_output_tokens": "inf",
        "tools": []
    }
}

Then I enter a text message.

{
    "type": "conversation.item.created",
    "event_id": "event_BUZTCwFXtT6Kh8H35nB1w",
    "previous_item_id": null,
    "item": {
        "id": "msg_2025-05-07T135123815Z",
        "object": "realtime.item",
        "type": "message",
        "status": "completed",
        "role": "user",
        "content": [
            {
                "type": "input_text",
                "text": "Bonjour"
            }
        ]
    }
}

The assistant answers

{
    "type": "response.output_item.added",
    "event_id": "event_BUZTGZrs99S6SzpxOaHVG",
    "response_id": "resp_BUZTFh4d7fribh0IjruAQ",
    "output_index": 0,
    "item": {
        "id": "item_BUZTFAFNDyHfulw3Q6GOw",
        "object": "realtime.item",
        "type": "message",
        "status": "in_progress",
        "role": "assistant",
        "content": []
    }
}
{
    "type": "conversation.item.created",
    "event_id": "event_BUZTGRmSn5JxxEVNtbGwi",
    "previous_item_id": "msg_2025-05-07T135123815Z",
    "item": {
        "id": "item_BUZTFAFNDyHfulw3Q6GOw",
        "object": "realtime.item",
        "type": "message",
        "status": "in_progress",
        "role": "assistant",
        "content": []
    }
}
{
    "type": "response.text.done",
    "event_id": "event_BUZTGbRwmkbB9GP36rGRr",
    "response_id": "resp_BUZTFh4d7fribh0IjruAQ",
    "item_id": "item_BUZTFAFNDyHfulw3Q6GOw",
    "output_index": 0,
    "content_index": 0,
    "text": "Bonjour, je suis navré de vous embêter mais j'ai un petit souci. Merci de m'accorder ce moment. J'ai besoin d'une remise en état de mon plafond, car il a été endommagé par un dégât des eaux. Pourriez-vous m'aider à organiser une intervention rapide ?"
}

Then I update the session to the [“text”, “audio”] modality.

{
    "type": "session.updated",
    "event_id": "event_BUZvthzIi9VgCYm5Tv9oH",
    "session": {
        "id": "sess_BUZvmQcoFUsH8BiyKenyK",
        "object": "realtime.session",
        "expires_at": 1746629459,
        "input_audio_noise_reduction": null,
        "turn_detection": {
            "type": "server_vad",
            "threshold": 0.5,
            "prefix_padding_ms": 300,
            "silence_duration_ms": 500,
            "create_response": true,
            "interrupt_response": true
        },
        "input_audio_format": "pcm16",
        "input_audio_transcription": {
            "model": "whisper-1",
            "language": "fr",
            "prompt": null
        },
        "client_secret": null,
        "include": null,
        "model": "gpt-4o-realtime-preview-2024-12-17",
        "modalities": [
            "text",
            "audio"
        ],
        "instructions": "Ta Fiche d'Identité est M. Heureux. Ennéagramme : Type 2 (Altruiste) : Chaleureux, sensible et attentif aux besoins, apprécie particulièrement les échanges bienveillants et efficaces.",
        "voice": "ash",
        "output_audio_format": "pcm16",
        "tool_choice": "auto",
        "temperature": 0.8,
        "max_response_output_tokens": "inf",
        "tools": []
    }
}

Just after updating the modality, I start to talk. Here are the server events:

{
    "type": "input_audio_buffer.speech_started",
    "event_id": "event_BUZvuQR2S5AZcTcFi9yk0",
    "audio_start_ms": 6388,
    "item_id": "item_BUZvuXBWUILuJtjUep7c3"
}
{
    "type": "input_audio_buffer.speech_stopped",
    "event_id": "event_BUZvwsJ6L4CmLqDZXfdzg",
    "audio_end_ms": 8032,
    "item_id": "item_BUZvuXBWUILuJtjUep7c3"
}
{
    "type": "input_audio_buffer.committed",
    "event_id": "event_BUZvwSVA85w5TuGVhxQTU",
    "previous_item_id": "item_BUZvrwlU5PWYr3QBsS1ei",
    "item_id": "item_BUZvuXBWUILuJtjUep7c3"
}

```json
{
    "type": "conversation.item.created",
    "event_id": "event_BUZvwUl7EjGbxXBNs6qyV",
    "previous_item_id": "item_BUZvrwlU5PWYr3QBsS1ei",
    "item": {
        "id": "item_BUZvuXBWUILuJtjUep7c3",
        "object": "realtime.item",
        "type": "message",
        "status": "completed",
        "role": "user",
        "content": [
            {
                "type": "input_audio",
                "transcript": null
            }
        ]
    }
}
```json
{
    "type": "response.created",
    "event_id": "event_BUZvwt69rvgQv1Tah3yY7",
    "response": {
        "object": "realtime.response",
        "id": "resp_BUZvwuLgR3NaCJa4DGdZd",
        "status": "in_progress",
        "status_details": null,
        "output": [],
        "conversation_id": "conv_BUZvnJyuicd4znrnEOUMX",
        "modalities": [
            "text",
            "audio"
        ],
        "voice": "ash",
        "output_audio_format": "pcm16",
        "temperature": 0.8,
        "max_output_tokens": "inf",
        "usage": null,
        "metadata": null
    }
}
{
    "type": "rate_limits.updated",
    "event_id": "event_BUZvwGp16cBiwvr0yk7cq",
    "rate_limits": [
        {
            "name": "requests",
            "limit": 5000,
            "remaining": 4999,
            "reset_seconds": 0.012
        },
        {
            "name": "tokens",
            "limit": 800000,
            "remaining": 794385,
            "reset_seconds": 0.421
        }
    ]
}
{
    "type": "response.output_item.added",
    "event_id": "event_BUZvwL9V8MjID50Ua3Q0J",
    "response_id": "resp_BUZvwuLgR3NaCJa4DGdZd",
    "output_index": 0,
    "item": {
        "id": "item_BUZvw3B6Xfdjnxksd3wvr",
        "object": "realtime.item",
        "type": "message",
        "status": "in_progress",
        "role": "assistant",
        "content": []
    }
}
{
    "type": "conversation.item.created",
    "event_id": "event_BUZvwljiVq7wZX1TXFxFQ",
    "previous_item_id": "item_BUZvuXBWUILuJtjUep7c3",
    "item": {
        "id": "item_BUZvw3B6Xfdjnxksd3wvr",
        "object": "realtime.item",
        "type": "message",
        "status": "in_progress",
        "role": "assistant",
        "content": []
    }
}
{
    "type": "response.text.done",
    "event_id": "event_BUZvwlM8iQohfIT7i1GBn",
    "response_id": "resp_BUZvwuLgR3NaCJa4DGdZd",
    "item_id": "item_BUZvw3B6Xfdjnxksd3wvr",
    "output_index": 0,
    "content_index": 0,
    "text": "Mon plafond a été endommagé par un dégât des eaux. J'aurais besoin d'une intervention rapide pour éviter que la situation ne s'aggrave. Pourriez-vous m'aider à organiser cela ?"
}
{
    "type": "response.done",
    "event_id": "event_BUZvwuXGJRdLFIwNP1Phj",
    "response": {
        "object": "realtime.response",
        "id": "resp_BUZvwuLgR3NaCJa4DGdZd",
        "status": "completed",
        "status_details": null,
        "output": [
            {
                "id": "item_BUZvw3B6Xfdjnxksd3wvr",
                "object": "realtime.item",
                "type": "message",
                "status": "completed",
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": "Mon plafond a été endommagé par un dégât des eaux. J'aurais besoin d'une intervention rapide pour éviter que la situation ne s'aggrave. Pourriez-vous m'aider à organiser cela ?"
                    }
                ]
            }
        ],
        "conversation_id": "conv_BUZvnJyuicd4znrnEOUMX",
        "modalities": [
            "text",
            "audio"
        ],
        "voice": "ash",
        "output_audio_format": "pcm16",
        "temperature": 0.8,
        "max_output_tokens": "inf",
        "usage": {
            "total_tokens": 1469,
            "input_tokens": 1426,
            "output_tokens": 43,
            "input_token_details": {
                "text_tokens": 1120,
                "audio_tokens": 306,
                "cached_tokens": 1024,
                "cached_tokens_details": {
                    "text_tokens": 1024,
                    "audio_tokens": 0
                }
            },
            "output_token_details": {
                "text_tokens": 43,
                "audio_tokens": 0
            }
        },
        "metadata": null
    }
}

I notice that only the text output is generated by the Reatime audio model if I talk just after updating the session.

The workaround: if I talk a second after updating the modality, then the audio output is generated. It is a problem because the output generation is not reliable.