Hey guys,
I’m new so apologies if this has already been asked.
Right now I’m using the Realtime API to call simple functions intelligently. I used the Twilio + OpenAI app as a template. I would like to leverage both DTMF and voice in order to call functions intelligently.
In order to leverage DTMF, I convert every keypress that the user makes to a plaintext character. So if the user presses “1”, I define a new conversation.item.create
with that keypress and send it to OpenAI.
However, OpenAI doesn’t respond after getting keypress data, it just lies still. I have VAD mode enabled but I would also like it to respond after it gets all keypresses. Is there a way to do this without creating & sending a response.create
after every single keypress (leads to a jarring experience b/c OpenAI starts responding after a single keypress).
Here’s the code that listens to Twilio bidirectional stream events and sends DTMF keypresses as text to OpenAI.
@app.websocket("/media-stream")
async def handle_media_stream(websocket: WebSocket):
"""Handle WebSocket connections between Twilio and OpenAI."""
print("Client connected")
await websocket.accept()
async with websockets.connect(
"wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
extra_headers={
"Authorization": f"Bearer {CONFIG.api_key}",
"OpenAI-Beta": "realtime=v1",
},
) as openai_ws:
await initialize_session(openai_ws)
# Connection specific state
stream_sid = None
call_sid = None
latest_media_timestamp = 0
last_assistant_item = None
mark_queue = []
response_start_timestamp_twilio = None
async def receive_from_twilio():
"""Receive audio data from Twilio and send it to the OpenAI Realtime API."""
nonlocal stream_sid, call_sid, latest_media_timestamp
try:
async for message in websocket.iter_text(): # listening for events from Twilio
data = json.loads(message)
if data["event"] == "media" and openai_ws.open:
latest_media_timestamp = int(data["media"]["timestamp"])
audio_append = {
"type": "input_audio_buffer.append",
"audio": data["media"]["payload"],
}
await openai_ws.send(json.dumps(audio_append))
elif data["event"] == "start":
stream_sid = data["start"]["streamSid"]
call_sid = data["start"]["callSid"]
logging.info(f"Incoming stream has started: {call_sid=}")
response_start_timestamp_twilio = None # noqa: F841
latest_media_timestamp = 0
last_assistant_item = None # noqa: F841
elif data["event"] == "dtmf":
logging.info(f"Received DTMF event: {data['dtmf']}")
digit = data["dtmf"]["digit"]
await openai_ws.send(build_conversation_item(digit)) # creates & sends conversation.item.create
elif data["event"] == "mark":
if mark_queue:
mark_queue.pop(0)
except WebSocketDisconnect:
print("Client disconnected.")
if openai_ws.open:
await openai_ws.close()