How to replace my GPT TTS call for better performance?

Could someone provide a full example how the chunks need to be returned? I tried to replace my GPT + TTS call with this to reduce latency, but I can’t seem to figure it out.

url = "https://api.openai.com/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
}
data = {
    "model": "gpt-4o-audio-preview",
    "modalities": ["text", "audio"],
    "audio": {"voice": "alloy", "format": "pcm16"},
    "messages": [
        {"role": "system", "content": "..."},
        {"role": "user", "content": "..."},
    ],
    "stream": True,
}
async with httpx.AsyncClient() as client:
    async with client.stream("POST", url, headers=headers, json=data) as response:
        if response.status_code == 200:
            async for chunk in response.aiter_bytes(1024):
                yield chunk

Here’s the chunks of “Hi there!” (with the length of pcm16 data strings massively truncated)

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"role":"assistant","refusal":null},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"content":null,"audio":{"id":"audio_672960c3a3e48190aed70377a7d6c941","transcript":"Hi"}},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"audio":{"transcript":" there"}},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"audio":{"transcript":"!"}},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"role":"assistant","content":null,"refusal":null,"audio":{"id":"audio_672960c3a3e48190aed70377a7d6c941","data":"CAAIAAkACgAPABEAEQARABMAEAANAAgAAwD9//v/+f/3//L/8v/w//L/8"}},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"audio":{"data":"lBNbDTYZdhycFrgTxg6NBT8BavhC85LyNu/j9Yv8DvqAAuA"}},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"audio":{"data":"J/6d/nsAkAE3/xv7U/jp9ur0bPRY+DH//v//v8IAAcAEgALABsAFAANAAsA//8IAP//8//y//T/7//3//v/+v8TABsADgAdACIAPAACAPn//f/j/zAAIQATADIAQgBbAAwA+f/z/wgA/v8YABkAOAAjANn/"}},"finish_reason":null}]}

data: {"id":"chatcmpl-123456","object":"chat.completion.chunk","created":1730764995,"model":"gpt-4o-audio-preview-2024-10-01","system_fingerprint":"fp_6e2d124157","choices":[{"index":0,"delta":{"audio":{"expires_at":1730768595}}}]}

data: [DONE]


Procedural non-async to get chunk data into a REPL environment to also experiment with a next():

for _ in range(1):  # chat loop only runs once
    request = {**params_template, **{"messages": chat}}
    try:
        print("sending API request")
        with httpx.Client(timeout=120) as client:
            response = client.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                json=request,
            )
        
        # Open the log file in write mode to overwrite content on each try
        with open("voice_chunk_log.txt", "wb") as log_file:
            response_content = b''

            for chunk in response.iter_bytes(chunk_size=8192):
                if chunk:
                    # Log each chunk to the file as it’s received
                    log_file.write(chunk)
                    log_file.flush()  # Ensure data is written to file immediately
                    response_content += chunk
                    # parse & play to buffer here


    except Exception as e:
        print(f"Error: {e}")
        continue
1 Like