Here’s a successful call body:
{"messages":[{"role":"system","content":[{"type":"text","text":"You are ChatPAL!"}]},{"role":"user","content":[{"type":"text","text":""},{"type":"input_audio","input_audio":{"data":"SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjYwLjE2LjEwMAAAAAAAAAAAAAAA...","format":"wav"}}]}],"temperature":0.6,"max_completion_tokens":2048,"top_p":1,"frequency_penalty":0,"presence_penalty":0,"model":"gpt-4o-audio-preview","modalities":["text","audio"],"audio":{"voice":"alloy","format":"pcm16"},"stream":true,"stream_options":{"include_usage":true}}
input with mp3 was also successful.
Could it be that you are sending a wav format not supported?
Here’s API endpoint code to send all common wav rates to test compatibility:
import os
import sys
import base64
import httpx
import io
import json
try:
from pydub import AudioSegment
except ImportError:
print("Please install pydub (e.g., pip install pydub) and ensure ffmpeg is installed.")
sys.exit(1)
def convert_wav_format(wav_path: str, channels: int, sample_rate: int) -> io.BytesIO:
"""
Load a WAV file and adjust it to the desired number of channels and sample rate.
:param wav_path: Path to the input WAV file.
:param channels: Number of audio channels (1 for mono, 2 for stereo).
:param sample_rate: Desired sample rate in Hz.
:return: A BytesIO object containing the adjusted WAV audio data.
"""
try:
# Load the WAV file (format specified as "wav")
audio = AudioSegment.from_file(wav_path, format="wav")
except Exception as e:
raise RuntimeError(f"Error loading WAV file '{wav_path}': {e}")
# Adjust the sample rate and channel count
audio = audio.set_frame_rate(sample_rate).set_channels(channels)
# Export the adjusted audio to a BytesIO buffer in WAV format
wav_bytes = io.BytesIO()
audio.export(wav_bytes, format="wav")
wav_bytes.seek(0)
return wav_bytes
def send_audio_api_call(audio_bytes: io.BytesIO, channels: int, sample_rate: int, api_key: str) -> dict:
"""
Send the audio file (as a BytesIO object) to the OpenAI API.
:param audio_bytes: A BytesIO object with the WAV audio data.
:param channels: Number of channels (used for logging/testing).
:param sample_rate: Sample rate in Hz (used for logging/testing).
:param api_key: Your OpenAI API key.
:return: The JSON response from the API.
"""
# Ensure the buffer is at the beginning
audio_bytes.seek(0)
audio_data = audio_bytes.read()
encoded_audio = base64.b64encode(audio_data).decode('ascii')
# Construct the payload with the provided JSON structure
payload = {
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Perfectly Round-Headed Primate Ramblings!"
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": ""
},
{
"type": "input_audio",
"input_audio": {
"data": encoded_audio,
"format": "wav" # since we are using WAV
}
}
]
}
],
"temperature": 1,
"max_completion_tokens": 100,
"top_p": 0.6,
"model": "gpt-4o-audio-preview",
"modalities": ["text"], # or ["text", "audio"] if needed
"audio": {
"voice": "alloy",
"format": "wav"
},
"stream": False
}
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
with httpx.Client(timeout=60) as client:
response = client.post(url, json=payload, headers=headers)
response.raise_for_status()
return response.json()
def test_api_across_formats(wav_path: str, api_key: str):
"""
Test the API call across combinations of mono/stereo channels and various sample rates.
Logs the progress and exceptions for each test and finally prints a report of the supported
sample rate/channel combinations.
:param wav_path: Path to the source WAV file.
:param api_key: Your OpenAI API key.
:return: A list of dictionaries logging the results of each test.
"""
# Define sample rates and channel options to test
sample_rates = [16000, 22050, 24000, 32000, 44100, 48000]
channel_options = [1, 2] # 1 for mono, 2 for stereo
results_log = []
print("Starting API tests for various audio formats...\n")
for channels in channel_options:
for sr in sample_rates:
test_id = f"{sr} Hz, {channels} channel(s)"
print(f"Testing audio ({test_id})...")
try:
# Convert the input WAV to the desired format
formatted_audio = convert_wav_format(wav_path, channels, sr)
# Call the API with the formatted audio
result = send_audio_api_call(formatted_audio, channels, sr, api_key)
print(f" SUCCESS: {test_id}\n")
results_log.append({
"channels": channels,
"sample_rate": sr,
"success": True,
"result": result
})
except Exception as e:
print(f" FAILURE: {test_id} - {e}\n")
results_log.append({
"channels": channels,
"sample_rate": sr,
"success": False,
"error": str(e)
})
# Create a report of supported frequencies (combinations that succeeded)
supported = {}
for entry in results_log:
if entry["success"]:
sr = entry["sample_rate"]
ch = entry["channels"]
supported.setdefault(sr, []).append(ch)
print("\n=== Supported Frequencies Report ===")
if supported:
for sr in sorted(supported.keys()):
channels_supported = sorted(supported[sr])
print(f"{sr} Hz: Channels supported -> {channels_supported}")
else:
print("No supported frequency/channel combinations found.")
return results_log
if __name__ == "__main__":
# Check for the API key in environment variables
api_key = os.environ.get('OPENAI_API_KEY')
if not api_key:
print("Error: OPENAI_API_KEY environment variable is not set.")
sys.exit(1)
# Use the WAV file (now simplified to ash.wav)
wav_file_path = 'ash.wav'
if not os.path.exists(wav_file_path):
print(f"Error: Source WAV file '{wav_file_path}' not found.")
sys.exit(1)
# Run the tests across all combinations
test_results = test_api_across_formats(wav_file_path, api_key)
(text output modality)
Results:
=== Supported Frequencies Report ===
16000 Hz: Channels supported → [1, 2]
22050 Hz: Channels supported → [1, 2]
24000 Hz: Channels supported → [1, 2]
32000 Hz: Channels supported → [1, 2]
44100 Hz: Channels supported → [1, 2]
48000 Hz: Channels supported → [1, 2]
So I would check if the file input itself is the fault. An mp3 with a wav type?
Or you can review the call construction, including typed system message, reuse the transcoding offered in the test to convert from mystery wav to 16kHz/1
to reduce the network data – or fail on your end instead of the API.