Responses API (AzureOpenAI) is significantly slower on average than the Chat Completions endpoint. Occasionally some Responses requests have extreme latency outliers (requests appear to get congested). Please investigate performance/regression of the Responses endpoint vs Chat Completions.
Statistical: Store = True
Responses: mean=4.268s median=2.349s min=1.421s max=21.711s stdev=4.903s
Chat : mean=1.354s median=1.298s min=0.902s max=2.385s stdev=0.330s
Statistical: Store = False
Responses: mean=2.901s median=2.264s min=1.476s max=6.520s stdev=1.530s
Chat : mean=1.257s median=1.203s min=0.891s max=1.813s stdev=0.286s
Code snippets
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
import time, random, statistics
import matplotlib.pyplot as plt
tp = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
client = AzureOpenAI(azure_endpoint="https://aoai-eastus2-0001.openai.azure.com/",
azure_ad_token_provider=tp, api_version="2025-04-01-preview")
N = 20
store = False # True
resp_times = []
chat_times = []
for i in range(N):
n = random.randint(1, 1000000)
# Change a little bit prompt to avoid hitting cache
prompt_r = f"{n} Hello, one-sentence bedtime story about a unicorn."
prompt_c = f"{n} Hiii, one-sentence bedtime story about a unicorn."
t0 = time.perf_counter()
r = client.responses.create(model="gpt-5", input=prompt_r, store=store,
reasoning={"effort":"minimal"}, text={"verbosity":"low"})
resp_times.append(time.perf_counter() - t0)
t0 = time.perf_counter()
cc = client.chat.completions.create(model="gpt-5",
messages=[{"role":"user","content":prompt_c}], store=store,
reasoning_effort="minimal", verbosity="low")
chat_times.append(time.perf_counter() - t0)
# --- statistics ---
def stats(a):
return {
"n": len(a),
"mean": statistics.mean(a),
"median": statistics.median(a),
"stdev": statistics.pstdev(a) if len(a)>1 else 0.0,
"min": min(a),
"max": max(a),
}
sr = stats(resp_times)
sc = stats(chat_times)
print("\nResponses:", f"mean={sr['mean']:.3f}s median={sr['median']:.3f}s min={sr['min']:.3f}s max={sr['max']:.3f}s stdev={sr['stdev']:.3f}s")
print("Chat :", f"mean={sc['mean']:.3f}s median={sc['median']:.3f}s min={sc['min']:.3f}s max={sc['max']:.3f}s stdev={sc['stdev']:.3f}s")
# --- plot
plt.figure(figsize=(10,4))
plt.plot(range(1, N+1), resp_times, label="Responses")
plt.plot(range(1, N+1), chat_times, label="Chat Completions")
plt.xlabel("Run")
plt.ylabel("Time (s)")
plt.title(f"Responses mean {sr['mean']:.2f}s vs Chat mean {sc['mean']:.2f}s")
plt.legend()
plt.tight_layout()