Do you have a link that now confirms this hands-off policy about speed of output?
Because OpenAI’s barely-mentioned policy has been the opposite: tier 1 (those having paid less than $50 total) even before the tier system was published explaining it, previous users, were hit with token generation rate penalties.
“May have better latency” (the wrong term, because it was token generation rate) or “may be moved to faster models”, in their words, was implemented by existing API users being immediately penalized.
It is only appropriate to share a method to benchmark AI models and their performance, which will produce a report that when pasted to the forum, will look like this:
For 5 trials of gpt-3.5-turbo @ 2024-04-17 08:49PM:
Stat |
Minimum |
Maximum |
Average |
stream rate |
Min: 51.0 |
Max: 66.4 |
Avg: 58.820 |
latency (s) |
Min: 0.3543 |
Max: 0.7639 |
Avg: 0.580 |
total response (s) |
Min: 4.5517 |
Max: 5.5489 |
Avg: 4.951 |
total rate |
Min: 46.135 |
Max: 56.243 |
Avg: 51.994 |
response tokens |
Min: 256 |
Max: 256 |
Avg: 256.000 |
For 5 trials of gpt-4-turbo @ 2024-04-17 08:49PM:
Stat |
Minimum |
Maximum |
Average |
stream rate |
Min: 23.3 |
Max: 42.8 |
Avg: 29.160 |
latency (s) |
Min: 0.6019 |
Max: 0.9499 |
Avg: 0.742 |
total response (s) |
Min: 6.5735 |
Max: 11.7735 |
Avg: 9.899 |
total rate |
Min: 21.744 |
Max: 38.944 |
Avg: 26.994 |
response tokens |
Min: 256 |
Max: 256 |
Avg: 256.000 |
(Tier-5)
I solicit those at trust tier-1
to put in their literal two cents; try this Python script and see your own performance (which could vary depending on time of day, also). Find out what is realized now in streaming token rate, regardless of documentation.
api-speed.py
import openai # requires pip install openai
import tiktoken # requires pip install tiktoken
import time
import json
## Your test parameters
trials = 5
max_tokens = 256
prompt = "Write an extensive article about kittens, 30 paragraphs in length."
models = ['gpt-3.5-turbo', 'gpt-4-turbo']
# models.extend(['gpt-4', 'gpt-3.5-turbo-instruct']) # uncomment to add more
class Tokenizer:
def __init__(self, encoder="cl100k_base"):
self.tokenizer = tiktoken.get_encoding(encoder)
def count(self, text):
return len(self.tokenizer.encode(text))
class BotDate:
def __init__(self):
self.created_time = time.time()
self.start_time = 0
def start(self):
return time.strftime("%Y-%m-%d %I:%M%p", time.localtime(self.created_time))
def now(self):
return time.strftime("%Y-%m-%d %I:%M%p", time.localtime(time.time()))
def set(self):
self.start_time = time.time()
def get(self):
return round(time.time() - self.start_time, 4)
client = openai.Client(timeout=120) # uses OPENAI_API_KEY env variable
bdate = BotDate()
tok = Tokenizer()
latency = 0
stats = {model: {"stream rate": [], "latency (s)": [],"total response (s)": [],
"total rate": [],
"response tokens": [],} for model in models}
for i in range(trials): # number of trials per model
for model in models:
bdate.set() # start timer
response = None
if model[-5:] == "instruct"[-5:]:
# API request from completions
try:
response = client.completions.with_raw_response.create(
prompt=prompt + "\n\nassistant: ",
model=model, top_p=0.0001, stream=True, max_tokens=max_tokens+1)
except Exception as e:
print(f"{model}: {e}")
continue
else:
# API request from chat completions
try:
response = client.chat.completions.with_raw_response.create(
messages=[{"role": "system", "content": "assistant is a helpful AI author."},
{"role": "user", "content": prompt}],
model=model, top_p=0.0001, stream=True, max_tokens=max_tokens)
except Exception as e:
print(f"{model}: {e}")
continue
q = response.parse()
print(f"\n{q.__class__.__name__}:{model}", end="")
reply = "" # string to collect response tokens
for chunk_no, chunk in enumerate(q):
if reply == "":
latency_s = bdate.get()
if q.response.is_success and not chunk.choices[0].finish_reason:
if q.response.url.path.startswith("/v1/chat"):
reply += chunk.choices[0].delta.content # chat chunks
else:
reply += chunk.choices[0].text # completion chunks
print(".", end="") # progress indicator
total_s = bdate.get() # timer end
# extend model stats lists with total, latency, tokens for model
stats[model]["latency (s)"].append(round(latency_s,4))
stats[model]["total response (s)"].append(round(total_s,4))
tokens = tok.count(reply)
stats[model]["response tokens"].append(tokens)
stats[model]["total rate"].append(round(tokens/total_s, 3))
stats[model]["stream rate"].append(round((tokens-1)/(1 if (total_s-latency_s) == 0 else (total_s-latency_s)), 1))
print("\n")
for key in stats:
print(f"### For {trials} trials of {key} @ {bdate.now()}:")
print("| Stat | Minimum | Maximum | Average |")
print("| --- | --- | --- | --- |")
for sub_key in stats[key]:
values = stats[key][sub_key]
min_value = min(values)
max_value = max(values)
avg_value = sum(values) / len(values)
print(f"| {sub_key} | Min: {min_value} | Max: {max_value} | Avg: {avg_value:.3f} |")
print()