This is simply avoidance and blame.
The gpt-3.5-turbo-instruct model is still fast for those affected - a 3.5-turbo slowdown to under 10 tokens per second.
Let’s link to just one of multiple threads, 41 posts
A typical token-per-second is like what I get — 25 to 50 tokens per second. Not 5-10.
For 2 trials of gpt-3.5-turbo @ 2023-10-17 05:09PM:
Stat | Minimum | Maximum | Average |
---|---|---|---|
latency (s) | Min: 0.501 | Max: 0.604 | Avg: 0.552 |
total response (s) | Min: 2.8842 | Max: 2.9052 | Avg: 2.895 |
total rate | Min: 34.421 | Max: 34.672 | Avg: 34.546 |
stream rate | Min: 41.5 | Max: 43.0 | Avg: 42.250 |
response tokens | Min: 100 | Max: 100 | Avg: 100.000 |
For 2 trials of gpt-3.5-turbo-instruct @ 2023-10-17 05:09PM:
Stat | Minimum | Maximum | Average |
---|---|---|---|
latency (s) | Min: 0.229 | Max: 0.795 | Avg: 0.512 |
total response (s) | Min: 1.273 | Max: 1.8421 | Avg: 1.558 |
total rate | Min: 54.286 | Max: 78.555 | Avg: 66.421 |
stream rate | Min: 94.5 | Max: 94.8 | Avg: 94.650 |
response tokens | Min: 100 | Max: 100 | Avg: 100.000 |
Try-it-Yourself Python code, compare chat to instruct, producing forum markdown
(You can increase the number of trial runs per model or include more models in the list if desired)
import openai # requires pip install openai
import tiktoken # requires pip install tiktoken
import time
import json
openai.api_key = "sk-2156a65Y"
class Tokenizer:
def __init__(self, encoder="cl100k_base"):
self.tokenizer = tiktoken.get_encoding(encoder)
def count(self, text):
return len(self.tokenizer.encode(text))
class BotDate:
def __init__(self):
self.created_time = time.time()
self.start_time = 0
def start(self):
return time.strftime("%Y-%m-%d %I:%M%p", time.localtime(self.created_time))
def now(self):
return time.strftime("%Y-%m-%d %I:%M%p", time.localtime(time.time()))
def set(self):
self.start_time = time.time()
def get(self):
return round(time.time() - self.start_time, 4)
models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-instruct']
bdate = BotDate()
tok = Tokenizer()
latency = 0
stats = {model: {"latency (s)": [],"total response (s)": [],"total rate": [],
"stream rate": [],"response tokens": [],} for model in models}
trials = 2
max_tokens = 100
prompt = "Write an article about kittens, 80 paragraphs"
for i in range(trials): # number of trials per model
for model in models:
bdate.set()
if model[-5:] == "instruct"[-5:]:
response = openai.Completion.create(
prompt=prompt,
model=model,
top_p=0.01, stream=True, max_tokens=max_tokens+1)
else:
response = openai.ChatCompletion.create(
messages=[
# {"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt}],
model=model,
top_p=0.01, stream=True, max_tokens=max_tokens)
# capture the words emitted by the response generator
reply = ""
for chunk in response:
if reply == "":
latency_s = bdate.get()
if not chunk['choices'][0]['finish_reason']:
if not chunk['object'] == "chat.completion.chunk":
reply += chunk['choices'][0]['text']
else:
reply += chunk['choices'][0]['delta']['content']
print(".", end="")
total_s = bdate.get()
# extend model stats lists with total, latency, tokens for model
stats[model]["latency (s)"].append(round(latency_s,4))
stats[model]["total response (s)"].append(round(total_s,4))
tokens = tok.count(reply)
stats[model]["response tokens"].append(tokens)
stats[model]["total rate"].append(round(tokens/total_s, 3))
stats[model]["stream rate"].append(round((tokens-1)/(1 if (total_s-latency_s) == 0 else (total_s-latency_s)), 1))
print("\n")
for key in stats:
print(f"### For {trials} trials of {key} @ {bdate.now()}:")
print("| Stat | Minimum | Maximum | Average |")
print("| --- | --- | --- | --- |")
for sub_key in stats[key]:
values = stats[key][sub_key]
min_value = min(values)
max_value = max(values)
avg_value = sum(values) / len(values)
print(f"| {sub_key} | Min: {min_value} | Max: {max_value} | Avg: {avg_value:.3f} |")
print()