Gpt-3.5-turbo-1106 is very slow

Oh noes Llama that can make 80 tokens a second?

“I was the first one on the server and it was fast!”

“So angry that this is only twice the rate that gpt-3.5-turbo has been producing for the last months.”

Free benchmark code I just ran and bonus utilities. If you aren’t also making the same rate at a tier higher than me, wait for your payment to process.

python w openai == 1.2.2, tiktoken
import openai
import jsonschema
import time
import re
import os
import tiktoken
import httpx

openai.api_key = key

from openai import OpenAI
client = OpenAI(timeout=httpx.Timeout(15.0, read=5.0, write=10.0, connect=3.0))

class Printer:
    """
    A class for formatted text output, supporting word wrapping, indentation and line breaks.

    Attributes:
        max_len (int): Maximum line length.
        indent (int): Indentation size.
        breaks (str): Characters treated as line breaks.
        line_length (int): Current line length.

    Methods:
        print_word(word): Prints a word with the defined formatting rules.
        reset(): Starts a new line without printing anything.
    """

    def __init__(self, max_len=80, indent=0, breaks=[" ", "-"]):
        self.max_len = max_len
        self.indent = indent
        self.breaks = breaks
        self.line_length = -1

    def reset(self):
        self.line_length = 0

    def document(self, text):
        # Define a regular expression pattern to split text into words
        word_pattern = re.compile(r"[\w']+|[.,!?;]")
        # Split the text into words including ending punctuation
        words = word_pattern.findall(text)
        for chunk in words:
            self.word(chunk)
            time.sleep(0.1)

    def word(self, word):
        if ((len(word) + self.line_length > self.max_len
                and (word and word[0] in self.breaks))
                or self.line_length == -1):
            print("")  # new line
            self.line_length = 0
            word = word.lstrip()
        if self.line_length == 0:  # Indent new lines
            print(" " * self.indent, end="")
            self.line_length = self.indent
        print(word, end="")
        if word.endswith("\n"):  # Indent after AI's line feed
            print(" " * self.indent, end="")
            self.line_length = self.indent
        self.line_length += len(word)


class Tokenizer:
    """ required: import tiktoken; import re;
    usage example:
        cl100 = Tokenizer()
        number_of_tokens = cl100.count("my string")
    """
    def __init__(self, model="cl100k_base"):
        self.tokenizer = tiktoken.get_encoding(model)
        self.chat_strip_match = re.compile(r'<\|.*?\|>')
        self.intype = None

    def ucount(self, text):
        encoded_text = self.tokenizer.encode(text)
        return len(encoded_text)

    def count(self, text):
        text = self.chat_strip_match.sub('', text)
        encoded_text = self.tokenizer.encode(text)
        return len(encoded_text)


class BotDate:
    """ .start/.now : object creation date/time; current date/time
        .set/.get   : start/reset timer, elapsed time
        .print      : formatted date/time from epoch seconds
    """
    def __init__(self, format_spec="%Y-%m-%d %H:%M%p"):
        self.format_spec = format_spec
        self.created_time = time.time()
        self.start_time = 0
        self.stats1 = []
        self.stats2 = []

    def stats_reset(self):
        self.stats1 = []
        self.stats2 = []
        

    def start(self):
        return self.format_time(self.created_time)

    def now(self):
        return self.format_time(time.time())

    def print(self, epoch_seconds): # format input seconds
        return self.format_time(epoch_seconds)

    def format_time(self, epoch_seconds):
        formatted_time = time.strftime(self.format_spec, time.localtime(epoch_seconds))
        return formatted_time

    def set(self):
        self.start_time = time.perf_counter()  # Record the current time when set is called

    def get(self):  # elapsed time value str
        if self.start_time is None:
            return "X.XX"
        else:
            elapsed_time = time.perf_counter() - self.start_time
            return elapsed_time


bdate = BotDate()
tok = Tokenizer()
p = Printer()
latency = 0
user = """Write an article about kittens""".strip()

models = ['gpt-3.5-turbo-1106', 'gpt-3.5-turbo-0613']
trials = 3
stats = {model: {"total response time": [],
                 "latency (s)": [],
                 "response tokens": [],
                 "total rate": [],
                 "stream rate": [],
                 } for model in models}

for i in range(trials):
    for model in models:
        print(f"\n[{model}]")
        time.sleep(.2)
        bdate.set()
        # call the chat API using the openai package and model parameters
        try:
            response = client.chat.completions.create(
                messages=[
                          # {"role": "system", "content": "You are a helpful assistant"},
                          {"role": "user", "content": user}],
                model=model,
                top_p=0.0, stream=True, max_tokens=256)
        except openai.APIConnectionError as e:
            print("The server could not be reached")
            print(e.__cause__)  # an underlying Exception, likely raised within httpx.
        except openai.RateLimitError as e:
            print(f"OpenAI rate error {e.status_code}: (e.response)")
        except openai.APIStatusError as e:
            print(f"OpenAI error {e.status_code}: (e.response)")

        # capture the words emitted by the response generator
        reply = ""
        for part in response:
            if reply == "":
                latency = bdate.get()
            if not (part.choices[0].finish_reason):
                word = part.choices[0].delta.content or ""
                if reply == "" and word == "\n":
                    word = ""
                reply += word
                p.word(word)
        total = bdate.get()
        # extend model stats lists with total, latency, tokens for model
        stats[model]["total response time"].append(total)
        stats[model]["latency (s)"].append(latency)
        tokens = tok.count(reply)
        stats[model]["response tokens"].append(tokens)
        stats[model]["total rate"].append(tokens/total)
        stats[model]["stream rate"].append((tokens-1)/(1 if total-latency == 0 else total-latency))

print("\n\n")
for key in stats:
    print(f"Report for {trials} trials of {key}:")
    for sub_key in stats[key]:
        values = stats[key][sub_key]
        min_value = min(values)
        max_value = max(values)
        avg_value = sum(values) / len(values)
        print(f"- {sub_key.ljust(20, '.')}"
              f"Min:{str(f'{min_value:.3f}'.zfill(7))} "
              f"Max:{str(f'{max_value:.3f}'.zfill(7))} "
              f"Avg:{str(f'{avg_value:.3f}'.zfill(7))}")


    print()
2 Likes