How do people estimate GPT4 $$ given that they changed to pre-paid plan + you don’t know how long the response will be? Do people simply use the longest context length GPT4 has?
Thanks! With a concrete example appreciated!
How do people estimate GPT4 $$ given that they changed to pre-paid plan + you don’t know how long the response will be? Do people simply use the longest context length GPT4 has?
Thanks! With a concrete example appreciated!
The most that gpt-4-turbo can write is 4k tokens, usually with significantly less than that on tasks like “summary” or “write an article”. So the max unpredictable output you could possibly get is $0.03 x 4 = $0.12.
The input to the model is under your control (unless using Assistants), so you don’t have to send anything you don’t want to pay for, for example limiting the memory of past chat turns, or making a user interface where you can delete irrelevant past messages.
Just being prepaid doesn’t mean you have to budget by the penny. The initial purchase is $5, and that’s plenty to find out typical usage from your expected work.
You can search for “OpenAI tokenizer” and see how much your text would be in tokens, usually about 3-4 letters per token. This entire response is 202 tokens, and you’d probably add some instructions telling the AI what to do with it.
this is what I thought you’d say. Will do this tomorrow…unless you have code for this. I assume someone already done this before, so code sharing would be nice
I can write you code. It has gpt-4-turbo prices hard-coded.
Scroll down to the bottom, edit in your own system and user message in the script, run, see what it would cost to send (the code can’t send anything).
This uses tiktoken, which you can install in your environment with
pip install tiktoken
import re
import tiktoken
class Tokenizer:
""" required: import tiktoken; import re;
usage example:
cl100 = Tokenizer()
number_of_tokens = cl100.count("my string")
"""
def __init__(self, model="cl100k_base"):
self.tokenizer = tiktoken.get_encoding(model)
self.chat_strip_match = re.compile(r'<\|.*?\|>')
self.intype = None
self.inprice = round(0.01/1000, 6) ### hardcoded GPT-4-Turbo prices
self.outprice = round(0.03/1000, 6)
def ucount(self, text):
encoded_text = self.tokenizer.encode(text)
return len(encoded_text)
def count(self, text):
text = self.chat_strip_match.sub('', text)
encoded_text = self.tokenizer.encode(text)
return len(encoded_text)
def outputprice(self, text):
return self.ucount(text) * self.outprice
def inputprice(self, text):
return self.ucount(text) * self.inprice
def message(self, message):
"""
Extends the input message dictionary or list of dictionaries with a 'tokens' field,
which contains the token count of the 'role' and 'content' fields
(and optionally the 'name' field). The token count is calculated using the
'scount' method, which strips out any text enclosed within "<|" and "|>" before counting the tokens.
Args:
message (dict or list): A dictionary or a list of dictionaries. The ChatML format.
Each dictionary must have a 'role' field and a 'content' field, and may optionally
have a 'name' field. The 'role' and 'content' fields are strings, and the
'name' field, if present, is also a string.
Returns:
The input message dictionary or list of dictionaries, extended with a 'tokens' field
in each dictionary. The 'tokens' field contains the token count of the 'role' and
'content' fields (and optionally the 'name' field), calculated using the 'scount'
method. The total token count also includes a fixed overhead of 3 control tokens.
Raises:
KeyError: If a dictionary does not have a 'role' or 'content' field.
"""
if isinstance(message, str):
self.intype = string
message = dict(message)
if isinstance(message, dict):
self.intype = dict
message = [message]
elif isinstance(message, list):
self.intype = list
else:
raise ValueError("no supported format in message")
for msg in message:
role_string = msg['role']
if 'name' in msg:
role_string += ':' + msg['name']
role_tokens = self.count(role_string)
content_tokens = self.count(msg['content'])
msg['tokens'] = 3 + role_tokens + content_tokens
msg['price'] = round(msg['tokens'] * self.inprice, 6)
return message if len(message) > 1 else message[0]
####### Actually using those functions starts here
token = Tokenizer()
system = {"role":"system", "content": """
You are an AI. You need some more instructions of how to behave though.
""".strip()}
user = {"role":"user", "content": """
I am a human, and I want to know how much it costs to send you a question.
I just type whatever else I want to ask.
""".strip()}
# concatenate message lists together like you do to send to API
messages = [system, user]
print("-- Here's messages after I add token count and token price metadata to them")
print(token.message(messages))
print("-- Here's the sum of all message prices")
total_price = sum(message['price'] for message in messages)
print(f"Input messages price: {total_price}")
# Example of measuring tokens in a file with a function you'd call
def measure_tokens_in_a_file(filename):
with open(filename, 'r') as file:
content = file.read()
print(f"This message has {token.count(content)} tokens.")
print(f"If it was sent as input, it would cost ${token.inputprice(content):.5f}")
print(f"If it was received as output, it would cost ${token.outputprice(content):.5f}")
# other uses of my functions
#print(token.count(user[0]['content'])) # the count() method just measures tokens
#print(token.message(system)) # the message() method gets input dict counts
Thanks! Btw, curious, why did open ai change to this model of charging?
# -- Estimate for OpeanAI API inferences cost $$
def get_token_char_page_approx_equivalence():
"""
1 tok ~ 4-5 chars e.g., hello 5, dog 3, help 4, happy 5, the 3, at 2, she 3,
2-3 tok ~ 1 word
4k toks = 2k words = 2000 words = 2000 / 500 = 4 pages
Google doc 11pt font
1 lines ~ 13-14 words
1 page ~ 35-37 lines
1 page ~ 37 lines / page * 13 words / line = 481 words / page
(1 char ~ 1 byte)
"""
...
def get_cost_inference_per_token(model: str = 'gpt-4-turbo', verbose: bool = True) -> dict:
# gpt-4-turbo-2024-04-09 in $10.00 / 1M tokens out $30.00 / 1M tokens
if 'gpt-4-turbo' in model:
# to cost per token $$ / tok
inprice: float = 10 / 1_000_000
outprince: float = 30 / 1_000_000
prices: dict = {'in_cost_per_tok': inprice, 'out_cost_per_tok': outprince}
print(f'{prices=}') if verbose else None
return prices
else:
raise ValueError(f'Unknown model: {model=}')
def estimate_openai_api_inference_cost(
prompts: list[str], # e.g., math prompts
outputs: list[str], # perhaps guessed to have a cost
model: str = 'gpt-4-turbo', # ref costs: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken#encodings
verbose: bool = True,
) -> float:
""" Estimate cost of inference for given prompts using OpenAI API. ref: https://community.openai.com/t/how-do-people-estimate-gpt4-given-that-they-changed-to-pre-paid-plan-you-dont-know-how-long-the-response-will-be/741443/3"""
import tiktoken
assert model in {'gpt-4-turbo', 'gpt-3.5-turbo'}, f'Unknown model: {model=}'
assert len(prompts) == len(outputs), f'Length of prompts and outputs should be equal but got: {len(prompts)=}, {len(outputs)=}'
# - get encoding name
# gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large -> cl100k_base
if model in {'gpt-4-turbo', 'gpt-3.5-turbo', 'text-embedding-ada-002', 'text-embedding-3-small', 'text-embedding-3-large'}:
encoding_name: str = 'cl100k_base'
else:
raise ValueError(f'Unknown model: {model=}')
tokenizer = tiktoken.get_encoding(encoding_name)
cost_per_tok: dict = get_cost_inference_per_token(model)
in_cost_per_tok, out_cost_per_tok = cost_per_tok['in_cost_per_tok'], cost_per_tok['out_cost_per_tok']
# compute cost by going through all sentences, tokenizing multiply by cost per token, sum and then return
print(f'number of requests/seqs to {model=}: {len(prompts)=} ')
print(f'number of outputs of {model=}: {len(outputs)=} ')
# for output token, use output token list (guessed) strings
tot_in_cost, tot_out_cost = 0.0, 0.0
for prompt, output in zip(prompts, outputs):
# tokenize with tiktoken
toks_in: list[int] = tokenizer.encode(prompt)
# print(f'{toks_in=} {len(toks_in)=} {type(toks_in)=}')
num_toks_per_in_seq: int = len(toks_in)
toks_out: list[int] = tokenizer.encode(output)
# print(f'{toks_out=} {len(toks_out)=} {type(toks_out)=}')
num_toks_per_out_seq: int = len(toks_out)
# cost per token
in_cost_per_seq: float = num_toks_per_in_seq * in_cost_per_tok
out_cost_per_seq: float = num_toks_per_out_seq * out_cost_per_tok
# accumulate total cost
tot_in_cost += in_cost_per_seq
tot_out_cost += out_cost_per_seq
result = {'tot_in_cost': tot_in_cost, 'tot_out_cost': tot_out_cost}
if verbose:
print(f'{result=}')
return result
def estimate_tenacity_vals(model) -> dict:
"""
Estimate vals for tenacity retry decorator for given model.
500 rpm = 500 requests per minute = 500 reqs / 60 sec = 8.33 requests per second
8.33 rps
1s (init) -> 2s (1 retry) -> 4s (2 retries) -> 8s (3 retries) -> 16s (4 retries) -> 32s (5 retries)
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=2, max=16))
max = max wait time in seconds.
multiplier = number to multiply wait time after we've been rate limited.
ref: https://platform.openai.com/settings/organization/limits
ref: https://chatgpt.com/g/g-KV0CvoH8Y-python-excellent-comments-doc-strings-types/c/9c137c59-1784-4023-9e38-b1e322ede951
"""
if model == 'gpt-4-turbo':
rpm: int = 500
rps: float = rpm / 60 # e.g. 8.33
else:
raise ValueError(f'Invalid model: {model=}')
# estimate vals, 8.33 we can do 8.33 reqs per sec, so if we do more than that we need to wait, but we don't know the cool off
raise NotImplementedError