Hi, I have mixed the toktrack from @davell with the cost details idea from Sebastian Shepard ( Handling OpenAI API Cost Retrieval Changes - at jeypop) to get this alternative:
import json
import requests
import datetime
from dateutil.relativedelta import relativedelta
import os
import pandas as pd
import time
openai_org_id = os.getenv('ORG_ID')
openai_api_key = os.getenv('OPENAI_API_KEY')
first_day_of_month = datetime.date(2023,7,1) #start date
current_day = datetime.date.today() #end date
headers = {
"method": "GET",
"authority": "api.openai.com",
"scheme": "https",
"path": f"/v1/organizations/{openai_org_id}/users",
"authorization": f"Bearer {openai_api_key}",
}
users_response = requests.get(f"https://api.openai.com/v1/organizations/{openai_org_id}/users", headers=headers)
users = users_response.json()["members"]["data"]
df_costs = pd.DataFrame()
for user in users:
id_of_user = user["user"]["id"]
current_date = first_day_of_month
while current_date <= current_day:
# we can get 5 requests per min, so I set 1 request per 15s just to make sure
start = time.time()
usage_headers = {
"method": "GET",
"authority": "api.openai.com",
"authorization": f"Bearer {openai_api_key}",
"openai-organization": openai_org_id,
}
usage_response = requests.get(f"https://api.openai.com/v1/usage?date={current_date}&user_public_id={id_of_user}", headers=usage_headers)
user_data = usage_response.json()
if len(user_data['data'])==0:
current_date += relativedelta(days=1)
end = time.time()
if end - start < 15:
time.sleep(15 - (end - start))
continue
else:
usage_data = user_data["data"]
whisper_data = user_data["whisper_api_data"]
data = usage_data + whisper_data
df = pd.DataFrame(data)
df['local_timestamp'] = df['aggregation_timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
# converting to openai tz
df['system_timestamp'] = df['local_timestamp'].dt.tz_localize('America/Sao_Paulo').dt.tz_convert("UTC")
df['user'] = user["user"]["name"].lower().replace(" ", "_")
df['email'] = user["user"]["email"]
df_costs = pd.concat([df_costs,df])
current_date += relativedelta(days=1)
end = time.time()
if end - start < 15 :
time.sleep(15 - (end - start))
# model costs
model_costs = {
"gpt-3.5-turbo": {"context": 0.0015, "generated": 0.002},
"gpt-3.5-turbo-0301": {"context": 0.0015, "generated": 0.002},
"gpt-3.5-turbo-0613": {"context": 0.0015, "generated": 0.002},
"gpt-3.5-turbo-16k": {"context": 0.003, "generated": 0.004},
"gpt-3.5-turbo-16k-0613": {"context": 0.003, "generated": 0.004},
"gpt-4": {"context": 0.03, "generated": 0.06},
"gpt-4-0314": {"context": 0.03, "generated": 0.06},
"gpt-4-0613": {"context": 0.03, "generated": 0.06},
"gpt-4-32k": {"context": 0.06, "generated": 0.12},
"gpt-4-32k-0314": {"context": 0.06, "generated": 0.12},
"gpt-4-32k-0613": {"context": 0.06, "generated": 0.12},
"text-embedding-ada-002-v2": {"context": 0.0001, "generated": 0},
"text-davinci:003": {"context": 0.03, "generated": 0.12},
"whisper-1": {"context": 0.006 / 60, "generated": 0},
}
mc = pd.DataFrame(model_costs)
mc = mc.T.reset_index()
# cost calculation
df_costs=df_costs.merge(mc, left_on='snapshot_id', right_on='index', how='left')
df_costs['context_costs']=(df_costs['n_context_tokens_total']/1000)*df_costs['context']
df_costs['generated_costs']=(df_costs['n_generated_tokens_total']/1000)*df_costs['generated']
df_costs['total_costs']=df_costs['context_costs']+df_costs['generated_costs']
# saving a json file
with open(f"openai_costs_{first_day_of_month}_to_{current_date}.json", "w") as f:
json.dump(df_costs.to_json(index=False, orient='table'), f)
# cost summary
def summary_costs(x):
names = {
'requests': x['n_requests'].sum()
, 'context_tokens': x['n_context_tokens_total'].sum()
, 'generated_tokens': x['n_generated_tokens_total'].sum()
, 'context_costs': x['context_costs'].sum()
, 'generated_costs': x['generated_costs'].sum()
, 'total_costs': x['total_costs'].sum()
}
return pd.Series(names, index=['requests'
, 'context_tokens'
, 'generated_tokens'
, 'context_costs'
, 'generated_costs'
, 'total_costs'
])
df_costs.groupby(['user', pd.Grouper(key='system_timestamp', freq='D'),'snapshot_id', 'operation']).apply(summary_costs)
# costs by day
df_costs.groupby([pd.Grouper(key='system_timestamp', freq='D')]).apply(summary_costs)
#costs by user
df_costs.groupby(['user']).apply(summary_costs).sort_values('total_costs', ascending=False)