After some testing, I can confirm that there seems to be some kind of bug on OpenAI’s end.
This is the simplest tool call that I can think to generate using a Run:
def run_with_empty_thread(config:TestConfig) -> TestResult:
client = get_client()
start_time = timeit.default_timer()
# with time_block("run_with_empty_thread", print_output=False) as exec_time:
with time_block('client.beta.threads.create()'):
thread = client.beta.threads.create()
with time_block('client.beta.threads.runs.create_and_poll()'):
run = client.beta.threads.runs.create_and_poll(
assistant_id=config.assistant_id,
thread_id=thread.id,
model=config.model,
additional_instructions = config.prompt,
additional_messages = [{'role': 'user', 'content': config.query}],
tools=config.tools
)
if run.status == 'completed': # will only complete immediately if there are no tools
with time_block('client.beta.threads.messages.list()'):
messages = client.beta.threads.messages.list(thread_id=thread.id, limit=1)
exec_time = timeit.default_timer() - start_time
result = TestResult(
completion_time=exec_time,
tokens_total=run.usage.total_tokens,
tokens_prompt=run.usage.prompt_tokens,
tokens_completion=run.usage.completion_tokens,
tools=[],
response = messages.data[0].content[0].text.value
)
return result
elif run.status == 'requires_action': # when tools are required, we have to submit a second run
tool_outputs = [] # fake the output, we only care about openai speed
tools = run.required_action.submit_tool_outputs.tool_calls
for tool_call in run.required_action.submit_tool_outputs.tool_calls:
tool_outputs.append({'tool_call_id': tool_call.id, 'output': 'success'}) # args are: tool_call.function.arguments
with time_block('client.beta.threads.runs.create_and_poll() - with tool_outputs'):
run = client.beta.threads.runs.submit_tool_outputs_and_poll(
run_id=run.id,
thread_id=run.thread_id,
tool_outputs = tool_outputs
)
if run.status == 'completed':
with time_block('client.beta.threads.messages.list()'):
messages = client.beta.threads.messages.list(thread_id=thread.id, limit=1)
exec_time = timeit.default_timer() - start_time
result = TestResult(
completion_time=exec_time,
tokens_total=run.usage.total_tokens,
tokens_prompt=run.usage.prompt_tokens,
tokens_completion=run.usage.completion_tokens,
tools=tools,
response = messages.data[0].content[0].text.value
)
return result
else:
print(f"Unexpected run status: [{run.status}]")
else:
print(f"Unexpected run status: [{run.status}]")
On average, this is about a 5 second process:
Execution time of block[client.beta.threads.create()]: 0.200420 seconds
Execution time of block[client.beta.threads.runs.create_and_poll()]: 2.262922 seconds
Execution time of block[client.beta.threads.runs.create_and_poll() - with tool_outputs]: 1.765574 seconds
Execution time of block[client.beta.threads.messages.list()]: 1.017133 seconds
However, once in a while it takes forever. (This screenshot is from before I started benchmarking each individual client call.)
A 30 second wait is obviously a no-go for user experience.
Edit:
I also wanted to ask if it’s expected for a Run operation (not including the tool submission) to take 5x longer than a chat completion on average. The same tool, prompt, and query takes 0.5-0.7sec in my testing for a completion. I can’t find anything in the documentation about an expected speed difference.