OpenAI Assistant File Processing Issue: Unpredictable Behaviour
Hello everyone! I need the community’s support to solve a problem I am experiencing in this project where the OpenAI Assistant is being used. I am sharing a piece of code that I have extracted from my project to show a demo of the pipeline.
Problem Description:
- The Assistant receives an Excel file as input, processes it, and should return an output Excel file (or JSON, though the format is not the issue).
- Inconsistent Behavior: Sometimes, the output file has the correct number of rows (matching the input), while other times, it only generates 5 observations (rows).
- The issue is random and unpredictable.
My Hypothesis:
- I suspect the issue is related to the function calling step. To use a pydantic model as output, I provide an empty string when the run status is set to
requires_action
, as no function is actually being called. - The status alternates between
requires_action
and not, but that doesn’t seem to be the root of the problem.
I really hope you can help me find a way to solve this issue. The code works and does its intended job, but occasionally, the result is reduced to just 5 rows in the output file.
################### LIBRARIES ##########################
import os
import sys
import json
import openai
import time
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List, Optional
load_dotenv()
SCRIPT_PATH = os.getenv("SCRIPT_PATH")
sys.path.append(SCRIPT_PATH)
client = OpenAI()
MODEL = 'gpt-4o-2024-08-06'
###################### FUNCTIONS ########################
def wait_on_run(run, thread):
while run.status == "queued" or run.status == "in_progress":
run = client.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id,
)
time.sleep(0.5)
return run
def tool_call(thread, run):
tool_outputs = []
required_actions = run.required_action.submit_tool_outputs.tool_calls
for action in required_actions:
final_str = ""
tool_outputs.append(
{"tool_call_id": action.id,
"output": final_str
}
)
print("Submitting outputs back to the Assistant...")
run = client.beta.threads.runs.submit_tool_outputs(
thread_id=thread.id,
run_id=run.id,
tool_outputs=tool_outputs
)
return run
def get_model_output_file_id(thread):
messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
last_message = messages.data[-1]
role = last_message.role
response = last_message.content[0].text.value
file_id = last_message.attachments[0].file_id
print(f"{role.capitalize()}: ==> {response}")
print(f"FILE ID: ==> {file_id}")
return file_id
def check_run_status(run, thread):
while True:
run = wait_on_run(run, thread)
if run.status == "requires_action":
run = tool_call(thread, run)
elif run.status == "completed":
return run
else:
print(f"Si è riscontrato un problema: \n \t Run Status = {run.status}")
return
################# PYDANTIC MODELS #######################
class DataModel(BaseModel):
ANNO: Optional[int] = Field(description="Anno di riferimento del documento. Deve avere 4 cifre.")
MESE: Optional[str] = Field(description="Mese di riferimento del documento. Deve avere 2 cifre numeriche.")
VALORE_TOTALE: Optional[float] = Field(description="Valore totale delle vendite, con 2 cifre decimali.")
class ListaDataModel(BaseModel):
"""Define a model that validates a list of DataModel osjects"""
lista: List[DataModel] = Field(..., description="Una lista contenente tutte le osservazioni della tabella excel")
########################## ASSISTANT PIPELINE ###############################
# Tools for the Assistant's Function Calling
code_interpreter_tool = {"type": "code_interpreter"}
pydantic_tool = openai.pydantic_function_tool(ListaDataModel)
pydantic_tool['function']['strict'] = True
tools_generic = [pydantic_tool, code_interpreter_tool]
system_prompt = ""
# Assistant object
assistant = client.beta.assistants.create(
name="Data Extraction Assistant",
instructions=system_prompt,
model=MODEL,
tools=tools_generic,
temperature=0.2
)
# Thread object
thread = client.beta.threads.create()
# Message Object
user_prompt = ""
excel_path = ""
file = client.files.create(file=open(excel_path, "rb"), purpose='assistants')
message_input = client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
attachments=[
{
"file_id":file.id,
"tools":[code_interpreter_tool]
}
],
content=user_prompt)
# Create a Run
run = client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant.id)
# Wait for the run status to be "completed"
run = check_run_status(run, thread)
# Get the output file id
assistant_output = get_model_output_file_id(thread)
# Save Download the excel file
generated_data = client.files.content(assistant_output)
generated_data_bytes = generated_data.read()
with open("filename.xlsx", "wb") as file:
file.write(generated_data_bytes)
######################## FINISH ###################################