OpenAI Assistant File Processing Issue: Unpredictable Behaviour

OpenAI Assistant File Processing Issue: Unpredictable Behaviour

Hello everyone! I need the community’s support to solve a problem I am experiencing in this project where the OpenAI Assistant is being used. I am sharing a piece of code that I have extracted from my project to show a demo of the pipeline.

Problem Description:

  • The Assistant receives an Excel file as input, processes it, and should return an output Excel file (or JSON, though the format is not the issue).
  • Inconsistent Behavior: Sometimes, the output file has the correct number of rows (matching the input), while other times, it only generates 5 observations (rows).
  • The issue is random and unpredictable.

My Hypothesis:

  • I suspect the issue is related to the function calling step. To use a pydantic model as output, I provide an empty string when the run status is set to requires_action, as no function is actually being called.
  • The status alternates between requires_action and not, but that doesn’t seem to be the root of the problem.

I really hope you can help me find a way to solve this issue. The code works and does its intended job, but occasionally, the result is reduced to just 5 rows in the output file.

################### LIBRARIES ##########################

import os
import sys
import json
import openai
import time

from openai import OpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List, Optional

load_dotenv()

SCRIPT_PATH = os.getenv("SCRIPT_PATH")
sys.path.append(SCRIPT_PATH)

client = OpenAI()
MODEL = 'gpt-4o-2024-08-06'

###################### FUNCTIONS ########################

def wait_on_run(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id,
        )
        time.sleep(0.5)
    return run

def tool_call(thread, run):
    tool_outputs = []
    required_actions = run.required_action.submit_tool_outputs.tool_calls
    for action in required_actions:
        final_str = ""
        tool_outputs.append(
            {"tool_call_id": action.id, 
             "output": final_str
             }
        )
    print("Submitting outputs back to the Assistant...")
    run = client.beta.threads.runs.submit_tool_outputs(
        thread_id=thread.id, 
        run_id=run.id, 
        tool_outputs=tool_outputs
    )
    return run

        
def get_model_output_file_id(thread):
    messages = client.beta.threads.messages.list(thread_id=thread.id, order="asc")
    last_message = messages.data[-1]

    role = last_message.role
    response = last_message.content[0].text.value
    file_id = last_message.attachments[0].file_id

    print(f"{role.capitalize()}: ==> {response}")
    print(f"FILE ID: ==> {file_id}")
    return file_id

def check_run_status(run, thread):
    while True:
        run = wait_on_run(run, thread)
        if run.status == "requires_action":
            run = tool_call(thread, run)
        elif run.status == "completed":
            return run
        else:
            print(f"Si è riscontrato un problema: \n \t Run Status = {run.status}")
            return
        
################# PYDANTIC MODELS #######################

class DataModel(BaseModel):
    ANNO: Optional[int] = Field(description="Anno di riferimento del documento. Deve avere 4 cifre.")
    MESE: Optional[str] = Field(description="Mese di riferimento del documento. Deve avere 2 cifre numeriche.")
    VALORE_TOTALE: Optional[float] = Field(description="Valore totale delle vendite, con 2 cifre decimali.")

class ListaDataModel(BaseModel):
    """Define a model that validates a list of DataModel osjects"""
    lista: List[DataModel] = Field(..., description="Una lista contenente tutte le osservazioni della tabella excel")

########################## ASSISTANT PIPELINE ###############################

# Tools for the Assistant's Function Calling
code_interpreter_tool = {"type": "code_interpreter"}
pydantic_tool = openai.pydantic_function_tool(ListaDataModel)
pydantic_tool['function']['strict'] = True
tools_generic = [pydantic_tool, code_interpreter_tool]

system_prompt = ""

# Assistant object
assistant = client.beta.assistants.create(
  name="Data Extraction Assistant",
  instructions=system_prompt,
  model=MODEL,
  tools=tools_generic,
  temperature=0.2
)

# Thread object
thread = client.beta.threads.create()

# Message Object
user_prompt = ""
excel_path = ""
file = client.files.create(file=open(excel_path, "rb"), purpose='assistants')
message_input = client.beta.threads.messages.create(
    thread_id=thread.id, 
    role="user",
    attachments=[
        {
            "file_id":file.id,
            "tools":[code_interpreter_tool]
        }
    ],
    content=user_prompt)

# Create a Run
run = client.beta.threads.runs.create(thread_id=thread.id, assistant_id=assistant.id)

# Wait for the run status to be "completed"
run = check_run_status(run, thread)

# Get the output file id
assistant_output = get_model_output_file_id(thread)

# Save Download the excel file
generated_data = client.files.content(assistant_output)
generated_data_bytes = generated_data.read()

with open("filename.xlsx", "wb") as file:
    file.write(generated_data_bytes)

######################## FINISH ###################################
1 Like

I have encountered a similar issue. Got tricked by the request for an action. Couldn’t solve the problem :frowning:

1 Like