hello @_j and thanks for your support.
This is my code at the moment
from openai import OpenAI
from dotenv import load_dotenv
import os
dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
load_dotenv(dotenv_path)
# Retrieve the OpenAI API key from the environment variables
api_key = os.getenv("OPENAI_KEY")
client = OpenAI(api_key=api_key)
# Upload a file with an "assistants" purpose
file = client.files.create(
file=open("pdf/CARMICHAEL_TCA_BANK 1_02-25-2015_0001.pdf", "rb"),
purpose='assistants'
)
# Create an assistant using the file ID
assistant = client.beta.assistants.create(
instructions="You are a maintenance engineer. You answer question about dissolved gas analysis (DGA) in maintenance reports.",
model="gpt-4o", #gpt-4o-2024-11-20", #"gpt-4o-mini",
tools=[{"type": "code_interpreter"}],
tool_resources={
"code_interpreter": {
"file_ids": [file.id]
}
}
)
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": (
"I need to extract content from this maintenance report in PDF. "
"The file contains tabular data, with some columns indicating the different sample dates. "
"The goal is to extract the different measurements reported in the file for each sample date. "
"The measurements include C2H2, CO2, CO, C2H6, C2H4, H2, CH4, N2, and O2. "
"In the pdf file we have rows representing measurements and columns representing dates. "
"The table might also contain some context information like Laboratory Number, Container Number or Temperature, "
"which are not relevant for the desired output."
),
"attachments": [
{
"file_id": file.id,
"tools": [{"type": "code_interpreter"}]
}
]
}
]
)
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id,
assistant_id=assistant.id,
instructions=(
"Extract chemicals analysis from this file in a csv. "
"The csv should have sample date as rows and "
"C2H2, CO2, CO, C2H6, C2H4, H2, CH4, N2, O2 as columns."
)
)
if run.status == 'completed':
messages = client.beta.threads.messages.list(
thread_id=thread.id
)
print(messages)
else:
print(run.status)
for message in messages.data:
print(f"Message ID: {message.id}")
print(f"Assistant ID: {message.assistant_id}")
print(f"Role: {message.role}")
print(f"Content: {message.content[0].text.value}")
print(f"Created At: {message.created_at}")
print(f"Attachments: {message.attachments}") # manually retrieve output_file_id here
print("-" * 40)
def write_file_to_temp_dir(some_file_id, output_path):
file_data = client.files.content(some_file_id)
file_data_bytes = file_data.read()
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "wb") as file:
file.write(file_data_bytes)
output_file_id = "file-Fsxke9cCMVrasMnVLqkNxE"
output_path = 'test/test2.csv'
write_file_to_temp_dir(output_file_id, output_path)
It is returning variable results, none of which is reliable nor complete in terms of accuracy, while in chatgpt.com I get consistent and correct results for the different types of file tested.
When creating the assistant, I tried to replace gpt-4o with gpt-4o-2024-11-20 but it throws an error
BadRequestError: Error code: 400 - {‘error’: {‘message’: “The requested model ‘gpt-4o-2024-11-20’ does not exist.”, […]
Even if in https://platform.openai.com/settings/organization/limits I can see it among the available models
Thanks in advance for any additional hint,
Elisa