How good is my prompt for pdf analysis using openai API in python?

Hi, i’m creating a simple bot that takes as inputs some papers and then asks gpt3.5-turbo to find specific attributes inside said papers.

my questions are:

  1. if you have some suggestions on how to improve the prompting, it would be appreciated. In some cases in the first part the answer uses values that doesn’t fit the request and in the second part it happens that the answer is not in csv-like format

  2. is it possible to receive a message (first_message in the code below) and add a new question without the need to create a new thread passing the previous messages and the new one? if so, how it will be considered in terms of token usage?

  3. if you have suggestions in general, feel free to tell

here the code for now:

import os
import pickle

from openai import OpenAI


current_dir = os.getcwd()
folder_path = os.path.join(current_dir, "input_pdfs")
answers_path = os.path.join(current_dir, "answers")

ap_path = os.path.join(answers_path, "already_processed.pkl")
if os.path.isfile(ap_path):
    with open(ap_path, "rb") as ap_f:
        already_processed = pickle.load(ap_f)
else: already_processed = []

file_paths = []
if os.path.exists(folder_path):
    files = os.listdir(folder_path)
    for f in files:
        if f[-4:] == ".pdf":
            file_path = os.path.join(folder_path, f)
            print("here")
            print(already_processed)
            print(file_path)
            if file_path not in already_processed:
                file_paths.append(file_path)
else:
    raise RuntimeError("The input_pdfs directory does not exist.")

if not file_paths:
    raise RuntimeError("No more PDF files found in the input_pdfs directory.")

prompt_1 = """
Given only the attached file, i want you to analyze it. first detect which are the described protocols, \
both the proposed optimized protocols and the baseline of comparison. Then i want you to analyze the \
information about these protocols and to find for each protocol the following informations:\n\
1) The percentage of achieved product in terms of dry extraction yield. if you don't find the percentage, try to compute it from related informations\n\
2) The solid to liquid ratio in terms of solvents (expressed as kg/L and represented as x:y). Consider only the solvent or the mix of solvents adopted for this protocol\n\
3) The solid to liquid ratio in terms of additives (expressed as kg/L and represented as x:y). Consider only the additive or the mix of additives adopted for this protocol\n\
4) The percentage of water recycle, meaning the percentage of water that can be (or it is) reused. just the water, nothing else.\
Focus on finding exactly what asked, to not include in the answer useless informations and to use the correct units of measure, perform the conversion when needed.\
"""

prompt_2_1 = f"""
I asked you this:\n\n{prompt_1}\n\n \
and your answer was this:\n\n
"""
prompt_2_2 = """
\n\nCan you summarize your answer in a standardized comma separated value format (csv) format? \
I want you to answer with this exact header line: protocol_name(String), achieved_product_amount(%), solvents_solid_to_liquid_ratio(Kg:L), additives_solid_to_liquid_ratio(Kg:L), water_recycle_percentage(%). \
After the header line, i want you to write one line per analyzed protocol containing the comma separated values corresponding to the requested attribte. \
I the line after the header be sure to respect the requested units of measure and to omit every surplus information (like units of measure and uncertanty). \
If you cannot find one of the attributes, write None instead.\
"""

assistant_name = "GreenCheck"
assistant_instructions = "You are an expert in Ecology, Science and waste managment. You excel in retrieving data from pdf files using the cintextual semantic meaning to find what you're asked."
assistant_model = "gpt-3.5-turbo-0125"

client = OpenAI(api_key= my-api-key)

assistant_exist = False
vector_store_exists = False
for ass in client.beta.assistants.list():
    if ass.name == assistant_name:
        assistant_exist = True
        assistant_id = ass.id
        print("assistant already exists")
        if len(ass.tool_resources.file_search.vector_store_ids) > 0:
            vector_store_exists = True
            vector_store_id = ass.tool_resources.file_search.vector_store_ids[0]
            print("also the vector store already exists")
        break

if not assistant_exist:
    assistant_id = client.beta.assistants.create(
        name= assistant_name,
        instructions= assistant_instructions,
        model= assistant_model,
        tools=[{"type": "file_search"}],
    ).id
    print("assistant created")
    
if not vector_store_exists:
    vector_store_id = client.beta.vector_stores.create(name= "GreenPDF").id

    assistant = client.beta.assistants.update(
        assistant_id= assistant_id,
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}},
    )

    print("vector store created and assigned to the assistant")

for input_file in [file_paths[0]]:
#for input_file in file_paths:

    file_id = client.files.create(
        file= open(input_file, 'rb'),
        purpose= "assistants",
    ).id

    print("file uploaded")

    thread_id = client.beta.threads.create(
    messages=[
        {
        "role": "user",
        "content": prompt_1,
        "attachments": [
            { "file_id": file_id, "tools": [{"type": "file_search"}] }
        ],
        }
    ]
    ).id

    run_id = client.beta.threads.runs.create_and_poll(
        assistant_id= assistant_id,
        thread_id= thread_id, 
    ).id

    messages = list(client.beta.threads.messages.list(thread_id= thread_id, run_id= run_id))

    message_content = messages[0].content[0].text

    first_answer = message_content.value
    print("first answer:")
    print(first_answer)

    try:
        client.beta.threads.delete(thread_id= thread_id)
    except: pass

    thread_id = client.beta.threads.create(
    messages=[
        {
        "role": "user",
        "content": prompt_2_1 + first_answer + prompt_2_2,
        }
    ]
    ).id

    run_id = client.beta.threads.runs.create_and_poll(
        assistant_id= assistant_id,
        thread_id= thread_id, 
    ).id

    messages = list(client.beta.threads.messages.list(thread_id= thread_id, run_id= run_id))

    message_content = messages[0].content[0].text

    second_answer = message_content.value
    print("second answer:")
    print(second_answer)

    last_id = -1
    for f_name in os.listdir("answers"):
        if ".pkl" not in f_name:
            f_id = int(f_name[0])
            if f_id > last_id:
                last_id = f_id
    
    new_f_id = last_id + 1
    with open(os.path.join(answers_path, str(new_f_id) + "_first.txt"), 'w', encoding='utf-8') as f_first:
        f_first.write(first_answer)
    with open(os.path.join(answers_path, str(new_f_id) + "_second.txt"), 'w', encoding='utf-8') as f_second:
        f_second.write(second_answer)
        already_processed.append(input_file)
    with open(ap_path, "wb") as ap_f:
        pickle.dump(already_processed, ap_f)
    
    try:
        for f in client.beta.vector_stores.files.list(vector_store_id= vector_store_id):
            client.beta.vector_stores.files.delete(vector_store_id= vector_store_id, file_id= f.id)
    except: pass
    
    try:
        for f in client.files.list():
            client.files.delete(file_id= f.id)
    except: pass

thanks to whoever answer

1 Like