Trying to get retrieval to loop

Hi! I am trying to build a workflow that iterates over contents from an Excel file, which uses assistant api retrieval. It is always processing the first row and I am not sure what is wrong. Thanks to anyone who helps.

import os
import pandas as pd
import openai
import time
from openai import OpenAI
from dotenv import load_dotenv

# Load the OpenAI API key
load_dotenv('your_env_file.env')
api_key = os.getenv('YOUR_ENV_VARIABLE_FOR_API_KEY')
openai.api_key = api_key  # Make sure to set the API key for the openai package

# Read contents from an Excel file
excel_path = 'path_to_your_excel_file.xlsx'
df = pd.read_excel(excel_path)
content_column = 'your_content_column_name'  # Replace with the name of your column

# Prepare a column in the DataFrame to store responses
df['Assistant_Responses'] = ''

# Upload a file with an "assistants" purpose
client = OpenAI()
file = client.files.create(
  file=open("path_to_your_file.json", "rb"),
  purpose='assistants'
)
print(file.id)

# Iterate over each content in the DataFrame
for index, row in df.iterrows():
    user_input = row[content_column]
    print(f"Processing row {index + 1}/{len(df)}...")

    # Create a thread
    thread = client.beta.threads.create()
    message = client.beta.threads.messages.create(
      thread_id=thread.id,
      role="user",
      content=user_input,
      file_ids=[file.id]
    )

    run = openai.beta.threads.runs.create(
      thread_id=thread.id,
      assistant_id="your_assistant_id"
    )

    while run.status != 'requires_action':
        time.sleep(1)
        run = openai.beta.threads.runs.retrieve(
           thread_id=thread.id,
           run_id=run.id
        )

    try:
        messages = openai.beta.threads.messages.list(
            thread_id=thread.id
        )
        # Assuming the last message is the response
        if messages.data:
            response = messages.data[-1].content
            df.at[index, 'Assistant_Responses'] = response
    except Exception as e:
        print(f"Error processing row {index + 1}: {e}")

# Save the updated DataFrame back to the Excel file
df.to_excel(excel_path, index=False)

print("Processing complete.")

1 Like

Iā€™m looking for the answer as well! I need to loop through 500k records in batch.

Same here! @geteff did you come up with a fix?

import os
import openai
import time
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv('untitled.env')

api_key = os.getenv('OPENAI_API_KEY')
print(api_key)  

client = OpenAI()


#Loop through each line
import pandas as pd

# Function to process queries and get responses
def process_user_queries(client, excel_path, content_column, file_id, assistant_id, instructions):
    # Read contents from an Excel file
    df = pd.read_excel(excel_path)

    # Prepare a column in the DataFrame to store responses if not already present
    if 'Assistant_Responses' not in df.columns:
        df['Assistant_Responses'] = ''

    # Adjust the start row if it's beyond the range of the DataFrame
    start_row = 467

    # Iterate over each content in the DataFrame starting from the start_row
    for index, row in df.iloc[start_row:].iterrows():
        user_query = row[content_column]
        print(f"Processing row {index + 1}/{len(df)}: {user_query}")

        # Create a new thread for each query
        print(f"Creating thread for row {index + 1}...")
        thread = client.beta.threads.create()

        print(f"Sending message for row {index + 1}...")
        message = client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content=user_query,
            file_ids=[file_id]
        )

        # Create a run with specific instructions
        print(f"Creating run for row {index + 1}...")
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=assistant_id,
            instructions=instructions
        )

        # Wait for the run to complete
        print(f"Waiting for run to complete for row {index + 1}...")
        while run.status != "completed":
            time.sleep(3)  # Sleep to avoid hitting API limits
            run = client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id
            )
        print(run.status)

         # Retrieve and store all messages from the thread
        messages = client.beta.threads.messages.list(
            thread_id=thread.id
        )
        full_conversation = []
        if messages.data:
            for msg in messages.data:
                # Extract the message text and add it to the list
                full_conversation.append(msg.content[0].text.value)

        # Join all messages into a single string
        df.at[index, 'Assistant_Responses'] = '\n'.join(full_conversation)


        # Save the updated DataFrame back to the Excel file after each row is completed
        df.to_excel(excel_path, index=False)
        print(f"Row {index + 1} processing completed and saved.")

    print("All rows processed and saved.")

# Designate file path and content to be processed
excel_path = ''
content_column = ''  # Replace with the name of your column


# Replace with your actual file and assistant IDs and instructions
file_id = ''
assistant_id = ''
instructions = (""" """)



# Process the queries and get responses
process_user_queries(client, excel_path, content_column, file_id, assistant_id, instructions)

I got one code working, as you can see above