Assistant API answer questions using as reference deleted files

Hello everyone,

I’m unsure if this is a unique issue I’m encountering, but I’ve been experiencing some difficulties while using the assistant API. Specifically, I’m encountering problems when looping through questions using various files.

Here’s the process: I pose a question using the first file, then repeat the same question after uploading the second file, and so forth. To conserve memory, I delete the file each time before uploading a new one. However, despite deleting the files after each question, it appears that the assistant still references previous files when answering some questions, even though they’ve been removed.

Thank you in advance for your assistance.

Hey there and welcome to the community!

Could you by chance show us some code to see how you’re looping through this? Also, it is it referring to files directly, or just the contents within them?

@ergi.mera – if you’re creating these runs in the same thread, contents of the older files that we used to answer questions earlier in the thread are part of the threads context. If you want 0 memory of older files, you’ll want to create “freash” new threads.

4 Likes
def process_question(file_path, question, assistantid, client):
    # Create the file object
    with open(file_path, "rb") as file_data:
        file_upload_response = client.files.create(
            file=file_data,
            purpose='assistants'
        )
        file_id = file_upload_response.id

    # Create thread message
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": question,
                "file_ids": [file_id]
            }
        ]
    )
    thread_id = thread.id

    # Execute the run
    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistantid,
        model="gpt-4-1106-preview",
        tools=[{"type": "code_interpreter"}, {"type": "retrieval"}]
    )
    run_id = run.id

    # Retrieve the run
    retrieved_run = client.beta.threads.runs.retrieve(
        thread_id=thread_id,
        run_id=run_id
    )

    # Wait for the run to complete
    while True:
        run = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run_id
        )
        if run.status == "completed":
            break
        time.sleep(3)  # Wait for 3 seconds before checking again

    # Retrieve thread messages
    thread_messages = client.beta.threads.messages.list(thread_id)
    message_id = thread_messages.first_id  # Need to verify the attribute name for the first message ID

    # Retrieve the message object
    message = client.beta.threads.messages.retrieve(
        thread_id=thread_id,
        message_id=message_id
    )

    # Presuming that message.content is a list with text and annotations attributes
    message_content = message.content[0].text
    annotations = getattr(message_content, 'annotations', [])
    citations = []

    # Iterate over the annotations and add footnotes
    for index, annotation in enumerate(annotations):
        # Replace the text with a footnote
        message_content.value = message_content.value.replace(annotation.text, f' [{index}]')

        # Gather citations based on annotation attributes
        file_citation = getattr(annotation, 'file_citation', None)
        if file_citation:
            cited_file = client.files.retrieve(file_citation.file_id)
            citation_text = f'[{index}] {file_citation.quote} from {cited_file.filename}'
            citations.append(citation_text)
        file_path = getattr(annotation, 'file_path', None)
        if file_path:
            cited_file = client.files.retrieve(file_path.file_id)
            citation_text = f'[{index}] Click <here> to download {cited_file.filename}'
            citations.append(citation_text)

    # Add footnotes to the end of the message before displaying to user
    message_content.value += '\n\n' + '\n'.join(citations)
    client.files.delete(file.id)

    return message_content.value

def process_all_pdfs_to_dataframe(directory_path, question, assistantid, client):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            try:
                response = process_question(file_path, question, assistantid, client)
                results.append({
                    "File": filename,
                    "Question": question,
                    "Response": response
                })
            except Exception as e:
                print(f"An error occurred while processing {filename}: {e}")
                results.append({
                    "Società": filename,
                    "Domanda": question,
                    "Risposta": "Error: " + str(e)
                })
    return pd.DataFrame(results)


Hi Nikunj

this is my code:

def process_question(file_path, question, assistantid, client):
    # Create the file object
    with open(file_path, "rb") as file_data:
        file_upload_response = client.files.create(
            file=file_data,
            purpose='assistants'
        )
        file_id = file_upload_response.id

    # Create thread message
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": question,
                "file_ids": [file_id]
            }
        ]
    )
    thread_id = thread.id

    # Execute the run
    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistantid,
        model="gpt-4-1106-preview",
        tools=[{"type": "code_interpreter"}, {"type": "retrieval"}]
    )
    run_id = run.id

    # Retrieve the run
    retrieved_run = client.beta.threads.runs.retrieve(
        thread_id=thread_id,
        run_id=run_id
    )

    # Wait for the run to complete
    while True:
        run = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run_id
        )
        if run.status == "completed":
            break
        time.sleep(3)  # Wait for 3 seconds before checking again

    # Retrieve thread messages
    thread_messages = client.beta.threads.messages.list(thread_id)
    message_id = thread_messages.first_id  # Need to verify the attribute name for the first message ID

    # Retrieve the message object
    message = client.beta.threads.messages.retrieve(
        thread_id=thread_id,
        message_id=message_id
    )

    # Presuming that message.content is a list with text and annotations attributes
    message_content = message.content[0].text
    annotations = getattr(message_content, 'annotations', [])
    citations = []

    # Iterate over the annotations and add footnotes
    for index, annotation in enumerate(annotations):
        # Replace the text with a footnote
        message_content.value = message_content.value.replace(annotation.text, f' [{index}]')

        # Gather citations based on annotation attributes
        file_citation = getattr(annotation, 'file_citation', None)
        if file_citation:
            cited_file = client.files.retrieve(file_citation.file_id)
            citation_text = f'[{index}] {file_citation.quote} from {cited_file.filename}'
            citations.append(citation_text)
        file_path = getattr(annotation, 'file_path', None)
        if file_path:
            cited_file = client.files.retrieve(file_path.file_id)
            citation_text = f'[{index}] Click <here> to download {cited_file.filename}'
            citations.append(citation_text)

    # Add footnotes to the end of the message before displaying to user
    message_content.value += '\n\n' + '\n'.join(citations)
    client.files.delete(file.id)

    return message_content.value

def process_all_pdfs_to_dataframe(directory_path, question, assistantid, client):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            try:
                response = process_question(file_path, question, assistantid, client)
                results.append({
                    "File": filename,
                    "Question": question,
                    "Response": response
                })
            except Exception as e:
                print(f"An error occurred while processing {filename}: {e}")
                results.append({
                    "Società": filename,
                    "Domanda": question,
                    "Risposta": "Error: " + str(e)
                })
    return pd.DataFrame(results)

Maybe i should (i don’t know if possible) delete also the thread id after each question?

Thanks you in advance