Assistant API answer questions using as reference deleted files

ergi.mera · March 5, 2024, 7:03pm

Hello everyone,

I’m unsure if this is a unique issue I’m encountering, but I’ve been experiencing some difficulties while using the assistant API. Specifically, I’m encountering problems when looping through questions using various files.

Here’s the process: I pose a question using the first file, then repeat the same question after uploading the second file, and so forth. To conserve memory, I delete the file each time before uploading a new one. However, despite deleting the files after each question, it appears that the assistant still references previous files when answering some questions, even though they’ve been removed.

Thank you in advance for your assistance.

Macha · March 5, 2024, 8:04pm

Hey there and welcome to the community!

Could you by chance show us some code to see how you’re looping through this? Also, it is it referring to files directly, or just the contents within them?

nikunj · March 5, 2024, 11:59pm

@ergi.mera – if you’re creating these runs in the same thread, contents of the older files that we used to answer questions earlier in the thread are part of the threads context. If you want 0 memory of older files, you’ll want to create “freash” new threads.

ergi.mera · March 6, 2024, 7:55am

def process_question(file_path, question, assistantid, client):
    # Create the file object
    with open(file_path, "rb") as file_data:
        file_upload_response = client.files.create(
            file=file_data,
            purpose='assistants'
        )
        file_id = file_upload_response.id

    # Create thread message
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": question,
                "file_ids": [file_id]
            }
        ]
    )
    thread_id = thread.id

    # Execute the run
    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistantid,
        model="gpt-4-1106-preview",
        tools=[{"type": "code_interpreter"}, {"type": "retrieval"}]
    )
    run_id = run.id

    # Retrieve the run
    retrieved_run = client.beta.threads.runs.retrieve(
        thread_id=thread_id,
        run_id=run_id
    )

    # Wait for the run to complete
    while True:
        run = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run_id
        )
        if run.status == "completed":
            break
        time.sleep(3)  # Wait for 3 seconds before checking again

    # Retrieve thread messages
    thread_messages = client.beta.threads.messages.list(thread_id)
    message_id = thread_messages.first_id  # Need to verify the attribute name for the first message ID

    # Retrieve the message object
    message = client.beta.threads.messages.retrieve(
        thread_id=thread_id,
        message_id=message_id
    )

    # Presuming that message.content is a list with text and annotations attributes
    message_content = message.content[0].text
    annotations = getattr(message_content, 'annotations', [])
    citations = []

    # Iterate over the annotations and add footnotes
    for index, annotation in enumerate(annotations):
        # Replace the text with a footnote
        message_content.value = message_content.value.replace(annotation.text, f' [{index}]')

        # Gather citations based on annotation attributes
        file_citation = getattr(annotation, 'file_citation', None)
        if file_citation:
            cited_file = client.files.retrieve(file_citation.file_id)
            citation_text = f'[{index}] {file_citation.quote} from {cited_file.filename}'
            citations.append(citation_text)
        file_path = getattr(annotation, 'file_path', None)
        if file_path:
            cited_file = client.files.retrieve(file_path.file_id)
            citation_text = f'[{index}] Click <here> to download {cited_file.filename}'
            citations.append(citation_text)

    # Add footnotes to the end of the message before displaying to user
    message_content.value += '\n\n' + '\n'.join(citations)
    client.files.delete(file.id)

    return message_content.value

def process_all_pdfs_to_dataframe(directory_path, question, assistantid, client):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            try:
                response = process_question(file_path, question, assistantid, client)
                results.append({
                    "File": filename,
                    "Question": question,
                    "Response": response
                })
            except Exception as e:
                print(f"An error occurred while processing {filename}: {e}")
                results.append({
                    "Società": filename,
                    "Domanda": question,
                    "Risposta": "Error: " + str(e)
                })
    return pd.DataFrame(results)

ergi.mera · March 6, 2024, 7:59am

Hi Nikunj

this is my code:

def process_question(file_path, question, assistantid, client):
    # Create the file object
    with open(file_path, "rb") as file_data:
        file_upload_response = client.files.create(
            file=file_data,
            purpose='assistants'
        )
        file_id = file_upload_response.id

    # Create thread message
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": question,
                "file_ids": [file_id]
            }
        ]
    )
    thread_id = thread.id

    # Execute the run
    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistantid,
        model="gpt-4-1106-preview",
        tools=[{"type": "code_interpreter"}, {"type": "retrieval"}]
    )
    run_id = run.id

    # Retrieve the run
    retrieved_run = client.beta.threads.runs.retrieve(
        thread_id=thread_id,
        run_id=run_id
    )

    # Wait for the run to complete
    while True:
        run = client.beta.threads.runs.retrieve(
            thread_id=thread_id,
            run_id=run_id
        )
        if run.status == "completed":
            break
        time.sleep(3)  # Wait for 3 seconds before checking again

    # Retrieve thread messages
    thread_messages = client.beta.threads.messages.list(thread_id)
    message_id = thread_messages.first_id  # Need to verify the attribute name for the first message ID

    # Retrieve the message object
    message = client.beta.threads.messages.retrieve(
        thread_id=thread_id,
        message_id=message_id
    )

    # Presuming that message.content is a list with text and annotations attributes
    message_content = message.content[0].text
    annotations = getattr(message_content, 'annotations', [])
    citations = []

    # Iterate over the annotations and add footnotes
    for index, annotation in enumerate(annotations):
        # Replace the text with a footnote
        message_content.value = message_content.value.replace(annotation.text, f' [{index}]')

        # Gather citations based on annotation attributes
        file_citation = getattr(annotation, 'file_citation', None)
        if file_citation:
            cited_file = client.files.retrieve(file_citation.file_id)
            citation_text = f'[{index}] {file_citation.quote} from {cited_file.filename}'
            citations.append(citation_text)
        file_path = getattr(annotation, 'file_path', None)
        if file_path:
            cited_file = client.files.retrieve(file_path.file_id)
            citation_text = f'[{index}] Click <here> to download {cited_file.filename}'
            citations.append(citation_text)

    # Add footnotes to the end of the message before displaying to user
    message_content.value += '\n\n' + '\n'.join(citations)
    client.files.delete(file.id)

    return message_content.value

def process_all_pdfs_to_dataframe(directory_path, question, assistantid, client):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            try:
                response = process_question(file_path, question, assistantid, client)
                results.append({
                    "File": filename,
                    "Question": question,
                    "Response": response
                })
            except Exception as e:
                print(f"An error occurred while processing {filename}: {e}")
                results.append({
                    "Società": filename,
                    "Domanda": question,
                    "Risposta": "Error: " + str(e)
                })
    return pd.DataFrame(results)

Maybe i should (i don’t know if possible) delete also the thread id after each question?

Thanks you in advance

Topic		Replies	Views
The Assistant API responds by including the non-existent file in the annotation Bugs api , assistants-api , assistants-files	4	79	December 3, 2024
Best Practices for Managing Uploaded Files in Code Interpreter Threads API assistants-api	0	32	December 13, 2024
Inconsistent File Access in Assistant API API gpt-4 , assistants-api	17	2729	May 13, 2024
Assistant API: Files Not Accessible in Messages Despite Successful Upload Bugs assistants-api , gpt-4o	4	245	December 8, 2024
Assistant Retrieve API not Reading File API assistants , assistants-api	1	1196	December 5, 2023

Assistant API answer questions using as reference deleted files

Related topics