Thanks for your suggestions. I tried to the example on https://platform.openai.com/docs/assistants/tools/file-search. For regular short questions, it seems work, for example “Who are the authors of this article?”
When I ask the tool to extract a whole section (for example the Methods section) out of a scientific article, it is not able to. Even though I specifically asked to extra the complete section, it only extracted part of it.
But when I uploaded the article through chatgpt UI (using GPT4), it was able to extract the complete section. So I guess the file search (RAG) is still not working as good as “myfiles_browser”?
Any suggestion if I want to extract a whole section (such as the methods section) out of a pdf file? Any help would be greatly appreciated!
client = OpenAI(api_key="sk-***")
assistant = client.beta.assistants.create(
name="Scientific Research Assistant",
instructions="You are an expert in biomedical researcher. Use you knowledge base to answer questions about research articles.",
model="gpt-4-turbo-2024-04-09",
tools=[{"type": "file_search"}],
temperature=0,
)
vector_store = client.beta.vector_stores.create(name="Scientific Research")
file_paths = ["1234.pdf"]
file_streams = [open(path, "rb") for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
vector_store_id=vector_store.id, files=file_streams
)
print(file_batch.status)
print(file_batch.file_counts)
assistant = client.beta.assistants.update(
assistant_id=assistant.id,
tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)
message_file = client.files.create(
file=open("1234.pdf", "rb"), purpose="assistants"
)
section = "Methods"
content = f"""Please extract the {section} section from the following text, which is a scientific article. \
Please provide me the whole {section} section completely. Do not miss, change, or summarize any of the words in the section."""
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": content,
# Attach the new file to the message.
"attachments": [
{ "file_id": message_file.id, "tools": [{"type": "file_search"}] }
],
}
]
)
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id, assistant_id=assistant.id
)
messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
message_content = messages[0].content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
if file_citation := getattr(annotation, "file_citation", None):
cited_file = client.files.retrieve(file_citation.file_id)
citations.append(f"[{index}] {cited_file.filename}")
print(message_content.value)
print("\n".join(citations))