Hi, I am completely new to ChatGPT API and Python. I am developing a RAG to discover certain characteristics of single-use plastic bags using a group of regulation PDFs (laws, etc.). I have split those PDFs into several chunks, but my code needs to identify the country to which the characteristic pertains successfully. I think this is because the chunks have no identifier that allows ChatOpenAI to get the right answer regarding the country. I am using ChromaDB to store the vectors. How could I add an attribute to each chunk so ChatOpenAI can correctly identify the country? Thanks!
add metadata.
collection.add(
documents=["doc1", "doc2", "doc3", ...],
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
metadatas=[{"country": "US", "state": "CA"}, {"country": "Japan"}, {"country": "UK"}, ...],
ids=["id1", "id2", "id3", ...]
)
when you query your db, it will be included in the result
Thank you for answering! Very recently, I thought I’d better add the PDFs and query them individually. I have this code:
For generating the dataset:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
import shutil
from openai import OpenAI
import osos.environ[“OPENAI_API_KEY”] = “MyKey”
CHROMA_PATH = “chroma”
DATA_PATH = “docs”def main():
generate_data_store()def generate_data_store():
documents = load_documents()
chunks = split_documents(documents)
save_to_chroma(chunks)def load_documents():
document_loader = PyPDFDirectoryLoader(DATA_PATH)
return document_loader.load()def split_documents(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=80,
length_function=len,
is_separator_regex=False,
)
return text_splitter.split_documents(documents)def save_to_chroma(chunks: list[Document]):
# Clear out the database first.
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)# Create a new DB from the documents. db = Chroma.from_documents( chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH ) db.persist() print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
if name == “main”:
main()
For the query:
import argparse
from dataclasses import dataclass
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from openai import OpenAI
import osos.environ[“OPENAI_API_KEY”] = “MyKey”
CHROMA_PATH = “chroma”
PROMPT_TEMPLATE = “”"
Answer the question based only on the following context:{context}
Answer the question based on the above context: {question}
“”"def main():
# Create CLI.
parser = argparse.ArgumentParser()
parser.add_argument(“query_text”, type=str, help=“The query text.”)
args = parser.parse_args()
query_text = args.query_text# Prepare the DB. embedding_function = OpenAIEmbeddings() db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function) # Search the DB. results = db.similarity_search_with_relevance_scores(query_text, k=3) if len(results) == 0 or results[0][1] < 0.7: print(f"Unable to find matching results.") return context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]) prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) prompt = prompt_template.format(context=context_text, question=query_text) print(prompt) model = ChatOpenAI(model="gpt-4o", temperature=0.0) response_text = model.predict(prompt) sources = [doc.metadata.get("source", None) for doc, _score in results] formatted_response = f"Response: {response_text}\nSources: {sources}" print(context_text) print(formatted_response)
if name == “main”:
main()
How would I insert a loop to add one PDF and query it progressively? Sorry about my total ignorance of Python, and thanks for your help!