Identify chunks with langchain and ChromaDB

Hi, I am completely new to ChatGPT API and Python. I am developing a RAG to discover certain characteristics of single-use plastic bags using a group of regulation PDFs (laws, etc.). I have split those PDFs into several chunks, but my code needs to identify the country to which the characteristic pertains successfully. I think this is because the chunks have no identifier that allows ChatOpenAI to get the right answer regarding the country. I am using ChromaDB to store the vectors. How could I add an attribute to each chunk so ChatOpenAI can correctly identify the country? Thanks!

add metadata.

collection.add(
    documents=["doc1", "doc2", "doc3", ...],
    embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],
    metadatas=[{"country": "US", "state": "CA"}, {"country": "Japan"}, {"country": "UK"}, ...],
    ids=["id1", "id2", "id3", ...]
)

when you query your db, it will be included in the result

Thank you for answering! Very recently, I thought I’d better add the PDFs and query them individually. I have this code:

For generating the dataset:

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
import shutil
from openai import OpenAI
import os

os.environ[“OPENAI_API_KEY”] = “MyKey”

CHROMA_PATH = “chroma”
DATA_PATH = “docs”

def main():
generate_data_store()

def generate_data_store():
documents = load_documents()
chunks = split_documents(documents)
save_to_chroma(chunks)

def load_documents():
document_loader = PyPDFDirectoryLoader(DATA_PATH)
return document_loader.load()

def split_documents(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=800,
chunk_overlap=80,
length_function=len,
is_separator_regex=False,
)
return text_splitter.split_documents(documents)

def save_to_chroma(chunks: list[Document]):
# Clear out the database first.
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)

# Create a new DB from the documents.
db = Chroma.from_documents(
    chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
)
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

if name == “main”:
main()

For the query:

import argparse
from dataclasses import dataclass
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from openai import OpenAI
import os

os.environ[“OPENAI_API_KEY”] = “MyKey”

CHROMA_PATH = “chroma”

PROMPT_TEMPLATE = “”"
Answer the question based only on the following context:

{context}


Answer the question based on the above context: {question}
“”"

def main():
# Create CLI.
parser = argparse.ArgumentParser()
parser.add_argument(“query_text”, type=str, help=“The query text.”)
args = parser.parse_args()
query_text = args.query_text

# Prepare the DB.
embedding_function = OpenAIEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")
    return

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

model = ChatOpenAI(model="gpt-4o", temperature=0.0)
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(context_text)
print(formatted_response)

if name == “main”:
main()

How would I insert a loop to add one PDF and query it progressively? Sorry about my total ignorance of Python, and thanks for your help!