I am into a case in which I want for each distinct paragraph of a text from a pdf to be stored as a distinct embedding.
So far I scan a directory and for each file I store the embedding into a database:
import os
import time
import psycopg2
from openai import OpenAI
from pypdf import PdfReader
from dotenv import load_dotenv
from lib.embeddings import getEmbeddings
load_dotenv()
docsFolder='./docs'
def getTextFromPDF(fileName):
text = ""
reader = PdfReader(fileName)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def createTable(conn):
cur = conn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
sql='''
Create Table IF NOT EXISTS embeddings (
id SERIAL,
"text" text,
embedding vector(1536)
)
'''
cur.execute(sql)
# Remove existing data we want to have a clean set to save them
cur.execute("TRUNCATE embeddings")
conn.commit()
cur.close()
def saveEmbeddings(conn,text,embedding):
print("INSERT DATA")
sql = "INSERT INTO embeddings(text,embedding) VALUES (%s,%s)"
cur = conn.cursor()
cur.execute(sql, (text, embedding))
conn.commit()
cur.close()
if __name__ == "__main__":
conn = psycopg2.connect(
user=os.getenv("PG_USER","myuser"),
password=os.getenv("PG_PASSWORD",""),
host=os.getenv("PG_HOST","localhost"),
port=int(os.getenv("PG_PORT",5432)),
database=os.getenv("PG_DB_NAME","mydb")
)
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
)
print("SETUP DB")
createTable(conn)
with os.scandir(docsFolder) as it:
for entry in it:
if entry.name.startswith('.') and not entry.is_file():
continue
print("EMBEDDINGS CREATION")
start = time.perf_counter()
text = getTextFromPDF(docsFolder+"/"+entry.name)
end = time.perf_counter()
print("Passage Extraction time "+str(end-start)+" seconds")
embeddings=getEmbeddings(client, text)
saveEmbeddings(conn, text, embeddings)
time.sleep(10)
conn.close()
File lib/embeddings.py
:
import tiktoken
import time
import numpy as np
from itertools import islice
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'
def batched(iterable, n):
"""Batch data into tuples of length n. The last batch may be shorter."""
if n < 1:
raise ValueError('n must be at least one')
it = iter(iterable)
while (batch := tuple(islice(it, n))):
yield batch
def chunked_tokens(text, encoding_name, chunk_length):
encoding = tiktoken.get_encoding(encoding_name)
tokens = encoding.encode(text)
chunks_iterator = batched(tokens, chunk_length)
yield from chunks_iterator
def getEmbeddings(client,text, max_tokens=EMBEDDING_CTX_LENGTH, encoding_name=EMBEDDING_ENCODING, average=True):
chunk_embeddings = []
chunk_lens = []
for chunk in chunked_tokens(text, encoding_name=encoding_name, chunk_length=max_tokens):
res = client.embeddings.create(
model="text-embedding-ada-002",
input=chunk,
encoding_format="float"
)
chunk_embeddings.append(res.data[0].embedding)
chunk_lens.append(len(chunk))
time.sleep(2)
if average:
chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings) # normalizes length to 1
chunk_embeddings = chunk_embeddings.tolist()
return chunk_embeddings
As you can see I get a huge text file and I store its embedding into a database. But in my case I want to split the text into seperate paragraphs and store for each paragraph