How I can split text into paragraphs?

ddesyllas · November 15, 2024, 2:54pm

I am into a case in which I want for each distinct paragraph of a text from a pdf to be stored as a distinct embedding.

So far I scan a directory and for each file I store the embedding into a database:

import os
import time

import psycopg2
from openai import OpenAI

from pypdf import PdfReader
from dotenv import load_dotenv
from lib.embeddings import getEmbeddings

load_dotenv()
docsFolder='./docs'

def getTextFromPDF(fileName):
    text = ""
    reader = PdfReader(fileName)
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text


def createTable(conn):

    cur = conn.cursor()

    cur.execute("CREATE EXTENSION IF NOT EXISTS vector")

    sql='''
        Create Table IF NOT EXISTS embeddings (
            id SERIAL,
            "text" text,
            embedding  vector(1536)
        )
    '''
    cur.execute(sql)
    # Remove existing data we want to have a clean set to save them
    cur.execute("TRUNCATE embeddings")
    conn.commit()
    cur.close()

def saveEmbeddings(conn,text,embedding):
    print("INSERT DATA")
    sql = "INSERT INTO embeddings(text,embedding) VALUES (%s,%s)"
    cur = conn.cursor()
    cur.execute(sql, (text, embedding))
    conn.commit()
    cur.close()

if __name__ == "__main__":

    conn = psycopg2.connect(
            user=os.getenv("PG_USER","myuser"),
            password=os.getenv("PG_PASSWORD",""),
            host=os.getenv("PG_HOST","localhost"),
            port=int(os.getenv("PG_PORT",5432)),
            database=os.getenv("PG_DB_NAME","mydb")
        )

    client = OpenAI(
        # This is the default and can be omitted
        api_key=os.environ.get("OPENAI_API_KEY"),
    )

    print("SETUP DB")
    createTable(conn)

    with os.scandir(docsFolder) as it:
        for entry in it:
            if entry.name.startswith('.') and not entry.is_file():
                continue

            print("EMBEDDINGS CREATION")

            start = time.perf_counter()
            text = getTextFromPDF(docsFolder+"/"+entry.name)
            end = time.perf_counter()

            print("Passage Extraction time "+str(end-start)+" seconds")
            embeddings=getEmbeddings(client, text)
            saveEmbeddings(conn, text, embeddings)
            time.sleep(10)

    conn.close()

File lib/embeddings.py:

import tiktoken
import time
import numpy as np
from itertools import islice

EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

def batched(iterable, n):
    """Batch data into tuples of length n. The last batch may be shorter."""
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

def chunked_tokens(text, encoding_name, chunk_length):
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(text)
    chunks_iterator = batched(tokens, chunk_length)
    yield from chunks_iterator


def getEmbeddings(client,text, max_tokens=EMBEDDING_CTX_LENGTH, encoding_name=EMBEDDING_ENCODING, average=True):

    chunk_embeddings = []
    chunk_lens = []
    for chunk in chunked_tokens(text, encoding_name=encoding_name, chunk_length=max_tokens):
        res = client.embeddings.create(
            model="text-embedding-ada-002",
            input=chunk,
            encoding_format="float"
        )

        chunk_embeddings.append(res.data[0].embedding)
        chunk_lens.append(len(chunk))
        time.sleep(2)

    if average:
        chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
        chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings)  # normalizes length to 1
        chunk_embeddings = chunk_embeddings.tolist()

    return  chunk_embeddings

As you can see I get a huge text file and I store its embedding into a database. But in my case I want to split the text into seperate paragraphs and store for each paragraph

_j · November 15, 2024, 3:02pm

That is a near-impossibility programmatically, but something that an AI could perform for you.

Searchable text from PDFs is usually broken by lines in the original presentation format. You can have a document with narrow columns, each line extracted with linefeeds, but no extra linefeeds between paragraphs and longer continuations of linefeeds within broken sentences.

pbrunelle · November 16, 2024, 7:07am

…/github/pixeltable/pixeltable/blob/release/docs/release/tutorials/rag-operations.ipynb

Would that answer your need? Being able to chunk into paragraphs and embeds and being able to apply custom sim search?

ddesyllas · November 20, 2024, 3:58pm

I need a full url this is part of a github repository but Idk which one you are mentioning.

ddesyllas · November 28, 2024, 11:53am

An alternate approach is to use completion chat with some prompt engineering and formatted response. Therefore I made this python lib (named extract_paragraphs):

from openai import OpenAI
import json

def extractParagraphs(client: OpenAI, text: str):
    text = text.strip()

    if (text == ""):
        raise ValueError("String should noty be an empty string")

    prompt = """
        You are a tool that splits the incoming texts and messages into paragraphs and extracts any title from text
        Do not alter the incoming message just output it as a json with split paragraphs. 

        The text is coming from PDF and DOCX files, therefore omit any page numbers page headers and footers.


        The Json output should be the following:
        ```
        {
          "text_title":string,

          "paragraphs":[
            {
              "title":string,
              "paragraph":string
            }
          ]
        }
        ```

        * "text_title" is the title of incomming text
        * "paragraphs" is an array with split paragraphs upon each paragraph:
          * "title" is the paragraph title if there's none set it as empty string
          * "paragraph" is the paragraph content

        Feel free to trim any excess opr unwanted whitespaces and multiple newlines and do not pretty print the json.
        Replace multiple tabs and spaces in the incomming text with a single space character.
        The output should be raw json that is NOT into markdown markup.
    """

    response_format={
        "type":"json_schema",
        "json_schema":{
            "name": "paragraph_response",
            "strict": True,
            "schema": {
                "type": "object",
                "properties":{
                    "text_title":{
                        "type":"string"
                    },
                    "paragraphs":{
                        "type": "array",
                        "items": {
                            "type":"object",
                            "properties":{
                                "title":{ "type":"string"},
                                "paragraph":{"type":"string"}
                            },
                            "required": ["title", "paragraph"],
                            "additionalProperties": False
                        }
                    }
                },
                "required": ["text_title","paragraphs"],
                "additionalProperties": False
            }
        }
    }

    response = client.chat.completions.create(model="gpt-4o", messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": text}
    ],response_format=response_format)

    content = extractChatCompletionMessage(response)

    return json.loads(content)

def extractChatCompletionMessage(response):
    return  response.choices[0].message.content

The idea is to use formatted response with a fixed schema and upon system message describe that I want to split the text into paragraphs.

Then I could use it as:

from pypdf import PdfReader
from openai import OpenAI
from extract_paragraphs import extractParagraphs

def getTextFromPDF(fileName):
    text = ""
    reader = PdfReader(fileName)
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

path="mypdf.pdf"

openai = OpenAI()

content = getTextFromPDF(path)
paragraphs = extractParagraphs(content)

print(paragraphs)

Have you used this approach?
I want to know any known pitfalls compared to using a custom model.

Topic		Replies	Views
Retrieval Augmented Generation (RAG) with 100k PDFs?! Too slow! Community pdf , llm , rag , development	13	22783	October 31, 2024
OpenAI Embeddings - Search through ~1000 PDFs API embeddings	3	3080	August 28, 2024
Training with Large PDF FIles API	10	24431	December 15, 2023
What's the appropriate way to convert pdfs to text files? Prompting	6	4526	December 23, 2023
Aggregated answer across multiple documents (Q&A) API	6	3276	March 14, 2023

How I can split text into paragraphs?

Related topics