I have a large document.
during my test with Pinecone.
i found that i needed to split the document into chunk, and put into json format during the API Call to embedding with text-embedding-ada-002.
( i have few 800 words document, no split. i found that everytime i query the Pinecone, it always return the entire document which i have to put into the prompt in text Completion. )
My question.
How do i split the document the right way? ( even manually )
Example.
I have a document regarding an Insurance Package ( desc, key benefits, q&a, disclaimer, footnotes )
(and there is few package )
- if i split the document, will the context still remain intact?
Here is my implementation, not using JSON, but CSV.
#Split the input text into smaller chunks of a specified size.
def split_text(text, chunk_size):
text_chunks = []
text_length = len(text)
start = 0
while start < text_length:
end = start + chunk_size
chunk = text[start:end]
text_chunks.append(chunk)
start = end
return text_chunks
def create_embeddings(text_chunks, model="text-embedding-ada-002"):
embeddings = []
try:
prepared_chunks = [chunk.replace("\n", " ") for chunk in text_chunks]
response = openai.Embedding.create(input=prepared_chunks, model=model)
if response and "data" in response:
for data in response["data"]:
embeddings.append(data["embedding"])
return embeddings
except Exception as e:
print(f"Error creating embeddings: {e}")
return None
def write_embeddings_to_csv(embeddings, csv_path):
with open(csv_path, "w", newline="") as csvfile:
csv_writer = csv.writer(csvfile)
for embedding in embeddings:
csv_writer.writerow(embedding)
def read_embeddings_from_csv(csv_path):
embeddings = []
with open(csv_path, "r", newline="") as csvfile:
csv_reader = csv.reader(csvfile)
for row in csv_reader:
embedding = [float(value) for value in row]
embeddings.append(embedding)
return embeddings
def write_chunks_to_csv(chunks, csv_path):
with open(csv_path, "w", encoding="utf-8", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["chunk"])
for chunk in chunks:
writer.writerow([chunk])
def read_chunks_from_csv(csv_path):
chunks = []
with open(csv_path, "r", encoding="utf-8", newline="") as csv_file:
reader = csv.reader(csv_file)
next(reader) # Skip header row
for row in reader:
chunks.append(row[0])
return chunks