We are still following the tutorials in the OpenAI website (OpenAI API) with some modification. Below are the codes:
###CHUNKING
max_tokens = 500
def split_into_many(text, max_tokens=max_tokens):
pattern = r’(?:[\n.!?;])’
sentences = re.split(pattern, text)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
chunks = []
tokens_so_far = 0
chunk = []
# Loop through the sentences and tokens joined together in a tuple
for sentence, token in zip(sentences, n_tokens):
if tokens_so_far + token > max_tokens:
chunks.append(" ".join(chunk) + ".")
chunk = []
tokens_so_far = 0
if token > max_tokens:
continue
chunk.append(sentence)
tokens_so_far += token + 1
if chunk:
chunks.append(" ".join(chunk) + ".")
return chunks
df = pd.DataFrame(shortened, columns=[‘text’])
df[‘n_tokens’] = df.text.apply(lambda x: len(tokenizer.encode(x)))
for row in df.iterrows():
# If the text is None, go to the next row
if row[1][‘text’] is None:
continue
# If the number of tokens is greater than the max number of tokens, split the text into chunks
if row[1]['n_tokens'] > max_tokens:
shortened += split_into_many(row[1]['text'])
# Otherwise, add the text to the list of shortened texts
else:
shortened.append(row[1]['text'])
###END_OF_CHUNKING
###EMBEDDING
df[‘embeddings’] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine=‘text-embedding-ada-002’)[‘data’][0][‘embedding’])
df.to_csv(‘processed/embeddings.csv’)
df.head()
###END_OF_EMBEDDING
###CONTEXT_GENERATION
def create_context(
question, df, max_len=1800, size=“ada”
):
“”"
Create a context for a question by finding the most similar context from the dataframe
“”"
# Get the embeddings for the question
q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
# Get the distances from the embeddings
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
returns = []
cur_len = 0
for i, row in df.sort_values('distances', ascending=True).iterrows():
# Add the length of the text to the current length
cur_len += row['n_tokens'] + 4
# If the context is too long, break
if cur_len > max_len:
break
# Else add it to the text that is being returned
returns.append(row["text"])
# Return the context
return "\n\n###\n\n".join(returns)
###END_OF_CONTEXT_GENERATION