How to load saved embeddings

It looks like you did not set an index when creating the embeddings file (which is fine). Therefore, the first column of the embeddings file is the index(row number) of the original csv file. When your embedding file is generated, put a place holder column called idx like this:

idx,0,1,2,3,4,5,6,7,8,9,

and then when loading the file, you need to ignore that column header when computing the max_dim.

Here is my python code, let me know if you have any questions.

import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

MAX_SECTION_LEN = 500
MAX_SECTIONS = 3
SEPARATOR = "\n* "
ENCODING = "gpt2"
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))


def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c)
                  for c in df.columns if c != "idx"])
    return {
        (r.idx): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }


def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(
        question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.
        document_section = df.loc[section_index]
        print(type(document_section))
        if hasattr(document_section, 'tokens'):
            chosen_sections_len += document_section.tokens + separator_len
            if chosen_sections_len > MAX_SECTION_LEN:
                break
        elif len(chosen_sections) > MAX_SECTIONS:
            break

        print(document_section.skill + ' ' + document_section.mastery +
              ' ' + document_section.authorname)
        chosen_sections.append(
            SEPARATOR + document_section.skill + ' ' + document_section.mastery + ' ' + document_section.authorname)
        # print('-----------')
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"


def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
        model=model,
        input=text
    )
    return result["data"][0]["embedding"]


def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        # idx: get_embedding(r.title) for idx, r in df.iterrows()
        idx: get_embedding(r.skill + ' ' + r.mastery) for idx, r in df.iterrows()
    }


def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities


COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}


def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[tuple[str, str], np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    print(prompt)
    quit()
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
        prompt=prompt,
        **COMPLETIONS_API_PARAMS
    )
    print(response["choices"])
    return response["choices"][0]["text"].strip(" \n")


df = pd.read_csv('luna_skills_copy.csv')
print(f"{len(df)} rows in the data.")
# print(df.sample(15))
print(df)

# save embeddings to file
'''
document_embeddings = compute_doc_embeddings(df)
example_entry = list(document_embeddings.items())[0]
print(
    f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")
pd.DataFrame(document_embeddings).T.to_csv(
    'luna_skills_copy_embeddings_small2.csv')

quit()
'''
print('loading ...')
document_embeddings = load_embeddings("luna_skills_copy_embeddings.csv")
print('loading done.')

answer = answer_query_with_context(
    "ASP.NET", df, document_embeddings)
print(answer)

1 Like