Infinity Memory implementation

Hello Everyone,
these days i had an idea of adding memory to chat-gpt through recursive function calling but the model keep calling the function even though he has an answer to certain question
like ask memory[n] then memory[n - 1] then memory[n-2] … return an answer or
index reach -1
memory is a list of conversations
i wonder if there’s limitations something like to use this method. because it will save a lot of tokens

import json
import openai
import tiktoken
openai.api_key = "sk-****"
memory = [[]]
max_request_tokens = 200
max_response_tokens = 50
index = 0
"""
"""
functions = [
    {
    "name": "history_conversations",
    "description": """
    This function retrieves relevent information from the previous conversation context and return it as a response
    """,
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": """
                the question that you didn't have an answer about in ongoing conversation:
                E.g., 'What's my name?', 'What's my daughter's age?', 'Do you remember me?'...etc"""
            }
        },
        "required": ["question"]
    },
    "action": "load previous conversation list"
}
]
def generate_response(message):
    global index
    message_list=[{"role": "user", "content":message}]
    memory[index].extend(message_list)
    if num_tokens_from_messages(memory[index]) + max_response_tokens >= max_request_tokens:
        del memory[index][-1]
        memory.append([])
        index += 1
        memory[index].extend(message_list)
    print(f"tokens={num_tokens_from_messages(message_list)}")
    response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo-0613",
                messages=message_list,
                max_tokens=100,
                functions=functions,
                function_call="auto",
            )
    response_message = response["choices"][0]["message"]
    if response_message.get("function_call"):
        # Step 3: call the function
        # Note: the JSON response may not always be valid; be sure to handle errors
        available_functions = {
            "history_conversations": memorize,
        }  # only one function in this example, but you can have multiple
        function_name = response_message["function_call"]["name"]
        fuction_to_call = available_functions[function_name]
        function_args = json.loads(response_message["function_call"]["arguments"])
        return fuction_to_call(
            message=function_args.get("question"),iterator=index)
    else:
        memory[index].append({"role": "assistant", "content": response_message['content']})
        return response_message['content']
    
def memorize(message, iterator):
    print(f"iterator={iterator}")
    if iterator == -1:
        return "i don't remember"
    memory[iterator].append({"role": "user", "content":message})
    print(f"num tokens={num_tokens_from_messages(memory[iterator])}")
    response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo-0613",
                messages=memory[iterator],
                max_tokens=100,
                functions=functions,
                function_call="auto",
            )
    response_message = response["choices"][0]["message"]
    # delete the question for future questions
    del memory[iterator][-1]
    # Step 2: check if GPT wanted to call a function
    if response_message.get("function_call"):
        # Step 3: call the function
        # Note: the JSON response may not always be valid; be sure to handle errors
        available_functions = {
            "history_conversations": memorize,
        }  # only one function in this example, but you can have multiple
        function_name = response_message["function_call"]["name"]
        fuction_to_call = available_functions[function_name]
        function_args = json.loads(response_message["function_call"]["arguments"])
        return fuction_to_call(
            message=function_args.get("question"),iterator=iterator - 1)
    else:
        return response_message['content']
    
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
        """Return the number of tokens used by a list of messages."""
        try:
            encoding = tiktoken.encoding_for_model(model)
        except KeyError:
            print("Warning: model not found. Using cl100k_base encoding.")
            encoding = tiktoken.get_encoding("cl100k_base")
        if model in {
            "gpt-3.5-turbo-0613",
            "gpt-3.5-turbo-16k-0613",
            "gpt-4-0613",
            "gpt-4-32k-0613",
        }:
            tokens_per_message = 3
            tokens_per_name = 1
        elif "gpt-3.5-turbo" in model:
            return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
        elif "gpt-4" in model:
            return num_tokens_from_messages(messages, model="gpt-4-0613")
        else:
            raise NotImplementedError(
                f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
            )
        num_tokens = 0
        for message in messages:
            num_tokens += tokens_per_message
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += tokens_per_name
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
        return num_tokens     
    
       
while True:
    user_input = input("USER:")
    print(f"index = {index}")
    AI_response = generate_response(user_input)
    print("AI:" + AI_response)

USER:hello my name is haithem
index = 0
tokens=33
AI:Hello Haithem! How can I assist you today?
USER:what is my name?
index = 0
tokens=31
iterator=0
num tokens=101
AI:Your name is Haithem. <— here he answered the question normally
USER:i’m 20 <— he thought that was a question
index = 0
tokens=31
iterator=0
num tokens=129
iterator=-1
AI:i don’t remember

1 Like

Hi @haithemyk0707

Welcome to the OpenAI community.

You can use embeddings to find out semantically relevant messages from from the conversation and pass them to answer questions.

I also write a tutorial about it: Use embeddings to retrieve relevant context for AI assistant

While it may not literally give infinite context, it should help you go well beyond the context length of the chat completion models and also save tokens.

2 Likes

I heard about that but it’s dummy to use embedding alone
I’m willing to combine this method with embedding and vector store.
In conclusion i want to know if this method is illegal or no

In that case you can let the model use the function to retrieve relevant conversation with embeddings and/or a vector DB, if there’s no answer available in the current list of messages, while also deciding the indices of messages to remove from the message list, to conserve tokens.

There aren’t any laws for that. Just make sure you’re not going over your org’s rate limits, else you’ll get rate-limit error

Yes this is my goal but
As I’ve shown before it is not working 100%
i don’t know why
Some times i mention my name let’s say in memory[0]
User: my name is haithem
Bot: nice to meet you haithem!..
and if i ask him
What’s my name? again
He iterate through all previous memorycache
Until he reach index -1 without answering my question!
Returning “i don’t know”
And you said there’s no limitation for this But i don’t feel so

USER:hello wassup my name is haithem
index = 0
tokens=17
AI:Hello Haithem! How can I assist you today?
USER:what is the capital of algeria?
index = 0
tokens=16
AI:The capital of Algeria is Algiers.
USER:what is my name?
index = 0
tokens=12
question= What is my name?
iterator=0
num tokens=89
memory[0]=[{'role': 'user', 'content': 'hello wassup my name is haithem'}, {'role': 'assistant', 'content': 'Hello Haithem! How can I assist you today?'}, {'role': 'user', 'content': 'what is the capital of algeria?'}, {'role': 'assistant', 'content': 'The capital of Algeria is Algiers.'}, {'role': 'user', 'content': 'what is my name?'}, {'role': 'user', 'content': "if the answer is present in conversation don't call the function\nWhat is my name?"}]
iterator=-1
AI:i don't remember

Works just fine for me … maybe you are trying to make it too complicated using functions. Just use the plain old API and embeddings, it should work.

GPT-4:

payload = {
    "model": "gpt-4",
    "messages": [{'role': 'user', 'content': 'hello wassup my name is haithem'}, {'role': 'assistant', 'content': 'Hello Haithem! How can I assist you today?'}, {'role': 'user', 'content': 'what is the capital of algeria?'}, {'role': 'assistant', 'content': 'The capital of Algeria is Algiers.'}, {'role': 'user', 'content': 'what is my name?'}, {'role': 'user', 'content': "if the answer is present in conversation don't call the function\nWhat is my name?"}]
}


response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=payload)
r = response.json()
print(r["choices"][0]["message"]["content"])
print(r)
Your name is Haithem.

GPT-3.5-Turbo:

payload = {
    "model": "gpt-3.5-turbo",
    "messages": [{'role': 'user', 'content': 'hello wassup my name is haithem'}, {'role': 'assistant', 'content': 'Hello Haithem! How can I assist you today?'}, {'role': 'user', 'content': 'what is the capital of algeria?'}, {'role': 'assistant', 'content': 'The capital of Algeria is Algiers.'}, {'role': 'user', 'content': 'what is my name?'}, {'role': 'user', 'content': "if the answer is present in conversation don't call the function\nWhat is my name?"}]
}


response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=payload)
r = response.json()
print(r["choices"][0]["message"]["content"])
print(r)
Your name is Haithem.
1 Like

Yes I’ll just use embedding because
openai has smart system to detect such thing.
With simple math using this method and you just have 8 memory chunks 32k tokens
Let’s say you searching for something exists in
First memory chunk in the worst case scenario
It will only cost 0.048$
While 3.5 turbo-16k it will cost the same amount for one request
And beside that if this was working perfectly
There is no need to load all the conversation
You just need to load user message and if it’s contains something needs for model to remember it he will just loop through memory to answer user question
Unfortunately

1 Like

All the models have a finite memory … 4k, 8k, 16k, 32k …

So to get “infinite memory” you would build a running database of chunks of text, along with embeddings, so that in 5 years when you ask the bot what your name is, it says “Your name is Haithem”.

I’m testing it on my app and it works fine.