How to Handle Token Limit Exceeded Error in OpenAI API

'm getting an error from the OpenAI API stating that the context length exceeds the model’s limit, even though I’m only passing the last four messages to the prompt. I’ve verified that each interaction is using around 1056 tokens, but I’m still encountering the error when sending the prompt to the model and not sure why I’m still exceeding the token limit.

Full error message:

openai.BadRequestError: Error code: 400 - {‘error’: {‘message’: “This model’s maximum context length is 8192 tokens. However, your messages resulted in 8452 tokens (8415 in the messages, 37 in the functions). Please reduce the length of the messages or functions.”, ‘type’: ‘invalid_request_error’, ‘param’: ‘messages’, ‘code’: ‘context_length_exceeded’}}

`@tool(response_format="content_and_artifact")
  def retrieve(query: str):
       """Retrivieving function"""
       try:
           vector_store = document_embeddings.get_vectorstore()
           retrieved_docs = vector_store.similarity_search(query, k=4, max_tokens_limit=3000)
           serialized = "\n\n".join(
               (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
               for doc in retrieved_docs
           )
           return serialized, retrieved_docs
       except Exception as e:
           print(f"Error during retrieval: {e}")
           raise e` 
`def filter_messages(messages: list):
    # This is very simple helper function which only ever uses the 4 last messages to prevent context limit error
    return messages[-4:]

def query_or_respond(state: MessagesState):
    llm_with_tools = llm.bind_tools([retrieve])
    response = llm_with_tools.invoke(state["messages"])
    return {"messages": [response]}

tools = ToolNode([retrieve])
from langchain_community.callbacks.manager import get_openai_callback
def generate(state: MessagesState):
    messages = filter_messages(state["messages"])
    recent_tool_messages = []
    for message in reversed(messages):
        if message.type == "tool":
            print("Tool")
            recent_tool_messages.append(message)
        else:
            break
    tool_messages = recent_tool_messages[::-1]

    docs_content = "\n\n".join(doc.content for doc in tool_messages)
    system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        f"{docs_content}"
        "Use documents/context . ")

    conversation_messages = [
        message
        for message in messages
        if message.type in ("human", "system")
        or (message.type == "ai" and not message.tool_calls)
    ]
    prompt = [SystemMessage(system_message_content)] + conversation_messages
    print(f"prompt: {prompt}")
        # Run
    with get_openai_callback() as cb:
        response = llm.invoke(prompt)
        print(f"Total Tokens: {cb.total_tokens}")
        print(f"Prompt Tokens: {cb.prompt_tokens}")
        print(f"Completion Tokens: {cb.completion_tokens}")
        print(f"Total Cost (USD): ${cb.total_cost}")

    return {"messages": [response]}


memory = MemorySaver()

 graph_builder = StateGraph(MessagesState)
 graph_builder.add_node(query_or_respond)
 graph_builder.add_node(tools)
 graph_builder.add_node(generate)
 graph_builder.set_entry_point("query_or_respond")
 graph_builder.add_conditional_edges(
     "query_or_respond",
     tools_condition,
     {END: END, "tools": "tools"},
 )
 graph_builder.add_edge("tools","generate")
 graph_builder.add_edge("generate", END)
 graph = graph_builder.compile(checkpointer=memory)`

for the embedding i am using Openai embedding, chunk size = 1000, overlap = 200, parsin with Llamaparse and Unstructured for MakrdownLoader`

Any advice or solutions would be greatly appreciated!