How do I reduce tokens without losing relevant information?

Hi,

I’m creating a chatbot to analyse football matches based on summaries I’ve given to it. Though, I have some problems with the token limits. How can I reduce the tokens without losing relevant information?

import os
import re
import pandas as pd
from langchain import hub
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS

# Set environment variable for OpenAI API Key
os.environ["OPENAI_API_KEY"] = '■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■IMG7zkKB'

def parse_summary(summary):
    if summary.isdigit():  # Check if the summary is a numeric value (integer)
        return {'matchday': int(summary), 'success': None, 'action_type': None,
                'zone_from': None, 'player_from': None, 'player_to': None,
                'zone_to': None, 'team': None, 'minute': int(summary)}
    
    pattern = (
        r"Matchday (\d+): (Successful|Unsuccessful) (\w+) in zone (\w+) by player ([\w\s]+?)"
        r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
    )

    match = re.search(pattern, summary)
    if match:
        matchday, success, action_type, zone_from, player_from, player_to, zone_to, team, minute, second = match.groups()
        player_from = player_from.strip()
        player_to = player_to.strip() if player_to else "N/A"
        zone_to = zone_to.strip() if zone_to else zone_from
        team = team.strip()

        assist = None
        
        return {
            'matchday': int(matchday),
            'success': success,
            'action_type': action_type,
            'zone_from': zone_from,
            'player_from': player_from,
            'player_to': player_to,
            'zone_to': zone_to,
            'team': team,
            'minute': int(minute),  # Convert minute to integer
            'assist': assist
        }
    return None

# CSV Loader
class CSVLoader:
    def __init__(self, filepath):
        self.filepath = filepath

    def load(self):
        try:
            # Specify dtype=str to force all columns to be loaded as strings
            return pd.read_csv(self.filepath, dtype=str)
        except Exception as e:
            print(f"Failed to load CSV: {e}")
            return None

# Initialize and load documents
loader = CSVLoader("/Users/jesperpilegaard/Desktop/Superliga 2022-2023/csv-summaries/f24-100-2022-2288345-eventdetails.csv")
docs = loader.load()
docs = docs.astype(str)

# Filter out None values after parsing summaries
parsed_docs = [doc for doc in (parse_summary(summary) for summary in docs['Summary'] if pd.notna(summary)) if doc is not None]

# Implementing the assist identification with a safeguard against None entries
def identify_assists(actions):
    for i in range(len(actions) - 1):
        current_action = actions[i]
        next_action = actions[i + 1]
        
        # Ensure both actions are properly populated dictionaries
        if current_action and next_action:
            # Check conditions only if both entries are valid
            if (current_action['action_type'] == 'Pass' and
                next_action['action_type'] in ['Shot on target', 'Shot on post', 'Shot saved', 'Goal'] and
                current_action['team'] == next_action['team'] and
                abs(next_action['minute'] - current_action['minute']) <= 1):
                current_action['assist'] = True

    return actions

# Apply the function to the parsed data
parsed_docs = identify_assists(parsed_docs)

if docs is None or not isinstance(docs, pd.DataFrame):
    print("Failed to load or invalid document format.")
else:
    parsed_docs = [parse_summary(doc) for doc in docs['Summary'] if pd.notna(doc)]
    texts = [f"{doc['action_type']}: {'successful' if doc['success'] == 'Successful' else 'unsuccessful'} in minute {doc['minute']} by {doc['player_from']} in zone {doc['zone_from']} to {doc['player_to']} in zone {doc['zone_to']} for team {doc['team']}" for doc in parsed_docs if doc]
    ids = [str(i) for i in range(len(texts))]

    # Initialize LangChain components
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    vectorstore = Chroma.from_texts(texts=texts, ids=ids, embedding=OpenAIEmbeddings(model="text-embedding-3-small"), persist_directory="vectorstore")
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 500})

    prompt = hub.pull("rlm/rag-prompt")
    def format_docs(docs):
        return "\n\n".join(str(doc) for doc in docs)

    template = """You are a football analyst. When I say football, I mean european football and not soccer.
    You provide analyses based on the data given to you and nothing else. You count all the events in the match.
    A match is one CSV file and includes summaries of all the chosen events with player names, team names, zones, matchday and minute.
    Each row in the document is one event. Count the ones I am asking for. When I ask for the result, tell me the result of the match based on how many goals each team scored.
    Please don't hallucinate and only count how many times an event occurs.

    {context}

    Question: {question}

    Helpful Answer:"""

    custom_rag_prompt = PromptTemplate.from_template(template)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

rag_chain.invoke("How many goals were scored during the match?")

How did you do your embedding? Are the words in a vector too long?

What’s the size of each of your chunk? Your k set to 500?
If each chunk contains about 1k token, then in each model run your context cost 500k token in chat history and you may need 1M context window model to run it.
Try a smaller k to overcome the token limit, and see what actually sent to LLM, it’s takes certain effort to test and tune it.

1 Like

Thanks, I’ve tried to reduce k, but then I get wrong answers. Like I have a csv-file with summaries of all events in each row and it seems like I have to go through each event to count number of successful passes. If a team have like 400 successful passes, I have to have a high k, right? If I set K to 10, it wont count more than 10 successful passes in the file