Hi,
I’m creating a chatbot to analyse football matches based on summaries I’ve given to it. Though, I have some problems with the token limits. How can I reduce the tokens without losing relevant information?
import os
import re
import pandas as pd
from langchain import hub
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
# Set environment variable for OpenAI API Key
os.environ["OPENAI_API_KEY"] = '■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■IMG7zkKB'
def parse_summary(summary):
if summary.isdigit(): # Check if the summary is a numeric value (integer)
return {'matchday': int(summary), 'success': None, 'action_type': None,
'zone_from': None, 'player_from': None, 'player_to': None,
'zone_to': None, 'team': None, 'minute': int(summary)}
pattern = (
r"Matchday (\d+): (Successful|Unsuccessful) (\w+) in zone (\w+) by player ([\w\s]+?)"
r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
)
match = re.search(pattern, summary)
if match:
matchday, success, action_type, zone_from, player_from, player_to, zone_to, team, minute, second = match.groups()
player_from = player_from.strip()
player_to = player_to.strip() if player_to else "N/A"
zone_to = zone_to.strip() if zone_to else zone_from
team = team.strip()
assist = None
return {
'matchday': int(matchday),
'success': success,
'action_type': action_type,
'zone_from': zone_from,
'player_from': player_from,
'player_to': player_to,
'zone_to': zone_to,
'team': team,
'minute': int(minute), # Convert minute to integer
'assist': assist
}
return None
# CSV Loader
class CSVLoader:
def __init__(self, filepath):
self.filepath = filepath
def load(self):
try:
# Specify dtype=str to force all columns to be loaded as strings
return pd.read_csv(self.filepath, dtype=str)
except Exception as e:
print(f"Failed to load CSV: {e}")
return None
# Initialize and load documents
loader = CSVLoader("/Users/jesperpilegaard/Desktop/Superliga 2022-2023/csv-summaries/f24-100-2022-2288345-eventdetails.csv")
docs = loader.load()
docs = docs.astype(str)
# Filter out None values after parsing summaries
parsed_docs = [doc for doc in (parse_summary(summary) for summary in docs['Summary'] if pd.notna(summary)) if doc is not None]
# Implementing the assist identification with a safeguard against None entries
def identify_assists(actions):
for i in range(len(actions) - 1):
current_action = actions[i]
next_action = actions[i + 1]
# Ensure both actions are properly populated dictionaries
if current_action and next_action:
# Check conditions only if both entries are valid
if (current_action['action_type'] == 'Pass' and
next_action['action_type'] in ['Shot on target', 'Shot on post', 'Shot saved', 'Goal'] and
current_action['team'] == next_action['team'] and
abs(next_action['minute'] - current_action['minute']) <= 1):
current_action['assist'] = True
return actions
# Apply the function to the parsed data
parsed_docs = identify_assists(parsed_docs)
if docs is None or not isinstance(docs, pd.DataFrame):
print("Failed to load or invalid document format.")
else:
parsed_docs = [parse_summary(doc) for doc in docs['Summary'] if pd.notna(doc)]
texts = [f"{doc['action_type']}: {'successful' if doc['success'] == 'Successful' else 'unsuccessful'} in minute {doc['minute']} by {doc['player_from']} in zone {doc['zone_from']} to {doc['player_to']} in zone {doc['zone_to']} for team {doc['team']}" for doc in parsed_docs if doc]
ids = [str(i) for i in range(len(texts))]
# Initialize LangChain components
llm = ChatOpenAI(model="gpt-3.5-turbo")
vectorstore = Chroma.from_texts(texts=texts, ids=ids, embedding=OpenAIEmbeddings(model="text-embedding-3-small"), persist_directory="vectorstore")
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 500})
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
return "\n\n".join(str(doc) for doc in docs)
template = """You are a football analyst. When I say football, I mean european football and not soccer.
You provide analyses based on the data given to you and nothing else. You count all the events in the match.
A match is one CSV file and includes summaries of all the chosen events with player names, team names, zones, matchday and minute.
Each row in the document is one event. Count the ones I am asking for. When I ask for the result, tell me the result of the match based on how many goals each team scored.
Please don't hallucinate and only count how many times an event occurs.
{context}
Question: {question}
Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| custom_rag_prompt
| llm
| StrOutputParser()
)
rag_chain.invoke("How many goals were scored during the match?")