I’m creating a chatbot to provide football analyses, and I’ve provided it with CSV files that contains summaries of each event I’ve chosen. Each row contains a summary with an event. I’ve tried to parse the relevant information, but the chatbot keeps giving me wrong responses. For example if I ask “How many successful passes did FC Midtjylland have”, it keeps giving me the answer 2 which is obviously wrong. I hope you can help me.
import os
import re
import pandas as pd
from langchain import hub
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
# Set environment variable for OpenAI API Key
os.environ["OPENAI_API_KEY"] = 'myAPIKey'
def parse_summary(summary):
if summary.isdigit(): # Check if the summary is a numeric value (integer)
return {'matchday': int(summary), 'success': None, 'action_type': None,
'zone_from': None, 'player_from': None, 'player_to': None,
'zone_to': None, 'team': None, 'minute': int(summary)}
pattern = (
r"Matchday (\d+): (Successful|Unsuccessful) (\w+) in zone (\w+) by player ([\w\s]+?)"
r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
)
match = re.search(pattern, summary)
if match:
matchday, success, action_type, zone_from, player_from, player_to, zone_to, team, minute, second = match.groups()
player_from = player_from.strip()
player_to = player_to.strip() if player_to else "N/A"
zone_to = zone_to.strip() if zone_to else zone_from
team = team.strip()
assist = None
return {
'matchday': int(matchday),
'success': success,
'action_type': action_type,
'zone_from': zone_from,
'player_from': player_from,
'player_to': player_to,
'zone_to': zone_to,
'team': team,
'minute': minute,
'assist': assist
}
return None
# Filter out None values after parsing summaries
parsed_docs = [doc for doc in (parse_summary(summary) for summary in docs['Summary'] if pd.notna(summary)) if doc is not None]
# Implementing the assist identification with a safeguard against None entries
def identify_assists(actions):
for i in range(len(actions) - 1):
current_action = actions[i]
next_action = actions[i + 1]
# Ensure both actions are properly populated dictionaries
if current_action and next_action:
# Check conditions only if both entries are valid
if (current_action['action_type'] == 'Pass' and
next_action['action_type'] in ['Shot on target', 'Shot on post', 'Shot saved', 'Goal'] and
current_action['team'] == next_action['team'] and
abs(next_action['minute'] - current_action['minute']) <= 1):
current_action['assist'] = True
return actions
# Apply the function to the parsed data
parsed_docs = identify_assists(parsed_docs)
# CSV Loader
class CSVLoader:
def __init__(self, filepath):
self.filepath = filepath
def load(self):
try:
# Specify dtype=str to force all columns to be loaded as strings
return pd.read_csv(self.filepath, dtype=str)
except Exception as e:
print(f"Failed to load CSV: {e}")
return None
# Initialize and load documents
loader = CSVLoader("/Users/jesperpilegaard/Desktop/Superliga 2022-2023/csv-summaries/f24-100-2022-2288344-eventdetails.csv")
docs = loader.load()
docs = docs.astype(str)
if docs is None or not isinstance(docs, pd.DataFrame):
print("Failed to load or invalid document format.")
else:
parsed_docs = [parse_summary(doc) for doc in docs['Summary'] if pd.notna(doc)]
texts = [f"{doc['action_type']}: {'successful' if doc['success'] == 'Successful' else 'unsuccessful'} in minute {doc['minute']} by {doc['player_from']} in zone {doc['zone_from']} to {doc['player_to']} in zone {doc['zone_to']} for team {doc['team']}" for doc in parsed_docs if doc]
ids = [str(i) for i in range(len(texts))]
# Initialize LangChain components
llm = ChatOpenAI(model="gpt-3.5-turbo")
vectorstore = Chroma.from_texts(texts=texts, ids=ids, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
return "\n\n".join(str(doc) for doc in docs)
template = """You are a football analyst. When I say football, I mean european football and not soccer.
You provide analyses based on the data given to you and nothing else. You count all the events in the match, whether it is Pass,
Offside Pass, Foul, Corner Awarded, Shot off target, Shot on post, Shot saved, Tackle, Interception, Save, Clearance, Goal or Card.
A match is one CSV file and includes summaries of all the chosen events with player names, team names, zones, matchday and minute.
Each row in the document is one event. Count the ones I am asking for.
{context}
Question: {question}
Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| custom_rag_prompt
| llm
| StrOutputParser()
)
rag_chain.invoke("How many passes did Randers FC have?")