Why doesn't my chatbot count the correct amount of events?

I’m creating a chatbot to provide football analyses, and I’ve provided it with CSV files that contains summaries of each event I’ve chosen. Each row contains a summary with an event. I’ve tried to parse the relevant information, but the chatbot keeps giving me wrong responses. For example if I ask “How many successful passes did FC Midtjylland have”, it keeps giving me the answer 2 which is obviously wrong. I hope you can help me.

import os
import re
import pandas as pd
from langchain import hub
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate

# Set environment variable for OpenAI API Key
os.environ["OPENAI_API_KEY"] = 'myAPIKey'

def parse_summary(summary):
    if summary.isdigit():  # Check if the summary is a numeric value (integer)
        return {'matchday': int(summary), 'success': None, 'action_type': None,
                'zone_from': None, 'player_from': None, 'player_to': None,
                'zone_to': None, 'team': None, 'minute': int(summary)}
    
    pattern = (
        r"Matchday (\d+): (Successful|Unsuccessful) (\w+) in zone (\w+) by player ([\w\s]+?)"
        r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
    )

    match = re.search(pattern, summary)
    if match:
        matchday, success, action_type, zone_from, player_from, player_to, zone_to, team, minute, second = match.groups()
        player_from = player_from.strip()
        player_to = player_to.strip() if player_to else "N/A"
        zone_to = zone_to.strip() if zone_to else zone_from
        team = team.strip()

        assist = None
        
        return {
            'matchday': int(matchday),
            'success': success,
            'action_type': action_type,
            'zone_from': zone_from,
            'player_from': player_from,
            'player_to': player_to,
            'zone_to': zone_to,
            'team': team,
            'minute': minute,
            'assist': assist
        }
    return None

# Filter out None values after parsing summaries
parsed_docs = [doc for doc in (parse_summary(summary) for summary in docs['Summary'] if pd.notna(summary)) if doc is not None]

# Implementing the assist identification with a safeguard against None entries
def identify_assists(actions):
    for i in range(len(actions) - 1):
        current_action = actions[i]
        next_action = actions[i + 1]
        
        # Ensure both actions are properly populated dictionaries
        if current_action and next_action:
            # Check conditions only if both entries are valid
            if (current_action['action_type'] == 'Pass' and
                next_action['action_type'] in ['Shot on target', 'Shot on post', 'Shot saved', 'Goal'] and
                current_action['team'] == next_action['team'] and
                abs(next_action['minute'] - current_action['minute']) <= 1):
                current_action['assist'] = True

    return actions

# Apply the function to the parsed data
parsed_docs = identify_assists(parsed_docs)

# CSV Loader
class CSVLoader:
    def __init__(self, filepath):
        self.filepath = filepath

    def load(self):
        try:
            # Specify dtype=str to force all columns to be loaded as strings
            return pd.read_csv(self.filepath, dtype=str)
        except Exception as e:
            print(f"Failed to load CSV: {e}")
            return None

# Initialize and load documents
loader = CSVLoader("/Users/jesperpilegaard/Desktop/Superliga 2022-2023/csv-summaries/f24-100-2022-2288344-eventdetails.csv")
docs = loader.load()
docs = docs.astype(str)

if docs is None or not isinstance(docs, pd.DataFrame):
    print("Failed to load or invalid document format.")
else:
    parsed_docs = [parse_summary(doc) for doc in docs['Summary'] if pd.notna(doc)]
    texts = [f"{doc['action_type']}: {'successful' if doc['success'] == 'Successful' else 'unsuccessful'} in minute {doc['minute']} by {doc['player_from']} in zone {doc['zone_from']} to {doc['player_to']} in zone {doc['zone_to']} for team {doc['team']}" for doc in parsed_docs if doc]
    ids = [str(i) for i in range(len(texts))]

    # Initialize LangChain components
    llm = ChatOpenAI(model="gpt-3.5-turbo")
    vectorstore = Chroma.from_texts(texts=texts, ids=ids, embedding=OpenAIEmbeddings())
    retriever = vectorstore.as_retriever()

    prompt = hub.pull("rlm/rag-prompt")
    def format_docs(docs):
        return "\n\n".join(str(doc) for doc in docs)

    template = """You are a football analyst. When I say football, I mean european football and not soccer.
    You provide analyses based on the data given to you and nothing else. You count all the events in the match, whether it is Pass, 
    Offside Pass, Foul, Corner Awarded, Shot off target, Shot on post, Shot saved, Tackle, Interception, Save, Clearance, Goal or Card.
    A match is one CSV file and includes summaries of all the chosen events with player names, team names, zones, matchday and minute.
    Each row in the document is one event. Count the ones I am asking for.

    {context}

    Question: {question}

    Helpful Answer:"""

    custom_rag_prompt = PromptTemplate.from_template(template)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

rag_chain.invoke("How many passes did Randers FC have?")

Just a Suggestion, your use case fits the OpenAI Assistants. I see you’re using LangChain which I think use the OpenAI embedding and ChatCompletion. Instead of this, build your own assistant.

If you’re looking on how Assistant V2 works, here’s a video on OpenAI Assistant V2