Search tool for university research assignment

found it!

@anciaroux30 This script is kinda like a search-engine for your PDFs. It dives into your files, grabs all the juicy details and text, and then uses some smart NLP/AI magic to figure out the best matches for whatever you’re looking for. It will also work with single PDFs, but it can scour through whole folders or even ZIPs full of them!

Once it’s done hunting, it’ll tell you where in which PDF it found the closest matches. And, for the curious ones, it’ll even show how long the whole treasure hunt took. To get started, just run the script, point it to where your PDFs are (be it a folder or a ZIP), and throw in what you’re curious about.

Before diving into the script, you gotta make sure you’ve got all the tools in your python toolbox. Here’s what you need:

:wrench: First, ensure you’ve got the required packages by running these commands:

pip install pdfminer.six
pip install spacy
pip install scikit-learn

:books: And don’t forget to download the English model for spaCy:

python -m spacy download en_core_web_sm

After setting up these tools, you’re good to go with the script. Remember: just run it, point to your stash of PDFs, and ask away! Like so: python script_name.py /where/your/PDFs/are "what's bugging you". Happy searching! :man_detective::mag:

heres’s the script:

import os
import sys
import zipfile
import spacy
from io import StringIO
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage


#timing function
import time

start_time = time.time()


class PDFInfo:
    def __init__(self, title, author, filename, page_number):
        self.title = title
        self.author = author
        self.filename = filename
        self.page_number = page_number

    def __str__(self):
        citation = f"{self.title} by {self.author} in {self.filename} on page {self.page_number}"
        return citation

def decode_with_fallback(text_bytes, encodings):
    for encoding in encodings:
        try:
            return text_bytes.decode(encoding)
        except UnicodeDecodeError:
            pass
    return text_bytes.decode('utf-8', errors='ignore')

def extract_pdf_metadata_and_text(pdf_path):
    with open(pdf_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)

        encodings = ['utf-8', 'ISO-8859-1', 'ASCII', 'Unicode', 'ASCII85', 'binary']

        author = doc.info[0].get('Author', b'Unknown Author')
        title = doc.info[0].get('Title', b'Unknown Title')

        author_decoded = decode_with_fallback(author, encodings)
        title_decoded = decode_with_fallback(title, encodings)

        laparams = LAParams(line_margin=0.5)
        resource_manager = PDFResourceManager()
        device = PDFPageAggregator(resource_manager, laparams=laparams)
        interpreter = PDFPageInterpreter(resource_manager, device)

        extracted_text = []
        for pagenum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            layout = device.get_result()
            for element in layout:
                if isinstance(element, LTTextBoxHorizontal):
                    extracted_text.append((element.get_text().strip(), pagenum + 1))

    return title_decoded, author_decoded, extracted_text


def preprocess_text(text_and_page_numbers, window_size=3):
    nlp = spacy.load("en_core_web_sm")
    processed_text = []
    for text, page_number in text_and_page_numbers:
        doc = nlp(text)
        for sent in doc.sents:
            start = max(0, sent.start - window_size)
            end = min(len(doc), sent.end + window_size)
            context = doc[start:end].text.strip()
            processed_text.append((context, page_number))
    return processed_text

def process_query(query):
    nlp = spacy.load("en_core_web_sm")
    query_doc = nlp(query)
    return query_doc

def match_text(query, sentences_and_page_numbers, top_k):
    vectorizer = TfidfVectorizer()
    query_vector = vectorizer.fit_transform([query])

    sentences, page_numbers = zip(*sentences_and_page_numbers)
    sentence_vectors = vectorizer.transform(sentences)

    similarity_scores = cosine_similarity(query_vector, sentence_vectors)
    top_k_indices = similarity_scores.argsort()[0][-top_k:][::-1]

    return [sentences_and_page_numbers[idx] for idx in top_k_indices]

def fine_search(query, top_k_matches):
    # Placeholder for the fine-grained search implementation
    # Implement advanced techniques such as using transformer models (e.g., BERT, GPT)
    return top_k_matches


def search_pdf_file(pdf_path, query, top_k=5):
    title, author, text_and_page_numbers = extract_pdf_metadata_and_text(pdf_path)
    sentences_and_page_numbers = preprocess_text(text_and_page_numbers)
    query_doc = process_query(query)
    top_k_matches = match_text(str(query_doc), sentences_and_page_numbers, top_k)
    best_matches = fine_search(query, top_k_matches)
    pdf_info = PDFInfo(title, author, os.path.basename(pdf_path), None)
    return best_matches, pdf_info

def search_directory(directory_path, query, top_k=5):
    answers = {}
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            best_matches, pdf_info = search_pdf_file(pdf_path, query, top_k)
            answers[pdf_info] = best_matches
    return answers

def search_zip_file(zip_path, query, top_k=5):
    answers = {}
    with zipfile.ZipFile(zip_path, "r") as zf:
        for filename in zf.namelist():
            if filename.endswith(".pdf"):
                with zf.open(filename) as pdf_file:
                    best_matches, pdf_info = search_pdf_file(pdf_file, query, top_k)
                    answers[pdf_info] = best_matches
    return answers

def main(input_path, queries):
    for query in queries:
        print("\n")
        print(f"Searching for: {query}\n")
        if os.path.isdir(input_path):
            answers = search_directory(input_path, query)
        elif input_path.endswith(".zip"):
            answers = search_zip_file(input_path, query)
        else:
            raise ValueError("Input path must be a directory or a .zip file")

        for pdf_info, best_matches in answers.items():
            for best_match, page_number in best_matches:
                print(f"\nAnswer from {pdf_info.filename} on page {page_number}: \n {best_match}")
        print("\n")
        """
        time function
        """
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time:.4f} seconds")
if __name__ == "__main__":
    input_path = sys.argv[1]
    queries = sys.argv[2:]
    main(input_path, queries)

I hope that helps :smiley:

3 Likes