Is it possible to scan some local files from the laptop with the API?

is it possible to scan some local files from the laptop with the API?

for example. I want to check some PDF files that do not open, or that open but have an error. I want to diagnose them at high capacity, using AI, Python + API KEY.

Can ChatGPT do this?

1 Like

Maybe, but you would have to send the raw content and not the actual document. It would also cost quite a bit.

Maybe you could send batches of the content? Starting from the first 100 characters or something, hoping that this will catch most the issues.

3 Likes

can you give me more details? be more specific

Bruh I’m not ChatGPT. Experiment and find out & report back!

1 Like

Ah, the eternal struggle between humans and misbehaving PDFs! :face_with_monocle:

While I can’t directly scan files on your laptop (still waiting on that teleportation upgrade), you can definitely use Python to read your PDFs and then have me analyze them via the API. Here’s a roadmap for your grand quest:

  1. Python to the Rescue: Use libraries like PyPDF2, pdfminer.six, or PyMuPDF to attempt reading the PDFs.
  2. Diagnose the Culprits: For PDFs that throw errors or refuse to open, you can catch exceptions in Python to see what’s going wrong—perhaps they’re corrupted or encrypted by a secret society?
  3. Summon the AI: Send the extracted content (or error details) to the OpenAI API. I’ll do my best to provide insights, suggestions, or just a sympathetic virtual shoulder to cry on.

Just remember, while I’m pretty good at pattern recognition and diagnostics, I can’t fix a physically corrupted file—yet. So, handle those tricky PDFs gently!

Happy debugging, and may your code be ever free of semicolons! :rocket:

1 Like

thanks @RonaldGRuckus and @PaulBellow

I test this code with some bad PDF, and python repair them. Maybe there is another way, but with Python and AI makes it simple. Also, can be other solutions, but this solution works in my case. If anyone has another good code…

import os
import PyPDF2
from pdfminer.high_level import extract_text
import fitz  # PyMuPDF
from PIL import Image

def print_step(step):
    print(f"\n{'='*20} {step} {'='*20}")

def repair_with_pypdf2(input_path, output_path):
    print_step("Reparare cu PyPDF2")
    try:
        with open(input_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            writer = PyPDF2.PdfWriter()

            for page in reader.pages:
                writer.add_page(page)

            with open(output_path, 'wb') as output_file:
                writer.write(output_file)
        print("Reparare cu PyPDF2 reușită")
        return True
    except Exception as e:
        print(f"Reparare cu PyPDF2 eșuată: {str(e)}")
        return False

def convert_to_images_and_back(input_path, output_path):
    print_step("Conversie PDF la imagini și înapoi")
    try:
        doc = fitz.open(input_path)
        images = []
        for page in doc:
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            images.append(img)

        images[0].save(output_path, "PDF", save_all=True, append_images=images[1:])
        print("Conversie PDF la imagini și înapoi reușită")
        return True
    except Exception as e:
        print(f"Conversie PDF la imagini și înapoi eșuată: {str(e)}")
        return False

def analyze_pdf(file_path):
    print_step(f"Analizăm fișierul: {file_path}")

    try:
        # PyPDF2
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            print(f"PyPDF2 - Număr de pagini: {len(reader.pages)}")
            print(f"PyPDF2 - Metadate: {reader.metadata}")
    except Exception as e:
        print(f"Eroare PyPDF2: {str(e)}")

    try:
        # PDFMiner
        text = extract_text(file_path)
        print(f"PDFMiner - Lungime text extras: {len(text)} caractere")
    except Exception as e:
        print(f"Eroare PDFMiner: {str(e)}")

    try:
        # PyMuPDF
        doc = fitz.open(file_path)
        print(f"PyMuPDF - Număr de pagini: {doc.page_count}")
        print(f"PyMuPDF - Metadate: {doc.metadata}")
    except Exception as e:
        print(f"Eroare PyMuPDF: {str(e)}")

def repair_pdf(input_path, output_dir_1, output_dir_2):
    base_name = os.path.basename(input_path)
    name, ext = os.path.splitext(base_name)

    # Metoda 1: PyPDF2
    output_path_1 = os.path.join(output_dir_1, f"{name}_pypdf2{ext}")
    if repair_with_pypdf2(input_path, output_path_1):
        print(f"Analizăm fișierul reparat cu PyPDF2: {output_path_1}")
        analyze_pdf(output_path_1)
    else:
        print("Metoda repair_with_pypdf2 a eșuat.")

    # Metoda 2: Conversie la imagini și înapoi
    output_path_2 = os.path.join(output_dir_2, f"{name}_img2pdf{ext}")
    if convert_to_images_and_back(input_path, output_path_2):
        print(f"Analizăm fișierul reparat prin conversie: {output_path_2}")
        analyze_pdf(output_path_2)
    else:
        print("Metoda convert_to_images_and_back a eșuat.")

    print_step("Procesul de reparare și analiză a fost finalizat")

def process_directory(input_dir, output_dir_1, output_dir_2):
    for filename in os.listdir(input_dir):
        if filename.lower().endswith('.pdf'):
            input_path = os.path.join(input_dir, filename)
            print_step(f"Procesăm fișierul: {filename}")
            repair_pdf(input_path, output_dir_1, output_dir_2)

if __name__ == "__main__":
    input_dir = r"d:\PDF PROASTE"
    output_dir_1 = r"d:\PDF PROASTE\OUTPUT-1"
    output_dir_2 = r"d:\PDF PROASTE\OUTPUT-2"

    # Creăm directoarele de ieșire dacă nu există
    for dir in [output_dir_1, output_dir_2]:
        if not os.path.exists(dir):
            os.makedirs(dir)

    process_directory(input_dir, output_dir_1, output_dir_2)

I think this is a good start. I’d imagine that these libraries and maybe more specific for your case are pretty good at catching & repairing broken PDFs.

If these fail it may be worth seeing how the model could handle the raw content and if it could be helpful - I’d love to know myself. These models have continued to surprise me, so :person_shrugging:

3 Likes

why did the codes disappear?

If I recall correctly I believe the person who posted code was just copying + pasting ChatGPT output.