is it possible to scan some local files from the laptop with the API?
for example. I want to check some PDF files that do not open, or that open but have an error. I want to diagnose them at high capacity, using AI, Python + API KEY.
Can ChatGPT do this?
1 Like
Maybe, but you would have to send the raw content and not the actual document. It would also cost quite a bit.
Maybe you could send batches of the content? Starting from the first 100 characters or something, hoping that this will catch most the issues.
3 Likes
can you give me more details? be more specific
Bruh I’m not ChatGPT. Experiment and find out & report back!
1 Like
thanks @RonaldGRuckus and @PaulBellow
I test this code with some bad PDF, and python repair them. Maybe there is another way, but with Python and AI makes it simple. Also, can be other solutions, but this solution works in my case. If anyone has another good code…
import os
import PyPDF2
from pdfminer.high_level import extract_text
import fitz # PyMuPDF
from PIL import Image
def print_step(step):
print(f"\n{'='*20} {step} {'='*20}")
def repair_with_pypdf2(input_path, output_path):
print_step("Reparare cu PyPDF2")
try:
with open(input_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
for page in reader.pages:
writer.add_page(page)
with open(output_path, 'wb') as output_file:
writer.write(output_file)
print("Reparare cu PyPDF2 reușită")
return True
except Exception as e:
print(f"Reparare cu PyPDF2 eșuată: {str(e)}")
return False
def convert_to_images_and_back(input_path, output_path):
print_step("Conversie PDF la imagini și înapoi")
try:
doc = fitz.open(input_path)
images = []
for page in doc:
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
images[0].save(output_path, "PDF", save_all=True, append_images=images[1:])
print("Conversie PDF la imagini și înapoi reușită")
return True
except Exception as e:
print(f"Conversie PDF la imagini și înapoi eșuată: {str(e)}")
return False
def analyze_pdf(file_path):
print_step(f"Analizăm fișierul: {file_path}")
try:
# PyPDF2
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
print(f"PyPDF2 - Număr de pagini: {len(reader.pages)}")
print(f"PyPDF2 - Metadate: {reader.metadata}")
except Exception as e:
print(f"Eroare PyPDF2: {str(e)}")
try:
# PDFMiner
text = extract_text(file_path)
print(f"PDFMiner - Lungime text extras: {len(text)} caractere")
except Exception as e:
print(f"Eroare PDFMiner: {str(e)}")
try:
# PyMuPDF
doc = fitz.open(file_path)
print(f"PyMuPDF - Număr de pagini: {doc.page_count}")
print(f"PyMuPDF - Metadate: {doc.metadata}")
except Exception as e:
print(f"Eroare PyMuPDF: {str(e)}")
def repair_pdf(input_path, output_dir_1, output_dir_2):
base_name = os.path.basename(input_path)
name, ext = os.path.splitext(base_name)
# Metoda 1: PyPDF2
output_path_1 = os.path.join(output_dir_1, f"{name}_pypdf2{ext}")
if repair_with_pypdf2(input_path, output_path_1):
print(f"Analizăm fișierul reparat cu PyPDF2: {output_path_1}")
analyze_pdf(output_path_1)
else:
print("Metoda repair_with_pypdf2 a eșuat.")
# Metoda 2: Conversie la imagini și înapoi
output_path_2 = os.path.join(output_dir_2, f"{name}_img2pdf{ext}")
if convert_to_images_and_back(input_path, output_path_2):
print(f"Analizăm fișierul reparat prin conversie: {output_path_2}")
analyze_pdf(output_path_2)
else:
print("Metoda convert_to_images_and_back a eșuat.")
print_step("Procesul de reparare și analiză a fost finalizat")
def process_directory(input_dir, output_dir_1, output_dir_2):
for filename in os.listdir(input_dir):
if filename.lower().endswith('.pdf'):
input_path = os.path.join(input_dir, filename)
print_step(f"Procesăm fișierul: {filename}")
repair_pdf(input_path, output_dir_1, output_dir_2)
if __name__ == "__main__":
input_dir = r"d:\PDF PROASTE"
output_dir_1 = r"d:\PDF PROASTE\OUTPUT-1"
output_dir_2 = r"d:\PDF PROASTE\OUTPUT-2"
# Creăm directoarele de ieșire dacă nu există
for dir in [output_dir_1, output_dir_2]:
if not os.path.exists(dir):
os.makedirs(dir)
process_directory(input_dir, output_dir_1, output_dir_2)
I think this is a good start. I’d imagine that these libraries and maybe more specific for your case are pretty good at catching & repairing broken PDFs.
If these fail it may be worth seeing how the model could handle the raw content and if it could be helpful - I’d love to know myself. These models have continued to surprise me, so
3 Likes
why did the codes disappear?
If I recall correctly I believe the person who posted code was just copying + pasting ChatGPT output.