Hi everyone,
I’m currently working on a project where I need to parse a PDF and send multiple images extracted from the PDF to GPT-4o for analysis. The primary challenge I’m facing is passing multiple base64-encoded images along with a prompt to the GPT-4o API. Here’s a summary of my approach and the difficulties encountered:
Use Case:
- PDF Parsing: I need to convert a PDF into multiple images, where each image represents a page or section of the PDF.
- Image Analysis: After conversion, I want to send these images to GPT-4 along with a prompt to get insights or extract information.
Current Approach:
- Convert PDF to Images: I’ve successfully converted the PDF into individual image files.
- Encode Images: Each image is encoded into a base64 string.
- Send Images to GPT-4o: I am attempting to send these base64-encoded images along with a textual prompt using the GPT-4o API.
Problem:
Passing Multiple Images: I’m encountering issues with sending multiple images together with a prompt. My current implementation seems to have problems when combining base64-encoded images and prompts in the API request.
API Response: The API may not be processing multiple images correctly or there might be constraints on handling such requests.
Request for Help:
Handling Multiple Images: What’s the best way to pass multiple base64-encoded images to GPT-4 along with a prompt?
API Constraints: Are there any constraints or best practices for sending multiple images to GPT-4?
Error Handling: How should I handle potential issues or errors when dealing with multiple images?
This code will do it using gpt-4o as the model. It will NOT do it using gpt-4o-mini as the model.
# gpt4oUploadImg01.py
# gpt-4o
import fitz # PyMuPDF
import os
import sys
import boto3
import base64
import httpx
from openai import OpenAI
def pdf_to_jpeg(pdf_path, output_folder):
"""Converts a PDF to JPEG images."""
os.makedirs(output_folder, exist_ok=True)
doc = fitz.open(pdf_path)
for page_num in range(doc.page_count):
page = doc[page_num]
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72)) # Adjust DPI as needed
pix.save(os.path.join(output_folder, f"page_{page_num + 1}.jpg"))
doc.close()
def upload_to_s3(local_folder, s3_bucket, s3_output_key):
"""Uploads the contents of a local folder to an S3 bucket."""
s3_resource = boto3.resource('s3')
for root, dirs, files in os.walk(local_folder):
for file in files:
local_file_path = os.path.join(root, file)
relative_path = os.path.relpath(local_file_path, local_folder)
s3_file_key = f"{s3_output_key}/{relative_path}".replace('//', '/')
s3_resource.Bucket(s3_bucket).upload_file(local_file_path, s3_file_key)
print(f'Output folder {local_folder} uploaded to s3://{s3_bucket}/{s3_output_key}')
def process_images_with_gpt4o(s3_bucket, s3_output_key, api_key):
client = OpenAI(api_key=api_key)
MODEL = "gpt-4o"
prompt = """
You are a very professional image to text document extractor.
Please extract the text from these images, treating them as pages of a PDF document.
Try to format any tables found in the images.
Do not include page numbers, page headers, or page footers.
Please double-check to make sure that any words in all capitalized letters with strikethrough letters are excluded.
Return only the extracted text. No commentary.
**Include Tables:** Tables should be preserved in the extracted text.
**Exclude Page Headers, Page Footers, and Page Numbers:** Eliminate these elements which are typically not part of the main content.
"""
s3_client = boto3.client('s3')
response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_output_key)
messages = [
{"role": "system", "content": "You are a helpful assistant that responds in Markdown."},
{"role": "user", "content": [{"type": "text", "text": prompt}]}
]
for obj in response.get('Contents', []):
if obj['Key'].endswith('.jpg'): # Ensure we're only processing jpg files
url = f"https://s3.us-west-2.amazonaws.com/{s3_bucket}/{obj['Key']}"
messages[1]["content"].append({"type": "image_url", "image_url": {"url": url}})
response = client.chat.completions.create(
model=MODEL,
messages=messages,
temperature=0.0,
)
return response.choices[0].message.content, response.usage
if __name__ == "__main__":
if len(sys.argv) != 6:
print("Usage: python script.py <pdf_path> <output_folder> <s3_bucket> <s3_output_key> <openai_api_key>")
sys.exit(1)
pdf_path = sys.argv[1]
output_folder = sys.argv[2]
s3_bucket = sys.argv[3]
s3_output_key = sys.argv[4]
openai_api_key = sys.argv[5]
# Convert PDF to JPEG
pdf_to_jpeg(pdf_path, output_folder)
# Construct the full s3_output_key including the folder name
folder_name = os.path.basename(output_folder)
full_s3_output_key = f"{s3_output_key.rstrip('/')}/{folder_name}"
# Upload to S3
upload_to_s3(output_folder, s3_bucket, full_s3_output_key)
# Process images with GPT-4O
response, usage = process_images_with_gpt4o(s3_bucket, full_s3_output_key, openai_api_key)
# Write the response to a text file
output_text_file = os.path.splitext(pdf_path)[0] + '.txt'
with open(output_text_file, 'w', encoding='utf-8') as f:
f.write(response)
# Print the response
print(f"Extracted text has been written to: {output_text_file}")
# Print the usage tokens
print("\nUsage tokens:")
print(f"Input tokens: {usage.prompt_tokens}")
print(f"Output tokens: {usage.completion_tokens}")
1 Like