Unable to pass multiple images through openai api to GPT-4o

Hi everyone,

I’m currently working on a project where I need to parse a PDF and send multiple images extracted from the PDF to GPT-4o for analysis. The primary challenge I’m facing is passing multiple base64-encoded images along with a prompt to the GPT-4o API. Here’s a summary of my approach and the difficulties encountered:

Use Case:

  1. PDF Parsing: I need to convert a PDF into multiple images, where each image represents a page or section of the PDF.
  2. Image Analysis: After conversion, I want to send these images to GPT-4 along with a prompt to get insights or extract information.

Current Approach:

  1. Convert PDF to Images: I’ve successfully converted the PDF into individual image files.
  2. Encode Images: Each image is encoded into a base64 string.
  3. Send Images to GPT-4o: I am attempting to send these base64-encoded images along with a textual prompt using the GPT-4o API.

Problem:

Passing Multiple Images: I’m encountering issues with sending multiple images together with a prompt. My current implementation seems to have problems when combining base64-encoded images and prompts in the API request.
API Response: The API may not be processing multiple images correctly or there might be constraints on handling such requests.

Request for Help:
Handling Multiple Images: What’s the best way to pass multiple base64-encoded images to GPT-4 along with a prompt?
API Constraints: Are there any constraints or best practices for sending multiple images to GPT-4?
Error Handling: How should I handle potential issues or errors when dealing with multiple images?

This code will do it using gpt-4o as the model. It will NOT do it using gpt-4o-mini as the model.

# gpt4oUploadImg01.py
# gpt-4o
import fitz  # PyMuPDF
import os
import sys
import boto3
import base64
import httpx
from openai import OpenAI

def pdf_to_jpeg(pdf_path, output_folder):
    """Converts a PDF to JPEG images."""
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc[page_num]
        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # Adjust DPI as needed
        pix.save(os.path.join(output_folder, f"page_{page_num + 1}.jpg"))
    doc.close()

def upload_to_s3(local_folder, s3_bucket, s3_output_key):
    """Uploads the contents of a local folder to an S3 bucket."""
    s3_resource = boto3.resource('s3')
    
    for root, dirs, files in os.walk(local_folder):
        for file in files:
            local_file_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_file_path, local_folder)
            s3_file_key = f"{s3_output_key}/{relative_path}".replace('//', '/')
            s3_resource.Bucket(s3_bucket).upload_file(local_file_path, s3_file_key)
    print(f'Output folder {local_folder} uploaded to s3://{s3_bucket}/{s3_output_key}')

def process_images_with_gpt4o(s3_bucket, s3_output_key, api_key):
    client = OpenAI(api_key=api_key)
    MODEL = "gpt-4o"

    prompt = """
    You are a very professional image to text document extractor.
    Please extract the text from these images, treating them as pages of a PDF document. 
    Try to format any tables found in the images. 
    Do not include page numbers, page headers, or page footers.
    Please double-check to make sure that any words in all capitalized letters with strikethrough letters are excluded.
    Return only the extracted text.  No commentary.
    **Include Tables:** Tables should be preserved in the extracted text.
    **Exclude Page Headers, Page Footers, and Page Numbers:** Eliminate these elements which are typically not part of the main content.
    """

    s3_client = boto3.client('s3')
    response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_output_key)

    messages = [
        {"role": "system", "content": "You are a helpful assistant that responds in Markdown."},
        {"role": "user", "content": [{"type": "text", "text": prompt}]}
    ]

    for obj in response.get('Contents', []):
        if obj['Key'].endswith('.jpg'):  # Ensure we're only processing jpg files
            url = f"https://s3.us-west-2.amazonaws.com/{s3_bucket}/{obj['Key']}"
            messages[1]["content"].append({"type": "image_url", "image_url": {"url": url}})

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.0,
    )

    return response.choices[0].message.content, response.usage

if __name__ == "__main__":
    if len(sys.argv) != 6:
        print("Usage: python script.py <pdf_path> <output_folder> <s3_bucket> <s3_output_key> <openai_api_key>")
        sys.exit(1)

    pdf_path = sys.argv[1]
    output_folder = sys.argv[2]
    s3_bucket = sys.argv[3]
    s3_output_key = sys.argv[4]
    openai_api_key = sys.argv[5]

    # Convert PDF to JPEG
    pdf_to_jpeg(pdf_path, output_folder)

    # Construct the full s3_output_key including the folder name
    folder_name = os.path.basename(output_folder)
    full_s3_output_key = f"{s3_output_key.rstrip('/')}/{folder_name}"

    # Upload to S3
    upload_to_s3(output_folder, s3_bucket, full_s3_output_key)

    # Process images with GPT-4O
    response, usage = process_images_with_gpt4o(s3_bucket, full_s3_output_key, openai_api_key)

    # Write the response to a text file
    output_text_file = os.path.splitext(pdf_path)[0] + '.txt'
    with open(output_text_file, 'w', encoding='utf-8') as f:
        f.write(response)

    # Print the response
    print(f"Extracted text has been written to: {output_text_file}")

    # Print the usage tokens
    print("\nUsage tokens:")
    print(f"Input tokens: {usage.prompt_tokens}")
    print(f"Output tokens: {usage.completion_tokens}")
1 Like