Why can't I read images via Completions API?

PROMPT:

I have several jpg files in the folder e:\De put pe FTP 2\Test\. I want Python code that will scan them with OCR and create docx and pdf according to the same model, but keeping the writing format identical and keeping the images identical, their size, their place on the page, etc. I want you to use ChatGPT API KEY

See Python Code. The code does not read with OCR and api key, it only prints the screen. OCR means I can copy each word separately, not select the image. I believe API-KEY from ChatGPT is better than (Tesseract OCR) on reading images, doesn’t ?

import os
from PIL import Image
import base64
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Inches
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import openai

# Set the path to the directory containing JPG files
input_dir = r"e:\De pus pe FTP 2\Test"

# Configure OpenAI API client (replace YOUR-API-KEY with your actual key)
client = openai.OpenAI(api_key="YOUR-API-KEY")

def encode_image(image_path):
    """Convert image to base64 for API upload."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def perform_ocr_with_chatgpt(image_path):
    """Use ChatGPT API to perform OCR on the image and extract text."""
    # Encode the image to base64
    base64_image = encode_image(image_path)

    try:
        # Send the image to ChatGPT API for OCR
        response = client.chat.completions.create(
            model="gpt-4o",  # Using the correct GPT-4 Omni model
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Extract all text from this image accurately, preserving the layout as much as possible. Return the text with line breaks where appropriate."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=2000  # Increased to handle more text
        )
        extracted_text = response.choices[0].message.content.strip()
        return extracted_text
    except openai.APIError as e:
        print(f"OpenAI API error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

def create_docx(output_path, text, image_path):
    """Create a .docx file with text and image preserving layout."""
    doc = Document()
    doc.styles['Normal'].font.name = 'Times New Roman'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), 'Times New Roman')

    # Add image (full size, adjusted to fit page)
    img = Image.open(image_path)
    img_width, img_height = img.size
    aspect = img_height / float(img_width)

    # Adjust image width to fit document margins (6 inches is reasonable)
    max_doc_width = 6.0
    if img_width > 0:
        doc.add_picture(image_path, width=Inches(max_doc_width))

    doc.add_paragraph()  # Spacing after image

    # Add extracted text with preserved line breaks
    paragraphs = text.split('\n')
    for para in paragraphs:
        if para.strip():
            p = doc.add_paragraph()
            p.add_run(para)

    # Save the document
    doc.save(output_path)
    print(f"Created DOCX: {output_path}")

def create_pdf(output_path, text, image_path):
    """Create a .pdf file with text and image preserving layout."""
    c = canvas.Canvas(output_path, pagesize=letter)

    # Add image (full size, adjusted to fit page)
    img = Image.open(image_path)
    img_width, img_height = img.size

    if img_width > 0:
        aspect = img_height / float(img_width)
        max_width = 500  # Max width in points
        new_height = max_width * aspect

        # Check if image fits on page, adjust if necessary
        if new_height > 600:
            new_height = 600
            max_width = new_height / aspect

        c.drawImage(image_path, 30, 650 - new_height, width=max_width, height=new_height)

        # Add text below image
        text_y = 650 - new_height - 20
    else:
        text_y = 650

    # Add text
    text_obj = c.beginText(30, text_y)
    text_obj.setFont("Helvetica", 10)

    paragraphs = text.split('\n')
    for para in paragraphs:
        if para.strip():
            # Check if we need a new page
            if text_obj.getY() < 50:
                c.drawText(text_obj)
                c.showPage()
                text_obj = c.beginText(30, 750)
                text_obj.setFont("Helvetica", 10)
            text_obj.textLine(para)

    c.drawText(text_obj)
    c.save()
    print(f"Created PDF: {output_path}")

def process_images():
    """Process all JPG files in the input directory."""
    if not os.path.exists(input_dir):
        print(f"Error: Directory not found: {input_dir}")
        return

    jpg_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.jpg')]

    if not jpg_files:
        print(f"No JPG files found in: {input_dir}")
        return

    print(f"Found {len(jpg_files)} JPG files to process")

    for filename in jpg_files:
        image_path = os.path.join(input_dir, filename)
        print(f"\nProcessing: {filename}")

        try:
            # Perform OCR
            text = perform_ocr_with_chatgpt(image_path)

            if text is None:
                print(f"Failed to extract text from {filename}")
                continue

            print(f"Successfully extracted text ({len(text)} characters)")

            # Generate output file names
            base_name = os.path.splitext(filename)[0]
            docx_output = os.path.join(input_dir, base_name + '.docx')
            pdf_output = os.path.join(input_dir, base_name + '.pdf')

            # Create .docx and .pdf files
            create_docx(docx_output, text, image_path)
            create_pdf(pdf_output, text, image_path)

            print(f"Successfully processed {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    print("Starting OCR processing with ChatGPT...")
    print(f"Scanning folder: {input_dir}")
    process_images()
    print("\nProcessing complete!")

ChatGPT is the web client product. The API <> ChatGPT. (I’ve updated your title and tags to reflect that)

1 Like

see this print screen. The code with API-KEY must read with OCR, distinguish between writing and small images that are inserted into the base image. Then, copy everything into a docx keeping the same format from the image, with all the writing and the small images from the large image. Then, convert to pdf.

This seems like an unreasonable instruction.

Your request to the model may return text (probably fine but I’ve not tested it) but I don’t see how it will be able to “preserve layout”?

How did you expect layout to be maintained?

You don’t seem to extract bespoke layout information at all and are simply tacking on the text below the images when you create the new documents.

Are you the author of this code or is it something you picked up from somewhere?

1 Like

Picked it up from a bot.

The functions make no sense, just from an interface perspective. Accepts a string and an image, and a docstring that expects magic to happen.

Layout parser…

2 Likes

I am the author of the code. I made, also, the prompt. Maybe this is why it’s not working ..

2 Likes

It sounds like the answer to your question is:

" text = perform_ocr_with_chatgpt(image_path)"

What is the code for that function?

You shared code for a top-level method/function but you didn’t share your input/output results.

If your sharing psuedo code then obviously it’s not going to work, because it’s psuedo code.

If you have actually a full project with real methods/functions and want help debugging certain I/O results, then share the relevant code.

Or better yet, share the code with GPT as well as your current I/O results and your desired I/O results, and watch the magic of LLM’s take place.

yes, I’m an average programmer, I combined about 4 codes to make this.

My point is, do you have real code that your trying to make work, otherwise what are you trying to do with what you shared, what are you looking for here?

1 Like

to answer your question in other way. Is there any possibility that API-KEY can accomplish this task?

In vain I struggle to make a code, if ChatGPT and API-KEY can’t do that.

A language AI will simply answer your question in natural language. What you ask for is magic that has multiple paths needed where data cannot flow.

I gave a link to Google’s specialized models, where you can see “layout parser”.

GPT-4 vision does not have a high-quality “grounding”, the ability to reference locations of elements in an image.

One thing it certainly cannot do without an extensive agentic framework is “put this map in an identical output document”.


Even though I know what it cannot do right off the bat, lets assume that we understand the need to get out individual sections, such as individual columns and individual images, to have any hope of proceeding further. Then we go in with more educated naivety:

Here’s an application built with the top OpenAI reasoner, able to accept images larger than others due to its patches algorithm for vision input (although your image provided is small and poor).

SYSTEM

You are Docuo, a language model that uses your vision ability to reproduce data from documents as accurately and comprehensively as possible.
Stages of document preparation you may fulfill:

  • initial analysis of contents and quality
  • layout parsing, allowing slicing of the document programmatically into constituent elements
  • element data extraction, such as complete OCR, complete visual description for reconstruction.

Then we embark on just one round:

For the attached image, provide a structural framework about the document composition and layout for parsing/extraction:

  1. full image dimensions received,
  2. four-corner coordinate bounding boxes for contiguous texts
  3. four-corner bounding boxes for images

Then for an image actually 550 width x 750 height, yours, we can see each element produced does not fit the task. Vision cannot function as an element extractor, to facilitate any kind of document reconstruction:

Below is a suggested “slicing” of the scanned page into its principal structural blocks. All coordinates are given in pixels relative to the top‐left of the full image, which we take here as width=785 px, height=535 px.

Full‐page
• Corner points:
– top‐left (0, 0)
– top‐right (785, 0)
– bottom‐right (785, 535)
– bottom‐left (0, 535)

Header band (runs across the full width; contains “LAOS” / “66” / “LAO‐TZI”)
• Approx. y = 0 → 50 px
• Corners:
– TL (0, 0)
– TR (785, 0)
– BR (785, 50)
– BL (0, 50)

Left‐column map image (“LAOS – scară …”)
• Approx. x = 10 → 380, y = 60 → 260 px
• Corners:
– TL (10, 60)
– TR (380, 60)
– BR (380, 260)
– BL (10, 260)

Left‐column body text (under the map, continues to bottom margin)
• Approx. x = 10 → 380, y = 260 → 525 px
• Corners:
– TL (10, 260)
– TR (380, 260)
– BR (380, 525)
– BL (10, 525)

Right‐column body text (starts under the header, above the portrait)
• Approx. x = 395 → 775, y = 60 → 350 px
• Corners:
– TL (395, 60)
– TR (775, 60)
– BR (775, 350)
– BL (395, 350)

Right‐column portrait image (Lao-tzi) plus its caption
• Approx. x = 390 → 785, y = 350 → 525 px
• Corners:
– TL (390, 350)
– TR (785, 350)
– BR (785, 525)
– BL (390, 525)

You can of course refine these slightly in your own layout tool, but these six boxes will let you isolate (a) the header, (b) two image assets, and (c) the two main text flows for OCR.

Result: it’s first “extraction” by coordinates:

I would suggest that you use PDF software tools that are document-aware and are specifically designed to enhance PDF files with searchable text.

1 Like

nice work. But how did you get those coordinates?

It is supposed to demonstrate to you that OpenAI vision language models are not suitable for sectioning documents into constituent elements.

You see the messages passed along with the image. You see completely wrong image dimensions and coordinates returned. It is thus not worth continuing on that work.