Trouble with OCR Using Multiple Photo

I have a lot of letters that I am tasked with transcribing. I have taken pictures of them and converted them to jpeg. GPT4o can do a great job transcribing them if I upload them one at a time, but if I upload multiple or send a batch as a zip file it fails to read any of them. Any advice would be appreciated.

1 Like

Send them once at a time.

If you are using ChatGPT you can easily just use the API to automate your process.

That sounds good, thank you!

import os
import base64
import requests
import time

# OpenAI API Key
api_key = "YOUR_API_KEY"

# Function to encode the image in base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Directory containing the images to process
image_directory = "YOUR_IMAGE_DIRECTORY"
# Path to the output file where results will be saved
output_file_path = "Text_File_To_Write_TO.txt"

# API endpoint for OpenAI GPT-4V
api_url = "https://api.openai.com/v1/chat/completions"

# Headers for the API request
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Function to process an image with GPT-4V and get the description
def process_image_with_gpt(image_path):
    base64_image = encode_image(image_path)
    
    # Payload for the API request
    payload = {
        "model": "gpt-4o-mini",  
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Transcribe the text from the image and include no other response."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 1000  # Adjust if you want more or fewer tokens in the response
    }

    # Send the POST request to the API
    response = requests.post(api_url, headers=headers, json=payload)
    
    # Check if the response is successful
    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        return f"Error: {response.status_code}, {response.text}"

# Initialize a list to store the results in case of errors
results = []

try:
    # Loop through all images in the directory
    for filename in os.listdir(image_directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
            image_path = os.path.join(image_directory, filename)
            print(f"Processing {filename}...")

            # Process the image and get the result
            result = process_image_with_gpt(image_path)
            
            # Store the result in the results list
            results.append(f"Result for {filename}:\n{result}\n\n")

            # Print the result for logging purposes
            print(f"Finished processing {filename}. Result:\n{result}\n")

            # Wait for a few seconds before processing the next image (adjust the time as necessary)
            time.sleep(5)  # Wait for 5 seconds

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Write all results to the output text file, even if an error occurs
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for result in results:
            output_file.write(result)

    print(f"All processed results (so far) saved in {output_file_path}.")

Here is the script I used for future reference.

I have faced this problem daily. Any photo I send, GPT PLUS is unable to identify the text in the image, damaging my experience. I suggest that the OpenAI team improve this feature, so that the tool is capable of performing the task of reading images without the need for OCR or conversion to another type.