I have a lot of letters that I am tasked with transcribing. I have taken pictures of them and converted them to jpeg. GPT4o can do a great job transcribing them if I upload them one at a time, but if I upload multiple or send a batch as a zip file it fails to read any of them. Any advice would be appreciated.
Send them once at a time.
If you are using ChatGPT you can easily just use the API to automate your process.
That sounds good, thank you!
import os
import base64
import requests
import time
# OpenAI API Key
api_key = "YOUR_API_KEY"
# Function to encode the image in base64
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Directory containing the images to process
image_directory = "YOUR_IMAGE_DIRECTORY"
# Path to the output file where results will be saved
output_file_path = "Text_File_To_Write_TO.txt"
# API endpoint for OpenAI GPT-4V
api_url = "https://api.openai.com/v1/chat/completions"
# Headers for the API request
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
# Function to process an image with GPT-4V and get the description
def process_image_with_gpt(image_path):
base64_image = encode_image(image_path)
# Payload for the API request
payload = {
"model": "gpt-4o-mini",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Transcribe the text from the image and include no other response."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 1000 # Adjust if you want more or fewer tokens in the response
}
# Send the POST request to the API
response = requests.post(api_url, headers=headers, json=payload)
# Check if the response is successful
if response.status_code == 200:
return response.json()['choices'][0]['message']['content']
else:
return f"Error: {response.status_code}, {response.text}"
# Initialize a list to store the results in case of errors
results = []
try:
# Loop through all images in the directory
for filename in os.listdir(image_directory):
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')):
image_path = os.path.join(image_directory, filename)
print(f"Processing {filename}...")
# Process the image and get the result
result = process_image_with_gpt(image_path)
# Store the result in the results list
results.append(f"Result for {filename}:\n{result}\n\n")
# Print the result for logging purposes
print(f"Finished processing {filename}. Result:\n{result}\n")
# Wait for a few seconds before processing the next image (adjust the time as necessary)
time.sleep(5) # Wait for 5 seconds
except Exception as e:
print(f"An error occurred: {str(e)}")
finally:
# Write all results to the output text file, even if an error occurs
with open(output_file_path, 'w', encoding='utf-8') as output_file:
for result in results:
output_file.write(result)
print(f"All processed results (so far) saved in {output_file_path}.")
Here is the script I used for future reference.