I have no idea if GPT4 can read that - but it might be worth a shot.
import os
import subprocess
import openai
# Function to get the chafa ASCII/ANSI representation of the image
def get_chafa_representation(image_path):
result = subprocess.run(['chafa', '--colors=256', image_path], capture_output=True, text=True)
return result.stdout
# Function to call GPT-4o API with the chafa output and question
def call_gpt4o(api_key, chafa_output, question):
openai.api_key = api_key
response = openai.ChatCompletion.create(
model="gpt-4-vision-preview", # Adjust if using another specific model variant
messages=[
{
"role": "user",
"content": f"{question}\n\nImage:\n{chafa_output}"
}
],
max_tokens=300
)
# Print out the model's response
print(response['choices'][0]['message']['content'])
# CLI script
def main():
api_key = input("Enter your OpenAI API key: ")
input_folder = input("Enter the input folder path: ")
image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('png', 'jpg', 'jpeg', 'gif', 'bmp'))]
if not image_files:
print("No image found in the input folder.")
return
image_path = os.path.join(input_folder, image_files[0])
print(f"Selected Image: {image_files[0]}")
# Get chafa representation of the image
chafa_output = get_chafa_representation(image_path)
print(f"Chafa output:\n{chafa_output}")
# Estimate tokens (rough estimate as chafa output is smaller than base64)
token_count = len(chafa_output.split())
print(f"Estimated token count for the image: {token_count}")
# Calculate cost
estimated_cost = calculate_cost(token_count)
print(f"Estimated cost for processing this image: ${estimated_cost:.5f}")
# Ask for user confirmation
proceed = input("Do you want to proceed with this request? (yes/no): ").strip().lower()
if proceed != 'yes':
print("Operation cancelled.")
return
# Ask the user for a question
question = input("Enter your question for the model: ")
# Call the GPT-4o API with the chafa output and question
call_gpt4o(api_key, chafa_output, question)
if __name__ == "__main__":
main()
The code was just generated by chatgpt - you will have to adjust it (I think gpt-4-vision-preview does not exist anymore and also " response = openai.ChatCompletion.create(
"
does not align with newest version of the openai lib (you can pip install openai==0.28.0 though or look up the correct way for the newest version in the library of openai on github
ah look at this:
Anyways here is how you can call GPT4o with an image:
import os
import base64
import openai
# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY") or input("Enter your OpenAI API key: ")
client = openai.Client(api_key=api_key)
# Function to encode the image to base64
def encode_image_to_base64(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Function to estimate the cost of processing the image
def estimate_cost(token_count, input_token_rate=5.00, output_token_rate=15.00):
input_cost = (token_count / 1_000_000) * input_token_rate
output_cost = (token_count / 1_000_000) * output_token_rate
return input_cost + output_cost
# Function to call GPT-4o API with the base64 image and a question
def call_gpt4o(client, base64_image, question, estimated_cost):
print(f"Estimated cost for processing this image: ${estimated_cost:.5f}")
proceed = input("Do you want to proceed with this request? (yes/no): ").strip().lower()
if proceed != 'yes':
print("Operation cancelled.")
return
response = client.chat.completions.create(
model="gpt-4o", # Use GPT-4o model identifier
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "low" # Adjust to "high" for detailed analysis
}
}
]
}
],
max_tokens=300,
)
# Print the response from the model
print(response.choices[0].message.content)
# CLI script
def main():
input_folder = input("Enter the input folder path: ")
image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('png', 'jpg', 'jpeg', 'gif', 'bmp'))]
if not image_files:
print("No image found in the input folder.")
return
image_path = os.path.join(input_folder, image_files[0])
print(f"Selected Image: {image_files[0]}")
# Encode the image to base64
base64_image = encode_image_to_base64(image_path)
# Estimate the number of tokens for the image (assuming roughly 1 token per 4 bytes of base64)
token_count = len(base64_image) // 4
# Estimate cost
estimated_cost = estimate_cost(token_count)
# Ask the user for a question
question = input("Enter your question for the model: ")
# Call the GPT-4o API with the base64 image and question
call_gpt4o(client, base64_image, question, estimated_cost)
if __name__ == "__main__":
main()
It is tested (no worries, I have revoked the token used in that)
OCR sucks but it sort of depends what your downstream task is. We use unstructured and the OCR they use isn’t great (whatever Adobe is doing in Acrobat is outstanding) but for a lot of RAG like tasks unstructured is good enough. The models really good at predicting what the missing text is but of course if it’s a value like a star rating or dollar amounts that’s missing it’s not going to predict that
For many tasks even pdfminer is good enough to extract the text layer of a pdf. But OCR really sucks in many ways.
I wasted 3 complete years already.
Even trained a model with thousands of generated random polygons to find the 3 out of 5.
Plus you got to remove stuff like coffee stains, crincles and you won’t believe it but some designer put background images behind text in CVs which kind of makes them like captchas
So hybrid solutions on top of opencv edge detection, spatial grouping and good old 60’s style flood fill and pixel count OCR work pretty solid.
Even spatial comparison of bounding boxes of multiple aws textract results are no fun.
I’ve made stored procedures that even o3 won’t be capable of producing