Does ChatGPT (API-KEY) know how to read the images?

I am using Python & ChatGPT - to read all local images with API-KEY and rename each image with a better description about it. I try many combination, try to do also with AI, but didn’t work.

Maybe someone knows a trick, how to read perfect the images and rename with new and better descrioption.

I try many things, but didn’t work.

import re
import os
import requests
from PIL import Image
import io
import base64

# Set up OpenAI configuration
OPENAI_API_KEY = 'YOUR_API_KEY'

def prepare_image(image_path, max_size=(800, 800)):
    """Resize image if too large and convert to bytes"""
    try:
        # Open image
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode != 'RGB':
                img = img.convert('RGB')

            # Calculate new size while maintaining aspect ratio
            width, height = img.size
            if width > max_size[0] or height > max_size[1]:
                ratio = min(max_size[0]/width, max_size[1]/height)
                new_size = (int(width * ratio), int(height * ratio))
                img = img.resize(new_size, Image.Resampling.LANCZOS)

            # Save to bytes
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='PNG', optimize=True, quality=85)
            img_byte_arr.seek(0)

            return img_byte_arr.getvalue()

    except Exception as e:
        print(f"❌ Error preparing image: {str(e)}")
        raise

def get_image_description(image_path):
    """Get description of the image using OpenAI Vision API"""
    try:
        # Prepare image
        print("Preparing image...")
        image_data = prepare_image(image_path)
        print("✓ Image prepared successfully")

        # Convert image to base64
        base64_image = base64.b64encode(image_data).decode('utf-8')

        headers = {
            'Authorization': f'Bearer {OPENAI_API_KEY}',
            'Content-Type': 'application/json'
        }

        # Prepare JSON payload with updated model name
        payload = {
            "model": "gpt-4-vision",  # Updated model name
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Describe this image in around 18 words, focusing on the main elements. Give only the description, no introduction or explanation."
                        },
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "data": base64_image,
                                "media_type": "image/png"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": 100
        }

        print("Sending request to API using model:", payload["model"])
        response = requests.post(
            'https://api.openai.com/v1/chat/completions',
            headers=headers,
            json=payload
        )

        print("Response status code:", response.status_code)
        print("Full response:", response.text)

        if response.status_code == 200:
            description = response.json()['choices'][0]['message']['content']
            print(f"Raw description received: {description}")
            return description
        else:
            print(f"API Error Response: {response.text}")
            raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

    except Exception as e:
        print(f"❌ Error getting image description: {str(e)}")
        print("Full error details:", str(e))
        raise

def clean_description(description):
    """Clean and format the description"""
    # Remove punctuation
    description = description.replace(".", "").replace(",", "")

    # Convert to lowercase for processing
    description = description.lower()

    # Remove common unnecessary prefixes
    prefixes_to_remove = [
        "the image shows ",
        "in the image ",
        "this is ",
        "there is ",
        "we can see ",
        "the picture shows ",
        "i see ",
        "shows ",
        "depicting ",
        "displaying ",
        "this image shows ",
        "a ",
        "an "
    ]

    for prefix in prefixes_to_remove:
        if description.startswith(prefix):
            description = description[len(prefix):]

    # Capitalize appropriately
    words = description.split()
    words = [word.capitalize() if word not in ['a', 'an', 'the', 'in', 'on', 'at', 'with', 'and', 'or']
            else word for word in words]

    return ' '.join(words)

def analyze_and_rename_images(folder_path):
    """Analyze and rename images based on their content"""
    print(f"\nAnalyzing images in folder: {folder_path}")

    # List all files in directory
    all_files = os.listdir(folder_path)
    print(f"All files in directory: {all_files}")

    # Get all PNG files that start with a number
    pattern = r"^(\d+)_.*\.png$"
    images = []
    for f in all_files:
        match = re.match(pattern, f)
        if match:
            images.append((int(match.group(1)), f))

    # Sort by number
    images.sort(key=lambda x: x[0])
    print(f"Found {len(images)} images to process: {[img[1] for img in images]}")

    for number, image_name in images:
        print(f"\nProcessing image: {image_name}")
        image_path = os.path.join(folder_path, image_name)

        try:
            # Get description
            raw_description = get_image_description(image_path)

            # Clean up description
            description = clean_description(raw_description)
            print(f"Cleaned description: {description}")

            # Create new filename
            new_filename = f"{number}_{description}.png"
            new_path = os.path.join(folder_path, new_filename)
            print(f"New filename will be: {new_filename}")

            # Check if source and destination are not the same
            if image_path != new_path:
                # Check if new file name already exists
                if os.path.exists(new_path):
                    print(f"Warning: {new_filename} already exists!")
                    continue

                # Rename file
                os.rename(image_path, new_path)
                print(f"✓ Successfully renamed: {image_name} -> {new_filename}")
            else:
                print("Skip renaming: Source and destination are the same")

        except Exception as e:
            print(f"❌ Error processing {image_name}: {str(e)}")
            print("Stack trace:")
            import traceback
            traceback.print_exc()

def main():
    folder_path = "e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/2023/Pictory.AI/generated_images"
    print("Starting image analysis and renaming process...")
    try:
        analyze_and_rename_images(folder_path)
        print("\n✓ Process completed successfully!")
    except Exception as e:
        print(f"\n❌ Process failed: {str(e)}")
        print("Stack trace:")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

It appears that the AI that wrote this for you has no idea that the maximum image size of the shortest dimension is already resized to 768px, told you to replace the model name in the code to one that is not fictional, and completely fabricated how to send an image. There also should be a system messae that does a better job of instructiong the AI (which is not named ChatGPT on the API) exactly what it is doing with vision skill, that it must provide valid file names of particular length with particular style.

Correct usage examples can be found in “API reference” on the forum side bar, under “chat”, or even searching this forum for “python vision example” or similar.

1 Like

I ask ChatGPT about your idea an my code, this is the answer. So, right now, ChatGPT doesn’t know the solution.

ChatGPT has a cut off circa end of 2023, you can make use of the web search functionality to improve this, but you are still best off going through the API documentation for specific API calls and their syntax.

https://platform.openai.com/overview

2 Likes

OpenAI vision models DO know how to use their image analysis skill.

In fact, models like gpt-4o can do this simple task of appending a description to a passed file name and its image content quite well. Providing that the API is used properly.

Here is a Python framework that has all the components of accepting a file name and its path as input, creating a message specifically for this vision task using asyncio and reusable functions, and sending to OpenAI, returning a new name.

It uses structured outputs, and uses an anyOf schema, meaning that the AI can either produce a JSON as output for renaming the file, or instead, can avoid renaming, and give you a reason why. The structured output ensures that there is no additional chat or possible damage to the file system, because there is an enforced format for receiving the new file name, along with other information like a longer description you can view.

detail:low is used, which limits the size used for AI vision to under 512px, as well as limiting the cost.

(My rename_image_file function is written simply to raise an exception if there was no new_vision_file_name returned by the AI, like if it gave a reason why it didn’t want to rename the file).

Example response from the AI:

{'image_description': 'A large open square with a tall historic church tower on the left and modern buildings on the right. The sky is clear with visible contrails.', 'original_file_name': 'hotel.jpg', 'new_vision_file_name': 'hotel_square_church_tower.jpg'}

Example value returned from function:

hotel_square_church_tower.jpg

Full code block

from pydantic import BaseModel
from typing import List, Optional, Union
import base64
import asyncio
from openai import AsyncOpenAI
from PIL import Image
import os

def encode_image_to_base64(file_path):
    """
    Encodes a given image file to base64.

    Args:
        file_path (str): Path to the image file.

    Returns:
        str: Base64 encoded string of the image.
    """
    with open(file_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def create_message_with_images(
    image_paths: Union[str, List[str]],
    system: Optional[str] = None,
    user: Optional[str] = None
) -> List[dict]:
    """
    Creates a message dictionary for OpenAI Chat Completions API,
    containing images encoded in base64.

    Args:
        image_paths: A single image file path string or a Python list of paths.
        system (str, optional): System message text.
        user (str, optional): User message text.

    Returns:
        list of dict: Message dictionaries ready to be used with the OpenAI API.
    """
    if isinstance(image_paths, str):
        image_paths = [image_paths]

    # Use provided system message or default if not provided
    system_prompt = system if system else (
        "You are GPT-4o with built-in image analysis skill.\n\nComputer vision: Enabled"
        )
    system_message = [
        {"type": "text", "text": system_prompt}
    ]
    
    # Set default user input instruction if not provided or empty
    user_prompt = user if user else (
        "Analyze these images, producing a report of what you see"
        )

    # Create user message content with the user input instruction
    content = [
        {"type": "text", "text": user_prompt}
    ]

    # Encode each image and add to the user message content
    for path in image_paths:
        base64_image = encode_image_to_base64(path)
        content.append(
                # Add user content with the image file name
            {
                "type": "text",
                "text": f"Input image path and file name: `{path}`"
            }
        )
        content.append(
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/webp;base64,{base64_image}",
                    "detail": "low"
                }
            }
        )

    return [
        {
            "role": "system",
            "content": system_message
        },
        {
            "role": "user",
            "content": content
        }
    ]

async def call_async(messages):
    """Use the OpenAI API to receive a JSON response regarding file names"""

    # structured output schema, specific to the task of renaming files

    class RenameFiles(BaseModel):
        image_description: str
        original_file_name: str
        new_vision_file_name: str

    class NoRename(BaseModel):
        reason_to_avoid_renaming: str

    # Structured output schema with anyOf logic
    class RenameResponse(BaseModel):
        result: Union[RenameFiles, NoRename]

    client = AsyncOpenAI()
    response = await client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=messages,
        max_tokens=500,
        top_p=0.01,
        response_format=RenameFiles
    )
    return response.choices[0].message.parsed

async def rename_image_file(image_path: str=None):
    user_message = ("Attached is contents of an image file to be renamed. Describe what you see. "
                    "Produce a new file name, appending a few words based on image description. "
                    "Without good quality image analysis, you must use reason_to_avoid_renaming."
        )

    messages = create_message_with_images(image_paths=image_path, user=user_message)
    try:
        structured_response = await call_async(messages)
        print(structured_response.model_dump())
        print(structured_response.new_vision_file_name)
        return structured_response.new_vision_file_name
    except Exception as e:
        raise ValueError(f"API image failure or refusal on {image_path}: {e}")

# Ensures the main function is run only if the script is executed directly
if __name__ == '__main__':
    new_name = asyncio.run(rename_image_file('./hotel.jpg'))

The important part is in properly constructing the user messages that contain images. Here I have a reusable function that can add multiple images with their file names, for other purposes; a snippet from that:

    # Encode each image and add to the user message content
    for path in image_paths:
        base64_image = encode_image_to_base64(path)
        content.append(
                # Add user content with the image file name
            {
                "type": "text",
                "text": f"Input image path and file name: `{path}`"
            }
        )
        content.append(
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/webp;base64,{base64_image}",
                    "detail": "low"
                }
            }
        )

Altering the file system or looping through files in a directory is NOT provided above.

This gives the whole file path to the AI. You may want to refactor so the AI only sees the file name, and you manage the output directory.

2 Likes

I test your code , also with ChatGPT or Claude.ai.

Didn’t succed…

What error did you get?

You will have fundamental challenges in writing an API application if you do not first embark on understanding the programming language or what the application is doing, or if you are unable to communicate well with an assistant about the issues you experience.

Here is an example chat session with such communication.

A beginner has encountered this fully-functional modern Python demonstration code for the latest openai API SDK that they want to try. What are the requirements to use it for minimum Python version and other modules? Produce some instructions on how to pip install, targeting a system or user Python install, how to add an environment variable OPENAI_API_KEY automatically used by the openai module, other key understandings of useful modifications of the code to adapt it to not pass the AI a full path name, and other reasons why the demonstrated method may not run as a single python file.


To use the provided Python demonstration script with the latest OpenAI API SDK, follow these steps:


Minimum Requirements

  1. Python Version: Ensure you have Python 3.11 or later.
  2. Libraries:
    • pydantic (for data validation)
    • typing (for type hinting, built-in with Python 3.11+) [correction to AI: you must still import, as this is backwards-compatible]
    • base64 and asyncio (built-in with Python)
    • openai (latest SDK required)
    • Pillow (for image handling)

Installation Instructions

Install Required Libraries

Run the following pip commands to install the dependencies. Adjust pip to pip3 if necessary.

  • For system installation:

    pip install openai pydantic Pillow
    
  • For user-only installation (no admin rights) [this also is a falsehood by AI, you must use an administrator command shell or sudo account elevation to modify a system install, or use a venv]:

    pip install --user openai pydantic Pillow
    

Verify Installation

Check versions:

python -m pip show openai pydantic Pillow

Ensure the openai SDK version aligns with the code structure (e.g., 2024 versions).


Environment Variable Configuration

The OpenAI SDK requires an API key to authenticate requests. Add the OPENAI_API_KEY environment variable:

Windows

Run the following in the command prompt:

setx OPENAI_API_KEY "your-api-key-here"

Mac/Linux

Run the following in the terminal:

export OPENAI_API_KEY="your-api-key-here"

To make it persistent, add the export line to your ~/.bashrc, ~/.zshrc, or equivalent shell configuration file.


Key Considerations and Adaptations

1. Avoid Passing Full Path Names

Passing full paths can expose sensitive directory structures. Modify the script to only pass filenames and retrieve the path dynamically:

from pathlib import Path

def encode_image_to_base64(file_name: str):
    file_path = Path("./images") / file_name
    with open(file_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

Place images in a dedicated directory (e.g., ./images) for organization and security.


2. File System Considerations

If run on systems where asyncio support is limited or asynchronous file I/O could fail (e.g., legacy filesystems), adapt the code to work synchronously or add retries for I/O operations.


3. API Limits and Errors

  • Quota Limits: Check your OpenAI API usage limits to prevent quota exhaustion.
  • Structured Outputs: Validate the API schema by testing small payloads first.

4. Production Settings

  • Consider setting timeouts for the API requests.
  • Implement robust error handling for file operations and API failures.
  • Handle edge cases, such as unsupported image formats or unreadable files.

Why This Code May Not Run as a Single File

  1. Missing API Key: The absence of an OPENAI_API_KEY environment variable will cause the script to fail.
  2. Dependencies: Without the required packages installed, the script will not execute.
  3. Path Issues: If ./hotel.jpg doesn’t exist or is incorrectly referenced, the script will error.
  4. Async Context: Mixing async functions and a synchronous main program can introduce runtime errors on platforms with restricted event loops.

Final Note

For streamlined development, encapsulate the code into a Python package or module with clear separation of concerns:

  • Place utility functions in a utils.py.
  • Add CLI support to specify images dynamically.
  • Provide configuration files for environment variables and defaults.
1 Like

I don’t remember all the errors

I try so many Python scripts, but now I get the error: "429 Client Error: Too Many Requests for url: https://api.openai.com/v1/chat/completions"

I will try again after 24 hours. The scripts below is the last one that I test.

import os
import base64
import asyncio
from pathlib import Path
from openai import OpenAI, AsyncOpenAI

# Set your OpenAI API Key
OPENAI_API_KEY = "YOUR-API-KEY"

def encode_image_to_base64(image_path: str) -> str:
    """Encode the image to base64."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

async def analyze_image_description(image_path: str) -> str:
    """
    Analizează imaginea și generează un nume descriptiv detaliat.
    """
    try:
        base64_image = encode_image_to_base64(image_path)

        async with AsyncOpenAI(api_key=OPENAI_API_KEY) as client:
            response = await client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {
                        "role": "system",
                        "content": "You are an image analysis expert. Create a comprehensive filename with at least 15 descriptive keywords. Include elements, colors, style, mood, composition, and technical details. Separate words with underscores."
                    },
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "Create a detailed descriptive filename that fully captures this image's content and characteristics."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                    "detail": "high"
                                }
                            }
                        ]
                    }
                ],
                max_tokens=150,
                temperature=0.7
            )

            new_name = response.choices[0].message.content.strip()
            new_name = "".join(c for c in new_name if c.isalnum() or c in ('-', '_', ' '))
            new_name = new_name.replace(' ', '_')
            return new_name[:200]

    except Exception as e:
        raise ValueError(f"OpenAI API Error: {e}")

async def analyze_and_rename_image(file_path: Path):
    """
    Analizează și redenumește o singură imagine.
    """
    try:
        print(f"\nProcesez: {file_path.name}")
        new_base_name = await analyze_image_description(str(file_path))
        new_name = f"{new_base_name}{file_path.suffix}"
        new_path = file_path.with_name(new_name)

        print(f"Redenumesc:\nDin: {file_path.name}\nÎn:  {new_name}")
        file_path.rename(new_path)
        print("✓ Redenumit cu succes!")

    except Exception as e:
        print(f"❌ Eroare la procesarea {file_path.name}: {e}")

async def rename_images_in_directory_parallel(directory: str):
    """
    Redenumește toate imaginile din director în paralel.
    """
    print(f"\nProcesez imaginile din: {directory}")

    dir_path = Path(directory)
    tasks = []

    # Creăm task-uri pentru toate imaginile
    for file_path in dir_path.iterdir():
        if file_path.suffix.lower() in [".png", ".jpg", ".jpeg"]:
            tasks.append(analyze_and_rename_image(file_path))

    # Procesăm toate task-urile în paralel
    if tasks:
        await asyncio.gather(*tasks)
        print("\n✓ Procesare completă!")
    else:
        print("\n❌ Nu s-au găsit imagini de procesat!")

if __name__ == "__main__":
    image_directory = "e:/Carte/BB/17 - Site Leadership/alte/Ionel Balauta/Aryeht/Task 1 - Traduce tot site-ul/Doar Google Web/Andreea/Meditatii/2023/Pictory.AI - Text to video/temp"
    asyncio.run(rename_images_in_directory_parallel(image_directory))