Open AI vision model claiming it's text only?

Ok, so this code was able to get the app to describe the image once and the next few responses were errors or it saying that it couldn’t see images and the last response was

This image contains a base64 encoded string that represents a JPEG image file. The string is very long and contains a series of characters and symbols that cannot be interpreted without decoding it. The decoded image would likely display a picture, but without decoding the base64 string, it is not possible to describe the content of the image.

Chat suggested using CLIP for image analysis. Does this make sense?

import gradio as gr
import json
import openai
import base64
import numpy as np
import cv2

# Load API keys from config file
with open("config.json", "r") as config_file:
    config = json.load(config_file)
    OPENAI_API_KEY = config["openai_api_key"]

openai.api_key = OPENAI_API_KEY

def image_to_base64(image):
    # Resize the image to reduce size
    resized_image = cv2.resize(image, (128, 128))  # Resize to 256x256 or other suitable size
    _, encoded_image = cv2.imencode('.png', resized_image)
    base64_image = base64.b64encode(encoded_image).decode('utf-8')
    return base64_image

def chat_with_bot(combined_input, base64_image=None):
        messages = [{"role": "user", "content": combined_input}]
        if base64_image:
            # Format the image as a data URL string
            image_data_url = f"data:image/jpeg;base64,{base64_image}"
            image_message = {"role": "system", "content": image_data_url}


        response = openai.ChatCompletion.create(
            max_tokens=300  # Adjust as needed
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"An error occurred: {str(e)}"

def handle_input(question, image):
    base64_image = ""
    if image is not None:
        base64_image = image_to_base64(image)

    return chat_with_bot(question, base64_image)

# Function to generate images using Dall-e
def generate_image(prompt):
    openai.api_key = OPENAI_API_KEY

        response = openai.Image.create(
            n=1,  # Number of images to generate
            size="1024x1024"  # Image size
        # Extracting the image URL from the response
        image_url = response['data'][0]['url']
        # Return HTML with flexbox for centering
        return f"<div style='display: flex; justify-content: center;'><img src='{image_url}' width='512' /></div>"
    except Exception as e:
        return f"An error occurred: {e}"

def handle_submission(question, api_key, action):
    if action == "submit":
        return chat_with_bot(question)
    elif action == "generate":
        return generate_image(question)
        return "Invalid action"

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("### Milo")

    with gr.Row():
        question = gr.Textbox(label="Ask a question or describe the image:")
        image_input = gr.Image(label="Or upload an image for analysis")
        submit_btn = gr.Button("Submit")

    response_text = gr.Textbox(label="Response"), inputs=[question, image_input], outputs=response_text)

    gr.Markdown("### Image Generation")
    image_prompt = gr.Textbox(label="Enter prompt for image generation")
    generate_btn = gr.Button("Generate Image")
    image_output = gr.HTML()  # Use HTML component for displaying image from URL, inputs=[image_prompt], outputs=image_output)


1 Like

Shouldn’t it be “data:image/png;base64” not “data:image/jpeg;base64” ?

I noticed you are setting your api key twice as well.


Ok, I started from scratch with the code in the vision docs and I’m getting some decent results, which is really great!

Here is the code

import gradio as gr
import base64
import cv2
import requests
import json

# Load API keys from config file
with open("config.json", "r") as config_file:
    config = json.load(config_file)
    api_key = config["openai_api_key"]

def encode_image(image):
    # Convert the NumPy array (image) to a format suitable for base64 encoding
    _, buffer = cv2.imencode('.jpg', image)
    base64_image = base64.b64encode(buffer).decode('utf-8')
    return base64_image

def chat_with_bot(question, image):
    base64_image = encode_image(image)

    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"

    payload = {
      "model": "gpt-4-vision-preview",
      "messages": [
          "role": "user",
          "content": [
              "type": "text",
              "text": question
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
      "max_tokens": 1000

    response ="", headers=headers, json=payload)

    # Extracting the descriptive text from the response
    descriptive_text = response.json()['choices'][0]['message']['content']
    return descriptive_text

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Image Analysis Chatbot")
    with gr.Row():
        question = gr.Textbox(label="Ask a question about the image")
        image = gr.Image(label="Upload an image")
        submit_btn = gr.Button("Submit")

    response = gr.Textbox(label="Response"), inputs=[question, image], outputs=response)


Thanks for checking over my code. I appreciate your help!

1 Like

Good job looks great! :slight_smile:

(Extra words)