Open AI vision model claiming it's text only?

mboard76 · January 17, 2024, 10:02pm

Ok, so this code was able to get the app to describe the image once and the next few responses were errors or it saying that it couldn’t see images and the last response was

Blockquote
This image contains a base64 encoded string that represents a JPEG image file. The string is very long and contains a series of characters and symbols that cannot be interpreted without decoding it. The decoded image would likely display a picture, but without decoding the base64 string, it is not possible to describe the content of the image.

Chat suggested using CLIP for image analysis. Does this make sense?

import gradio as gr
import json
import openai
import base64
import numpy as np
import cv2

# Load API keys from config file
with open("config.json", "r") as config_file:
    config = json.load(config_file)
    OPENAI_API_KEY = config["openai_api_key"]

openai.api_key = OPENAI_API_KEY

def image_to_base64(image):
    # Resize the image to reduce size
    resized_image = cv2.resize(image, (128, 128))  # Resize to 256x256 or other suitable size
    _, encoded_image = cv2.imencode('.png', resized_image)
    base64_image = base64.b64encode(encoded_image).decode('utf-8')
    return base64_image

def chat_with_bot(combined_input, base64_image=None):
    try:
        messages = [{"role": "user", "content": combined_input}]
        if base64_image:
            # Format the image as a data URL string
            image_data_url = f"data:image/jpeg;base64,{base64_image}"
            image_message = {"role": "system", "content": image_data_url}

            messages.append(image_message)

        response = openai.ChatCompletion.create(
            model="gpt-4-vision-preview",
            messages=messages,
            max_tokens=300  # Adjust as needed
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"An error occurred: {str(e)}"



def handle_input(question, image):
    base64_image = ""
    if image is not None:
        base64_image = image_to_base64(image)

    return chat_with_bot(question, base64_image)


# Function to generate images using Dall-e
def generate_image(prompt):
    openai.api_key = OPENAI_API_KEY

    try:
        response = openai.Image.create(
            prompt=prompt,
            n=1,  # Number of images to generate
            size="1024x1024"  # Image size
        )
        # Extracting the image URL from the response
        image_url = response['data'][0]['url']
        # Return HTML with flexbox for centering
        return f"<div style='display: flex; justify-content: center;'><img src='{image_url}' width='512' /></div>"
    except Exception as e:
        return f"An error occurred: {e}"

def handle_submission(question, api_key, action):
    if action == "submit":
        return chat_with_bot(question)
    elif action == "generate":
        return generate_image(question)
    else:
        return "Invalid action"

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("### Milo")

    with gr.Row():
        question = gr.Textbox(label="Ask a question or describe the image:")
        image_input = gr.Image(label="Or upload an image for analysis")
        submit_btn = gr.Button("Submit")

    response_text = gr.Textbox(label="Response")
    submit_btn.click(fn=handle_input, inputs=[question, image_input], outputs=response_text)

    gr.Markdown("### Image Generation")
    image_prompt = gr.Textbox(label="Enter prompt for image generation")
    generate_btn = gr.Button("Generate Image")
    image_output = gr.HTML()  # Use HTML component for displaying image from URL
    generate_btn.click(fn=generate_image, inputs=[image_prompt], outputs=image_output)

demo.launch()

lachie1 · January 17, 2024, 11:50pm

Shouldn’t it be “data:image/png;base64” not “data:image/jpeg;base64” ?

I noticed you are setting your api key twice as well.

mboard76 · January 18, 2024, 3:19pm

Ok, I started from scratch with the code in the vision docs and I’m getting some decent results, which is really great!

Here is the code

import gradio as gr
import base64
import cv2
import requests
import json

# Load API keys from config file
with open("config.json", "r") as config_file:
    config = json.load(config_file)
    api_key = config["openai_api_key"]

def encode_image(image):
    # Convert the NumPy array (image) to a format suitable for base64 encoding
    _, buffer = cv2.imencode('.jpg', image)
    base64_image = base64.b64encode(buffer).decode('utf-8')
    return base64_image

def chat_with_bot(question, image):
    base64_image = encode_image(image)

    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }

    payload = {
      "model": "gpt-4-vision-preview",
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": question
            },
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
              }
            }
          ]
        }
      ],
      "max_tokens": 1000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    # Extracting the descriptive text from the response
    descriptive_text = response.json()['choices'][0]['message']['content']
    return descriptive_text

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Image Analysis Chatbot")
    with gr.Row():
        question = gr.Textbox(label="Ask a question about the image")
        image = gr.Image(label="Upload an image")
        submit_btn = gr.Button("Submit")

    response = gr.Textbox(label="Response")
    submit_btn.click(fn=chat_with_bot, inputs=[question, image], outputs=response)

demo.launch()

mboard76 · January 18, 2024, 3:19pm

Thanks for checking over my code. I appreciate your help!

lachie1 · January 18, 2024, 11:56pm

Good job looks great!

(Extra words)

Topic		Replies	Views
Ask GPT-4o about a file - Example python function with file upload base64 and tiktoken and usage history with forced json return API gpt-4o	3	3839	June 8, 2024
Uploading images to the ChatGPT API? API	6	10120	February 27, 2025
Moving from gpt-4-vision-preview to gpt-4o Image URL Base64 API gpt-4 , api , gpt-4-vision	2	694	September 11, 2024
"I'm sorry, I can't assist with these requests." with Vision API API api , gpt-4-vision	6	13771	December 18, 2023
Different errors when inputting an image into gpt-4-vision-preview API gpt-4 , api	1	1339	January 17, 2024

Open AI vision model claiming it's text only?

Related topics