How to do few shot prompting with images in GPT-4 vision api structure? Can someone provide a code to do so?
Welcome to the community @deeksha.s.nayak
You can use multi-modal content
in messages to pass image(s) and the desired text outcome in content blocks, akin to how it’s been used in this example from docs.
It would be essentially the same as sending other few-shot examples: you give an input, and you demonstrate the way the AI responds, so that it can begin following a pattern.
These latest models, such as the 1106 version of gpt-4-turbo that vision is based on, are highly-trained on chat responses, so previous input will show far less impact on behavior.
After the system message (that still needs some more demonstration to the AI), you then pass example messages as if they were chat that occurred.
One can also experiment with how the AI interprets the “name” field, where you can use a name like “example”.
multi-shot_messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Perform the programmed vision task on two images"
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image1}", "detail": "low"}
},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{base64_image2}", "detail": "high"}
}
]
},
{
"role": "assistant",
"content": '{"similarity": 75, "commonality": "dogs"}'
}
]
If you read the response the AI writes, you also can probably figure out what I want you to do…
then to build your final API call:
messages = system + multi-shot_messages + history + user_input
I’m a bit confused as to how to incorporate it into this code. This code is supposed to identify a product on the retail shelf. It performs badly at counting which is why I want to try few shot prompting to see whether it helps. Could you please help out as to how do I modify this code -
import streamlit as st
import requests
import base64
def main():
st.title(“Product Detection on Retail Shelf”)
# File upload widgets
uploaded_product_image = st.file_uploader("Upload Product Image", type=["jpg", "jpeg"])
uploaded_shelf_image = st.file_uploader("Upload Retail Shelf Image", type=["jpg", "jpeg"])
if uploaded_product_image and uploaded_shelf_image:
product_image_content = uploaded_product_image.read()
shelf_image_content = uploaded_shelf_image.read()
# Display uploaded images
st.image(product_image_content, caption='Uploaded Product Image', use_column_width=True)
st.image(shelf_image_content, caption='Uploaded Shelf Image', use_column_width=True)
# Check if the product is present on the retail shelf
product_present = check_product_presence(product_image_content, shelf_image_content)
if product_present:
st.write("The product is present on the retail shelf.")
else:
st.write("The product is not present on the retail shelf.")
# Allow the user to ask questions if product is present
if product_present:
question = st.text_input("Ask a question:")
if st.button("Ask"):
if question:
answer = ask_question(question, product_image_content, shelf_image_content)
st.markdown(answer)
else:
st.warning("Please enter a question.")
else:
st.warning("Please upload both product and shelf images.")
def check_product_presence(product_image_content, shelf_image_content):
# Encoding images to base64
encoded_product_image = base64.b64encode(product_image_content).decode(‘ascii’)
encoded_shelf_image = base64.b64encode(shelf_image_content).decode(‘ascii’)
# Configuration
GPT4V_KEY = ""
headers = {
"Content-Type": "application/json",
"api-key": GPT4V_KEY,
}
# Payload for the request
payload = {
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": "Is product in product image present in shelf image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_product_image}"
}
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_shelf_image}"
}
}
]
}
],
"temperature": 0.5,
#"top_p": 0.95,
"max_tokens": 800
}
GPT4V_ENDPOINT = ""
# Send request
try:
response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
st.error(f"Failed to make the request. Error: {e}")
return False
# Handle the response as needed (e.g., print or process)
response_json = response.json()
if 'choices' in response_json and len(response_json['choices']) > 0 and 'message' in response_json['choices'][0]:
content = response_json['choices'][0]['message']['content']
return "yes" in content.lower()
return False
def ask_question(question, product_image_content, shelf_image_content):
# Encoding images to base64
encoded_product_image = base64.b64encode(product_image_content).decode(‘ascii’)
encoded_shelf_image = base64.b64encode(shelf_image_content).decode(‘ascii’)
# Configuration
GPT4V_KEY = ""
headers = {
"Content-Type": "application/json",
"api-key": GPT4V_KEY,
}
# Payload for the request
payload = {
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": question
},
{
"type": "text",
"text": "Product Image"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_product_image}"
}
},
{
"type": "text",
"text": "Retail Shelf Image"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_shelf_image}"
}
}
]
}
],
"temperature": 0.5,
#"top_p": 0.95,
"max_tokens": 800
}
GPT4V_ENDPOINT = ""
# Send request
try:
response = requests.post(GPT4V_ENDPOINT, headers=headers, json=payload)
response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
return f"Failed to make the request. Error: {e}"
# Handle the response as needed (e.g., print or process)
response_json = response.json()
if 'choices' in response_json and len(response_json['choices']) > 0 and 'message' in response_json['choices'][0]:
content = response_json['choices'][0]['message']['content']
return content
if name == “main”:
main()
where you able to figure this out? Bumping my head on this as well
Yes. This kind of helped.
payload = {
“model”: “gpt-4-vision-preview”,
“messages”: [
{
“role”: “user”,
“content”: [
{
“type”: “text”,
“text”: “Is the product in product image1 present in the shelf image?”
},
{
“type”: “image_url”,
“image_url”: {
“url”: f"data:image/jpeg;base64,{encoded_product_image1}"
}
},
{
“type”: “image_url”,
“image_url”: {
“url”: f"data:image/jpeg;base64,{encoded_shelf_image1}"
}
},
{
“type”: “text”,
“text”: “Yes it is present. It is present on second shelf from top. "
}
]
},
{
“role”: “user”,
“content”: [
{
“type”: “text”,
“text”: “Is the product in product image present in the shelf image?”
},
{
“type”: “image_url”,
“image_url”: {
“url”: f"data:image/jpeg;base64,{encoded_product_image2}”
}
},
{
“type”: “image_url”,
“image_url”: {
“url”: f"data:image/jpeg;base64,{encoded_shelf_image1}"
}
},
{
“type”: “text”,
“text”: “Yes it is present. It is located on the top shelf in the middle.”
}
]
},
{
“role”: “system”,
“content”: [
{
“type”: “text”,
“text”: “Is the product in product image present in the shelf image?”
},
{
“type”: “image_url”,
“image_url”: {
“url”: f"data:image/jpeg;base64,{encoded_product_image3}"
}
},
{
“type”: “image_url”,
“image_url”: {
“url”: f"data:image/jpeg;base64,{encoded_shelf_image2}"
}
}
]
}
],
what is the issue ? You said this " This kind of helped."