I ran into an error today where the output of my vision model was enclosed inside a triple back tip json and then the object,
and hence parsing it through json loads resulted in json error. I recently migrated code and got rid of langchain and non json approaches and when I tested it last it was working but it seems I still have work to do.
I looked through this chat How do I use the new JSON mode? - #64 by EricGT
and it talks about the vision preview api not being able to output in json and people used other methods. Since 4o is multi model and supports json that is no longer the case right? The vision docs have no mention of json format so can you please help me with what the current state of the matter is?
Here is my current vision code. My prompts clearly mention to output in JSON.
# Function to encode an image to base64 format
def encode_image(image_path):
try:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
except FileNotFoundError:
logging.error(f"File not found: {image_path}")
return None
except Exception as e:
logging.error(f"Error encoding image: {e}")
return None
# Function to create payload for image-based API requests
def create_image_payload(image_paths):
image_payload = []
for image_path in image_paths:
base64_image = encode_image(image_path)
if base64_image:
image_payload.append(
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
}
)
return image_payload
# Function to generate headers and payload for Vision API
def generate_vision_payload(prompt, image_paths):
image_payload = create_image_payload(image_paths)
if not image_payload:
logging.error("No valid images found for the payload")
return None
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key_gpt4}",
}
payload = {
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [{"type": "text", "text": prompt}, *image_payload],
}
],
"max_tokens": 4000,
}
return headers, payload
# Main function to make the Vision API request and return response content
def call_vision_api(prompt, image_paths):
headers, payload = generate_vision_payload(prompt, image_paths)
if not payload:
logging.error("Payload generation failed. Exiting Vision API call.")
return None
try:
response = make_vision_api_call(headers=headers, json=payload)
if response and "choices" in response:
return response["choices"][0]["message"]["content"]
else:
logging.error("Vision API returned an invalid or empty response.")
except Exception as e:
logging.error(f"Error during Vision API call: {e}")
return None
# Function to call the Vision API with retries
@retry_with_exponential_backoff
def make_vision_api_call(**kwargs):
try:
response = requests.post(VISION_API_URL, **kwargs)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
logging.error(f"Error during Vision API request: {e}")
raise