Multimodal GPT-4o ignores detail: "low" for small JPEG images

When sending many low-resolution JPEG frames through the Chat Completions API using detail:"low" (or "auto"), GPT-4o and GPT-4o-mini produce massively inflated prompt token counts.

Even with 128×72 JPEG images, the models tokenize 49k–180k tokens for a simple MCQ prompt with:

  • 32 main video frames

  • 4 options × 8 frames each

  • Total images = 64

Expected usage (per docs):
64 images × ~85 tokens ≈ ~5400 tokens

Actual usage:
GPT-4o → ~49,000 prompt tokens
GPT-4o-mini → ~180,000 prompt tokens

This suggests detail:"low" is not taking effect, and the entire base64 payload + structural JSON is being fully tokenized.

I have tried resizing images to quite small dimensions, and also reducing the jpeg compression quality, but the issue persists (exact same token usage).

Here’s an example script similar to the one I am running:

import base64, cv2, numpy as np, os
from openai import OpenAI

VIDEO_PATH = “/path/to/video.mp4”
OPTION_FRAME_ROOT = “/path/to/options”  # contains A/B/C/D folders
NUM_MAIN_FRAMES = 32
NUM_OPTION_FRAMES = 8
MODEL = “gpt-4o”

client = OpenAI()

def extract_frames(video_path, n=32):
cap = cv2.VideoCapture(video_path)
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
idxs = [int(i * total / n) for i in range(n)]
out = 

for idx in idxs:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ok, frame = cap.read()
if not ok: continue
frame = cv2.resize(frame, (426, 240))
_, buf = cv2.imencode(“.jpg”, frame, [int(cv2.IMWRITE_JPEG_QUALITY),70])
out.append(base64.b64encode(buf).decode(“utf-8”))
cap.release()
return out

main_frames = extract_frames(VIDEO_PATH, NUM_MAIN_FRAMES)

def load_option_frames(folder):
frames = 

files = sorted(os.listdir(folder))[:NUM_OPTION_FRAMES]
for f in files:
img = cv2.imread(os.path.join(folder,f))
img = cv2.resize(img, (426,240))
_, buf = cv2.imencode(“.jpg”, img, [int(cv2.IMWRITE_JPEG_QUALITY),70])
frames.append(base64.b64encode(buf).decode(“utf-8”))
return frames

options = {k: load_option_frames(os.path.join(OPTION_FRAME_ROOT,k))
for k in [“A”,“B”,“C”,“D”]}

content = [{“type”:“text”, “text”:“Analyze the following video frames:”}]

for b64 in main_frames:
content.append({“type”:“image_url”,
“image_url”:{“url”:f"data:image/jpeg;base64,{b64}",
“detail”:“low”}})

content.append({“type”:“text”,“text”:“Question: What is happening?”})

for key in [“A”,“B”,“C”,“D”]:
content.append({“type”:“text”,“text”:f"Option {key}:“})
for b64 in options[key]:
content.append({“type”:“image_url”,
“image_url”:{“url”:f"data:image/jpeg;base64,{b64}”,
“detail”:“low”}})

content.append({“type”:“text”,“text”:“Answer with one letter.”})
messages = [{“role”:“user”,“content”:content}]

resp = client.chat.completions.create(
model=MODEL,
messages=messages,
temperature=0
)

usage = resp.usage
print(“prompt_tokens:”, usage.prompt_tokens)
print(“completion_tokens:”, usage.completion_tokens)

Thank you for such a thorough report! The backend fix has now been merged and deployed - let me know how everything looks on your end now.

1 Like