When sending many low-resolution JPEG frames through the Chat Completions API using detail:"low" (or "auto"), GPT-4o and GPT-4o-mini produce massively inflated prompt token counts.
Even with 128×72 JPEG images, the models tokenize 49k–180k tokens for a simple MCQ prompt with:
-
32 main video frames
-
4 options × 8 frames each
-
Total images = 64
Expected usage (per docs):
64 images × ~85 tokens ≈ ~5400 tokens
Actual usage:
GPT-4o → ~49,000 prompt tokens
GPT-4o-mini → ~180,000 prompt tokens
This suggests detail:"low" is not taking effect, and the entire base64 payload + structural JSON is being fully tokenized.
I have tried resizing images to quite small dimensions, and also reducing the jpeg compression quality, but the issue persists (exact same token usage).
Here’s an example script similar to the one I am running:
import base64, cv2, numpy as np, os
from openai import OpenAI
VIDEO_PATH = “/path/to/video.mp4”
OPTION_FRAME_ROOT = “/path/to/options” # contains A/B/C/D folders
NUM_MAIN_FRAMES = 32
NUM_OPTION_FRAMES = 8
MODEL = “gpt-4o”
client = OpenAI()
def extract_frames(video_path, n=32):
cap = cv2.VideoCapture(video_path)
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
idxs = [int(i * total / n) for i in range(n)]
out =
for idx in idxs:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ok, frame = cap.read()
if not ok: continue
frame = cv2.resize(frame, (426, 240))
_, buf = cv2.imencode(“.jpg”, frame, [int(cv2.IMWRITE_JPEG_QUALITY),70])
out.append(base64.b64encode(buf).decode(“utf-8”))
cap.release()
return out
main_frames = extract_frames(VIDEO_PATH, NUM_MAIN_FRAMES)
def load_option_frames(folder):
frames =
files = sorted(os.listdir(folder))[:NUM_OPTION_FRAMES]
for f in files:
img = cv2.imread(os.path.join(folder,f))
img = cv2.resize(img, (426,240))
_, buf = cv2.imencode(“.jpg”, img, [int(cv2.IMWRITE_JPEG_QUALITY),70])
frames.append(base64.b64encode(buf).decode(“utf-8”))
return frames
options = {k: load_option_frames(os.path.join(OPTION_FRAME_ROOT,k))
for k in [“A”,“B”,“C”,“D”]}
content = [{“type”:“text”, “text”:“Analyze the following video frames:”}]
for b64 in main_frames:
content.append({“type”:“image_url”,
“image_url”:{“url”:f"data:image/jpeg;base64,{b64}",
“detail”:“low”}})
content.append({“type”:“text”,“text”:“Question: What is happening?”})
for key in [“A”,“B”,“C”,“D”]:
content.append({“type”:“text”,“text”:f"Option {key}:“})
for b64 in options[key]:
content.append({“type”:“image_url”,
“image_url”:{“url”:f"data:image/jpeg;base64,{b64}”,
“detail”:“low”}})
content.append({“type”:“text”,“text”:“Answer with one letter.”})
messages = [{“role”:“user”,“content”:content}]
resp = client.chat.completions.create(
model=MODEL,
messages=messages,
temperature=0
)
usage = resp.usage
print(“prompt_tokens:”, usage.prompt_tokens)
print(“completion_tokens:”, usage.completion_tokens)