You’d want to:
- constrain the size of inputs, targeting a particular AI input form
- reduce the number of individual API calls to services
- reduce the number of requests made by services to remote resources
- reduce the internal API lookups against persistent state content
- avoid frontend services that do extra interloping work and return excessive data
- don’t use models that perform extra internal deliberation (reasoning) before the response; instead, use ones with high token production rate, and limit that production to just what length is needed.
- don’t use services such as strict structured outputs that need artifacts built, cached, retrieved, cache missed.
So essentially, that means using chat completions with an instructed JSON schema (you can use multiple examples also in system messages to ensure JSON quality, as input context runs fast). Then a fast token-producing model, and one that responds to “service_tier”: “priority”.
One thing to note, is that gpt-4.1 mini and gpt-5-mini use “patches” as images. These models don’t respond to “detail”:“low” to give a fixed cost by a fast run only a single 512px x 512px max “tile” for vision. Instead, they use a pretty complicated formula for up to 1536 tokens of input.
What you’d want to do for transmission:
- keep extra network and file transfers low - images in memory, directly sent as base64.
- don’t resize images that the endpoint also wouldn’t automatically resize, unless you measure a threshold of network transmission and encoding time where this would be beneficial.
I skipped past benchmarking models on vision, which I’ve done before to generalize performance, and where each has a setup time per image that is a significant step of a few seconds over without. Just use gpt-4.1-mini and cap the size of the image input lower for both the benefit of transmission and for decompression by OpenAI.
What I have for you instead is a chat completions attachment-maker, creating the additional parts of content for a user message. Then, I attached the “patches” formula right out of my own pricing calculator that agrees with OpenAI down to the pixel (and is closer than OpenAI’s own pricing calculator to what the API delivers on extreme edge cases).
You can pass in a file path location or a PIL Image type already loaded (such as a user upload that never hit disk except for posterity).
Then you have detail:low do something - cap the tokens of input in half when resizing, being aware of the underlying format. Or you can pass the helper function your own patches-to-tokens count and have it do the magic, informed by the model.
Here is that in Python as our coding fun for the day. Give it all the CPU you’ve got.
from __future__ import annotations
from pathlib import Path
from io import BytesIO
import base64
from typing import Any
from PIL import Image, ImageOps
def calculate_patches_tokens_and_resized(
original_width: int,
original_height: int,
*,
patch_size: int = 32,
max_patches: int = 1536,
max_dim_absolute: int = 2000,
) -> dict[str, int]:
"""
Port of the provided JS algorithm. Do not change the logic.
Returns: {"tokens", "resizedWidth", "resizedHeight", "patchW", "patchH"}.
"""
width = original_width
height = original_height
if original_width > max_dim_absolute or original_height > max_dim_absolute:
scale_ratio = 1.0
if original_width > max_dim_absolute:
scale_ratio = min(scale_ratio, max_dim_absolute / original_width)
if original_height > max_dim_absolute:
scale_ratio = min(scale_ratio, max_dim_absolute / original_height)
width = int(width * scale_ratio // 1)
height = int(height * scale_ratio // 1)
if width == 0:
width = 1
if height == 0:
height = 1
def _ceil_div(a: int, b: int) -> int:
return (a + b - 1) // b
initial_patch_w = _ceil_div(width, patch_size)
initial_patch_h = _ceil_div(height, patch_size)
initial_total = initial_patch_w * initial_patch_h
if initial_total <= max_patches:
return {
"tokens": initial_total,
"resizedWidth": width,
"resizedHeight": height,
"patchW": initial_patch_w,
"patchH": initial_patch_h,
}
first_scale = ((max_patches * (patch_size ** 2)) / (width * height)) ** 0.5
width1 = int((width * first_scale) // 1)
height1 = int((height * first_scale) // 1)
patch_w1 = width1 / patch_size
final_patch_w = int(patch_w1 // 1)
adjustment_scale = (final_patch_w / patch_w1) if patch_w1 != 0 else 0.0
width_final = int((width1 * adjustment_scale) // 1)
height_final = int((height1 * adjustment_scale) // 1)
final_patch_h = _ceil_div(height_final, patch_size)
final_tokens = final_patch_w * final_patch_h
if final_tokens > max_patches:
final_patch_h -= 1
height_final = final_patch_h * patch_size
final_tokens = final_patch_w * final_patch_h
if final_patch_w <= 0 or final_patch_h <= 0:
final_patch_w = max(0, final_patch_w)
final_patch_h = max(0, final_patch_h)
final_tokens = 0
if final_patch_w == 0:
width_final = 0
if final_patch_h == 0:
height_final = 0
return {
"tokens": final_tokens,
"resizedWidth": width_final,
"resizedHeight": height_final,
"patchW": final_patch_w,
"patchH": final_patch_h,
}
def make_content_part(item: str | Image.Image, *, kind: str | None = None, detail: str = "auto") -> dict[str, Any]:
"""
Create a single chat content part for Chat Completions:
- text => {"type": "text", "text": "..."}
- PDF file => {"type": "file", "file": {"file_id": "..."}}
or {"type": "file", "file": {"filename": "...", "file_data": "data:application/pdf;base64,..."}} # local only
- audio (mp3/wav)=> {"type": "input_audio", "input_audio": {"data": "...", "format": "mp3|wav"}}
- image => {"type": "image_url", "image_url": {"url": "<http(s) or data: URI>", "detail": "low|high|auto"}}
Classification precedence (default):
1) file id: length and startswith "file-" or "file_"
2) image URL: http(s) with no whitespace OR data:image/*;base64,...
3) local file: existing path:
- .pdf -> PDF "file"
- .mp3/.wav -> input_audio
- else -> image as data URI (JPEG if no extension)
4) PIL.Image.Image -> image as data URI (auto downsize if needed)
5) fallback -> plain text
'kind' ("text" | "file" | "image" | "audio") is a hint and used if satisfiable without violating the spec.
'detail' is passed only for image content and defaults to "auto".
Image inputs (local files or PIL Image) are auto-downsized if they exceed the token/patch budget
implied by 'detail', using the provided patches algorithm. Data-URI/base64 inputs are passed through.
"""
def _normalize_detail(d: str) -> str:
d = (d or "auto").lower()
return d if d in {"low", "high", "auto"} else "auto"
DETAIL_BUDGET: dict[str, int] = {
# Tweakable per model if needed. Algorithm is fixed; only budgets vary.
"low": 768,
"high": 1536,
"auto": 1536,
}
PATCH_SIZE = 32
MAX_DIM_ABSOLUTE = 2000
def _is_http_url(s: str) -> bool:
return isinstance(s, str) and s.startswith(("http://", "https://")) and not any(ch.isspace() for ch in s)
def _is_data_uri(s: str) -> bool:
return isinstance(s, str) and s.startswith("data:")
def _is_data_image_uri(s: str) -> bool:
return isinstance(s, str) and s.startswith("data:image/")
def _image_mime_for_ext(ext: str) -> str:
e = ext.lower().lstrip(".")
if e in ("", "jpg", "jpeg"):
return "image/jpeg"
if e == "png":
return "image/png"
if e == "gif":
return "image/gif"
if e == "webp":
return "image/webp"
if e in ("tif", "tiff"):
return "image/tiff"
if e == "bmp":
return "image/bmp"
if e == "svg":
return "image/svg+xml"
if e == "ico":
return "image/x-icon"
return f"image/{e}"
def _pil_format_for_ext(ext: str) -> str:
e = ext.lower().lstrip(".")
if e in ("jpg", "jpeg", ""):
return "JPEG"
if e == "png":
return "PNG"
if e == "gif":
return "GIF"
if e == "webp":
return "WEBP"
if e in ("tif", "tiff"):
return "TIFF"
if e == "bmp":
return "BMP"
return "JPEG"
def _as_image_url(url: str) -> dict[str, Any]:
return {"type": "image_url", "image_url": {"url": url, "detail": detail_norm}}
def _encode_pil_to_data_uri(img: Image.Image, *, preferred_format: str | None = None) -> tuple[str, str]:
fmt = preferred_format or (img.format or "PNG")
if fmt.upper() == "JPEG":
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
mime = "image/jpeg"
elif fmt.upper() == "PNG":
mime = "image/png"
elif fmt.upper() == "WEBP":
mime = "image/webp"
elif fmt.upper() == "GIF":
mime = "image/gif"
elif fmt.upper() in ("TIFF", "TIF"):
mime = "image/tiff"
elif fmt.upper() == "BMP":
mime = "image/bmp"
else:
fmt = "PNG"
mime = "image/png"
buf = BytesIO()
save_kwargs: dict[str, Any] = {}
if fmt.upper() in {"JPEG", "WEBP"}:
save_kwargs.update({"quality": 95, "optimize": True})
img.save(buf, format=fmt, **save_kwargs)
b64 = base64.b64encode(buf.getvalue()).decode("ascii")
return f"data:{mime};base64,{b64}", mime
def _maybe_downsize_pil(img: Image.Image) -> Image.Image:
w, h = img.size
budget = DETAIL_BUDGET[detail_norm]
calc = calculate_patches_tokens_and_resized(
w, h, patch_size=PATCH_SIZE, max_patches=budget, max_dim_absolute=MAX_DIM_ABSOLUTE
)
target_w = calc["resizedWidth"]
target_h = calc["resizedHeight"]
if target_w <= 0 or target_h <= 0:
return img
if target_w < w or target_h < h:
# Apply EXIF orientation before resizing
img2 = ImageOps.exif_transpose(img)
return img2.resize((target_w, target_h), Image.Resampling.LANCZOS)
return ImageOps.exif_transpose(img)
def _file_path_to_image_data_uri(p: Path) -> str:
ext = p.suffix.lower().lstrip(".")
if ext == "svg":
raw = p.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
return f"data:image/svg+xml;base64,{b64}"
try:
with Image.open(p) as im:
im2 = _maybe_downsize_pil(im)
# Keep original file's intent where possible
fmt = _pil_format_for_ext(ext)
uri, _ = _encode_pil_to_data_uri(im2, preferred_format=fmt)
return uri
except Exception:
# Fallback: pass-through bytes as if image/* (may be non-raster)
raw = p.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
mime = _image_mime_for_ext(ext)
return f"data:{mime};base64,{b64}"
detail_norm = _normalize_detail(detail)
# PIL.Image.Image input
if isinstance(item, Image.Image):
im = _maybe_downsize_pil(item)
# Prefer PNG if alpha present, else JPEG
has_alpha = ("A" in im.getbands())
preferred = "PNG" if has_alpha else "JPEG"
uri, _mime = _encode_pil_to_data_uri(im, preferred_format=preferred)
return _as_image_url(uri)
# String input from here on
s = item
is_file_id = isinstance(s, str) and (len(s) in range(20, 40)) and s.startswith(("file-", "file_"))
is_http = _is_http_url(s) if isinstance(s, str) else False
is_data_img = _is_data_image_uri(s) if isinstance(s, str) else False
is_data_any = _is_data_uri(s) if isinstance(s, str) else False
p = Path(s) if isinstance(s, str) else None
exists = bool(p and p.exists() and p.is_file())
ext = p.suffix.lower().lstrip(".") if exists else ""
# Respect the hint when possible
if kind:
k = kind.lower().strip()
if k in {"text", "input_text"}:
return {"type": "text", "text": str(s)}
if k in {"file", "pdf"}:
if is_file_id:
return {"type": "file", "file": {"file_id": str(s)}}
if is_data_any:
# Pass-through any data:* as "local" file payload
return {"type": "file", "file": {"filename": "inline", "file_data": str(s)}}
if exists and ext == "pdf":
b64 = base64.b64encode(p.read_bytes()).decode("ascii")
return {"type": "file", "file": {"filename": p.name, "file_data": f"data:application/pdf;base64,{b64}"}}
# fall through to auto if not satisfiable
if k in {"audio", "input_audio"}:
if exists and ext in {"mp3", "wav"}:
b64 = base64.b64encode(p.read_bytes()).decode("ascii")
return {"type": "input_audio", "input_audio": {"data": b64, "format": ext}}
# fall through to auto
if k in {"image", "image_url"}:
if is_http or is_data_img:
return _as_image_url(str(s))
if is_data_any:
# Pass-through any data:* (e.g., prebuilt data:image/*) unmodified
return _as_image_url(str(s))
if exists:
uri = _file_path_to_image_data_uri(p)
return _as_image_url(uri)
# fall through to auto
# Default auto-classification
if is_file_id:
return {"type": "file", "file": {"file_id": str(s)}}
if is_http or is_data_img:
return _as_image_url(str(s))
if exists:
raw = p.read_bytes()
b64 = base64.b64encode(raw).decode("ascii")
if ext == "pdf":
return {"type": "file", "file": {"filename": p.name, "file_data": f"data:application/pdf;base64,{b64}"}}
if ext in {"mp3", "wav"}:
return {"type": "input_audio", "input_audio": {"data": b64, "format": ext}}
# Image (or other) file => use PIL-driven path if possible to allow downsizing
try:
with Image.open(p) as _:
uri = _file_path_to_image_data_uri(p)
return _as_image_url(uri)
except Exception:
mime = _image_mime_for_ext(ext)
return _as_image_url(f"data:{mime};base64,{b64}")
return {"type": "text", "text": str(s)}
This is ignorant of non-patches models, and is for your “fast” application.