Hi, how do I count how many number of tokens does the each image has when using gpt-4-vision-preview model?
According to the pricing page, every image is resized (if too big) in order to fit in a 1024x1024 square, and is first globally described by 85 base tokens.
Tiles
To be fully recognized, an image is covered by 512x512 tiles.
Each tile provides 170 tokens. So, by default, the formula is the following:
total tokens = 85 + 170 * n, where n = the number of tiles needed to cover your image.
Implementation
This can be easily computed this way:
from math import ceil
def resize(width, height):
if width > 1024 or height > 1024:
if width > height:
height = int(height * 1024 / width)
width = 1024
else:
width = int(width * 1024 / height)
height = 1024
return width, height
def count_image_tokens(width: int, height: int):
width, height = resize(width, height)
h = ceil(height / 512)
w = ceil(width / 512)
total = 85 + 170 * h * w
return total
Some examples
- 500x500 → 1 tile is enough to cover this up, so total tokens = 85+170 = 255
- 513x500 → you need 2 tiles → total tokens = 85+170*2 = 425
- 513x513 → you need 4 tiles → total tokens = 85+170*4 = 765
low_resolution
mode
In “low resolution” mode, there is no tile; only the 85 base tokens remain, no matter the size of your image.
This makes sense to me, except when you use the calculator it seems to be resizing images.
Like what is going on here in a 2048x2048 image:
Why is it resizing, and why is this 4 tiles and not 16 tiles?
They should put this in official documentation
Thanks!
def calculate_image_tokens(width: int, height: int):
if width > 2048 or height > 2048:
aspect_ratio = width / height
if aspect_ratio > 1:
width, height = 2048, int(2048 / aspect_ratio)
else:
width, height = int(2048 * aspect_ratio), 2048
if width >= height and height > 768:
width, height = int((768 / height) * width), 768
elif height > width and width > 768:
width, height = 768, int((768 / width) * height)
tiles_width = ceil(width / 512)
tiles_height = ceil(height / 512)
total_tokens = 85 + 170 * (tiles_width * tiles_height)
return total_tokens
This node.js code helped me calculate the actual tokens
function calculateVisionPricing(width, height, detail = "high") {
let newWidth = 768,
newHeight = 768;
let aspect_ratio;
if (detail === "low") {
return 85
}
if (width > 2048 || height > 2048) {
aspect_ratio = width / height;
if (aspect_ratio > 1) {
newWidth = 2048;
newHeight = parseInt(2048 / aspect_ratio);
} else {
newHeight = 2048;
newWidth = parseInt(2048 * aspect_ratio);
}
}
if (width >= height && height > 768) {
newWidth = Math.floor((768 / height) * width);
} else if (height > width && width > 768) {
newHeight = Math.floor((768 / width) * height);
}
const tiles_width = Math.ceil(newWidth / 512);
const tiles_height = Math.ceil(newHeight / 512);
const total_tokens = 85 + 170 * (tiles_width * tiles_height);
return total_tokens;
}
This answer is out of data now
Here is what I am using atm:
function calculateVisionPricing(width: number, height: number, detail: string = 'high'): number {
if (detail === 'low') {
return 85;
}
// Scale down to fit within a 2048 x 2048 square if necessary
if (width > 2048 || height > 2048) {
const maxSize = 2048;
const aspectRatio = width / height;
if (aspectRatio > 1) {
width = maxSize;
height = parseInt(String(maxSize / aspectRatio));
} else {
height = maxSize;
width = parseInt(String(maxSize * aspectRatio));
}
}
// Resize such that the shortest side is 768px if the original dimensions exceed 768px
const minSize = 768;
const aspectRatio = width / height;
if (width > minSize && height > minSize) {
if (aspectRatio > 1) {
height = minSize;
width = parseInt(String(minSize * aspectRatio));
} else {
width = minSize;
height = parseInt(String(minSize / aspectRatio));
}
}
const tilesWidth = Math.ceil(width / 512);
const tilesHeight = Math.ceil(height / 512);
return 85 + 170 * (tilesWidth * tilesHeight);
}
function runTests() {
const testCases = [
{ width: 128, height: 128, detail: 'high', expected: 255 },
{ width: 512, height: 512, detail: 'high', expected: 255 },
{ width: 612, height: 134, detail: 'high', expected: 425 },
{ width: 767, height: 767, detail: 'high', expected: 765 },
{ width: 900, height: 767, detail: 'high', expected: 765 },
{ width: 900, height: 900, detail: 'high', expected: 765 },
{ width: 3000, height: 1200, detail: 'high', expected: 1445 },
{ width: 3000, height: 5000, detail: 'high', expected: 1105 },
{ width: 4096, height: 8192, detail: 'low', expected: 85 },
];
let allTestsPassed = true;
for (const test of testCases) {
const { width, height, detail, expected } = test;
const result = calculateVisionPricing(width, height, detail);
const passed = result === expected;
allTestsPassed = allTestsPassed && passed;
console.log(`Test ${passed ? 'PASSED' : 'FAILED'}: width=${width}, height=${height}, detail=${detail}, expected=${expected}, got=${result}`);
}
if (allTestsPassed) {
console.log('All tests passed!');
} else {
console.log('Some tests failed.');
}
}