Hi everyone! I’m using gpt4o model for the task of video understanding. From video I take a frame per second and send it to the model. Here is my simplified code:
import cv2
import base64
def process_videos(video_paths, seconds_per_frame=1):
all_base64Frames = []
for video_path in video_paths:
base64Frames = []
video = cv2.VideoCapture(video_path)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = video.get(cv2.CAP_PROP_FPS)
frames_to_skip = int(fps * seconds_per_frame)
curr_frame = 0
while curr_frame < total_frames - 1:
video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
curr_frame += frames_to_skip
video.release()
all_base64Frames.extend(base64Frames)
return all_base64Frames
def get_markdown_content(client, base64Frames, question, model, prompt):
messages = [
{
"role": "system",
"content": prompt,
},
{
"role": "user",
"content": [
{"type": "text", "text": "These are frames from multiple videos. Each frame has a timestamp in the top right corner."},
*[
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpg;base64,{x}",
"detail": "low",
},
}
for x in base64Frames
],
f"Analyze the scenes in all videos and answer the question: {question}. ",
],
},
]
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
markdown_content = response.choices[0].message.content
prompt_tokens = response.usage.prompt_tokens
return markdown_content, prompt_tokens
if __name__ == "__main__":
from openai import OpenAI
video_path = ["/Users/anvarganiev/Downloads/test_video_1.mp4"]
question = "describe the video"
prompt = "You are generating a video summary. The video frames contain a timestamp in the top right corner. Please extract this timestamp from the frames and include it in your description. Use the timestamp to structure your answer to the given question. Respond in Markdown format."
tokens = dict(
OPENAI_TOKEN="***",
OPENAI_ORG="org-***",
OPENAI_PROJECT="***",
OPENAI_MODEL="gpt-4o"
)
client = OpenAI(
api_key=tokens["OPENAI_TOKEN"],
organization=tokens["OPENAI_ORG"],
project=tokens["OPENAI_PROJECT"],
)
base64Frames = process_videos(video_path)
markdown_content, prompt_tokens = get_markdown_content(
client, base64Frames, question, tokens['OPENAI_MODEL'], prompt
)
print("Markdown Content:")
print(markdown_content)
print(f"Prompt Tokens Used: {prompt_tokens}")
I use detailed : low parameter and as far as I understand from the documentation
- Regardless of input size, low detail images are a fixed cost.
And it is 85 token for the image.
When I’m sending 11 second video there is no any problem (the video shape is 848 × 624). Output is good and “Prompt Tokens Used: 1062”.
But when I send 62 second video (shape is 1280x720) it throws a token rate limit error: Limit 30000, Requested 49838.
I cannot understand why. Even if there are 6-7 times more frames, then it maximum should be 10k tokens, a not 50k. Please explain what am I doing wrong.