Image Upload / Recognition via Bing Chat

So in the absence of a Clip API, I decided to generate captions for images using Salesforce’s BLIP (Bootstrapped Language Image Pre-training) model. It’s not perfect, but it works and it only uses 32mb of ram

Usage

The script is intended to be run from the command line. The images should be placed in an ‘images’ directory located in the same directory as the script. The script will process all images in the ‘images’ directory and save the results to a file in the same directory as the script.

Output

The script generates a CSV file named ‘output.csv’ with the following columns:

  • Filename
  • Type (PNG, JPEG, JPG, GIF)
  • Number of Frames (for GIF’s)
  • Height in Pixels
  • Width in Pixels
  • File Size in MB
  • Description (Caption generated by the BLIP model)

import os
import csv
from PIL import Image
from fractions import Fraction
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Function to get dimensions of an image
def get_image_dimensions(image_path):
    with Image.open(image_path) as img:
        return img.width, img.height

# Function to get frame count of a GIF
def get_gif_frame_count(gif_path):
    with Image.open(gif_path) as img:
        frames = 0
        while True:
            try:
                img.seek(frames)
                frames += 1
            except EOFError:
                break
        return frames

# Function to compute the aspect ratio of an image
def compute_aspect_ratio(width, height):
    return Fraction(width, height).limit_denominator()

# Function to get file size in MB
def get_file_size_mb(file_path):
    return os.path.getsize(file_path) / (1024 * 1024)

# Function to write a CSV file
def write_csv(csv_file, csv_data):
    with open(csv_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Filename", "Type", "Number of Frames", "Height in Pixels", "Width in Pixels", "File Size in MB", "Description"])
        writer.writerows(csv_data)

# Function to get image caption using BLIP
def get_image_caption(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Main function
def main():
    # Define the directory containing the extracted files
    current_dir = os.path.dirname(os.path.abspath(__file__))
    extract_dir = os.path.join(current_dir, 'images')

    # List the contents of the extracted directory
    extracted_files = os.listdir(extract_dir)

    # Prepare data for the CSV
    csv_data = []

    for file in extracted_files:
        file_path = os.path.join(extract_dir, file)
        file_size_mb = get_file_size_mb(file_path)
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
            width, height = get_image_dimensions(file_path)
            description = get_image_caption(file_path)
            frames = 1
            if file.lower().endswith('.gif'):
                frames = get_gif_frame_count(file_path)
            csv_data.append([file, 'Image', frames, height, width, file_size_mb, description])

    # Write the CSV file
    csv_file = os.path.join(current_dir, 'output.csv')
    write_csv(csv_file, csv_data)

# Run the main function
if __name__ == "__main__":
    main()
1 Like