Same here - made a script to automatically exclude all jsonl elements that do not pass the Moderation API call, but Still fails with the very non-informative message āToo many files were skipped due to moderationā. Very frustrating.
import os
import requests
import time
from openai import OpenAI
import json
os.environ[āOPENAI_API_KEYā] = ā¦
api_key = ā¦
client = OpenAI(api_key=api_key)
def check_image_moderation(image_url):
# Moderate image using the OpenAI API
try:
response = client.moderations.create(
model=āomni-moderation-latestā,
input=image_url
)
moderation_result = response.results[0]
flagged_categories =
# Collect categories that are flagged as True
for category, is_flagged in vars(moderation_result.categories).items():
if is_flagged:
flagged_categories.append(category)
# Print result in a simplified format
if not flagged_categories:
print(f"Moderation result for {image_url}: all false")
else:
print(f"Moderation result for {image_url}: flagged categories - {', '.join(flagged_categories)}")
return moderation_result.flagged
except Exception as e:
print(f"Error during moderation request for {image_url}: {e}")
return True # If there is an error, assume it's flagged
def filter_images(jsonl_path):
filtered_images =
with open(jsonl_path, 'r') as file:
for line in file:
try:
data = json.loads(line)
# Extract the image URL
image_url = data["messages"][0]["content"][0]["image_url"]["url"]
#print(f"Checking image URL: {image_url}")
# Use the moderation API to check the image
if not check_image_moderation(image_url):
filtered_images.append(data)
except KeyError as e:
print(f"Skipping entry due to missing key: {e}")
return filtered_images
def save_filtered_jsonl(data, output_path):
with open(output_path, āwā) as file:
for entry in data:
file.write(json.dumps(entry) + ā\nā)
def upload_file(file_path, api_key):
headers = {
āAuthorizationā: f"Bearer {api_key}"
}
filename = os.path.basename(file_path)
mime_type = āapplication/jsonlā
with open(file_path, 'rb') as file:
files = {
'file': (filename, file, mime_type),
'purpose': (None, 'fine-tune')
}
response = requests.post("https://api.openai.com/v1/files", headers=headers, files=files)
response.raise_for_status()
return response.json()
def create_fine_tuning_job(training_file_id, validation_file_id, model):
response = client.fine_tuning.jobs.create(
training_file=training_file_id,
validation_file=validation_file_id,
model=model,
hyperparameters={
ān_epochsā: 2, # Adjust as needed
}
)
return response
def check_fine_tuning_status(job_id):
while True:
job = client.fine_tuning.jobs.retrieve(job_id)
print(f"Job status: {job.status}")
if job.status in [āsucceededā, āfailedā]:
break
time.sleep(60) # Check every minute
def main():
jsonl_train_path = āoutput_data_train.jsonlā
jsonl_val_path = āoutput_data_val.jsonlā
filtered_train_path = āfiltered_output_data_train.jsonlā
filtered_val_path = āfiltered_output_data_val.jsonlā
model = āgpt-4o-2024-08-06ā # Your specific model name
# Filter training images
print("Filtering training images...")
filtered_train_data = filter_images(jsonl_train_path)
save_filtered_jsonl(filtered_train_data, filtered_train_path)
print(f"Saved filtered training JSONL to {filtered_train_path}")
# Filter validation images
print("Filtering validation images...")
filtered_val_data = filter_images(jsonl_val_path)
save_filtered_jsonl(filtered_val_data, filtered_val_path)
print(f"Saved filtered validation JSONL to {filtered_val_path}")
# Upload filtered training JSONL file
print("Uploading filtered training JSONL file...")
try:
result_train = upload_file(filtered_train_path, api_key)
training_file_id = result_train['id']
print(f"Uploaded filtered training JSONL file. File ID: {training_file_id}")
except requests.exceptions.HTTPError as e:
print(f"HTTP Error during training JSONL file upload: {e}")
return
# Upload filtered validation JSONL file
print("Uploading filtered validation JSONL file...")
try:
result_val = upload_file(filtered_val_path, api_key)
validation_file_id = result_val['id']
print(f"Uploaded filtered validation JSONL file. File ID: {validation_file_id}")
except requests.exceptions.HTTPError as e:
print(f"HTTP Error during validation JSONL file upload: {e}")
return
# Create fine-tuning job
print("Creating fine-tuning job...")
try:
job = create_fine_tuning_job(training_file_id, validation_file_id, model)
job_id = job.id # Corrected to use dot notation instead of dictionary-style indexing
print(f"Fine-tuning job created. Job ID: {job_id}")
# Monitor fine-tuning job
print("Monitoring fine-tuning job status...")
check_fine_tuning_status(job_id)
except Exception as e:
print(f"Error while creating fine-tuning job: {e}")
if name == āmainā:
main()