Is image captioning currently working?

I saw a post from several weeks ago that said it is down across the board, if so is there an eta?

When I submit an image the api says it cant read images.

I have tried with both 4o and 4o-turbo models.

private void OpenFileButton_Click(object sender, EventArgs e)
{
	using OpenFileDialog openFileDialog = new OpenFileDialog
	{
		Title = "Select Images Files",
		Filter = "Image Files|*.jpg;*.jpeg;*.png;*.gif;*.bmp",
		Multiselect = true
	};

	if (openFileDialog.ShowDialog() == DialogResult.OK)
	{
		selectedImagePaths = new List<string>(openFileDialog.FileNames);
		UpdateSubmitButtonState();
	}            
}

private void PromptTextBox_TextChanged(object sender, EventArgs e)
{
	UpdateSubmitButtonState();
}

private void UpdateSubmitButtonState()
{
	submitButton.Enabled = !string.IsNullOrWhiteSpace(promptTextBox.Text) && selectedImagePaths.Count > 0;
}

private async void SubmitButton_Click(object sender, EventArgs e)
{
	submitButton.Enabled = false;
	progressBar.Minimum = 0;
	progressBar.Maximum = selectedImagePaths.Count;
	progressBar.Value = 0;

	for (int i = 0; i < selectedImagePaths.Count; i++)
	{
		string imagePath = selectedImagePaths[i];
		byte[] imageBytes = File.ReadAllBytes(imagePath);
		string base64Image = Convert.ToBase64String(imageBytes);

		var requestBody = new
		{
			model = "gpt-4-turbo",
			messages = new[]
			{
		new
		{
			role = "user",
			content = promptTextBox.Text,
			image = base64Image // Use the base64 image directly
		}
	},
			max_tokens = 512
		};

		var content = new StringContent(JsonSerializer.Serialize(requestBody), Encoding.UTF8, "application/json");

		try
		{
			var response = await httpClient.PostAsync("https://api.openai.com/v1/chat/completions", content);
			response.EnsureSuccessStatusCode(); // Throws an exception if the status code is not successful

			var responseBody = await response.Content.ReadAsStringAsync();
			var jsonResponse = JsonSerializer.Deserialize<JsonElement>(responseBody);

			if (jsonResponse.TryGetProperty("choices", out JsonElement choices) &&
				choices[0].TryGetProperty("message", out JsonElement message) &&
				message.TryGetProperty("content", out JsonElement contentElement))
			{
				string caption = contentElement.GetString();
				string textFilePath = Path.ChangeExtension(imagePath, ".txt");
				File.WriteAllText(textFilePath, caption);
			}
			else
			{
				MessageBox.Show("Unexpected response format.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
			}
		}

		progressBar.Value = i + 1;
	}

	MessageBox.Show("All captions have been generated and saved.", "Process Complete", MessageBoxButtons.OK, MessageBoxIcon.Information);
	submitButton.Enabled = true;
}

This may be a denial because the AI doesn’t understand it DOES have the ability. You can also send with that format of messages for images that will extend into the future.

Here’s the updated API message format presently used by OpenAI:

Create message list (showing an equivalent Python data object and variables, not the JSON that must be sent):

[
    {
        "role": "system",
        "content": [
            {"type": "text", "text": system_prompt}
        ]
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this art style."},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{encoded_string}"}
            }
        ]
    }
]

Despite the field being “image_url”, it can accept base64 images in the format shown above.

Note: multiple images are allowed, and “text” and “image_url” can even alternate, although this application may not require that.


Here’s a refinement to send in this message format, that also prepares you for the capabilities available, so that you might send multiple images per message, and label each with some further looping (AI produced):


To refine the provided C# code and update the message format as per the new API structure, we’ll modify the function that generates the request body. The new structure will include a system message with a specific prompt and user messages containing both text and base64-encoded images.

Here’s the straightforward C# code that follows the specified message format:

Updated C# Code

private async void SubmitButton_Click(object sender, EventArgs e)
{
	submitButton.Enabled = false;
	progressBar.Minimum = 0;
	progressBar.Maximum = selectedImagePaths.Count;
	progressBar.Value = 0;

	// System message to be sent before the user message
	var systemPrompt = "You are ChatGPT. Built-in GPT-4 computer vision: Enabled.";

	for (int i = 0; i < selectedImagePaths.Count; i++)
	{
		string imagePath = selectedImagePaths[i];
		byte[] imageBytes = File.ReadAllBytes(imagePath);
		string base64Image = Convert.ToBase64String(imageBytes);

		// Construct the messages as per the updated API format
		var requestBody = new
		{
			model = "gpt-4-turbo",
			messages = new[]
			{
				new
				{
					role = "system",
					content = new[]
					{
						new { type = "text", text = systemPrompt }
					}
				},
				new
				{
					role = "user",
					content = new object[]
					{
						new { type = "text", text = "Describe this art style." },
						new { type = "image_url", image_url = new { url = $"data:image/jpeg;base64,{base64Image}" } }
					}
				}
			},
			max_tokens = 512
		};

		var content = new StringContent(JsonSerializer.Serialize(requestBody), Encoding.UTF8, "application/json");

		try
		{
			var response = await httpClient.PostAsync("https://api.openai.com/v1/chat/completions", content);
			response.EnsureSuccessStatusCode(); // Throws an exception if the status code is not successful

			var responseBody = await response.Content.ReadAsStringAsync();
			var jsonResponse = JsonSerializer.Deserialize<JsonElement>(responseBody);

			if (jsonResponse.TryGetProperty("choices", out JsonElement choices) &&
				choices[0].TryGetProperty("message", out JsonElement message) &&
				message.TryGetProperty("content", out JsonElement contentElement))
			{
				string caption = contentElement.GetString();
				string textFilePath = Path.ChangeExtension(imagePath, ".txt");
				File.WriteAllText(textFilePath, caption);
			}
			else
			{
				MessageBox.Show("Unexpected response format.", "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
			}
		}

		progressBar.Value = i + 1;
	}

	MessageBox.Show("All captions have been generated and saved.", "Process Complete", MessageBoxButtons.OK, MessageBoxIcon.Information);
	submitButton.Enabled = true;
}

(warning: AI can write garbage code, I have not looked at all)

Key Changes:

  1. System message: The system prompt "You are ChatGPT. Built-in GPT-4 computer vision: Enabled." is now included as the first message.
  2. User message: The user message now follows the format specified for newest vision models, containing both:
    • A text message (“Describe this art style.”)
    • An image_url with the base64-encoded image in the URL format: data:image/jpeg;base64,{base64Image}.
  3. The code logic loops through the selected images, sending them as base64 in the new image_url format while maintaining the existing functionality.

This code ensures that the image and prompt are sent correctly to the API, and the system message is included at the start of each request (so the AI doesn’t decide to deny its vision capability).

1 Like