Trying to call Azure Open AI gpt-4 model via SDK.In prompt we are sedning base 64 encoded tetx of image and expecting key value pairs text from it.
It is returning this response - I’m sorry, but I’m unable to directly extract or interpret content from images or process images to extract text (Optical Character Recognition, OCR). However, I can assist with processing and interpreting text or structured data if you have any other requests or need assistance with a different type of query.
This is the code Iam using
public async Task ExtractTextFromImageAsync(double timeout)
{
_logger.LogInformation(“Starting ExtractTextFromImageAsync.”);
string deployment = _config.GetValue(OpenAIServiceConstants.CompletionDeployment);
if (string.IsNullOrEmpty(deployment))
{
_logger.LogError(“Configuration value for {CompletionDeployment} is missing”, OpenAIServiceConstants.CompletionDeployment);
throw new InvalidOperationException($“Configuration value for {OpenAIServiceConstants.CompletionDeployment} is missing”);
}
// Set a timeout for the operation
using var ct = new CancellationTokenSource(TimeSpan.FromSeconds(timeout));
// Create the chat client
ChatClient chatClient = _openAIClient.GetChatClient("gpt-4o");
// Define the JSON content for the request
string prompt = @"{
""enhancements"": {
""ocr"": { ""enabled"": true },
""grounding"": { ""enabled"": true }
},
""messages"": [
{
""role"": ""system"",
""content"": [
{
""type"": ""text"",
""text"": ""You are an AI assistant that extracts data from documents and returns them as structured JSON objects.""
}
]
},
{
""role"": ""user"",
""content"": [
{
""type"": ""image_url"",
""image_url"": {
""url"": """"
}
},
{
""type"": ""text"",
""text"": ""Extract JSON data from the given image""
}
]
}
],
""temperature"": 0.7,
""top_p"": 0.95,
""max_tokens"": 4096
}";
// Convert the string into a JObject
JObject obj = JObject.Parse(prompt);
// Create the ChatMessage objects for the request
var chatMessages = new List<ChatMessage>();
// Parse system and user messages from the JObject (you can extend this to handle more dynamic message creation)
if (obj["messages"] is JArray messagesArray)
{
foreach (var message in messagesArray)
{
var role = message["role"]?.ToString();
var content = message["content"]?.ToString();
if (role == "system" && content != null)
{
chatMessages.Add(new SystemChatMessage(content));
}
else if (role == "user" && content != null)
{
chatMessages.Add(new UserChatMessage(content));
}
}
}
// Set the ChatCompletionOptions
var options = new ChatCompletionOptions
{
Temperature = obj["temperature"]?.ToObject<float>() ?? 0.7f,
MaxOutputTokenCount = obj["max_tokens"]?.ToObject<int>() ?? 800,
TopP = obj["top_p"]?.ToObject<float>() ?? 0.95f
};
// Perform the request to the Azure OpenAI service
var responseBuilder = new StringBuilder();
_logger.LogInformation("Initiating OpenAI image text extraction.");
await ExecuteWithRetryAsync(async () =>
{
await foreach (var completionUpdate in chatClient.CompleteChatStreamingAsync(chatMessages, options, ct.Token))
{
foreach (var content in completionUpdate.ContentUpdate)
{
responseBuilder.Append(content.Text);
}
}
return responseBuilder.ToString();
}, _logger);
_logger.LogInformation("Completed text extraction from image.");
return responseBuilder.ToString();
}