I am implementing a basic version of a talking bot using the realtime API in Unity version 2022.3.33
I am able to give in an input audio in pcm16 format and I am successfully getting an output. I am also able to play the output verifying that it is correct. But as soon as I get the response.done event, the websocket gives me an error stating:-
“Error committing input audio buffer: buffer too small. Expected at least 100ms of audio, but buffer only has 0.00ms of audio.”
I have verified that I am only sending one input audio from which I get the correct output and I have no idea from where this error is coming. This error is not caused by any client event because it doesn’t return any event ID.
The order in which I am sending my client events are:-
- session.update
- input_audio_buffer.append
- conversation.item.create
- input_audio_buffer.commit
The docs say that event 3 should be called before event 2 but then I was not able to get any audio in the response and this order is working. In both the cases I am getting the error that I have mentioned in this topic.
I would really appreciate your help
Here are the functions that I am using for the requests and responses
PS - Please ignore the messy code, I have been trying literally everything to fix this
private async Task GetLLMResponse(AudioClip audioClip)
{
try
{
Debug.Log("Initializing LLM response...");
const string MODEL = "Model link";
// Create WebSocket connection
webSocket = new ClientWebSocket();
cancellationTokenSource = new CancellationTokenSource();
// Set headers
webSocket.Options.SetRequestHeader("Authorization", $"Bearer {openAIKey}");
webSocket.Options.SetRequestHeader("Content-Type", "application/json");
webSocket.Options.SetRequestHeader("openai-beta", "realtime=v1");
Uri uri = new Uri($"wss://api.openai.com/v1/realtime?model={MODEL}");
Debug.Log($"Connecting to WebSocket at: {uri}");
Debug.Log($"WebSocket state before connection: {webSocket.State}");
await webSocket.ConnectAsync(uri, cancellationTokenSource.Token);
Debug.Log("WebSocket connected successfully");
// Start receiving messages
_ = ReceiveWebSocketMessages();
// Convert audio data to PCM16 format
float[] audioData = new float[audioClip.samples];
audioClip.GetData(audioData, 0);
Debug.Log($"Audio frequency: {audioClip.frequency} Hz");
Debug.Log($"Audio Data Length (samples): {audioData.Length}");
// Ensure the audio clip is long enough (100ms of audio at 16kHz sample rate)
if (audioClip.samples < 1600)
{
Debug.LogError("Audio clip is too short (less than 100ms).");
ShowError("Audio clip is too short.");
return;
}
// Convert to 16-bit PCM
byte[] pcmData = new byte[audioData.Length * 2];
for (int i = 0; i < audioData.Length; i++)
{
short value = (short)(audioData[i] * 32767f);
pcmData[i * 2] = (byte)(value & 0xFF);
pcmData[i * 2 + 1] = (byte)((value >> 8) & 0xFF);
}
Debug.Log($"Converted audio data: {audioData.Length} samples, {pcmData.Length} bytes");
// First, send audio buffer append
var appendRequest = new
{
type = "input_audio_buffer.append",
audio = pcmData
};
string appendJson = JsonConvert.SerializeObject(appendRequest);
Debug.Log($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: Sending audio buffer append request (length: {appendJson.Length})");
var appendBytes = Encoding.UTF8.GetBytes(appendJson);
Debug.Log($"WebSocket state before sending: {webSocket.State}");
Debug.Log($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: Sending message: {Encoding.UTF8.GetString(appendBytes)}");
await webSocket.SendAsync(
new ArraySegment<byte>(appendBytes),
WebSocketMessageType.Text,
true,
cancellationTokenSource.Token
);
await Task.Delay(100);
// Send system message
var systemMessage = new
{
type = "conversation.item.create",
item = new
{
type = "message",
role = "system",
content = new[]
{
new { type = "input_text", text = "Some prompt" }
}
}
};
string systemJson = JsonConvert.SerializeObject(systemMessage);
Debug.Log($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: Sending system message: {systemJson}");
var systemBytes = Encoding.UTF8.GetBytes(systemJson);
await webSocket.SendAsync(
new ArraySegment<byte>(systemBytes),
WebSocketMessageType.Text,
true,
cancellationTokenSource.Token
);
await Task.Delay(100);
// Send audio buffer commit
var commitRequest = new
{
type = "input_audio_buffer.commit"
};
string commitJson = JsonConvert.SerializeObject(commitRequest);
Debug.Log($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: Sending audio buffer commit request: {commitJson}");
Debug.Log($"[Audio Commit] Sending commit request at: {DateTime.UtcNow}");
var commitBytes = Encoding.UTF8.GetBytes(commitJson);
Debug.Log($"WebSocket state before sending: {webSocket.State}");
Debug.Log($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: Sending message: {Encoding.UTF8.GetString(commitBytes)}");
await webSocket.SendAsync(
new ArraySegment<byte>(commitBytes),
WebSocketMessageType.Text,
true,
cancellationTokenSource.Token
);
Debug.Log("All messages sent successfully");
// Wait for completion or timeout
bool completed = false;
var timeoutTask = Task.Delay(30000); // 30 second timeout
while (!completed && !timeoutTask.IsCompleted)
{
if (webSocket.State != WebSocketState.Open)
{
Debug.LogError("WebSocket connection closed unexpectedly");
break;
}
await Task.Delay(100);
}
if (timeoutTask.IsCompleted)
{
Debug.LogError("Response timed out");
ShowError("Response timeout");
}
}
catch (Exception e)
{
Debug.LogError($"Error in GetLLMResponse: {e.Message}\nStack trace: {e.StackTrace}");
ShowError("Failed to get response");
}
finally
{
try
{
if (webSocket != null && webSocket.State == WebSocketState.Open)
{
await webSocket.CloseAsync(WebSocketCloseStatus.NormalClosure,
"Completing request",
CancellationToken.None);
}
}
catch (Exception e)
{
Debug.LogError($"Error closing WebSocket: {e.Message}");
}
webSocket?.Dispose();
webSocket = null;
if (cancellationTokenSource != null && !cancellationTokenSource.IsCancellationRequested)
{
cancellationTokenSource.Cancel();
}
}
}
private async Task ReceiveWebSocketMessages()
{
Debug.Log("Starting WebSocket message receiver...");
Debug.Log($"WebSocket initial state: {webSocket.State}");
var buffer = new byte[1024 * 4];
try
{
while (webSocket.State == WebSocketState.Open && !cancellationTokenSource.Token.IsCancellationRequested)
{
Debug.Log($"WebSocket State: {webSocket.State}, Cancellation Requested: {cancellationTokenSource.Token.IsCancellationRequested}");
var result = await webSocket.ReceiveAsync(new ArraySegment<byte>(buffer), cancellationTokenSource.Token);
if (result.MessageType == WebSocketMessageType.Close)
{
Debug.Log("Received WebSocket close message");
await webSocket.CloseAsync(
WebSocketCloseStatus.NormalClosure,
string.Empty,
CancellationToken.None
);
break;
}
var message = Encoding.UTF8.GetString(buffer, 0, result.Count);
Debug.Log($"Received message ({result.Count} bytes): {message}");
await ProcessWebSocketMessage(message);
}
}
catch (Exception e)
{
Debug.LogError($"WebSocket receive error: {e.Message}\nStack trace: {e.StackTrace}");
ShowError("Connection error occurred");
}
Debug.Log("WebSocket message receiver stopped");
}
private async Task ProcessWebSocketMessage(string message)
{
Debug.Log("Processing WebSocket message...");
try
{
// Append the incoming message to the global buffer
bufferedData += message;
JObject jsonObject = null;
try
{
// Attempt to parse the accumulated buffered data
jsonObject = JObject.Parse(bufferedData);
Debug.Log($"Successfully parsed JSON object: {jsonObject}");
}
catch (JsonReaderException ex)
{
// If parsing fails, the message is likely incomplete or malformed, so continue buffering
Debug.Log($"Failed to parse JSON: {ex.Message}");
Debug.Log($"Buffered data which failed: {bufferedData}");
return; // Exit early to wait for more data
}
if (jsonObject == null || !jsonObject.ContainsKey("type"))
{
Debug.LogWarning($"JSON object missing 'type' field: {jsonObject}");
return; // Exit early if the required 'type' field is missing
}
// Extract the message type
string messageType = jsonObject["type"]?.ToString();
Debug.Log($"Processing message type: {messageType}");
// Handle the message after successful parsing
string eventId = jsonObject["event_id"]?.ToString();
switch (messageType)
{
case "session.created":
Debug.Log("Session created, initializing...");
await InitializeSession();
break;
case "session.updated":
Debug.Log("Session updated successfully");
break;
case "conversation.item.created":
var item = jsonObject["item"] as JObject;
if (item != null)
{
string itemType = item["type"]?.ToString();
string status = item["status"]?.ToString();
Debug.Log($"Item type: {itemType}, status: {status}");
}
break;
case "input_audio_buffer.speech_started":
Debug.Log("Speech detected in input audio buffer");
break;
case "input_audio_buffer.speech_stopped":
Debug.Log("Speech stopped in input audio buffer");
break;
case "input_audio_buffer.committed":
Debug.Log("Input audio buffer committed successfully");
break;
case "response.audio_transcript.delta":
var transcriptDelta = jsonObject["delta"]?.ToString();
if (!string.IsNullOrEmpty(transcriptDelta))
{
UpdateResponseText(transcriptDelta);
}
break;
case "response.audio.delta":
var audioData = jsonObject["delta"]?.ToString();
if (!string.IsNullOrEmpty(audioData))
{
Debug.Log($"Received audio delta, length: {audioData.Length}");
// Store all chunks under the "current" key
audioChunkManager.AddChunk("current", audioData); // Use "current" to accumulate all chunks
}
break;
case "response.done":
Debug.Log("Response processing completed");
// Get the complete audio data from the chunks
string completeAudio = audioChunkManager.GetCompleteAudio("current");
if (string.IsNullOrEmpty(completeAudio))
{
Debug.LogError("No audio data found after response.done");
bufferedData = string.Empty;
return;
}
Debug.Log($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: Complete audio length: {completeAudio.Length}");
// Play the audio from the base64-encoded string
await PlayAudioFromBase64(completeAudio);
// Clear the audio buffer after playback to prepare for the next audio stream
audioChunkManager.Clear();
var clearRequest = new
{
type = "input_audio_buffer.clear"
};
string clearJson = JsonConvert.SerializeObject(clearRequest);
Debug.Log($"Sending input_audio_buffer.clear request: {clearJson}");
var clearBytes = Encoding.UTF8.GetBytes(clearJson);
await webSocket.SendAsync(
new ArraySegment<byte>(clearBytes),
WebSocketMessageType.Text,
true,
cancellationTokenSource.Token
);
isProcessing = false;
UpdateUI(false);
break;
case "error":
var errorObj = jsonObject["error"] as JObject;
var errorMessage = errorObj?["message"]?.ToString() ?? "Unknown error occurred";
Debug.LogError($"[{DateTime.Now:yyyy-MM-dd HH:mm:ss.fff}]: API Error: {errorMessage}");
ShowError($"API Error: {errorMessage}");
break;
default:
Debug.Log($"Unhandled message type: {messageType}");
break;
}
// After processing the JSON successfully, reset the buffer for the next message
bufferedData = string.Empty;
}
catch (Exception e)
{
Debug.LogError($"Error processing message: {e.Message}\nStack trace: {e.StackTrace}");
jsonParser.Clear(); // Clear parser state on error
string truncatedMessage = message;
if (message != null && message.Length > 1000)
{
truncatedMessage = message.Substring(0, 1000) + "...";
}
Debug.LogError($"Problematic message: {truncatedMessage}");
}
}