Actually i am doing something created a voice rag so taking input in voice but theissue in this the transcription doesn’t comes with good accuracy
And the code related to whisper is here
"use client";
import useWebSocket from "react-use-websocket";
import { useEffect, useRef } from "react";
/**
* @typedef {Object} Parameters
* @property {boolean} [useDirectAoaiApi]
* @property {string} [aoaiEndpointOverride]
* @property {string} [aoaiApiKeyOverride]
* @property {string} [aoaiModelOverride]
* @property {boolean} [enableInputAudioTranscription]
* @property {Function} [onWebSocketOpen]
* @property {Function} [onWebSocketClose]
* @property {Function} [onWebSocketError]
* @property {Function} [onWebSocketMessage]
* @property {Function} [onReceivedResponseAudioDelta]
* @property {Function} [onReceivedInputAudioBufferSpeechStarted]
* @property {Function} [onReceivedResponseDone]
* @property {Function} [onReceivedExtensionMiddleTierToolResponse]
* @property {Function} [onReceivedResponseAudioTranscriptDelta]
* @property {Function} [onReceivedInputAudioTranscriptionCompleted]
* @property {Function} [onReceivedError]
* @property {Function} [onReceivedMicControl]
* @property {Function} [onReceivedResponseTextDelta]
* @property {Function} [onReceivedResponseTextComplete]
* @property {Function} [onReceivedAudioFormat]
* @property {Function} [onReceivedAudioStream]
* @property {Function} [onReceivedAudioComplete]
* @property {Function} [onReceivedSpeakerStatus]
* @property {Function} [onReceivedLanguageStatus]
* @property {boolean} [shouldConnect]
*/
export default function useRealTime({
useDirectAoaiApi,
aoaiEndpointOverride,
aoaiApiKeyOverride,
aoaiModelOverride,
enableInputAudioTranscription,
onWebSocketOpen,
onWebSocketClose,
onWebSocketError,
onWebSocketMessage,
onReceivedResponseDone,
onReceivedResponseAudioDelta,
onReceivedResponseAudioTranscriptDelta,
onReceivedInputAudioBufferSpeechStarted,
onReceivedExtensionMiddleTierToolResponse,
onReceivedInputAudioTranscriptionCompleted,
onReceivedError,
onReceivedMicControl,
onReceivedResponseTextDelta,
onReceivedResponseTextComplete,
onReceivedAudioFormat,
onReceivedAudioStream,
onReceivedAudioComplete,
onReceivedSpeakerStatus,
onReceivedLanguageStatus,
shouldConnect = false
}) {
const wsEndpoint = useDirectAoaiApi
? `${aoaiEndpointOverride}/openai/realtime?api-key=${aoaiApiKeyOverride}&deployment=${aoaiModelOverride}&api-version=2024-10-01-preview`
: `${process.env.NEXT_PUBLIC_WS_URL}/realtime`;
const { sendJsonMessage, readyState, lastJsonMessage } = useWebSocket(wsEndpoint, {
onOpen: () => {
onWebSocketOpen?.();
},
onClose: () => onWebSocketClose?.(),
onError: event => onWebSocketError?.(event),
onMessage: event => {
onMessageReceived(event);
onWebSocketMessage?.(event);
},
shouldReconnect: () => shouldConnect,
reconnectAttempts: 10,
reconnectInterval: 3000,
},
shouldConnect
);
// Simplified safeSendJsonMessage that doesn't rely on ReadyState
const safeSendJsonMessage = (message) => {
try {
sendJsonMessage(message);
} catch (error) {
console.error('Error sending message:', error);
}
};
const startSession = (userLanguage = "en") => {
const command = {
type: "session.update",
session: {
turn_detection: {
type: "server_vad"
},
input_audio_transcription: {
model: "whisper-1",
language: userLanguage
}
}
};
safeSendJsonMessage(command);
};
const addUserAudio = (base64Audio) => {
if (!base64Audio || typeof base64Audio !== "string") {
console.error("Invalid base64Audio data:", base64Audio);
return;
}
const command = {
type: "input_audio_buffer.append",
audio: base64Audio
};
console.log("Sending audio data:", command);
safeSendJsonMessage(command);
};
const inputAudioBufferClear = () => {
const command = {
type: "input_audio_buffer.clear"
};
safeSendJsonMessage(command);
};
const stopSession = () => {
inputAudioBufferClear();
};
const sendTextInput = async (text) => {
if (text.trim()) {
const textCommand = {
type: "conversation.item.create",
item: {
type: "message",
role: "user",
content: [
{
type: "input_text",
text: text
}
]
}
};
safeSendJsonMessage(textCommand);
const responseCommand = {
type: "response.create"
};
safeSendJsonMessage(responseCommand);
}
};
const onMessageReceived = (event) => {
try {
const message = JSON.parse(event.data);
// [TEMPORARY DEBUGGING CODE - REMOVE AFTER TESTING]
// Log all message types to help troubleshoot
console.log(`WebSocket received message type: ${message.type}`, message);
switch (message.type) {
case "response.done":
onReceivedResponseDone?.(message);
break;
case "response.audio.delta":
onReceivedResponseAudioDelta?.(message);
break;
case "response.audio_transcript.delta":
onReceivedResponseAudioTranscriptDelta?.(message);
break;
case "response.text.delta":
onReceivedResponseTextDelta?.(message.delta);
break;
case "response.text.done":
if (message.text) {
onReceivedResponseTextDelta?.(message.text);
}
break;
case "input_audio_buffer.speech_started":
onReceivedInputAudioBufferSpeechStarted?.(message);
break;
case "mic_control":
// Handle microphone control messages from the backend
onReceivedMicControl?.(message);
break;
case "speaker.status":
// Handle speaker status update from the backend
onReceivedSpeakerStatus?.(message);
break;
case "language.status":
// Handle language status update from the backend
onReceivedLanguageStatus?.(message);
break;
case "conversation.item.input_audio_transcription.completed":
if (message.item?.content?.[0]?.transcript) {
console.log("Final transcript:", message.item.content[0].transcript);
onReceivedInputAudioTranscriptionCompleted?.({
transcript: message.item.content[0].transcript
});
} else {
console.warn("Received transcription event but no transcript found", message);
}
break;
case "conversation.item.create":
// [TEMPORARY DEBUGGING CODE - REMOVE AFTER TESTING]
console.log("DEBUG: Received conversation item create event", message);
if (message.item?.content?.[0]?.transcript) {
// [TEMPORARY DEBUGGING CODE - REMOVE AFTER TESTING]
console.log("DEBUG: Found transcript in conversation item:", message.item.content[0].transcript);
onReceivedInputAudioTranscriptionCompleted?.({
transcript: message.item.content[0].transcript
});
} else {
// [TEMPORARY DEBUGGING CODE - REMOVE AFTER TESTING]
console.warn("DEBUG: Received conversation item create but no transcript found", message);
}
break;
case "extension.middle_tier_tool_response":
onReceivedExtensionMiddleTierToolResponse?.(message);
break;
case "bot.text.complete":
onReceivedResponseTextComplete?.(message);
break;
case "bot.audio.format":
onReceivedAudioFormat?.(message);
break;
case "bot.audio.stream":
onReceivedAudioStream?.(message);
break;
case "bot.audio.complete":
onReceivedAudioComplete?.(message);
break;
case "error":
// Only process if there's actual content
if (message && Object.keys(message).length > 0 &&
Object.keys(message).some(key => key !== 'type')) {
onReceivedError?.(message);
}
break;
}
} catch (e) {
console.error('Error parsing WebSocket message:', e);
}
};
return {
startSession,
addUserAudio,
inputAudioBufferClear,
stopSession,
sendTextInput,
sendJsonMessage: safeSendJsonMessage
};
}