I have set up a voice chat using the OpenAI real-time API, but I’m facing an issue. Once the AI starts dictating its answer, I’m unable to interrupt it and have to wait until the AI finishes the entire narration. I have already tried all the solutions mentioned on the forums, but none of them have worked.
using websocket
backend part
router.ws(‘/audio’, (clientWs: WebSocket, req: any) => {
console.log(‘Audio WebSocket Connection Established’);
let sessionId: string | null = null;
let isAssistantSpeaking = false;
let currentResponseId: string | null = null; // Track current response ID
// Initialize OpenAI WebSocket connection
const openaiWs = new WebSocket('wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17', {
headers: {
Authorization: 'Bearer ' + process.env.OPENAI_API_KEY,
'OpenAI-Beta': 'realtime=v1',
},
});
// Handle OpenAI WebSocket connection open
openaiWs.on('open', () => {
console.log('Connected to OpenAI WebSocket');
// Send session update with refined system prompt
const sessionUpdate = {
type: 'session.update',
session: {
input_audio_format: 'pcm16',
output_audio_format: 'pcm16',
model: 'gpt-4o-realtime-preview-2024-12-17',
voice: 'verse',
modalities: ['text', 'audio'],
turn_detection: {
type: 'server_vad',
threshold: 0.60,
prefix_padding_ms: 50,
silence_duration_ms: 200,
create_response: true,
interrupt_response: true,
},
instructions: `
You are Assistant. Follow these rules strictly:
`,
},
};
openaiWs.send(JSON.stringify(sessionUpdate));
});
// Handle messages from client
clientWs.on('message', async (message: Buffer) => {
try {
if (!sessionId) {
console.log('Waiting for session initialization...');
return;
}
// Try to parse the message as JSON
let parsedMessage;
try {
parsedMessage = JSON.parse(message.toString());
} catch (e) {
parsedMessage = null;
}
if (parsedMessage && parsedMessage.type) {
console.log(`Received control message: ${parsedMessage.type}`);
if (openaiWs.readyState === WebSocket.OPEN) {
openaiWs.send(JSON.stringify(parsedMessage));
} else {
throw new Error('OpenAI WebSocket connection lost');
}
} else {
// Handle audio data
console.log('Received audio data from client');
if (openaiWs.readyState === WebSocket.OPEN) {
const audioBase64 = message.toString();
console.log('Sending audio to OpenAI:', audioBase64.substring(0, 50) + '...');
const audioMessage = {
type: 'input_audio_buffer.append',
audio: audioBase64,
};
openaiWs.send(JSON.stringify(audioMessage));
} else {
throw new Error('OpenAI WebSocket connection lost');
}
}
} catch (error) {
console.error('Error processing message:', error);
clientWs.send(JSON.stringify({
type: 'error',
message: error instanceof Error ? error.message : 'Error processing audio data',
}));
}
});
// Handle messages from OpenAI
openaiWs.on('message', (message: Buffer) => {
try {
const response = JSON.parse(message.toString());
console.log('Received from OpenAI:', response);
switch (response.type) {
case 'session.updated':
sessionId = response?.session?.id;
console.log('Session configured:', sessionId);
// Send greeting after session is ready
const greetingEvent = {
type: 'response.create',
response: {
modalities: ['text', 'audio'],
instructions: 'Say exactly the following: Hey I am Your GeoHub Assistant, how can I help you today?',
},
};
openaiWs.send(JSON.stringify(greetingEvent));
break;
case 'response.created':
currentResponseId = response.response_id; // Store response ID
console.log('Response created with ID:', currentResponseId);
break;
case 'input_audio_buffer.speech_started':
console.log('Speech Start:', response.type);
if (isAssistantSpeaking && openaiWs.readyState === WebSocket.OPEN) {
// Commit current audio buffer
const commitMessage = {
type: 'input_audio_buffer.commit',
};
openaiWs.send(JSON.stringify(commitMessage));
console.log('Committed audio buffer.');
// Cancel ongoing assistant response
if (currentResponseId) {
const interruptMessage = {
type: 'response.cancel',
response_id: currentResponseId,
};
openaiWs.send(JSON.stringify(interruptMessage));
console.log('Cancelled assistant response with ID:', currentResponseId);
// Notify frontend to clear audio queue
clientWs.send(JSON.stringify({
type: 'clear_audio_queue',
message: 'Clearing audio playback queue due to interruption',
}));
currentResponseId = null; // Reset response ID
isAssistantSpeaking = false; // Reset speaking state
}
}
break;
case 'response.audio_transcript.delta':
const transcript = response.delta.toLowerCase().trim();
clientWs.send(JSON.stringify({
type: 'transcription',
text: response.delta,
}));
// Handle commands
if (transcript === 'stop' || transcript === 'end') {
console.log('Detected stop command');
if (openaiWs.readyState === WebSocket.OPEN) {
openaiWs.send(JSON.stringify({ type: 'response.cancel' }));
clientWs.send(JSON.stringify({
type: 'conversation_stopped',
message: 'Conversation stopped by user',
}));
}
} else if (transcript === 'regiven' || transcript === 'repeat instruction') {
console.log('Detected regiven command');
if (openaiWs.readyState === WebSocket.OPEN) {
openaiWs.send(JSON.stringify({ type: 'response.cancel' }));
const repeatGreeting = {
type: 'response.create',
response: {
modalities: ['text', 'audio'],
instructions: 'Say exactly the following: Hey I am Your GeoHub Assistant, how can I help you today?',
},
};
openaiWs.send(JSON.stringify(repeatGreeting));
}
}
break;
case 'response.audio.delta':
isAssistantSpeaking = true;
currentResponseId = response.response_id || currentResponseId; // Update response ID if provided
clientWs.send(JSON.stringify({
type: 'audio_response',
audio: response.delta,
}));
break;
case 'response.done':
isAssistantSpeaking = false;
currentResponseId = null; // Clear response ID
console.log('Response completed:', response);
break;
case 'response.canceled':
isAssistantSpeaking = false;
currentResponseId = null; // Clear response ID
console.log('Response canceled:', response);
break;
case 'error':
console.error('OpenAI error:', response);
clientWs.send(JSON.stringify({
type: 'error',
message: response.error?.message || 'Unknown error',
}));
break;
default:
console.log('Other message type:', response.type);
}
} catch (error) {
console.error('Error processing OpenAI response:', error);
}
});
// Handle client disconnection
clientWs.on('close', () => {
console.log('Client disconnected');
if (openaiWs.readyState === WebSocket.OPEN) {
openaiWs.close();
}
sessionId = null;
currentResponseId = null;
});
// Handle OpenAI connection errors
openaiWs.on('error', (error) => {
console.error('OpenAI WebSocket error:', error);
clientWs.send(JSON.stringify({
type: 'error',
message: 'OpenAI service error',
}));
});
});
frontend part
const handleMicClick = async () => {
// Toggle the microphone modal
setIsMicModalOpen(true);
// If currently listening, stop all audio processing and clean up
if (isListening) {
setIsListening(false);
setIsMicModalOpen(false);
// Stop media recorder if active
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
mediaRecorderRef.current.stop();
}
// Stop audio tracks
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.stop());
audioStreamRef.current = null;
}
// Close WebSocket connection
if (webSocketRef.current?.readyState === WebSocket.OPEN) {
webSocketRef.current.send(JSON.stringify({ type: 'input_audio_buffer.commit' }));
webSocketRef.current.close();
webSocketRef.current = null;
}
// Close AudioContext
if (audioContextRef.current) {
audioContextRef.current.close();
audioContextRef.current = null;
}
return;
}
try {
// Reset transcription text
setTranscriptionText('');
// Initialize AudioContext for audio processing
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 24000
});
let audioBufferQueue = [];
let isPlaying = false;
let isMicMuted = false; // Track microphone mute state
// Decode base64 audio and queue for playback
const decodeAndPlayAudio = (base64Audio) => {
try {
const cleanBase64 = base64Audio.replace(/\s+/g, '');
if (!/^[A-Za-z0-9+/=]+$/.test(cleanBase64)) {
throw new Error('Invalid base64 string');
}
const binaryString = atob(cleanBase64);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
const audioData = new Int16Array(bytes.buffer);
const audioBuffer = audioContextRef.current.createBuffer(
1,
audioData.length,
audioContextRef.current.sampleRate
);
const floatData = new Float32Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
floatData[i] = audioData[i] / 32768;
}
audioBuffer.copyToChannel(floatData, 0);
audioBufferQueue.push(audioBuffer);
if (!isPlaying) {
playNextBuffer();
}
} catch (error) {
console.error('Error decoding audio:', error);
}
};
// Play the next audio buffer in the queue
const playNextBuffer = () => {
if (audioBufferQueue.length === 0) {
isPlaying = false;
isMicMuted = false; // Unmute microphone when playback ends
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.enabled = true);
}
console.log(`Audio playback ended at ${new Date().toISOString()}`);
return;
}
isPlaying = true;
isMicMuted = true; // Mute microphone during playback
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.enabled = false);
}
const buffer = audioBufferQueue.shift();
const source = audioContextRef.current.createBufferSource();
source.buffer = buffer;
source.connect(audioContextRef.current.destination);
source.onended = playNextBuffer;
source.start(0);
};
// Play AI audio response
const playAudioResponse = async (base64Audio) => {
try {
decodeAndPlayAudio(base64Audio);
} catch (error) {
console.error('Error playing audio response:', error);
}
};
// Resample audio to 24000 Hz
const resampleAudio = (inputData, inputSampleRate) => {
return new Promise((resolve) => {
const offlineContext = new OfflineAudioContext(
1,
Math.ceil((inputData.length * 24000) / inputSampleRate),
24000
);
const buffer = offlineContext.createBuffer(1, inputData.length, inputSampleRate);
buffer.copyToChannel(inputData, 0);
const source = offlineContext.createBufferSource();
source.buffer = buffer;
source.connect(offlineContext.destination);
source.start();
offlineContext.startRendering().then((renderedBuffer) => {
const resampledData = renderedBuffer.getChannelData(0);
resolve(resampledData);
});
});
};
// Convert audio to base64 for WebSocket transmission
const base64EncodeAudio = (floatArray) => {
const buffer = new ArrayBuffer(floatArray.length * 2);
const view = new DataView(buffer);
for (let i = 0; i < floatArray.length; i++) {
const s = Math.max(-1, Math.min(1, floatArray[i]));
view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
}
const binary = String.fromCharCode.apply(null, new Uint8Array(buffer));
return btoa(binary);
};
// Request microphone access
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: 24000,
sampleSize: 16,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: false
}
});
audioStreamRef.current = stream;
// Initialize WebSocket connection
const ws = new WebSocket('ws://localhost:3000/backend-api/conversations/audio');
webSocketRef.current = ws;
ws.onopen = () => {
console.log('WebSocket connection established at', new Date().toISOString());
setIsListening(true);
};
// Set up audio processing pipeline
const sourceNode = audioContextRef.current.createMediaStreamSource(stream);
const processorNode = audioContextRef.current.createScriptProcessor(4096, 1, 1);
sourceNode.connect(processorNode);
processorNode.connect(audioContextRef.current.destination);
// Process and send audio chunks
processorNode.onaudioprocess = async (e) => {
if (webSocketRef.current?.readyState === WebSocket.OPEN && !isMicMuted) {
const inputData = e.inputBuffer.getChannelData(0);
const inputSampleRate = e.inputBuffer.sampleRate;
const resampledData = await resampleAudio(inputData, inputSampleRate);
const base64Chunk = base64EncodeAudio(resampledData);
webSocketRef.current.send(base64Chunk);
}
};
// Handle WebSocket messages
ws.onmessage = async (event) => {
try {
const data = JSON.parse(event.data);
console.log('Received WebSocket message at', new Date().toISOString(), data);
switch (data.type) {
case 'clear_audio_queue':
console.log('Clearing audio queue due to user interruption at', new Date().toISOString());
audioBufferQueue = []; // Clear queued audio
isPlaying = false; // Stop playback
isMicMuted = false; // Unmute microphone
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.enabled = true);
}
break;
case 'audio_response':
if (data.audio) {
await playAudioResponse(data.audio);
}
break;
case 'error':
console.error('Server error:', data.message);
alert(`Server error: ${data.message}`);
setIsMicModalOpen(false);
break;
case 'transcription':
console.log('Transcription received:', data.text);
setTranscriptionText(data.text);
break;
case 'conversation_stopped':
console.log('Conversation stopped:', data.message);
setIsListening(false);
setIsMicModalOpen(false);
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.stop());
audioStreamRef.current = null;
}
if (audioContextRef.current) {
audioContextRef.current.close();
audioContextRef.current = null;
}
if (webSocketRef.current?.readyState === WebSocket.OPEN) {
webSocketRef.current.close();
webSocketRef.current = null;
}
alert('Conversation stopped.');
break;
case 'response.canceled':
console.log('AI response canceled by server at', new Date().toISOString());
isMicMuted = false; // Unmute microphone
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.enabled = true);
}
break;
default:
console.log('Unknown message type:', data.type);
}
} catch (error) {
console.error('Error processing WebSocket message:', error);
}
};
ws.onerror = (error) => {
console.error('WebSocket error at', new Date().toISOString(), error);
setIsListening(false);
setIsMicModalOpen(false);
alert('WebSocket connection error. Please try again.');
};
ws.onclose = () => {
console.log('WebSocket connection closed at', new Date().toISOString());
setIsListening(false);
setIsMicModalOpen(false);
if (audioStreamRef.current) {
audioStreamRef.current.getTracks().forEach(track => track.stop());
audioStreamRef.current = null;
}
if (audioContextRef.current) {
audioContextRef.current.close();
audioContextRef.current = null;
}
};
} catch (error) {
console.error('Error initializing audio at', new Date().toISOString(), error);
setIsListening(false);
setIsMicModalOpen(false);
alert('Unable to access microphone. Please ensure you have granted permission.');
}
};