Not able to interupt realtime ai response

hussain_ecardio · May 3, 2025, 7:58pm

I have set up a voice chat using the OpenAI real-time API, but I’m facing an issue. Once the AI starts dictating its answer, I’m unable to interrupt it and have to wait until the AI finishes the entire narration. I have already tried all the solutions mentioned on the forums, but none of them have worked.

using websocket

backend part

router.ws(‘/audio’, (clientWs: WebSocket, req: any) => {
console.log(‘Audio WebSocket Connection Established’);
let sessionId: string | null = null;
let isAssistantSpeaking = false;
let currentResponseId: string | null = null; // Track current response ID

// Initialize OpenAI WebSocket connection
const openaiWs = new WebSocket('wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17', {
    headers: {
        Authorization: 'Bearer ' + process.env.OPENAI_API_KEY,
        'OpenAI-Beta': 'realtime=v1',
    },
});

// Handle OpenAI WebSocket connection open
openaiWs.on('open', () => {
    console.log('Connected to OpenAI WebSocket');

    // Send session update with refined system prompt
    const sessionUpdate = {
        type: 'session.update',
        session: {
            input_audio_format: 'pcm16',
            output_audio_format: 'pcm16',
            model: 'gpt-4o-realtime-preview-2024-12-17',
            voice: 'verse',
            modalities: ['text', 'audio'],
            turn_detection: {
                type: 'server_vad',
                threshold: 0.60,
                prefix_padding_ms: 50,
                silence_duration_ms: 200,
                create_response: true,
                interrupt_response: true,
                
            },
            instructions: `
                You are  Assistant. Follow these rules strictly:
            `,
        },
    };

    openaiWs.send(JSON.stringify(sessionUpdate));
});

// Handle messages from client
clientWs.on('message', async (message: Buffer) => {
    try {
        if (!sessionId) {
            console.log('Waiting for session initialization...');
            return;
        }

        // Try to parse the message as JSON
        let parsedMessage;
        try {
            parsedMessage = JSON.parse(message.toString());
        } catch (e) {
            parsedMessage = null;
        }

        if (parsedMessage && parsedMessage.type) {
            console.log(`Received control message: ${parsedMessage.type}`);
            if (openaiWs.readyState === WebSocket.OPEN) {
                openaiWs.send(JSON.stringify(parsedMessage));
            } else {
                throw new Error('OpenAI WebSocket connection lost');
            }
        } else {
            // Handle audio data
            console.log('Received audio data from client');
            if (openaiWs.readyState === WebSocket.OPEN) {
                const audioBase64 = message.toString();
                console.log('Sending audio to OpenAI:', audioBase64.substring(0, 50) + '...');
                const audioMessage = {
                    type: 'input_audio_buffer.append',
                    audio: audioBase64,
                };
                openaiWs.send(JSON.stringify(audioMessage));
            } else {
                throw new Error('OpenAI WebSocket connection lost');
            }
        }
    } catch (error) {
        console.error('Error processing message:', error);
        clientWs.send(JSON.stringify({
            type: 'error',
            message: error instanceof Error ? error.message : 'Error processing audio data',
        }));
    }
});

// Handle messages from OpenAI
openaiWs.on('message', (message: Buffer) => {
    try {
        const response = JSON.parse(message.toString());
        console.log('Received from OpenAI:', response);

        switch (response.type) {
            case 'session.updated':
                sessionId = response?.session?.id;
                console.log('Session configured:', sessionId);

                // Send greeting after session is ready
                const greetingEvent = {
                    type: 'response.create',
                    response: {
                        modalities: ['text', 'audio'],
                        instructions: 'Say exactly the following: Hey I am Your GeoHub Assistant, how can I help you today?',
                    },
                };
                openaiWs.send(JSON.stringify(greetingEvent));
                break;

            case 'response.created':
                currentResponseId = response.response_id; // Store response ID
                console.log('Response created with ID:', currentResponseId);
                break;

                case 'input_audio_buffer.speech_started':
                    console.log('Speech Start:', response.type);
                    if (isAssistantSpeaking && openaiWs.readyState === WebSocket.OPEN) {
                        // Commit current audio buffer
                        const commitMessage = {
                            type: 'input_audio_buffer.commit',
                        };
                        openaiWs.send(JSON.stringify(commitMessage));
                        console.log('Committed audio buffer.');
                
                        // Cancel ongoing assistant response
                        if (currentResponseId) {
                            const interruptMessage = {
                                type: 'response.cancel',
                                response_id: currentResponseId,
                            };
                            openaiWs.send(JSON.stringify(interruptMessage));
                            console.log('Cancelled assistant response with ID:', currentResponseId);
                
                            // Notify frontend to clear audio queue
                            clientWs.send(JSON.stringify({
                                type: 'clear_audio_queue',
                                message: 'Clearing audio playback queue due to interruption',
                            }));
                            
                            currentResponseId = null; // Reset response ID
                            isAssistantSpeaking = false; // Reset speaking state
                        }
                    }
                    break;

            case 'response.audio_transcript.delta':
                const transcript = response.delta.toLowerCase().trim();
                clientWs.send(JSON.stringify({
                    type: 'transcription',
                    text: response.delta,
                }));

                // Handle commands
                if (transcript === 'stop' || transcript === 'end') {
                    console.log('Detected stop command');
                    if (openaiWs.readyState === WebSocket.OPEN) {
                        openaiWs.send(JSON.stringify({ type: 'response.cancel' }));
                        clientWs.send(JSON.stringify({
                            type: 'conversation_stopped',
                            message: 'Conversation stopped by user',
                        }));
                    }
                } else if (transcript === 'regiven' || transcript === 'repeat instruction') {
                    console.log('Detected regiven command');
                    if (openaiWs.readyState === WebSocket.OPEN) {
                        openaiWs.send(JSON.stringify({ type: 'response.cancel' }));
                        const repeatGreeting = {
                            type: 'response.create',
                            response: {
                                modalities: ['text', 'audio'],
                                instructions: 'Say exactly the following: Hey I am Your GeoHub Assistant, how can I help you today?',
                            },
                        };
                        openaiWs.send(JSON.stringify(repeatGreeting));
                    }
                }
                break;

            case 'response.audio.delta':
                isAssistantSpeaking = true;
                currentResponseId = response.response_id || currentResponseId; // Update response ID if provided
                clientWs.send(JSON.stringify({
                    type: 'audio_response',
                    audio: response.delta,
                }));
                break;

            case 'response.done':
                isAssistantSpeaking = false;
                currentResponseId = null; // Clear response ID
                console.log('Response completed:', response);
                break;

            case 'response.canceled':
                isAssistantSpeaking = false;
                currentResponseId = null; // Clear response ID
                console.log('Response canceled:', response);
                break;

            case 'error':
                console.error('OpenAI error:', response);
                clientWs.send(JSON.stringify({
                    type: 'error',
                    message: response.error?.message || 'Unknown error',
                }));
                break;

            default:
                console.log('Other message type:', response.type);
        }
    } catch (error) {
        console.error('Error processing OpenAI response:', error);
    }
});

// Handle client disconnection
clientWs.on('close', () => {
    console.log('Client disconnected');
    if (openaiWs.readyState === WebSocket.OPEN) {
        openaiWs.close();
    }
    sessionId = null;
    currentResponseId = null;
});

// Handle OpenAI connection errors
openaiWs.on('error', (error) => {
    console.error('OpenAI WebSocket error:', error);
    clientWs.send(JSON.stringify({
        type: 'error',
        message: 'OpenAI service error',
    }));
});

});

frontend part

const handleMicClick = async () => {
    // Toggle the microphone modal
    setIsMicModalOpen(true);

    // If currently listening, stop all audio processing and clean up
    if (isListening) {
        setIsListening(false);
        setIsMicModalOpen(false);

        // Stop media recorder if active
        if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') {
            mediaRecorderRef.current.stop();
        }

        // Stop audio tracks
        if (audioStreamRef.current) {
            audioStreamRef.current.getTracks().forEach(track => track.stop());
            audioStreamRef.current = null;
        }

        // Close WebSocket connection
        if (webSocketRef.current?.readyState === WebSocket.OPEN) {
            webSocketRef.current.send(JSON.stringify({ type: 'input_audio_buffer.commit' }));
            webSocketRef.current.close();
            webSocketRef.current = null;
        }

        // Close AudioContext
        if (audioContextRef.current) {
            audioContextRef.current.close();
            audioContextRef.current = null;
        }

        return;
    }

    try {
        // Reset transcription text
        setTranscriptionText('');

        // Initialize AudioContext for audio processing
        audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({
            sampleRate: 24000
        });

        let audioBufferQueue = [];
        let isPlaying = false;
        let isMicMuted = false; // Track microphone mute state

        // Decode base64 audio and queue for playback
        const decodeAndPlayAudio = (base64Audio) => {
            try {
                const cleanBase64 = base64Audio.replace(/\s+/g, '');
                if (!/^[A-Za-z0-9+/=]+$/.test(cleanBase64)) {
                    throw new Error('Invalid base64 string');
                }

                const binaryString = atob(cleanBase64);
                const bytes = new Uint8Array(binaryString.length);
                for (let i = 0; i < binaryString.length; i++) {
                    bytes[i] = binaryString.charCodeAt(i);
                }

                const audioData = new Int16Array(bytes.buffer);
                const audioBuffer = audioContextRef.current.createBuffer(
                    1,
                    audioData.length,
                    audioContextRef.current.sampleRate
                );

                const floatData = new Float32Array(audioData.length);
                for (let i = 0; i < audioData.length; i++) {
                    floatData[i] = audioData[i] / 32768;
                }
                audioBuffer.copyToChannel(floatData, 0);

                audioBufferQueue.push(audioBuffer);

                if (!isPlaying) {
                    playNextBuffer();
                }
            } catch (error) {
                console.error('Error decoding audio:', error);
            }
        };

        // Play the next audio buffer in the queue
        const playNextBuffer = () => {
            if (audioBufferQueue.length === 0) {
                isPlaying = false;
                isMicMuted = false; // Unmute microphone when playback ends
                if (audioStreamRef.current) {
                    audioStreamRef.current.getTracks().forEach(track => track.enabled = true);
                }
                console.log(`Audio playback ended at ${new Date().toISOString()}`);
                return;
            }

            isPlaying = true;
            isMicMuted = true; // Mute microphone during playback
            if (audioStreamRef.current) {
                audioStreamRef.current.getTracks().forEach(track => track.enabled = false);
            }

            const buffer = audioBufferQueue.shift();
            const source = audioContextRef.current.createBufferSource();
            source.buffer = buffer;
            source.connect(audioContextRef.current.destination);
            source.onended = playNextBuffer;
            source.start(0);
        };

        // Play AI audio response
        const playAudioResponse = async (base64Audio) => {
            try {
                decodeAndPlayAudio(base64Audio);
            } catch (error) {
                console.error('Error playing audio response:', error);
            }
        };

        // Resample audio to 24000 Hz
        const resampleAudio = (inputData, inputSampleRate) => {
            return new Promise((resolve) => {
                const offlineContext = new OfflineAudioContext(
                    1,
                    Math.ceil((inputData.length * 24000) / inputSampleRate),
                    24000
                );
                const buffer = offlineContext.createBuffer(1, inputData.length, inputSampleRate);
                buffer.copyToChannel(inputData, 0);

                const source = offlineContext.createBufferSource();
                source.buffer = buffer;
                source.connect(offlineContext.destination);
                source.start();

                offlineContext.startRendering().then((renderedBuffer) => {
                    const resampledData = renderedBuffer.getChannelData(0);
                    resolve(resampledData);
                });
            });
        };

        // Convert audio to base64 for WebSocket transmission
        const base64EncodeAudio = (floatArray) => {
            const buffer = new ArrayBuffer(floatArray.length * 2);
            const view = new DataView(buffer);
            for (let i = 0; i < floatArray.length; i++) {
                const s = Math.max(-1, Math.min(1, floatArray[i]));
                view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
            }
            const binary = String.fromCharCode.apply(null, new Uint8Array(buffer));
            return btoa(binary);
        };

        // Request microphone access
        const stream = await navigator.mediaDevices.getUserMedia({
            audio: {
                channelCount: 1,
                sampleRate: 24000,
                sampleSize: 16,
                echoCancellation: true,
                noiseSuppression: true,
                autoGainControl: false
            }
        });

        audioStreamRef.current = stream;

        // Initialize WebSocket connection
        const ws = new WebSocket('ws://localhost:3000/backend-api/conversations/audio');
        webSocketRef.current = ws;

        ws.onopen = () => {
            console.log('WebSocket connection established at', new Date().toISOString());
            setIsListening(true);
        };

        // Set up audio processing pipeline
        const sourceNode = audioContextRef.current.createMediaStreamSource(stream);
        const processorNode = audioContextRef.current.createScriptProcessor(4096, 1, 1);

        sourceNode.connect(processorNode);
        processorNode.connect(audioContextRef.current.destination);

        // Process and send audio chunks
        processorNode.onaudioprocess = async (e) => {
            if (webSocketRef.current?.readyState === WebSocket.OPEN && !isMicMuted) {
                const inputData = e.inputBuffer.getChannelData(0);
                const inputSampleRate = e.inputBuffer.sampleRate;

                const resampledData = await resampleAudio(inputData, inputSampleRate);
                const base64Chunk = base64EncodeAudio(resampledData);
                webSocketRef.current.send(base64Chunk);
            }
        };

        // Handle WebSocket messages
        ws.onmessage = async (event) => {
            try {
                const data = JSON.parse(event.data);
                console.log('Received WebSocket message at', new Date().toISOString(), data);

                switch (data.type) {
                    case 'clear_audio_queue':
                        console.log('Clearing audio queue due to user interruption at', new Date().toISOString());
                        audioBufferQueue = []; // Clear queued audio
                        isPlaying = false; // Stop playback
                        isMicMuted = false; // Unmute microphone
                        if (audioStreamRef.current) {
                            audioStreamRef.current.getTracks().forEach(track => track.enabled = true);
                        }
                        break;
                    case 'audio_response':
                        if (data.audio) {
                            await playAudioResponse(data.audio);
                        }
                        break;
                    case 'error':
                        console.error('Server error:', data.message);
                        alert(`Server error: ${data.message}`);
                        setIsMicModalOpen(false);
                        break;
                    case 'transcription':
                        console.log('Transcription received:', data.text);
                        setTranscriptionText(data.text);
                        break;
                    case 'conversation_stopped':
                        console.log('Conversation stopped:', data.message);
                        setIsListening(false);
                        setIsMicModalOpen(false);
                        if (audioStreamRef.current) {
                            audioStreamRef.current.getTracks().forEach(track => track.stop());
                            audioStreamRef.current = null;
                        }
                        if (audioContextRef.current) {
                            audioContextRef.current.close();
                            audioContextRef.current = null;
                        }
                        if (webSocketRef.current?.readyState === WebSocket.OPEN) {
                            webSocketRef.current.close();
                            webSocketRef.current = null;
                        }
                        alert('Conversation stopped.');
                        break;
                    case 'response.canceled':
                        console.log('AI response canceled by server at', new Date().toISOString());
                        isMicMuted = false; // Unmute microphone
                        if (audioStreamRef.current) {
                            audioStreamRef.current.getTracks().forEach(track => track.enabled = true);
                        }
                        break;
                    default:
                        console.log('Unknown message type:', data.type);
                }
            } catch (error) {
                console.error('Error processing WebSocket message:', error);
            }
        };

        ws.onerror = (error) => {
            console.error('WebSocket error at', new Date().toISOString(), error);
            setIsListening(false);
            setIsMicModalOpen(false);
            alert('WebSocket connection error. Please try again.');
        };

        ws.onclose = () => {
            console.log('WebSocket connection closed at', new Date().toISOString());
            setIsListening(false);
            setIsMicModalOpen(false);
            if (audioStreamRef.current) {
                audioStreamRef.current.getTracks().forEach(track => track.stop());
                audioStreamRef.current = null;
            }
            if (audioContextRef.current) {
                audioContextRef.current.close();
                audioContextRef.current = null;
            }
        };
    } catch (error) {
        console.error('Error initializing audio at', new Date().toISOString(), error);
        setIsListening(false);
        setIsMicModalOpen(false);
        alert('Unable to access microphone. Please ensure you have granted permission.');
    }
};

Topic		Replies	Views
Realtime transcription issue API	21	990	April 9, 2025
Need help being able to interrupt the Realtime API response API realtime	19	4911	March 27, 2025
[Realtime API] AI Answering Gibberish API realtime , api-realtime , api-realtime-speech	9	806	October 25, 2024
Input_audio_transcription in realtime-api API	5	2861	February 20, 2025
Issue with realtime api user interruption API realtime	6	1689	October 24, 2024

Not able to interupt realtime ai response

Related topics