Hi, I tried to setup a simple mobile app using react-native-webrtc
and the code provided for connecting with the real-time api using WebRTC.
The only problem I’m facing is that the AI seems to be responding to itself.
I just say a “Hi”, and it seems to recursively greet me with “Hey”, “Hello there”,… It seems like it’s hearing it’s own audio and responding.
When I put the speakers on mute, and read my logs, I can see the AI only responds to my questions, and not to itself.
Here’s my code. Anyone know where I might be going wrong?
Client Side Code:
import { useState, useEffect } from "react";
import { View, TouchableOpacity, Text, Alert } from "react-native";
import { Audio } from "expo-av";
import {
RTCPeerConnection,
mediaDevices,
} from "react-native-webrtc";
export default function TalkToAIScreen() {
const [isRecording, setIsRecording] = useState(false);
const [audioPermission, setAudioPermission] = useState(false);
const [recordingStatus, setRecordingStatus] = useState("Idle");
const [ephemeralToken, setEphemeralToken] = useState(null);
const [connectionStatus, setConnectionStatus] = useState(null);
async function getEphemeralToken() {
// Get an ephemeral token from the server
try {
const tokenResponse = await fetch(
"http://localhost:3000/session"
);
const tokenData = await tokenResponse.json();
console.log("Token:", tokenData.client_secret.value);
setEphemeralToken(tokenData.client_secret.value);
return tokenData.client_secret.value;
} catch (error) {
Alert.alert("Error", "Couldn't retrieve ephemeral token");
return null;
}
}
async function setupWebRTC(EPHEMERAL_KEY) {
try {
// Create a peer connection
const pc = new RTCPeerConnection();
pc.ontrack = (event) => {
console.log("Received audio track from OpenAI");
};
// Add local audio track for microphone input in the browser
// Add local audio track for microphone input in the browser
try {
const ms = await mediaDevices.getUserMedia({
audio: true,
});
pc.addTrack(ms.getTracks()[0]);
} catch (audioError) {
console.error("Audio setup failed:", audioError);
throw new Error(`Audio setup failed: ${audioError.message}`);
}
// Setup data channel for sending and receiving events
// Set up data channel for sending and receiving events
const dc = pc.createDataChannel("oai-events");
dc.addEventListener("message", (e) => {
try {
// Realtime server events appear here!
const message = JSON.parse(e.data);
console.log(
"Got a data channel message:",
JSON.stringify(message)
);
} catch (error) {
console.error("Failed to parse data channel message:", err);
}
});
// Start the session using the Session Description Protocol (SDP)
console.log("Creating offer...");
const offer = await pc.createOffer();
console.log(
"Offer created:",
offer.sdp.substring(0, 100) + "..."
);
await pc.setLocalDescription(offer);
console.log("Local description set successfully");
console.log("Sending offer to OpenAI...");
const baseUrl = "https://api.openai.com/v1/realtime";
const model = "gpt-4o-realtime-preview-2024-12-17";
const sdpResponse = await fetch(`${baseUrl}?model=${model}`, {
method: "POST",
body: offer.sdp,
headers: {
Authorization: `Bearer ${EPHEMERAL_KEY}`,
"Content-Type": "application/sdp",
},
});
if (!sdpResponse.ok) {
const errorText = await sdpResponse.text();
console.error("OpenAI Response Error:", {
status: sdpResponse.status,
statusText: sdpResponse.statusText,
body: errorText,
});
throw new Error(
`Failed to send offer to OpenAI: ${sdpResponse.status} - ${errorText}`
);
}
const answer = {
type: "answer",
sdp: await sdpResponse.text(),
};
await pc.setRemoteDescription(answer);
console.log("Remote description set successfully");
console.log("Connection setup complete!");
setConnectionStatus("connected");
} catch (error) {
console.error("Failed to setup connection:", error);
setConnectionStatus("error");
Alert.alert(
"Connection Error",
"Failed to connect to AI service"
);
return false;
}
}
async function handleTalkToAI() {
const EPHEMERAL_KEY = await getEphemeralToken();
setupWebRTC(EPHEMERAL_KEY);
}
return (
<View
style={{
flex: 1,
alignItems: "center",
justifyContent: "center",
paddingHorizontal: 20,
}}
>
{/* CREATED: The main button that controls recording */}
<TouchableOpacity
onPress={handleTalkToAI}
disabled={!audioPermission}
style={{
backgroundColor: isRecording ? "#D9534F" : "#0275D8",
borderRadius: 8,
padding: 16,
marginBottom: 16,
opacity: audioPermission ? 1 : 0.5,
}}
>
<Text style={{ color: "#fff", fontSize: 16 }}>
{isRecording ? "Stop Talking" : "Talk to AI"}
</Text>
</TouchableOpacity>
{/* CREATED: Shows what's currently happening */}
<Text style={{ marginTop: 20, color: "#666" }}>
Status: {recordingStatus}
</Text>
<Text style={{ marginTop: 20, color: "#666" }}>
Token: {ephemeralToken}
</Text>
<Text style={{ marginTop: 10, color: "#666" }}>
Connection: {connectionStatus}
</Text>
<Text style={{ marginTop: 10, color: "#666" }}>
Audio Status: {sound ? "Playing" : "Silent"}
</Text>
</View>
);
}
Server Code
const express = require("express");
const dotenv = require("dotenv");
const cors = require("cors");
dotenv.config();
const app = express();
// An endpoint which would work with the client code above - it returns
// the contents of a REST API request to this protected endpoint
app.get("/session", async (req, res) => {
console.log(
"Using API Key:",
process.env.OPENAI_API_KEY ? "Key exists" : "No key found"
);
const r = await fetch(
"https://api.openai.com/v1/realtime/sessions",
{
method: "POST",
headers: {
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-realtime-preview-2024-12-17",
voice: "verse",
}),
}
);
const data = await r.json();
console.log("OpenAI Response For Ephemeral Token:", data);
// Send back the JSON we received from the OpenAI REST API
res.send(data);
});
app.use(cors());
app.listen(3000, () => {
console.log("Server is running on port 3000");
});