I’m using the following code but the transcription returned is NOT IN ENGLISH all the time. Please suggest what’s wrong. With the same parameters, the Whisper-1 model seems to be working fine.
How can I fix the output language consistency issue? Looks like a bug.
/**
* OpenAI Service for handling WebRTC connections and authentication
*/
import { API_KEY } from '../config';
interface SessionConfig {
type: string;
input_audio_format: string;
input_audio_transcription: {
model: string;
prompt: string;
temperature: number;
response_format: string;
language: string;
};
turn_detection: {
type: string;
eagerness: string;
// threshold: number;
// prefix_padding_ms: number;
// silence_duration_ms: number;
};
input_audio_noise_reduction: {
type: string;
};
include: string[];
}
// Get ephemeral token for WebRTC authentication
const getEphemeralToken = async (logPrefix: string = ''): Promise<string> => {
console.log(${logPrefix} Getting ephemeral token for WebRTC authentication...);
const response = await fetch('https://api.openai.com/v1/realtime/sessions', {
method: 'POST',
headers: {
'Authorization': Bearer ${API_KEY},
'Content-Type': 'application/json'
},
body: JSON.stringify({
model: "gpt-4o-transcribe",
input_audio_transcription: {
model: 'gpt-4o-transcribe',
language: "en"
}
})
});
if (!response.ok) {
throw new Error(Failed to get ephemeral token: ${response.statusText});
}
const data = await response.json();
console.log(${logPrefix} Successfully obtained ephemeral token);
return data.client_secret.value;
};
// Connect to OpenAI's WebRTC API
export const connectToOpenAI = async (stream: MediaStream, logPrefix: string = ''): Promise<{ pc: RTCPeerConnection, dc: RTCDataChannel }> => {
console.log(${logPrefix} Initializing WebRTC connection to OpenAI...);
try {
// Get ephemeral token
const EPHEMERAL_KEY = await getEphemeralToken(logPrefix);
// Create a peer connection
const pc = new RTCPeerConnection();
// Add local audio track for input
stream.getAudioTracks().forEach(track => {
pc.addTrack(track, stream);
});
// Set up data channel for sending and receiving events
const dc = pc.createDataChannel("oai-events");
// Set up data channel event handlers
dc.onopen = () => {
console.log(${logPrefix} Data channel opened);
// Set up session configuration
const sessionConfig: SessionConfig = {
type: 'session.create',
input_audio_format: 'pcm16',
input_audio_transcription: {
model: 'gpt-4o-transcribe',
prompt: 'You are a actively listening to a English interview. Transcribe only in English. Transcribe each and every word from the audio input in English!!! DO NOT MISS ANY WORD!!!',
temperature: 0.5,
response_format: 'text',
language: "en"
},
turn_detection: {
type: 'semantic_vad',
eagerness: "high"
// threshold: 0.5,
// prefix_padding_ms: 300,
// silence_duration_ms: 800
},
input_audio_noise_reduction: {
type: 'near_field'
},
include: ['input_audio_buffer.committed', 'transcription.delta', 'transcription.final']
};
console.log(${logPrefix} Sending session configuration:, sessionConfig);
dc.send(JSON.stringify(sessionConfig));
};
dc.onclose = () => {
console.log(${logPrefix} Data channel closed);
};
dc.onerror = (error) => {
console.error(${logPrefix} Data channel error:, error);
};
dc.onmessage = (e) => {
// Parse and handle transcription events
const message = JSON.parse(e.data);
console.log(${logPrefix} Raw message:, message);
// Extract transcript from various possible message formats
let transcript = '';
if (message.type === 'response.audio_transcript.delta' && message.delta?.text) {
transcript = message.delta.text;
console.log(${logPrefix} Transcription delta:, transcript);
} else if (message.type === 'conversation.item.input_audio_transcription.completed' && message.transcript) {
transcript = message.transcript;
console.log(${logPrefix} Transcription completed:, transcript);
} else if (message.content?.text) {
transcript = message.content.text;
console.log(${logPrefix} Content text:, transcript);
}
// Emit transcript if we have one
if (transcript) {
const transcriptionEvent = new CustomEvent('transcription', {
detail: { transcript, source: logPrefix }
});
window.dispatchEvent(transcriptionEvent);
}
// Handle different types of transcription events
switch (message.type) {
case 'input_audio_buffer.speech_started':
console.log(${logPrefix} Speech started at:, message.audio_start_ms);
break;
case 'input_audio_buffer.speech_stopped':
console.log(${logPrefix} Speech stopped at:, message.audio_end_ms);
break;
case 'input_audio_buffer.committed':
console.log(${logPrefix} Audio buffer committed);
break;
case 'conversation.item.created':
if (message.item) {
console.log(${logPrefix} Conversation item created:, message.item);
}
break;
case 'response.created':
console.log(${logPrefix} Response created);
break;
case 'response.done':
console.log(${logPrefix} Response completed);
break;
case 'session.created':
console.log(${logPrefix} Session created);
break;
case 'session.updated':
console.log(${logPrefix} Session updated);
break;
case 'error':
console.error(${logPrefix} OpenAI WebRTC error:, message.error);
break;
default:
// Log any unhandled message types with their full content
console.log(${logPrefix} Message type: ${message.type}, message);
}
};
// Start the session using SDP
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
const baseUrl = "https://api.openai.com/v1/realtime";
const sdpResponse = await fetch(${baseUrl}?intent=transcription, {
method: "POST",
body: offer.sdp,
headers: {
Authorization: Bearer ${EPHEMERAL_KEY},
"Content-Type": "application/sdp"
},
});
if (!sdpResponse.ok) {
throw new Error(Failed to get SDP answer: ${sdpResponse.statusText});
}
const sdpText = await sdpResponse.text();
const answer: RTCSessionDescriptionInit = {
type: 'answer' as RTCSdpType,
sdp: sdpText
};
await pc.setRemoteDescription(answer);
return { pc, dc };
} catch (error) {
console.error(${logPrefix} Failed to establish WebRTC connection:, error);
throw error;
}
};