Gpt 4o - Transcribe is not providing results in a single language

preetham1234 · March 23, 2025, 8:53pm

I’m using the following code but the transcription returned is NOT IN ENGLISH all the time. Please suggest what’s wrong. With the same parameters, the Whisper-1 model seems to be working fine.

How can I fix the output language consistency issue? Looks like a bug.

/**
 * OpenAI Service for handling WebRTC connections and authentication
 */

import { API_KEY } from '../config';

interface SessionConfig {
  type: string;
  input_audio_format: string;
  input_audio_transcription: {
    model: string;
    prompt: string;
    temperature: number;
    response_format: string;
    language: string;
  };
  turn_detection: {
    type: string;
    eagerness: string;
    // threshold: number;
    // prefix_padding_ms: number;
    // silence_duration_ms: number;
  };
  input_audio_noise_reduction: {
    type: string;
  };
  include: string[];
}

// Get ephemeral token for WebRTC authentication
const getEphemeralToken = async (logPrefix: string = ''): Promise<string> => {
  console.log(${logPrefix} Getting ephemeral token for WebRTC authentication...);
  const response = await fetch('https://api.openai.com/v1/realtime/sessions', {
    method: 'POST',
    headers: {
      'Authorization': Bearer ${API_KEY},
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      model: "gpt-4o-transcribe",
      input_audio_transcription: {
        model: 'gpt-4o-transcribe',
        language: "en"
      }
    })
  });
  
  if (!response.ok) {
    throw new Error(Failed to get ephemeral token: ${response.statusText});
  }
  
  const data = await response.json();
  console.log(${logPrefix} Successfully obtained ephemeral token);
  return data.client_secret.value;
};

// Connect to OpenAI's WebRTC API
export const connectToOpenAI = async (stream: MediaStream, logPrefix: string = ''): Promise<{ pc: RTCPeerConnection, dc: RTCDataChannel }> => {
  console.log(${logPrefix} Initializing WebRTC connection to OpenAI...);
  
  try {
    // Get ephemeral token
    const EPHEMERAL_KEY = await getEphemeralToken(logPrefix);

    // Create a peer connection
    const pc = new RTCPeerConnection();

    // Add local audio track for input
    stream.getAudioTracks().forEach(track => {
      pc.addTrack(track, stream);
    });

    // Set up data channel for sending and receiving events
    const dc = pc.createDataChannel("oai-events");

    // Set up data channel event handlers
    dc.onopen = () => {
      console.log(${logPrefix} Data channel opened);
      
      // Set up session configuration
      const sessionConfig: SessionConfig = {
        type: 'session.create',
        input_audio_format: 'pcm16',
        input_audio_transcription: {
          model: 'gpt-4o-transcribe', 
          prompt: 'You are a actively listening to a English interview. Transcribe only in English. Transcribe each and every word from the audio input in English!!! DO NOT MISS ANY WORD!!!',
          temperature: 0.5,
          response_format: 'text',
          language: "en"
        },
        turn_detection: {
          type: 'semantic_vad',
          eagerness: "high"

          // threshold: 0.5,
          // prefix_padding_ms: 300,
          // silence_duration_ms: 800
        },
        input_audio_noise_reduction: {
          type: 'near_field'
        },
        include: ['input_audio_buffer.committed', 'transcription.delta', 'transcription.final']
      };

      console.log(${logPrefix} Sending session configuration:, sessionConfig);
      dc.send(JSON.stringify(sessionConfig));
    };

    dc.onclose = () => {
      console.log(${logPrefix} Data channel closed);
    };

    dc.onerror = (error) => {
      console.error(${logPrefix} Data channel error:, error);
    };

    dc.onmessage = (e) => {
      // Parse and handle transcription events
      const message = JSON.parse(e.data);
      console.log(${logPrefix} Raw message:, message);
      
      // Extract transcript from various possible message formats
      let transcript = '';
      if (message.type === 'response.audio_transcript.delta' && message.delta?.text) {
        transcript = message.delta.text;
        console.log(${logPrefix} Transcription delta:, transcript);
      } else if (message.type === 'conversation.item.input_audio_transcription.completed' && message.transcript) {
        transcript = message.transcript;
        console.log(${logPrefix} Transcription completed:, transcript);
      } else if (message.content?.text) {
        transcript = message.content.text;
        console.log(${logPrefix} Content text:, transcript);
      }
      
      // Emit transcript if we have one
      if (transcript) {
        const transcriptionEvent = new CustomEvent('transcription', {
          detail: { transcript, source: logPrefix }
        });
        window.dispatchEvent(transcriptionEvent);
      }
      
      // Handle different types of transcription events
      switch (message.type) {
        case 'input_audio_buffer.speech_started':
          console.log(${logPrefix} Speech started at:, message.audio_start_ms);
          break;
        case 'input_audio_buffer.speech_stopped':
          console.log(${logPrefix} Speech stopped at:, message.audio_end_ms);
          break;
        case 'input_audio_buffer.committed':
          console.log(${logPrefix} Audio buffer committed);
          break;
        case 'conversation.item.created':
          if (message.item) {
            console.log(${logPrefix} Conversation item created:, message.item);
          }
          break;
        case 'response.created':
          console.log(${logPrefix} Response created);
          break;
        case 'response.done':
          console.log(${logPrefix} Response completed);
          break;
        case 'session.created':
          console.log(${logPrefix} Session created);
          break;
        case 'session.updated':
          console.log(${logPrefix} Session updated);
          break;
        case 'error':
          console.error(${logPrefix} OpenAI WebRTC error:, message.error);
          break;
        default:
          // Log any unhandled message types with their full content
          console.log(${logPrefix} Message type: ${message.type}, message);
      }
    };

    // Start the session using SDP
    const offer = await pc.createOffer();
    await pc.setLocalDescription(offer);

    const baseUrl = "https://api.openai.com/v1/realtime";
    const sdpResponse = await fetch(${baseUrl}?intent=transcription, {
      method: "POST",
      body: offer.sdp,
      headers: {
        Authorization: Bearer ${EPHEMERAL_KEY},
        "Content-Type": "application/sdp"
      },
    });

    if (!sdpResponse.ok) {
      throw new Error(Failed to get SDP answer: ${sdpResponse.statusText});
    }

    const sdpText = await sdpResponse.text();
    const answer: RTCSessionDescriptionInit = {
      type: 'answer' as RTCSdpType,
      sdp: sdpText
    };
    await pc.setRemoteDescription(answer);
    
    return { pc, dc };
  } catch (error) {
    console.error(${logPrefix} Failed to establish WebRTC connection:, error);
    throw error;
  }
};

preetham1234 · April 25, 2025, 5:57am

@itv : Can you please help with this? Been struggling with this for a long time now

Topic		Replies	Views
4o-mini-transcribe not respecting language option Bugs transcribe , gpt-4o-mini	0	42	April 19, 2025
RealTime API Transcription errors Bugs realtime	7	1539	January 9, 2025
Realtime API, getUserMedia, and WebRTC - does mic audio need to be converted to PCM16 for whisper ai transcription to work? API	0	81	February 7, 2025
Whisper is translating my audios for some reason API whisper	23	11052	April 7, 2025
Whisper transcription translates to random language (Malay) API whisper	8	1084	July 16, 2024

Gpt 4o - Transcribe is not providing results in a single language

Related topics