Okay, so as I’m developing for a German company, I tried the voice in German.
I now attempted the same code however speaking English.
This works flawlessly.
I imagine that my audio is still bad, however it can better “fill in the blanks” in English.
I tried exporting the audio that I’m sending to the websocket, however I’m having trouble doing this right. Either the AIs audio is pitched up or mine is.
However this just means I’m using the wrong samplerate. I know I have to use 24000Hz, mono, 16bit, 64base encoded chunks of audio frames, but I have no clue what I’m doing wrong.
I’m working with a SIP VoIP SDK, so the audio goes through that as well.
However when listening to the audio, everything sounds okay to me.
I am so stumped and wish I could resolve this seemingly simple issue.
I will post my code here as it might be a good starting point to help me with my issue.
Keep in mind, it’s Java, so you may want to use ChatGPT to convert to a more readable language or to let it summarize my code.
Thank you so much for your help!
import java.awt.event.KeyEvent;
import java.awt.event.KeyListener;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.WebSocket;
import java.net.http.WebSocket.Listener;
import java.util.Base64;
import java.util.concurrent.CompletionStage;
import java.util.concurrent.CountDownLatch;
import javax.sound.sampled.*;
import javax.swing.JFrame;
import org.json.JSONArray;
import org.json.JSONObject;
import io.github.cdimascio.dotenv.Dotenv;
import webphone.*;
public class WebSocketClient {
private static final String URL = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview";
private static final String AUTHORIZATION = "Bearer " + getApiKey();
private static final String OPENAI_BETA = "realtime=v1";
private static CountDownLatch latch = new CountDownLatch(1);
public static WebSocket webSocket;
private static webphone wobj;
private static boolean isWebSocketConnected = false;
private static ByteArrayOutputStream audioOutputStream = new ByteArrayOutputStream();
private static ByteArrayOutputStream webSocketAudioReceived = new ByteArrayOutputStream();
private static ByteArrayOutputStream webSocketAudioSent = new ByteArrayOutputStream();
private static ByteArrayOutputStream phoneAudioReceived = new ByteArrayOutputStream();
private static ByteArrayOutputStream phoneAudioSent = new ByteArrayOutputStream();
public static void main(String[] args) {
try {
latch.await(); // Keep the main thread alive
} catch (InterruptedException e) {
// Load API key from environment variables
private static String getApiKey() {
Dotenv dotenv = Dotenv.load();
return dotenv.get("OPENAI_API_KEY");
// Connect to WebSocket if not already connected
public static void connectToWebSocket() {
if (!isWebSocketConnected) {
HttpClient client = HttpClient.newHttpClient();
webSocket = client.newWebSocketBuilder()
.header("Authorization", AUTHORIZATION)
.header("OpenAI-Beta", OPENAI_BETA)
.buildAsync(URI.create(URL), new WebSocketListener())
isWebSocketConnected = true;
// Reconnect to WebSocket
private static void reconnect() {
isWebSocketConnected = false;
// Initialize SIP settings
private static void initializeSIP() {
try {
wobj = new webphone(0);
MyNotificationListener listener = new MyNotificationListener();
wobj.API_SetParameter("loglevel", 1);
wobj.API_SetParameter("logtoconsole", true);
wobj.API_SetParameter("serveraddress", "");
wobj.API_SetParameter("username", "AIPhone");
wobj.API_SetParameter("password", "9aGi28axZrgbtimA");
wobj.API_SetParameter("useaudiodevicerecord", false); // Disable recording from local audio device
wobj.API_SetParameter("sendmedia_mode", 2); // Use API_GetMedia for media streaming
wobj.API_SetParameter("sendmedia_atype", 3); // PCM 16-bit
wobj.API_SetParameter("sendmedia_mtype", 1); // Audio only
wobj.API_SetParameter("sendmedia_dir", 1); // Incoming only
} catch (Exception e) {
// Start media streaming in a separate thread
private static void startMediaStreaming() {
new Thread(() -> {
while (true) {
byte[] mediaData = wobj.API_GetMedia();
if (mediaData != null && mediaData.length > 0) {
try {
} catch (IOException e) {
try {
Thread.sleep(10); // Sleep to avoid busy waiting
} catch (InterruptedException e) {
private static void sendTextToWS(String text) {
if (isWebSocketConnected) {
webSocket.sendText(new JSONObject()
.put("type", "response.create")
.put("response", new JSONObject()
.put("modalities", new JSONArray().put("text").put("audio"))
.put("instructions", text))
.toString(), true);
// Stream audio data to WebSocket
private static void streamAudioToWebSocket(byte[] audioBytes) {
if (isWebSocketConnected) {
byte[] resampledAudio = WebSocketListener.resampleAudio(audioBytes, 16000, 24000);
String base64Audio = Base64.getEncoder().encodeToString(resampledAudio);
webSocket.sendText(new JSONObject()
.put("type", "input_audio_buffer.append")
.put("audio", base64Audio)
.toString(), true);
try {
} catch (IOException e) {
// Save audio streams to files
private static void saveAudioToFile() {
try {
saveAudioStreamToFile(audioOutputStream, "FullConversation.wav", 24000);
saveAudioStreamToFile(webSocketAudioReceived, "WebSocketAudioReceived.wav", 24000);
saveAudioStreamToFile(webSocketAudioSent, "WebSocketAudioSent.wav", 24000);
saveAudioStreamToFile(phoneAudioReceived, "PhoneAudioReceived.wav", 24000);
saveAudioStreamToFile(phoneAudioSent, "PhoneAudioSent.wav", 24000);
} catch (IOException e) {
// Save a specific audio stream to a file
private static void saveAudioStreamToFile(ByteArrayOutputStream audioStream, String fileName, float sampleRate)
throws IOException {
byte[] audioData = audioStream.toByteArray();
AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
ByteArrayInputStream bais = new ByteArrayInputStream(audioData);
AudioInputStream audioInputStream = new AudioInputStream(bais, format,
audioData.length / format.getFrameSize());
File wavFile = new File(fileName);
AudioSystem.write(audioInputStream, AudioFileFormat.Type.WAVE, wavFile);
// WebSocket listener class
public static class WebSocketListener implements Listener {
private StringBuilder messageBuffer = new StringBuilder();
private ByteArrayOutputStream audioBuffer = new ByteArrayOutputStream();
private String lastItemId = null;
public void onOpen(WebSocket webSocket) {
System.out.println("Connected to WebSocket.");
webSocket.sendText(new JSONObject()
.put("type", "response.create")
.put("response", new JSONObject()
.put("modalities", new JSONArray().put("text").put("audio"))
.put("instructions", "Assist the user.")
).toString(), true);
webSocket.sendText(new JSONObject()
.put("type", "session.update")
.put("session", new JSONObject()
.put("voice", "alloy")
.put("input_audio_format", "pcm16")
.put("output_audio_format", "pcm16")
.put("input_audio_transcription", new JSONObject()
.put("model", "whisper-1")
).toString(), true);
public CompletionStage<?> onText(WebSocket webSocket, CharSequence data, boolean last) {
if (last) {
try {
JSONObject event = new JSONObject(messageBuffer.toString());
if (event.has("type")) {
String eventType = event.getString("type");
switch (eventType) {
case "response.audio.delta":
String itemId = event.getString("item_id");
if (!itemId.equals(lastItemId)) {
lastItemId = itemId;
audioBuffer.reset(); // Clear the buffer for new item_id
String base64Audio = event.getString("delta");
byte[] audioBytes = Base64.getDecoder().decode(base64Audio);
try {
} catch (IOException e) {
case "conversation.item.input_audio_transcription.completed":
String transcript = event.getString("transcript");
System.out.println("Transcription completed: " + transcript);
case "conversation.item.input_audio_transcription.failed":
JSONObject error = event.getJSONObject("error");
System.out.println("Transcription failed:");
error.keys().forEachRemaining(key -> {
System.out.println(key + ": " + error.get(key));
case "response.text.delta":
String deltaText = event.getString("delta");
System.out.println("Text delta received: " + deltaText);
case "error":
JSONObject errorDetails = event.getJSONObject("error");
String errorType = errorDetails.getString("type");
String errorCode = errorDetails.optString("code", "N/A");
String errorMessage = errorDetails.getString("message");
String errorParam = errorDetails.optString("param", "N/A");
String errorEventId = errorDetails.optString("event_id", "N/A");
System.out.println("Error occurred:");
System.out.println("Type: " + errorType);
System.out.println("Code: " + errorCode);
System.out.println("Message: " + errorMessage);
System.out.println("Param: " + errorParam);
System.out.println("Event ID: " + errorEventId);
//System.out.println("Unknown event type: " + eventType);
} catch (Exception e) {
} finally {
messageBuffer.setLength(0); // Clear the buffer
return null;
public void onError(WebSocket webSocket, Throwable error) {
public CompletionStage<?> onClose(WebSocket webSocket, int statusCode, String reason) {
System.out.println("Connection closed: " + reason);
return null;
// Resample audio data to a different sample rate
private static byte[] resampleAudio(byte[] audioData, float fromSampleRate, float toSampleRate) {
try {
AudioFormat originalFormat = new AudioFormat(fromSampleRate, 16, 1, true, false);
AudioInputStream originalStream = new AudioInputStream(
new ByteArrayInputStream(audioData), originalFormat, audioData.length / originalFormat.getFrameSize());
AudioFormat targetFormat = new AudioFormat(toSampleRate, 16, 1, true, false);
AudioInputStream resampledStream = AudioSystem.getAudioInputStream(targetFormat, originalStream);
ByteArrayOutputStream resampledOut = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = resampledStream.read(buffer)) != -1) {
resampledOut.write(buffer, 0, bytesRead);
return resampledOut.toByteArray();
} catch (Exception e) {
System.out.println("Failed to resample audio.");
return null;
// Stream audio data to phone
private static void streamAudioToPhone(byte[] audioBytes) {
if (isWebSocketConnected) {
// Resample audio from WebSocket sample rate (24000 Hz) to phone sample rate (8000 Hz)
byte[] resampledAudio = resampleAudio(audioBytes, 24000, 8000);
if (resampledAudio != null) {
wobj.API_StreamSoundBuff(1, -1, resampledAudio, resampledAudio.length); // Stream resampled audio buffer to phone
try {
} catch (IOException e) {
// SIP notification listener class
public static class MyNotificationListener extends SIPNotificationListener {
private boolean isCallConnected = false;
public void onStatus(SIPNotification.Status e) {
if (e.getLine() == -1) return;
if (e.getStatus() == SIPNotification.Status.STATUS_CALL_RINGING && e.getEndpointType() == SIPNotification.Status.DIRECTION_IN) {
System.out.println("Incoming call from " + e.getPeerDisplayname());
} else if (e.getStatus() == SIPNotification.Status.STATUS_CALL_CONNECT && e.getEndpointType() == SIPNotification.Status.DIRECTION_IN) {
if (!isCallConnected) {
System.out.println("Incoming call connected");
isCallConnected = true;
} else if (e.getStatus() == SIPNotification.Status.STATUS_CALL_FINISHED) {
if (isCallConnected) {
System.out.println("Call finished");
if (webSocket != null) {
webSocket.sendClose(WebSocket.NORMAL_CLOSURE, "Call ended");
isWebSocketConnected = false;
isCallConnected = false;
// Setup key listener
private static void setupKeyListener() {
JFrame frame = new JFrame();
frame.setSize(300, 200);
frame.addKeyListener(new KeyListener() {
public void keyTyped(KeyEvent e) {}
public void keyPressed(KeyEvent e) {
if (e.getKeyCode() == KeyEvent.VK_R) {
System.out.println("R key pressed");
sendTextToWS("Wer bist du?");
public void keyReleased(KeyEvent e) {}
For Documentation about the SIP: