Can`t get the right audio format for recording in web application with whisper on IOS

hedtmann.phil.87 · November 20, 2024, 10:14pm

I have a serious problem on the non ios systems everythig is working finde and i record a voice it transcribes with whisper and gives me a summary. The video/audio file is converting the right way. On IOS no matter what i do it always gives me an error invalid file format although the file format shbould be right and is audio/mp4 or webm tried so many now. I am getting crazy i need some help please. here is my code

html:

<div>
  <button type="button" class="mic">
    <i class="fa fa-microphone"></i>
  </button>

</div>

the js


var mic = document.querySelector(".mic");
var isRecording = false;
var mediaRecorder;
var audioChunks = [];
var startTime, endTime;

// Event-Listener für den Mikrofon-Button
mic.addEventListener("click", function () {
  if (!isRecording) {
    startRecording();
    mic.classList.add("listening");
  } else {
    stopRecording();
    mic.classList.remove("listening");
  }
  isRecording = !isRecording;
});

// Aufnahme starten
function startRecording() {
  navigator.mediaDevices.getUserMedia({ audio: true })
    .then(function (stream) {
      // Für iOS und Whisper-kompatibles Format (z. B. mp4/m4a)
      mediaRecorder = new MediaRecorder(stream, { mimeType: "audio/mp4" });
      mediaRecorder.start();
      startTime = new Date();

      mediaRecorder.ondataavailable = function (event) {
        audioChunks.push(event.data);
      };

      console.log("Recording started...");
    })
    .catch(function (error) {
      console.error("Error accessing the microphone: ", error);
    });
}

// Aufnahme stoppen
function stopRecording() {
  if (mediaRecorder) {
    mediaRecorder.stop();
    endTime = new Date();
    console.log("Recording stopped...");
    mediaRecorder.onstop = () => {
      sendAudioToServer();
    };
  }
}

// Audio an den Server senden
function sendAudioToServer() {
  mic.disabled = true;
  mic.innerHTML = `<i class="fa fa-spinner fa-spin"></i>`;

  // Audio-Daten als Whisper-kompatibles Format erstellen
  var audioBlob = new Blob(audioChunks, { type: "audio/mp4" });
  audioChunks = []; // Sicherstellen, dass alte Daten entfernt werden
  var duration = Math.round((endTime - startTime) / 1000);
  var user_id = 1;

  var formData = new FormData();
  formData.append("audio", audioBlob);

  // Verhindern von Caching-Problemen durch Hinzufügen eines Zeitstempels zur URL
  fetch("assets/process_audio.php?" + new Date().getTime(), {
    method: "POST",
    body: formData
  })
    .then(response => response.json())
    .then(data => {
      if (data.originalMimeType) {
        alert("Original MIME-Type: " + data.originalMimeType); // Zeige den ursprünglichen MIME-Type an
      }
      if (data.response) {
        displayPopup(data.response, duration, user_id);
      } else {
        console.error("Server Error:", data.error);
        alert("Fehler: " + data.error); // Fehleranzeige für den Benutzer
      }
      resetMicButton();
    })
    .catch(error => {
      console.error("Error sending audio to server:", error);
      alert("Fehler beim Senden der Audiodatei: " + error.message);
      resetMicButton();
    });
}

// Pop-up für Titel und Kursanzeige
function displayPopup(transcription, duration, user_id) {
  const title = prompt("Gib den Titel der Aufnahme ein:");
  const course = prompt("Gib den Kurs der Aufnahme ein:");

  if (title && course) {
    saveRecording(title, course, transcription, duration, user_id);
  } else {
    alert("Titel und Kurs sind erforderlich!");
  }
}

// Funktion zum Speichern in der Datenbank
function saveRecording(title, course, transcription, duration, user_id) {
  fetch("assets/save_recording.php", {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
    },
    body: JSON.stringify({ title, course, transcription, duration, user_id }),
  })
    .then(response => response.json())
    .then(data => {
      if (data.success) {
        alert("Aufnahme erfolgreich gespeichert!");
      } else {
        console.error("Fehler beim Speichern der Aufnahme:", data.error);
        alert("Fehler: " + data.error);
      }
    })
    .catch(error => {
      console.error("Error saving recording:", error);
      alert("Fehler beim Speichern der Aufnahme: " + error.message);
    });
}

// Funktion zum Zurücksetzen des Mikrofon-Buttons
function resetMicButton() {
  mic.disabled = false;
  mic.innerHTML = `<i class="fa fa-microphone"></i>`;
}

and the php

if ($_SERVER['REQUEST_METHOD'] === 'POST') {
    if (isset($_FILES['audio'])) {
        $audioFile = $_FILES['audio']['tmp_name'];
        $originalMimeType = mime_content_type($audioFile);

        // Unterstützte Formate
        $supportedFormats = ['audio/flac', 'audio/m4a', 'audio/mp3', 'audio/mp4', 'audio/mpeg', 'audio/mpga', 'audio/oga', 'audio/ogg', 'audio/wav', 'audio/webm'];

        // Falls der MIME-Typ "video/webm" oder "video/mp4" ist, konvertieren wir zu den entsprechenden Audioformaten
        if ($originalMimeType === 'video/webm') {
            $originalMimeType = 'audio/webm'; // Neuer Typ für Whisper
        }
        
        if ($originalMimeType === 'video/mp4') {
            $originalMimeType = 'audio/mp4'; // Neuer Typ für Whisper
        }

        if (!in_array($originalMimeType, $supportedFormats)) {
            echo json_encode(["error" => "Ungültiges Dateiformat: $originalMimeType. Unterstützte Formate: " . implode(', ', $supportedFormats)]);
            exit;
        }

        // Whisper API - Transkription
        $whisperApiKey = "sk-proj-.....";
        $whisperUrl = "https://api.openai.com/v1/audio/transcriptions";

        $audioData = curl_file_create($audioFile, $originalMimeType, 'audio.webm');

        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $whisperUrl);
        curl_setopt($ch, CURLOPT_POST, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_HTTPHEADER, [
            "Authorization: Bearer $whisperApiKey",
            "Content-Type: multipart/form-data"
        ]);
        curl_setopt($ch, CURLOPT_POSTFIELDS, [
            'file' => $audioData,
            'model' => 'whisper-1'
        ]);

        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);
        $transcription = json_decode($response, true);

        if ($httpCode >= 400 || !isset($transcription['text'])) {
            $errorMessage = $transcription['error']['message'] ?? 'Whisper konnte den Text nicht transkribieren';
            echo json_encode(["error" => $errorMessage, "originalMimeType" => $originalMimeType]);
            exit;
        }

        $text = $transcription['text'];

        // GPT-4 API - Textverarbeitung
        $gptApiKey = "sk-proj-...";
        $gptUrl = "https://api.openai.com/v1/chat/completions";

        $chunks = str_split($text, 4000);
        $summaries = [];

        foreach ($chunks as $chunk) {
            $prompt = "Fasse den folgenden Text basierend auf einer akademischen Vorlesung zusammen. 
                       Identifiziere die Kernaussagen, wichtigsten Punkte und generiere eine strukturierte Übersicht. 
                       Die Antwort sollte in Markdown formatiert sein, mit Überschriften, Aufzählungen und Absätzen. 
                       Hier ist der Text: $chunk";

            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $gptUrl);
            curl_setopt($ch, CURLOPT_POST, true);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
            curl_setopt($ch, CURLOPT_HTTPHEADER, [
                "Authorization: Bearer $gptApiKey",
                "Content-Type: application/json"
            ]);
            curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode([
                "model" => "gpt-4o-mini",
                "messages" => [
                    ["role" => "system", "content" => "Du bist ein hilfreicher Assistent."],
                    ["role" => "user", "content" => $prompt]
                ],
                "max_tokens" => 1000,
                "temperature" => 0.7
            ]));

            $gptResponse = curl_exec($ch);
            $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            curl_close($ch);
            $gptData = json_decode($gptResponse, true);

            if ($httpCode >= 400 || !isset($gptData['choices'][0]['message']['content'])) {
                $errorMessage = $gptData['error']['message'] ?? 'GPT konnte den Text nicht verarbeiten';
                echo json_encode(["error" => $errorMessage, "originalMimeType" => $originalMimeType]);
                exit;
            }

            $summaries[] = trim($gptData['choices'][0]['message']['content']);
        }

        $finalSummary = implode("\n\n", $summaries);

        echo json_encode([
            "response" => $finalSummary,
            "originalMimeType" => $originalMimeType
        ]);
    } else {
        echo json_encode(["error" => "Keine Audiodatei hochgeladen"]);
    }
} else {
    echo json_encode(["error" => "Ungültige Anforderung"]);
}

i hipe someone can help me with this.

Topic		Replies	Views
Whisper API not transcribing audio files coming from an iphone API ios , whisper , javascript	10	1771	December 18, 2024
Issues with audio files from IOS and the x-m4a format API whisper	14	1254	July 21, 2024
Trying to send a transcription request with an audio file results in error in react native with expo go API	0	229	June 20, 2024
Whisper issues with mp4 saved by Safari API whisper	5	2040	December 16, 2023
Whisper API rejecting MP4 from safari - but works with webm on chrome and edge Bugs whisper	2	824	February 12, 2024

Can`t get the right audio format for recording in web application with whisper on IOS

Related topics