Hello,
i have noticed that producing a transcript with a mapping of characters / voices with TTS does no longer work… it looks like all the chunks are no longer iterated and only the last chunk of speech is generated in the audio, or they are all generated but each overwrites the previous.
The code i use is this:
const voiceMapping = {};
// Convert script to audio using chosen character voices
document.getElementById("convertButton").addEventListener("click", () => {
// Fill the voiceMapping with characters and selected voices
for (let i = 1; i <= 6; i++) {
const characterName = document.getElementById(`character${i}`).value.trim();
const selectedVoice = document.getElementById(`voiceSelect${i}`).value;
if (characterName) {
voiceMapping[characterName] = selectedVoice;
}
}
// Get the script input
const scriptInput = document
.getElementById("scriptInput")
.value.trim()
.split("\n");
console.log(scriptInput);
convertToAudio(scriptInput);
});
async function convertToAudio(script) {
const apikey = localStorage.getItem("openaikey"); // Ensure the API key is saved in local storage
const audioChunks = [];
for (const line of script) {
const match = line.match(/^\[(.+?)\]: (.+)$/);
if (match) {
const character = match[1].trim(); // Extract character name
const dialogue = match[2].trim(); // Extract dialogue
const selectedVoice = voiceMapping[character]; // Look up voice
console.log(dialogue);
if (selectedVoice) {
try {
const response = await fetch(
"https://api.openai.com/v1/audio/speech",
{
method: "POST",
headers: {
Authorization: `Bearer ${apikey}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: "tts-1",
input: dialogue,
voice: selectedVoice
})
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const blob = await response.blob();
audioChunks.push(URL.createObjectURL(blob)); // Store each audio chunk
} catch (error) {
console.error("Error while converting TTS:", error);
}
} else {
console.warn(`No voice mapping found for character: ${character}`);
}
} else {
console.warn(`Line not in expected format: ${line}`);
}
}
// Play the audio in sequence
playAudioChunks(audioChunks);
}
function playAudioChunks(chunks) {
const audioPlayer = document.getElementById("audioPlayer");
const playNext = (index) => {
if (index < chunks.length) {
audioPlayer.src = chunks[index];
audioPlayer.play();
audioPlayer.onended = () => playNext(index + 1); // Play next audio when the current one ends
}
};
playNext(0); // Start playing from the first chunk
}
in a script like:
[Mark]: Hey June how are you?
[June]: Hello Mark, very good!
I map:
character 1 → Mark → Alloy
character 2 → June → Nova
When i generate the speech i can hear both sentences, but in the audio file only the last sentence is present.
The given code is designed to play the audio clips received from the API call, one after the other, using the browser’s audio player. However, it does not contain any code to assemble or collect the audio pieces into a single file.
The audioChunks array stores the audio clips as individual blobs, and these blobs are played sequentially by the playAudioChunks function.
The blobs in audioChunks are converted into URLs using URL.createObjectURL(blob), and these URLs are then passed as the src attribute to the audio player in the playNext function. However, this approach does not concatenate the audio blobs into a single file; it simply stores each blob as a separate URL that is played in sequence.
So, it seems that the existing code is designed only to play audio segments sequentially, and there is no indication of any other function or use of the audio data. To assemble or collect all the received audio into a single file, some additional coding would be needed
The default format is mp3. While the mp3 format of raw frames is streamable, and frames can depend on the previous frame, what we think of as an mp3 file should not be trusted to be appendable, as some too-clever API developer could decide to add ID3 tags to the start or end.
Other API formats on TTS are simply standalone files, or streamable non-files like AAC. RAW, which has 24kHz sample rate mono PCM audio is the only one clear of any complication, but is high-bandwidth that connections might not keep up with.
You are right there is nothing uniting the chunks, but what bugs me is that when i created the page i have produced many transcripts of up to 10 lines, and the resulting files downloaded via the ‘download’ command of the audio player were complete.
Anyway, I have fixed it uniting the chunks and generating a download link.
I leave the full code here for the sake of clarity, in case someone wants to achieve the same goal.
const voiceMapping = {};
let audioChunks = []; // Store audio chunks globally
document.getElementById("convertButton").addEventListener("click", () => {
// Clear previous audio chunks
audioChunks = [];
// Fill the voiceMapping with characters and selected voices
for (let i = 1; i <= 6; i++) {
const characterName = document.getElementById(`character${i}`).value.trim();
const selectedVoice = document.getElementById(`voiceSelect${i}`).value;
if (characterName) {
voiceMapping[characterName] = selectedVoice;
}
}
// Get the script input
const scriptInput = document.getElementById("scriptInput").value.trim().split("\n");
convertToAudio(scriptInput);
});
async function convertToAudio(script) {
const apikey = localStorage.getItem("openaikey");
for (const line of script) {
const match = line.match(/^\[(.+?)\]: (.+)$/);
if (match) {
const character = match[1].trim(); // Extract character name
const dialogue = match[2].trim(); // Extract dialogue
const selectedVoice = voiceMapping[character]; // Look up voice
if (selectedVoice) {
try {
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${apikey}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: "tts-1",
input: dialogue,
voice: selectedVoice
})
});
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const blob = await response.blob();
const audioUrl = URL.createObjectURL(blob);
audioChunks.push(audioUrl); // Store each audio chunk
} catch (error) {
console.error("Error while converting TTS:", error);
}
} else {
console.warn(`No voice mapping found for character: ${character}`);
}
} else {
console.warn(`Line not in expected format: ${line}`);
}
}
console.log("Audio chunks generated:", audioChunks.length);
// Enable play button after audio chunks are generated
document.getElementById("playButton").disabled = false;
document.getElementById("downloadButton").disabled = false;
}
function playAudioChunks() {
const audioPlayer = document.getElementById("audioPlayer");
const playNext = (index) => {
if (index < audioChunks.length) {
audioPlayer.src = audioChunks[index];
audioPlayer.play().then(() => {
audioPlayer.onended = () => {
playNext(index + 1); // Play next audio when the current one ends
};
}).catch((error) => {
console.error('Error playing audio:', error);
});
} else {
// Reset the audio player after all chunks have been played
//audioPlayer.src = ''; // Clear the source (optional)
}
};
// Reset any existing onended handler before starting playback
audioPlayer.onended = null;
// Start playing from the first chunk
playNext(0);
}
document.getElementById("playButton").addEventListener("click", playAudioChunks);
document.getElementById("downloadButton").addEventListener("click", () => {
// Creiamo un array di Promises per tutti i blob audio
const promises = audioChunks.map(chunkUrl => fetch(chunkUrl).then(res => res.blob()));
Promise.all(promises)
.then(blobs => {
const audioBlob = new Blob(blobs, { type: 'audio/wav' }); // O il tipo corretto che usi
const url = URL.createObjectURL(audioBlob);
const link = document.createElement('a');
link.href = url;
link.download = 'conversation.wav'; // Nome del file da scaricare
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
// Libera l'URL creato
URL.revokeObjectURL(url);
})
.catch(error => {
console.error('Error creating audio file for download:', error);
});
});
Obviously this needs a front end side, which in my case is the following: