Producing more voices audio file with TTS?

I wonder if the API allows to produce an audio file using two or more voices alternating one another like a movie script, instead of producing distinct audio files with partial prompts and having to assemble them manually…
Anyone has tested? I cannot seem to make it work.

I came out with this:

const voiceMapping = {
"Mark": "alloy",
"Joan": "nova"
};

const script = [
"[Mark]: Hello, how are you?",
"[Joan]: I'm good, thanks! How about you?",
"[Mark]: I'm doing well!",
"[Character 3]: Did you hear about the news?",
"[Joan]: No, what happened?"
];

async function convertToAudio(script) {
const apikey = localStorage.getItem("openaikey");
const audioChunks = [];

for (const line of script) {
const match = line.match(/^\[(.+?)\]: (.+)$/);
if (match) {
const character = match[1].trim(); // Extract character name
const dialogue = match[2].trim(); // Extract dialogue
const selectedVoice = voiceMapping[character]; // Look up voice

if (selectedVoice) {
try {
const response = await fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${apikey}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: "tts-1",
input: dialogue,
voice: selectedVoice
})
});

if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}

const blob = await response.blob();
audioChunks.push(URL.createObjectURL(blob)); 
} catch (error) {
console.error("Error while converting TTS: ", error);
}
} else {
console.log(`No voice mapping found for character: ${character}`);
}
} else {
console.log(`Line not in expected format: ${line}`);
}
}

playAudioChunks(audioChunks);
}

function playAudioChunks(chunks) {
const audioPlayer = document.getElementById("audioPlayer");

const playNext = (index) => {
if (index < chunks.length) {
audioPlayer.src = chunks[index];
audioPlayer.play();
audioPlayer.onended = () => playNext(index + 1); 
}
};

playNext(0); 
}

The routine is called via a button that calls convertToAudio(script);
The fact is that i continue getting the Bad request error and the chuncks are not assembled.

it seems to work. i adopted your code for react native:

const playAudio = async (uri) => {
        try {

            const { sound } = await Audio.Sound.createAsync({ 
                uri: uri,
            })

            await Audio.setAudioModeAsync({ playsInSilentModeIOS: true })

            await sound.playAsync()

        } catch(e) {
            console.log(e.message)
        }
    }

    const handleConvo = async () => {

        const voiceMapping = {
            "Mark": "echo",
            "Joan": "nova",
            "Betty": "fable"
        }

        const script = [
            {speaker: "Mark", content: "Hello, Joan. How are you?"},
            {speaker: "Joan", content: "Hi, Mark. I'm good, thanks! How about you?"},
            {speaker: "Mark", content: "I'm doing well!"},
            {speaker: "Betty", content: "Hey, guys! Did you hear about the news?"},
            {speaker: "Joan", content: "No, what happened?"},
        ]

        for (const item of script) {

            const selectedVoice = voiceMapping[item.speaker]
            const selectedDialogue = item.content
            
            try {
              
                const response = await fetch('https://api.openai.com/v1/audio/speech', {
                    method: 'POST',
                    headers: {
                      'Authorization': `Bearer ${process.env.EXPO_PUBLIC_OPENAI_API_KEY}`,
                      'Content-Type': 'application/json'
                    },
                    body: JSON.stringify({
                      model: 'tts-1',
                      input: selectedDialogue,
                      voice: selectedVoice,
                    })
                })

                if(!response.ok) {
                    throw new Error(`HTTP error! status: ${response.status}`)
                }

                const buffer = await response.arrayBuffer()

                const audioData = Buffer.from(buffer, 'base64')
                let base64Data = audioData.toString('base64')

                const uri = FileSystem.documentDirectory + `speech-${selectedVoice}-${Date.now()}.mp3`

                await FileSystem.writeAsStringAsync(uri, base64Data, {
                    encoding: FileSystem.EncodingType.Base64,
                })

                await playAudio(uri)

            } catch (e) {
              
                console.log(e.message)

            }
        }

    }

it is probably good to add some delay in after playing the audio file as the transition is too fast.

3 Likes

I produced such December 2023, where the contents you can hear are radio hosts discussing the forum itself and posts from the day.

The API voices are pretty sleepy, but this was all automatic and Python scripted.

1 Like

Thank you, i asked chatgpt to analyze my code and proposed me a little change and now it works.