Hi ! First of all, thanks for all the great work, and continued support.
Also, I am sure this request has probably been made, but I looked and didn’t see it exactly as I would like to express here, so, here it is:
The Structured JSON outputs are very powerful, and an amazing innovation. I truly hope this feature is here to last (100 years into the future!).
I use Structured JSON and also TTS API calls quite frequently. They both work great.
My feature request is to leverage the Structured JSON Roles that have already been implemented, and to integrate it with the TTS models.
The idea would be for the TTS models to use Two Voices in one audio, perhaps starting simply with an A:B Dialogue between the two voices.
The API call could be made using JSON schema for roles and speakers A:B much like is already done for structured outputs.
The TTS model could merely allocate a different voice model for the A:B roles in the JSON schema fed to the TTS model.
Anyway, this would be a great feature, and really take the TTS models to the next level.
Example Output:
"dialogue":[{"speaker":"Alice","line":"I’m always jogging in the morning."},{"speaker":"Bob","line":"That’s healthy! I’m always eating breakfast."},{"speaker":"Alice","line":"Yes, breakfast is important. I’m always reading news."},{"speaker":"Bob","line":"I’m always studying for my exams."},{"speaker":"Alice","line":"Studying is good! At least you have a plan."},{"speaker":"Bob","line":"True! I’m always practicing math."},{"speaker":"Alice","line":"At least math improves your skills."},
… etc.
This can already be done (by writing your own code to make multiple API calls). The spacing between the voices is something that you would need to tune yourself, adding some application-specific silence, because immediately appending the audio segments can seem rushed.
A more API-centric solution would be to take a batch input.
Now:
response = openai.audio.speech.create(
model="tts-1",
voice="alloy",
input="The quick brown fox jumped over the lazy dog."
)
Possibility:
response = openai.audio.speech.create(
model="tts-1",
turn_gap=500, # milliseconds
turns=[
{"voice": "alloy", "input": "Hey Echo, how are you today?"},
{"voice": "echo", "input": "I'm doing great, thanks for asking! How about you?"},
{"voice": "alloy", "input": "I'm good, just finishing up some work."},
{"voice": "echo", "input": "That sounds productive! What are you working on?"},
{"voice": "alloy", "input": "I'm organizing some files for a big project."},
{"voice": "echo", "input": "Nice, it must feel satisfying to get things in order!"}
]
)
Wow! This is amazing! Thank you so much for the reply and the example. I will look into this now, and post here again after! Thanks a again! Very much appreciated!
That’s not actually a working example. That’s what it might look like if OpenAI implemented your idea logically.
Here’s what coding would look like that you can do right now, just grabbing an example off my drive.
'''
text-to-speech example with API, openai > 1.2.2
'''
import time
from pathlib import Path
from openai import OpenAI
import shutil
class botDate:
""" .start/.now : object creation date/time; current date/time (formatted)
.set/.get : start/reset timer, return elapsed seconds
"""
def __init__(self, format_spec="%Y-%m-%d %H:%M%p"):
self.format_spec = format_spec
self.created_time = time.time()
self.start_time = time.perf_counter()
def start(self):
return self.format_time(self.created_time)
def now(self):
return self.format_time(time.time())
def format_time(self, epoch_seconds):
formatted_time = time.strftime(self.format_spec, time.localtime(epoch_seconds))
return formatted_time
def set(self):
'''Record the current time when .set is called'''
self.start_time = time.perf_counter()
def get(self):
'''seconds since object creation, or since .set was called'''
elapsed_time = time.perf_counter() - self.start_time
return round(elapsed_time, 3)
class ttsFile:
def __init__(self, text="x",
voice="alloy",
response_format="mp3",
file_prefix="tts",
hd=False
):
self.timeout = 240
self.timer = botDate("%Y%m%d_%H%M%S")
self.input = text
self.voice = voice
self.response_format = response_format
self.file_prefix = file_prefix
self.model = "tts-1-hd" if hd else "tts-1"
self.client = OpenAI(timeout = self.timeout)
def _setparams(self):
self.params = {
"voice": self.voice,
"model": self.model,
"response_format": self.response_format,
"input": self.input,
}
def tofile(self, text):
self.input = text
filename = (f"{self.file_prefix}_{self.model}_"
f"{self.voice}_{self.timer.now()}.{self.response_format}")
out_file = Path(__file__).parent / filename
self.timer.set()
self._setparams()
self.client = OpenAI(timeout = self.timeout)
print(str(self.params))
response = self.client.audio.speech.create(**self.params)
response.stream_to_file(out_file)
response = None
# with open("out_file", "wb") as file:
# file.write(response.content) # alternate file create
cost = round(len(self.input) *
(3/1000 if self.params['model'] == "tts-1-hd" else 1.5/1000),4)
return {"time": self.timer.get(), # just some stats
"length": len(self.input),
"cost": cost,
"filename": filename,
"response": response}
from pydub import AudioSegment
def generate_voice_turn(voice_choice, text):
tts = ttsFile(hd=False)
voices = {
5: 'echo',
6: 'onyx',
1: 'shimmer',
}
tts.voice = voices[voice_choice]
tts.response_format = "mp3"
tts.file_prefix = f"radio_tts-{voice_choice}_{tts.voice}_"
results = tts.tofile(text)
print(results['filename'] + f" took {results['time']:0.2f} seconds")
print(f"{results['length']} characters, cost {results['cost']} cents.")
return results['filename']
def join_audio_files(files, output_filename):
with open(output_filename, 'wb') as output_file:
for file in files:
with open(file, 'rb') as input_file:
shutil.copyfileobj(input_file, output_file)
if __name__ == "__main__":
# Generate voice turns
# Generate an extended and more engaging voice turns script for the AI co-hosts Sam and Greg's radio show
voice_turns = [
(6, "Today’s discussion features Carolyn Finney, a scholar focused on issues related to race, space, and nature, particularly within the context of national parks and Donald Worster, an environmental historian known for his work on the ecological consequences of economic and technological change."),
(1, "Donald, the renaming of Mount Evans to Mount Blue Sky is more than a mere change of nomenclature. It represents a critical reevaluation of our collective memory and the symbols we choose to honor in our public spaces. By renaming the mountain, we’re acknowledging the historical wrongs associated with John Evans and the Sand Creek Massacre, and we’re choosing to honor the cultural heritage of the indigenous Arapaho and Cheyenne tribes instead."),
(5, "I understand your point, Carolyn, and I agree that acknowledging past injustices is crucial. However, we must also consider the potential drawbacks of renaming natural landmarks. Firstly, such renamings can lead to confusion and a loss of historical continuity. Names like Mount Evans are deeply ingrained in the local culture and history, and changing them might disrupt the community’s connection to the landscape. Moreover, renaming could be seen as an oversimplification of history, potentially erasing the multifaceted narratives associated with these places."),
(1, "That’s a valid perspective, but consider this: the act of naming is powerful and reflects who has the authority to shape narratives and histories. The original naming of Mount Evans was itself an act that imposed a certain narrative, one that ignored the indigenous peoples’ connection to the land and the atrocities they suffered. Renaming to Mount Blue Sky is a way of reclaiming that narrative, offering a form of restorative justice, and providing an opportunity for education and reflection about the past."),
(5, "While that’s a compelling argument, we should also ponder the practical implications. Renaming landmarks requires updating maps, signs, and literature, which involves significant time, effort, and resources. It’s not just about changing a name; it’s about altering a vast array of information systems and public consciousness. This process, though symbolic, might divert attention and resources from more direct forms of support and reparations to indigenous communities."),
(1, "Those are logistical challenges indeed, but they don’t outweigh the symbolic and educational value of such a change. Renaming is part of a broader movement to decolonize our understanding of history and geography. It’s not just about the past; it’s about shaping a more inclusive and just future. While it’s true that renaming alone is not sufficient and must be part of broader reparative actions, it’s a significant step toward acknowledging the narratives and rights of indigenous peoples."),
(5, "Your points are well-taken, Carolyn. It’s clear that the renaming of Mount Evans to Mount Blue Sky is not just about changing a word on a map; it’s about confronting our history, rectifying past wrongs, and deciding how we want to remember and honor our shared spaces. It’s a complex issue that requires careful consideration of both the symbolic significance and the practical consequences."),
(1, "Precisely, Donald. It’s a dialogue between the past and the present, a negotiation of memory, identity, and justice. As we move forward, it’s essential that such decisions are made inclusively, with the voices of all stakeholders, especially those of indigenous communities, being heard and respected."),
(5, "Exactly! But in all seriousness, the community's been loving it. It's the simplicity for me. What's your take on it?"),
(6, "Expert characters are selected by AI only to imbue answering skills in field, without likeness rights. AI responses neither represent actual expert opinions nor positions."),
]
audio_files = []
for turn in voice_turns:
voice_choice, text = turn
audio_files.append(generate_voice_turn(voice_choice, text))
# Join audio files
join_audio_files(audio_files, "combined_audio_about_mountain.mp3")
Just running it again:
(The speakers aren’t that engaging.)
You’d need to look for the imports inside the code and have those installed in Python with pip.
I was able to get it working. Thanks for your help. Just sharing here a Generalized version of what ultimately worked for me (Javascript). Your example was very helpful in providing a foundation for the eventual solution, so thanks again!
// FUNCTION TO PARSE DIALOGUE JSON FOR TTS
function parseDialogueJSON(jsonData) {
let voices = ['female', 'male']; // Define alternating voices
let voiceIndex = 0; // Start with the first voice (female)
let dialogueArray = []; // Array to hold the dialogue with voices
// Loop through the dialogue array directly
jsonData.forEach((line, index) => {
let speakerLine = typeof line.line === 'string' ? line.line.replace(/___/g, '<pause>') : '';
// Add a pause between different speakers
if (index > 0 && line.speaker !== jsonData[index - 1].speaker) {
speakerLine += '<pause=1000>'; // Add a 1000ms (1 second) pause
}
// Push the dialogue with voice
dialogueArray.push({
speaker: line.speaker,
line: speakerLine,
voice: voices[voiceIndex]
});
// Alternate voices between female and male
voiceIndex = (voiceIndex + 1) % voices.length;
});
return dialogueArray; // Return the dialogue with voices
}
// FUNCTION TO CALL OPEN AI TTS API FOR AUDIO GENERATION
function generateAudioFromText(simpleText, apiEndpoint, apiKey) {
Logger.log('Initializing OpenAI TTS API call...');
var audioChunks = []; // Array to store the audio chunks
// Loop through the dialogue lines and call OpenAI TTS for each speaker
for (let line of simpleText) {
let data = {
"model": "text-to-speech-model", // Replace with your model name
"input": line.line,
"voice": line.voice === 'female' ? 'female-voice-id' : 'male-voice-id', // Replace with actual voice IDs
"response_format": "mp3"
};
var options = {
'method': 'post',
'contentType': 'application/json',
'headers': {
'Authorization': 'Bearer ' + apiKey
},
'muteHttpExceptions': true,
'payload': JSON.stringify(data)
};
try {
Logger.log('Fetching audio from OpenAI TTS API...');
var response = UrlFetchApp.fetch(apiEndpoint, options);
if (response.getResponseCode() !== 200) {
Logger.log('Failed to fetch audio. Response code: ' + response.getResponseCode());
Logger.log('Response content: ' + response.getContentText());
return null;
}
var audioContent = response.getContent();
audioChunks.push(audioContent); // Store each audio chunk
Logger.log('Audio fetched successfully.');
} catch (error) {
Logger.log('Error calling OpenAI TTS API: ' + error.toString());
return null;
}
}
// Combine the audio chunks into a single byte array
var combinedAudio = [];
for (var i = 0; i < audioChunks.length; i++) {
combinedAudio = combinedAudio.concat(Array.from(audioChunks[i]));
}
// Create a final Blob from the combined audio byte array
var finalAudioBlob = Utilities.newBlob(new Uint8Array(combinedAudio), 'audio/mp3', 'TTS_Audio.mp3');
return finalAudioBlob; // Return the combined audio blob
}