I just started using the speech API and am converting text to speech using tts-1 (and tts-1-hd), and more often than not the audio it generates (or at least as it’s uploaded to GCS) is nonsensical. The text is in one of the supported languages (Greek) but it often returns total jibberish. This is a nodejs application, and I’m uploading the audio as an mp3 to Google Cloud Storage. Pardon the sloppy code, just trying to get it to work. Has anyone who has used the text to speech models experienced this before?
async textToSpeech(req: Request, res: Response) {
console.log("Getting audio file from openAI");
const mp3 = await openai.audio.speech.create({
model: "tts-1-hd",
voice: "echo",
input: req.body.text,
});
const buffer = Buffer.from(await mp3.arrayBuffer());
let bucketDestination = await this.cloudStorage.uploadAudioFile(buffer, req.body.text);
console.log("Responding with " + bucketDestination);
res.json(bucketDestination);
}
async uploadAudioFile(buffer: Buffer, text: string): Promise {
const hash = crypto.createHash(‘sha256’);
hash.update(text);
const hashText = hash.digest(‘hex’);
const destination = “text-to-speech/” + hashText + ‘.mp3’;
const bucket = this.storage.bucket(this.bucketName);
const file = bucket.file(destination);
console.log("Attempting to upload file to " + this.bucketName + "/" + destination);
const stream = file.createWriteStream({
metadata: {
contentType: 'audio/mpeg',
}
});
stream.on('error', (err) => {
console.error('Upload failed.', err);
throw err;
});
stream.on('finish', () => {
console.log(`${destination} uploaded to ${this.bucketName}.`);
});
stream.end(buffer);
return this.bucketName + "/" + destination;
}