I have been trying to use gpt-40-transcribe-diarize using both wav and opus encoded (webm) file. But I always get result of the first 32 seconds. As if only the first chunk is processed from the backend. Any clue what I am doing wrong? I tried with and without extra_body, same result.
try {
stream = fh.createReadStream();
const filePart = await toFile(stream, inputFileName || "input_audio");
speakerRefs = await buildSpeakerReferences({
interviewerPath: interviewerSamplePath,
panelistPath: panelistSamplePath,
fallbackPath: monoSamplePath,
durationSec
});
const request = {
model: "gpt-4o-transcribe-diarize",
file: filePart,
response_format: "diarized_json",
chunking_strategy: "auto"
};
if (speakerRefs.names.length) {
request.extra_body = {
known_speaker_names: speakerRefs.names,
known_speaker_references: speakerRefs.references
};
}
console.log("DEBUG0",JSON.stringify(request,null,2))
response = await openai.audio.transcriptions.create({
...request
});
response.\_speaker_reference_debug = speakerRefs.debug;
console.log("DEBUG1",JSON.stringify(response,null,2))
} finally {
if (stream) stream.destroy();
await fh.close();
}