I submitted an audio file to the Whisper API of nonsense words and asked for the results as verbose_json.
Below was the data returned. Are there any API docs available that describe all of the data types returned? I am trying to determine how I can use this data.
Thank you.
{
"task": "transcribe",
"language": "english",
"duration": 9.28,
"segments": [
{
"id": 0,
"seek": 0,
"start": 0.0,
"end": 10.6,
"text": " Fubar, Malfo, Beatrix, Sing Nog, Klet, Fark.",
"tokens": [
50364,
479,
836,
289,
11,
376,
1678,
78,
11,
16031,
6579,
11,
7474,
426,
664,
11,
591,
2631,
11,
479,
809,
13,
50894
],
"temperature": 0.0,
"avg_logprob": -0.522584597269694,
"compression_ratio": 0.8461538461538461,
"no_speech_prob": 0.34786054491996765,
"transient": false
}
],
"text": "Fubar, Malfo, Beatrix, Sing Nog, Klet, Fark."
}