Speech & TTS-1 model corrupted PCM Data using stream

Hello, I’m trying to use the speech API to stream PCM data from fetch request but when I play the chunks received they play scrambled noises alongside of the voice,

Could it be a bug with OpenAI PCM data? has someone experienced that?
Demo: Vimeo ID: 1039061425?share=copy (can’t paste links)

I’m using Dart/Flutter with soloud package to play buffered audio.

import 'dart:async';

import 'package:example/tts_service_web.dart';
import 'package:flutter/material.dart';
import 'package:flutter_soloud/flutter_soloud.dart';

void main() async {
  WidgetsFlutterBinding.ensureInitialized();

  /// Initialize the player.
  await SoLoud.instance.init();

  runApp(const MyApp());
}

class MyApp extends StatelessWidget {
  const MyApp({super.key});
  @override
  Widget build(BuildContext context) {
    return const MaterialApp(
      home: AudioStreamScreen(),
    );
  }
}

class AudioStreamScreen extends StatefulWidget {
  const AudioStreamScreen({super.key});

  @override
  State<AudioStreamScreen> createState() => _AudioStreamScreenState();
}

class _AudioStreamScreenState extends State<AudioStreamScreen> {
  final openAIKey = 'OPEN_AI_KEY';

  @override
  void initState() {
    super.initState();
  }

  @override
  void dispose() {
    unawaited(SoLoud.instance.disposeAllSources());
    super.dispose();
  }

  Future<void> _fetchAndPlayAudio() async {
    final stream = TTSServiceWeb(openAIKey).tts(
      'https://api.openai.com/v1/audio/speech',
      {
        'model': 'tts-1',
        'voice': 'alloy',
        'speed': 1,
        'input':
            '''1. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.''',
        'response_format': 'pcm',
        "sample_rate": 16000,
        'stream': true,
      },
    );

    final currentSound = SoLoud.instance.setBufferStream(
      maxBufferSize: 1024 * 1024 * 5, // 2 MB
      sampleRate: 16000,
      channels: Channels.mono,
      pcmFormat: BufferPcmType.s16le,
      onBuffering: (isBuffering, handle, time) async {
        debugPrint('buffering');
      },
    );

    int chunkNumber = 0;
    stream.listen((chunk) async {
      try {
        SoLoud.instance.addAudioDataStream(
          currentSound,
          chunk,
        );
        if (chunkNumber == 0) {
          await SoLoud.instance.play(currentSound);
        }
        chunkNumber++;
        print('chunk number: $chunkNumber');
        print('chunk length: ${chunk.length}');
      } on SoLoudPcmBufferFullCppException {
        debugPrint('pcm buffer full or stream already set '
            'to be ended');
      } catch (e) {
        debugPrint(e.toString());
      }
    }, onDone: () {
      SoLoud.instance.setDataIsEnded(currentSound);
    });
  }

  @override
  Widget build(BuildContext context) {
    return Scaffold(
      appBar: AppBar(
        title: const Text('Audio Stream Example'),
      ),
      body: Center(
        child: ElevatedButton(
          onPressed: _fetchAndPlayAudio,
          child: const Text('Play Audio'),
        ),
      ),
    );
  }
}

I’m in a nextjs project, but I found when I was streaming PCM and managing a playback queue that the buffer size would be pretty inconsistent, this would result in distortion/artifacts on playback. I made a stream processor to guarantee a minimum buffer size and that alleviated my playback issues.
example:

async function bufferChatStream(readableStream: NodeJS.ReadableStream, bufferSize?: number) {
  const MIN_BUFFER_SIZE = bufferSize || 32 * 1024 // 32 KB

  return new TransformStream({
    async start(controller) {
      let bufferStore = Uint8Array.from([])

      try {
        readableStream.on('data', (chunk: Buffer) => {
          bufferStore = Buffer.concat([bufferStore, chunk])

          while (bufferStore.length >= MIN_BUFFER_SIZE) {
            const bufferToProcess = bufferStore.slice(0, MIN_BUFFER_SIZE)
            bufferStore = bufferStore.slice(MIN_BUFFER_SIZE)

            // Enqueue the buffer chunk
            controller.enqueue(bufferToProcess)
          }
        })

        readableStream.on('end', () => {
          // Send any remaining data
          if (bufferStore.length > 0) {
            controller.enqueue(bufferStore)
          }
          controller.terminate()
        })

        readableStream.on('error', (error) => {
          console.error('Stream error:', error)
          controller.error(error)
        })
      } catch (error) {
        console.error('Failed to create speech stream:', error)
        controller.error(error)
      }
    }
  })
}
2 Likes