miku-discord/soprano_to_rvc/websocket_client_example.py

#!/usr/bin/env python3
"""
WebSocket Client Example for Soprano + RVC Streaming API

This demonstrates how to use the WebSocket endpoint from a Discord bot
to stream audio as LLM tokens arrive.

Usage:
    python websocket_client_example.py "Hello! How are you today?"
"""
import asyncio
import websockets
import json
import sys
import numpy as np
import sounddevice as sd

async def stream_tts(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
    """
    Stream TTS audio token-by-token, simulating LLM token streaming.

    Args:
        text: The text to synthesize
        server_url: WebSocket server URL
    """
    print(f"Connecting to {server_url}...")

    async with websockets.connect(server_url) as websocket:
        print("Connected! Streaming tokens...")

        # Simulate token-by-token streaming
        # In real Discord bot, these come from llamacpp streaming response
        tokens = text.split()  # Simple word-by-word tokenization

        # Audio playback setup
        sample_rate = 48000
        audio_queue = asyncio.Queue()

        # Start audio playback task
        async def play_audio():
            """Play audio chunks as they arrive"""
            stream = sd.OutputStream(
                samplerate=sample_rate,
                channels=1,
                dtype='float32'
            )
            stream.start()

            try:
                while True:
                    audio_bytes = await audio_queue.get()
                    if audio_bytes is None:  # Sentinel for end
                        break

                    # Convert bytes back to numpy array
                    audio_data = np.frombuffer(audio_bytes, dtype=np.float32)
                    stream.write(audio_data)
                    print(f"  ♪ Playing {len(audio_data)} samples")
            finally:
                stream.stop()
                stream.close()

        # Start playback task
        playback_task = asyncio.create_task(play_audio())

        # Send tokens
        for i, token in enumerate(tokens):
            # Add space except for first token
            token_with_space = token if i == 0 else " " + token

            message = {
                "token": token_with_space,
                "pitch_shift": 0  # Adjust pitch if needed (-12 to +12 semitones)
            }

            await websocket.send(json.dumps(message))
            print(f"→ Sent token: '{token_with_space}'")

            # Receive and queue audio
            try:
                # Non-blocking receive with timeout
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=2.0
                )
                await audio_queue.put(audio_bytes)
            except asyncio.TimeoutError:
                print("  (no audio yet, continuing...)")
            except websockets.exceptions.ConnectionClosed:
                break

        # Flush remaining buffer
        print("→ Flushing buffer...")
        await websocket.send(json.dumps({"flush": True}))

        # Receive remaining audio chunks
        try:
            while True:
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=1.0
                )
                await audio_queue.put(audio_bytes)
        except asyncio.TimeoutError:
            print("  (flush complete)")

        # Signal end of audio
        await audio_queue.put(None)
        await playback_task

        print("✓ Done!")

async def stream_tts_simple(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
    """
    Simplified version for Discord bot integration.
    Returns audio chunks as they're generated.
    """
    async with websockets.connect(server_url) as websocket:
        # Send tokens (in real bot, these come from LLM stream)
        tokens = text.split()

        for i, token in enumerate(tokens):
            token_with_space = token if i == 0 else " " + token

            await websocket.send(json.dumps({
                "token": token_with_space,
                "pitch_shift": 0
            }))

            # Yield audio chunks as they arrive
            try:
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=2.0
                )
                yield audio_bytes
            except asyncio.TimeoutError:
                continue

        # Flush and get remaining audio
        await websocket.send(json.dumps({"flush": True}))

        try:
            while True:
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=1.0
                )
                yield audio_bytes
        except asyncio.TimeoutError:
            pass

# Example Discord.py integration
"""
# In your Discord bot cog:

import discord
import websockets
import json

class VoiceCog(commands.Cog):
    async def speak_streaming(self, ctx, text: str):
        '''Stream TTS to Discord voice channel'''

        # Connect to voice if not already connected
        if not ctx.voice_client:
            await ctx.author.voice.channel.connect()

        vc = ctx.voice_client

        # Stream audio via WebSocket
        async with websockets.connect('ws://localhost:8765/ws/stream') as ws:
            # Simulate token streaming (replace with actual LLM streaming)
            tokens = text.split()

            for token in tokens:
                # Send token
                await ws.send(json.dumps({
                    "token": " " + token,
                    "pitch_shift": 0
                }))

                # Receive audio
                try:
                    audio_bytes = await asyncio.wait_for(ws.recv(), timeout=2.0)

                    # Convert to Discord audio format
                    audio_source = discord.PCMAudio(io.BytesIO(audio_bytes))

                    # Play (non-blocking)
                    if not vc.is_playing():
                        vc.play(audio_source)

                except asyncio.TimeoutError:
                    continue

            # Flush
            await ws.send(json.dumps({"flush": True}))
"""

if __name__ == "__main__":
    if len(sys.argv) < 2:
        text = "Hello! This is a test of the WebSocket streaming API. It should feel very natural!"
    else:
        text = " ".join(sys.argv[1:])

    print(f"Text: {text}\n")
    asyncio.run(stream_tts(text))