soprano_to_rvc/websocket_client_example.py

#!/usr/bin/env python3
"""
WebSocket Client Example for Soprano + RVC Streaming API

This demonstrates how to use the WebSocket endpoint from a Discord bot
to stream audio as LLM tokens arrive.

Usage:
    python websocket_client_example.py "Hello! How are you today?"
"""
import asyncio
import websockets
import json
import sys
import numpy as np
import sounddevice as sd

async def stream_tts(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
    """
    Stream TTS audio token-by-token, simulating LLM token streaming.
    
    Args:
        text: The text to synthesize
        server_url: WebSocket server URL
    """
    print(f"Connecting to {server_url}...")
    
    async with websockets.connect(server_url) as websocket:
        print("Connected! Streaming tokens...")
        
        # Simulate token-by-token streaming
        # In real Discord bot, these come from llamacpp streaming response
        tokens = text.split()  # Simple word-by-word tokenization
        
        # Audio playback setup
        sample_rate = 48000
        audio_queue = asyncio.Queue()
        
        # Start audio playback task
        async def play_audio():
            """Play audio chunks as they arrive"""
            stream = sd.OutputStream(
                samplerate=sample_rate,
                channels=1,
                dtype='float32'
            )
            stream.start()
            
            try:
                while True:
                    audio_bytes = await audio_queue.get()
                    if audio_bytes is None:  # Sentinel for end
                        break
                    
                    # Convert bytes back to numpy array
                    audio_data = np.frombuffer(audio_bytes, dtype=np.float32)
                    stream.write(audio_data)
                    print(f"  ♪ Playing {len(audio_data)} samples")
            finally:
                stream.stop()
                stream.close()
        
        # Start playback task
        playback_task = asyncio.create_task(play_audio())
        
        # Send tokens
        for i, token in enumerate(tokens):
            # Add space except for first token
            token_with_space = token if i == 0 else " " + token
            
            message = {
                "token": token_with_space,
                "pitch_shift": 0  # Adjust pitch if needed (-12 to +12 semitones)
            }
            
            await websocket.send(json.dumps(message))
            print(f"→ Sent token: '{token_with_space}'")
            
            # Receive and queue audio
            try:
                # Non-blocking receive with timeout
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=2.0
                )
                await audio_queue.put(audio_bytes)
            except asyncio.TimeoutError:
                print("  (no audio yet, continuing...)")
            except websockets.exceptions.ConnectionClosed:
                break
        
        # Flush remaining buffer
        print("→ Flushing buffer...")
        await websocket.send(json.dumps({"flush": True}))
        
        # Receive remaining audio chunks
        try:
            while True:
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=1.0
                )
                await audio_queue.put(audio_bytes)
        except asyncio.TimeoutError:
            print("  (flush complete)")
        
        # Signal end of audio
        await audio_queue.put(None)
        await playback_task
        
        print("✓ Done!")

async def stream_tts_simple(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
    """
    Simplified version for Discord bot integration.
    Returns audio chunks as they're generated.
    """
    async with websockets.connect(server_url) as websocket:
        # Send tokens (in real bot, these come from LLM stream)
        tokens = text.split()
        
        for i, token in enumerate(tokens):
            token_with_space = token if i == 0 else " " + token
            
            await websocket.send(json.dumps({
                "token": token_with_space,
                "pitch_shift": 0
            }))
            
            # Yield audio chunks as they arrive
            try:
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=2.0
                )
                yield audio_bytes
            except asyncio.TimeoutError:
                continue
        
        # Flush and get remaining audio
        await websocket.send(json.dumps({"flush": True}))
        
        try:
            while True:
                audio_bytes = await asyncio.wait_for(
                    websocket.recv(),
                    timeout=1.0
                )
                yield audio_bytes
        except asyncio.TimeoutError:
            pass

# Example Discord.py integration
"""
# In your Discord bot cog:

import discord
import websockets
import json

class VoiceCog(commands.Cog):
    async def speak_streaming(self, ctx, text: str):
        '''Stream TTS to Discord voice channel'''
        
        # Connect to voice if not already connected
        if not ctx.voice_client:
            await ctx.author.voice.channel.connect()
        
        vc = ctx.voice_client
        
        # Stream audio via WebSocket
        async with websockets.connect('ws://localhost:8765/ws/stream') as ws:
            # Simulate token streaming (replace with actual LLM streaming)
            tokens = text.split()
            
            for token in tokens:
                # Send token
                await ws.send(json.dumps({
                    "token": " " + token,
                    "pitch_shift": 0
                }))
                
                # Receive audio
                try:
                    audio_bytes = await asyncio.wait_for(ws.recv(), timeout=2.0)
                    
                    # Convert to Discord audio format
                    audio_source = discord.PCMAudio(io.BytesIO(audio_bytes))
                    
                    # Play (non-blocking)
                    if not vc.is_playing():
                        vc.play(audio_source)
                    
                except asyncio.TimeoutError:
                    continue
            
            # Flush
            await ws.send(json.dumps({"flush": True}))
"""

if __name__ == "__main__":
    if len(sys.argv) < 2:
        text = "Hello! This is a test of the WebSocket streaming API. It should feel very natural!"
    else:
        text = " ".join(sys.argv[1:])
    
    print(f"Text: {text}\n")
    asyncio.run(stream_tts(text))
add: absorb soprano_to_rvc as regular subdirectory Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (.pth, .pt, .onnx, .index). 287 files (3.1GB of ML weights properly excluded via gitignore). 2026-03-04 00:24:53 +02:00			`#!/usr/bin/env python3`
			`"""`
			`WebSocket Client Example for Soprano + RVC Streaming API`

			`This demonstrates how to use the WebSocket endpoint from a Discord bot`
			`to stream audio as LLM tokens arrive.`

			`Usage:`
			`python websocket_client_example.py "Hello! How are you today?"`
			`"""`
			`import asyncio`
			`import websockets`
			`import json`
			`import sys`
			`import numpy as np`
			`import sounddevice as sd`

			`async def stream_tts(text: str, server_url: str = "ws://localhost:8765/ws/stream"):`
			`"""`
			`Stream TTS audio token-by-token, simulating LLM token streaming.`

			`Args:`
			`text: The text to synthesize`
			`server_url: WebSocket server URL`
			`"""`
			`print(f"Connecting to {server_url}...")`

			`async with websockets.connect(server_url) as websocket:`
			`print("Connected! Streaming tokens...")`

			`# Simulate token-by-token streaming`
			`# In real Discord bot, these come from llamacpp streaming response`
			`tokens = text.split() # Simple word-by-word tokenization`

			`# Audio playback setup`
			`sample_rate = 48000`
			`audio_queue = asyncio.Queue()`

			`# Start audio playback task`
			`async def play_audio():`
			`"""Play audio chunks as they arrive"""`
			`stream = sd.OutputStream(`
			`samplerate=sample_rate,`
			`channels=1,`
			`dtype='float32'`
			`)`
			`stream.start()`

			`try:`
			`while True:`
			`audio_bytes = await audio_queue.get()`
			`if audio_bytes is None: # Sentinel for end`
			`break`

			`# Convert bytes back to numpy array`
			`audio_data = np.frombuffer(audio_bytes, dtype=np.float32)`
			`stream.write(audio_data)`
			`print(f" ♪ Playing {len(audio_data)} samples")`
			`finally:`
			`stream.stop()`
			`stream.close()`

			`# Start playback task`
			`playback_task = asyncio.create_task(play_audio())`

			`# Send tokens`
			`for i, token in enumerate(tokens):`
			`# Add space except for first token`
			`token_with_space = token if i == 0 else " " + token`

			`message = {`
			`"token": token_with_space,`
			`"pitch_shift": 0 # Adjust pitch if needed (-12 to +12 semitones)`
			`}`

			`await websocket.send(json.dumps(message))`
			`print(f"→ Sent token: '{token_with_space}'")`

			`# Receive and queue audio`
			`try:`
			`# Non-blocking receive with timeout`
			`audio_bytes = await asyncio.wait_for(`
			`websocket.recv(),`
			`timeout=2.0`
			`)`
			`await audio_queue.put(audio_bytes)`
			`except asyncio.TimeoutError:`
			`print(" (no audio yet, continuing...)")`
			`except websockets.exceptions.ConnectionClosed:`
			`break`

			`# Flush remaining buffer`
			`print("→ Flushing buffer...")`
			`await websocket.send(json.dumps({"flush": True}))`

			`# Receive remaining audio chunks`
			`try:`
			`while True:`
			`audio_bytes = await asyncio.wait_for(`
			`websocket.recv(),`
			`timeout=1.0`
			`)`
			`await audio_queue.put(audio_bytes)`
			`except asyncio.TimeoutError:`
			`print(" (flush complete)")`

			`# Signal end of audio`
			`await audio_queue.put(None)`
			`await playback_task`

			`print("✓ Done!")`

			`async def stream_tts_simple(text: str, server_url: str = "ws://localhost:8765/ws/stream"):`
			`"""`
			`Simplified version for Discord bot integration.`
			`Returns audio chunks as they're generated.`
			`"""`
			`async with websockets.connect(server_url) as websocket:`
			`# Send tokens (in real bot, these come from LLM stream)`
			`tokens = text.split()`

			`for i, token in enumerate(tokens):`
			`token_with_space = token if i == 0 else " " + token`

			`await websocket.send(json.dumps({`
			`"token": token_with_space,`
			`"pitch_shift": 0`
			`}))`

			`# Yield audio chunks as they arrive`
			`try:`
			`audio_bytes = await asyncio.wait_for(`
			`websocket.recv(),`
			`timeout=2.0`
			`)`
			`yield audio_bytes`
			`except asyncio.TimeoutError:`
			`continue`

			`# Flush and get remaining audio`
			`await websocket.send(json.dumps({"flush": True}))`

			`try:`
			`while True:`
			`audio_bytes = await asyncio.wait_for(`
			`websocket.recv(),`
			`timeout=1.0`
			`)`
			`yield audio_bytes`
			`except asyncio.TimeoutError:`
			`pass`

			`# Example Discord.py integration`
			`"""`
			`# In your Discord bot cog:`

			`import discord`
			`import websockets`
			`import json`

			`class VoiceCog(commands.Cog):`
			`async def speak_streaming(self, ctx, text: str):`
			`'''Stream TTS to Discord voice channel'''`

			`# Connect to voice if not already connected`
			`if not ctx.voice_client:`
			`await ctx.author.voice.channel.connect()`

			`vc = ctx.voice_client`

			`# Stream audio via WebSocket`
			`async with websockets.connect('ws://localhost:8765/ws/stream') as ws:`
			`# Simulate token streaming (replace with actual LLM streaming)`
			`tokens = text.split()`

			`for token in tokens:`
			`# Send token`
			`await ws.send(json.dumps({`
			`"token": " " + token,`
			`"pitch_shift": 0`
			`}))`

			`# Receive audio`
			`try:`
			`audio_bytes = await asyncio.wait_for(ws.recv(), timeout=2.0)`

			`# Convert to Discord audio format`
			`audio_source = discord.PCMAudio(io.BytesIO(audio_bytes))`

			`# Play (non-blocking)`
			`if not vc.is_playing():`
			`vc.play(audio_source)`

			`except asyncio.TimeoutError:`
			`continue`

			`# Flush`
			`await ws.send(json.dumps({"flush": True}))`
			`"""`

			`if __name__ == "__main__":`
			`if len(sys.argv) < 2:`
			`text = "Hello! This is a test of the WebSocket streaming API. It should feel very natural!"`
			`else:`
			`text = " ".join(sys.argv[1:])`

			`print(f"Text: {text}\n")`
			`asyncio.run(stream_tts(text))`