add: absorb soprano_to_rvc as regular subdirectory

Voice conversion pipeline (Soprano TTS → RVC) with Docker support. Previously tracked as bare gitlink; removed .git/ directories and absorbed into main repo for unified tracking. Includes: Soprano TTS, RVC WebUI integration, Docker configs, WebSocket API, and benchmark scripts. Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index). 287 files (3.1GB of ML weights properly excluded via gitignore).
2026-03-04 00:24:53 +02:00
parent 34b184a05a
commit 8ca716029e
287 changed files with 47102 additions and 0 deletions
--- a/soprano_to_rvc/websocket_client_example.py
+++ b/soprano_to_rvc/websocket_client_example.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+WebSocket Client Example for Soprano + RVC Streaming API
+
+This demonstrates how to use the WebSocket endpoint from a Discord bot
+to stream audio as LLM tokens arrive.
+
+Usage:
+    python websocket_client_example.py "Hello! How are you today?"
+"""
+import asyncio
+import websockets
+import json
+import sys
+import numpy as np
+import sounddevice as sd
+
+async def stream_tts(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
+    """
+    Stream TTS audio token-by-token, simulating LLM token streaming.
+    
+    Args:
+        text: The text to synthesize
+        server_url: WebSocket server URL
+    """
+    print(f"Connecting to {server_url}...")
+    
+    async with websockets.connect(server_url) as websocket:
+        print("Connected! Streaming tokens...")
+        
+        # Simulate token-by-token streaming
+        # In real Discord bot, these come from llamacpp streaming response
+        tokens = text.split()  # Simple word-by-word tokenization
+        
+        # Audio playback setup
+        sample_rate = 48000
+        audio_queue = asyncio.Queue()
+        
+        # Start audio playback task
+        async def play_audio():
+            """Play audio chunks as they arrive"""
+            stream = sd.OutputStream(
+                samplerate=sample_rate,
+                channels=1,
+                dtype='float32'
+            )
+            stream.start()
+            
+            try:
+                while True:
+                    audio_bytes = await audio_queue.get()
+                    if audio_bytes is None:  # Sentinel for end
+                        break
+                    
+                    # Convert bytes back to numpy array
+                    audio_data = np.frombuffer(audio_bytes, dtype=np.float32)
+                    stream.write(audio_data)
+                    print(f"  ♪ Playing {len(audio_data)} samples")
+            finally:
+                stream.stop()
+                stream.close()
+        
+        # Start playback task
+        playback_task = asyncio.create_task(play_audio())
+        
+        # Send tokens
+        for i, token in enumerate(tokens):
+            # Add space except for first token
+            token_with_space = token if i == 0 else " " + token
+            
+            message = {
+                "token": token_with_space,
+                "pitch_shift": 0  # Adjust pitch if needed (-12 to +12 semitones)
+            }
+            
+            await websocket.send(json.dumps(message))
+            print(f"→ Sent token: '{token_with_space}'")
+            
+            # Receive and queue audio
+            try:
+                # Non-blocking receive with timeout
+                audio_bytes = await asyncio.wait_for(
+                    websocket.recv(),
+                    timeout=2.0
+                )
+                await audio_queue.put(audio_bytes)
+            except asyncio.TimeoutError:
+                print("  (no audio yet, continuing...)")
+            except websockets.exceptions.ConnectionClosed:
+                break
+        
+        # Flush remaining buffer
+        print("→ Flushing buffer...")
+        await websocket.send(json.dumps({"flush": True}))
+        
+        # Receive remaining audio chunks
+        try:
+            while True:
+                audio_bytes = await asyncio.wait_for(
+                    websocket.recv(),
+                    timeout=1.0
+                )
+                await audio_queue.put(audio_bytes)
+        except asyncio.TimeoutError:
+            print("  (flush complete)")
+        
+        # Signal end of audio
+        await audio_queue.put(None)
+        await playback_task
+        
+        print("✓ Done!")
+
+async def stream_tts_simple(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
+    """
+    Simplified version for Discord bot integration.
+    Returns audio chunks as they're generated.
+    """
+    async with websockets.connect(server_url) as websocket:
+        # Send tokens (in real bot, these come from LLM stream)
+        tokens = text.split()
+        
+        for i, token in enumerate(tokens):
+            token_with_space = token if i == 0 else " " + token
+            
+            await websocket.send(json.dumps({
+                "token": token_with_space,
+                "pitch_shift": 0
+            }))
+            
+            # Yield audio chunks as they arrive
+            try:
+                audio_bytes = await asyncio.wait_for(
+                    websocket.recv(),
+                    timeout=2.0
+                )
+                yield audio_bytes
+            except asyncio.TimeoutError:
+                continue
+        
+        # Flush and get remaining audio
+        await websocket.send(json.dumps({"flush": True}))
+        
+        try:
+            while True:
+                audio_bytes = await asyncio.wait_for(
+                    websocket.recv(),
+                    timeout=1.0
+                )
+                yield audio_bytes
+        except asyncio.TimeoutError:
+            pass
+
+# Example Discord.py integration
+"""
+# In your Discord bot cog:
+
+import discord
+import websockets
+import json
+
+class VoiceCog(commands.Cog):
+    async def speak_streaming(self, ctx, text: str):
+        '''Stream TTS to Discord voice channel'''
+        
+        # Connect to voice if not already connected
+        if not ctx.voice_client:
+            await ctx.author.voice.channel.connect()
+        
+        vc = ctx.voice_client
+        
+        # Stream audio via WebSocket
+        async with websockets.connect('ws://localhost:8765/ws/stream') as ws:
+            # Simulate token streaming (replace with actual LLM streaming)
+            tokens = text.split()
+            
+            for token in tokens:
+                # Send token
+                await ws.send(json.dumps({
+                    "token": " " + token,
+                    "pitch_shift": 0
+                }))
+                
+                # Receive audio
+                try:
+                    audio_bytes = await asyncio.wait_for(ws.recv(), timeout=2.0)
+                    
+                    # Convert to Discord audio format
+                    audio_source = discord.PCMAudio(io.BytesIO(audio_bytes))
+                    
+                    # Play (non-blocking)
+                    if not vc.is_playing():
+                        vc.play(audio_source)
+                    
+                except asyncio.TimeoutError:
+                    continue
+            
+            # Flush
+            await ws.send(json.dumps({"flush": True}))
+"""
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        text = "Hello! This is a test of the WebSocket streaming API. It should feel very natural!"
+    else:
+        text = " ".join(sys.argv[1:])
+    
+    print(f"Text: {text}\n")
+    asyncio.run(stream_tts(text))