Files
miku-discord/soprano_to_rvc/websocket_client_example.py
koko210Serve 8ca716029e add: absorb soprano_to_rvc as regular subdirectory
Voice conversion pipeline (Soprano TTS → RVC) with Docker support.
Previously tracked as bare gitlink; removed .git/ directories and
absorbed into main repo for unified tracking.

Includes: Soprano TTS, RVC WebUI integration, Docker configs,
WebSocket API, and benchmark scripts.
Updated .gitignore to exclude large model weights (*.pth, *.pt, *.onnx, *.index).
287 files (3.1GB of ML weights properly excluded via gitignore).
2026-03-04 00:24:53 +02:00

209 lines
6.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
WebSocket Client Example for Soprano + RVC Streaming API
This demonstrates how to use the WebSocket endpoint from a Discord bot
to stream audio as LLM tokens arrive.
Usage:
python websocket_client_example.py "Hello! How are you today?"
"""
import asyncio
import websockets
import json
import sys
import numpy as np
import sounddevice as sd
async def stream_tts(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
"""
Stream TTS audio token-by-token, simulating LLM token streaming.
Args:
text: The text to synthesize
server_url: WebSocket server URL
"""
print(f"Connecting to {server_url}...")
async with websockets.connect(server_url) as websocket:
print("Connected! Streaming tokens...")
# Simulate token-by-token streaming
# In real Discord bot, these come from llamacpp streaming response
tokens = text.split() # Simple word-by-word tokenization
# Audio playback setup
sample_rate = 48000
audio_queue = asyncio.Queue()
# Start audio playback task
async def play_audio():
"""Play audio chunks as they arrive"""
stream = sd.OutputStream(
samplerate=sample_rate,
channels=1,
dtype='float32'
)
stream.start()
try:
while True:
audio_bytes = await audio_queue.get()
if audio_bytes is None: # Sentinel for end
break
# Convert bytes back to numpy array
audio_data = np.frombuffer(audio_bytes, dtype=np.float32)
stream.write(audio_data)
print(f" ♪ Playing {len(audio_data)} samples")
finally:
stream.stop()
stream.close()
# Start playback task
playback_task = asyncio.create_task(play_audio())
# Send tokens
for i, token in enumerate(tokens):
# Add space except for first token
token_with_space = token if i == 0 else " " + token
message = {
"token": token_with_space,
"pitch_shift": 0 # Adjust pitch if needed (-12 to +12 semitones)
}
await websocket.send(json.dumps(message))
print(f"→ Sent token: '{token_with_space}'")
# Receive and queue audio
try:
# Non-blocking receive with timeout
audio_bytes = await asyncio.wait_for(
websocket.recv(),
timeout=2.0
)
await audio_queue.put(audio_bytes)
except asyncio.TimeoutError:
print(" (no audio yet, continuing...)")
except websockets.exceptions.ConnectionClosed:
break
# Flush remaining buffer
print("→ Flushing buffer...")
await websocket.send(json.dumps({"flush": True}))
# Receive remaining audio chunks
try:
while True:
audio_bytes = await asyncio.wait_for(
websocket.recv(),
timeout=1.0
)
await audio_queue.put(audio_bytes)
except asyncio.TimeoutError:
print(" (flush complete)")
# Signal end of audio
await audio_queue.put(None)
await playback_task
print("✓ Done!")
async def stream_tts_simple(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
"""
Simplified version for Discord bot integration.
Returns audio chunks as they're generated.
"""
async with websockets.connect(server_url) as websocket:
# Send tokens (in real bot, these come from LLM stream)
tokens = text.split()
for i, token in enumerate(tokens):
token_with_space = token if i == 0 else " " + token
await websocket.send(json.dumps({
"token": token_with_space,
"pitch_shift": 0
}))
# Yield audio chunks as they arrive
try:
audio_bytes = await asyncio.wait_for(
websocket.recv(),
timeout=2.0
)
yield audio_bytes
except asyncio.TimeoutError:
continue
# Flush and get remaining audio
await websocket.send(json.dumps({"flush": True}))
try:
while True:
audio_bytes = await asyncio.wait_for(
websocket.recv(),
timeout=1.0
)
yield audio_bytes
except asyncio.TimeoutError:
pass
# Example Discord.py integration
"""
# In your Discord bot cog:
import discord
import websockets
import json
class VoiceCog(commands.Cog):
async def speak_streaming(self, ctx, text: str):
'''Stream TTS to Discord voice channel'''
# Connect to voice if not already connected
if not ctx.voice_client:
await ctx.author.voice.channel.connect()
vc = ctx.voice_client
# Stream audio via WebSocket
async with websockets.connect('ws://localhost:8765/ws/stream') as ws:
# Simulate token streaming (replace with actual LLM streaming)
tokens = text.split()
for token in tokens:
# Send token
await ws.send(json.dumps({
"token": " " + token,
"pitch_shift": 0
}))
# Receive audio
try:
audio_bytes = await asyncio.wait_for(ws.recv(), timeout=2.0)
# Convert to Discord audio format
audio_source = discord.PCMAudio(io.BytesIO(audio_bytes))
# Play (non-blocking)
if not vc.is_playing():
vc.play(audio_source)
except asyncio.TimeoutError:
continue
# Flush
await ws.send(json.dumps({"flush": True}))
"""
if __name__ == "__main__":
if len(sys.argv) < 2:
text = "Hello! This is a test of the WebSocket streaming API. It should feel very natural!"
else:
text = " ".join(sys.argv[1:])
print(f"Text: {text}\n")
asyncio.run(stream_tts(text))