209 lines
6.6 KiB
Python
209 lines
6.6 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
WebSocket Client Example for Soprano + RVC Streaming API
|
||
|
|
|
||
|
|
This demonstrates how to use the WebSocket endpoint from a Discord bot
|
||
|
|
to stream audio as LLM tokens arrive.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python websocket_client_example.py "Hello! How are you today?"
|
||
|
|
"""
|
||
|
|
import asyncio
|
||
|
|
import websockets
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
import numpy as np
|
||
|
|
import sounddevice as sd
|
||
|
|
|
||
|
|
async def stream_tts(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
|
||
|
|
"""
|
||
|
|
Stream TTS audio token-by-token, simulating LLM token streaming.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: The text to synthesize
|
||
|
|
server_url: WebSocket server URL
|
||
|
|
"""
|
||
|
|
print(f"Connecting to {server_url}...")
|
||
|
|
|
||
|
|
async with websockets.connect(server_url) as websocket:
|
||
|
|
print("Connected! Streaming tokens...")
|
||
|
|
|
||
|
|
# Simulate token-by-token streaming
|
||
|
|
# In real Discord bot, these come from llamacpp streaming response
|
||
|
|
tokens = text.split() # Simple word-by-word tokenization
|
||
|
|
|
||
|
|
# Audio playback setup
|
||
|
|
sample_rate = 48000
|
||
|
|
audio_queue = asyncio.Queue()
|
||
|
|
|
||
|
|
# Start audio playback task
|
||
|
|
async def play_audio():
|
||
|
|
"""Play audio chunks as they arrive"""
|
||
|
|
stream = sd.OutputStream(
|
||
|
|
samplerate=sample_rate,
|
||
|
|
channels=1,
|
||
|
|
dtype='float32'
|
||
|
|
)
|
||
|
|
stream.start()
|
||
|
|
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
audio_bytes = await audio_queue.get()
|
||
|
|
if audio_bytes is None: # Sentinel for end
|
||
|
|
break
|
||
|
|
|
||
|
|
# Convert bytes back to numpy array
|
||
|
|
audio_data = np.frombuffer(audio_bytes, dtype=np.float32)
|
||
|
|
stream.write(audio_data)
|
||
|
|
print(f" ♪ Playing {len(audio_data)} samples")
|
||
|
|
finally:
|
||
|
|
stream.stop()
|
||
|
|
stream.close()
|
||
|
|
|
||
|
|
# Start playback task
|
||
|
|
playback_task = asyncio.create_task(play_audio())
|
||
|
|
|
||
|
|
# Send tokens
|
||
|
|
for i, token in enumerate(tokens):
|
||
|
|
# Add space except for first token
|
||
|
|
token_with_space = token if i == 0 else " " + token
|
||
|
|
|
||
|
|
message = {
|
||
|
|
"token": token_with_space,
|
||
|
|
"pitch_shift": 0 # Adjust pitch if needed (-12 to +12 semitones)
|
||
|
|
}
|
||
|
|
|
||
|
|
await websocket.send(json.dumps(message))
|
||
|
|
print(f"→ Sent token: '{token_with_space}'")
|
||
|
|
|
||
|
|
# Receive and queue audio
|
||
|
|
try:
|
||
|
|
# Non-blocking receive with timeout
|
||
|
|
audio_bytes = await asyncio.wait_for(
|
||
|
|
websocket.recv(),
|
||
|
|
timeout=2.0
|
||
|
|
)
|
||
|
|
await audio_queue.put(audio_bytes)
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
print(" (no audio yet, continuing...)")
|
||
|
|
except websockets.exceptions.ConnectionClosed:
|
||
|
|
break
|
||
|
|
|
||
|
|
# Flush remaining buffer
|
||
|
|
print("→ Flushing buffer...")
|
||
|
|
await websocket.send(json.dumps({"flush": True}))
|
||
|
|
|
||
|
|
# Receive remaining audio chunks
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
audio_bytes = await asyncio.wait_for(
|
||
|
|
websocket.recv(),
|
||
|
|
timeout=1.0
|
||
|
|
)
|
||
|
|
await audio_queue.put(audio_bytes)
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
print(" (flush complete)")
|
||
|
|
|
||
|
|
# Signal end of audio
|
||
|
|
await audio_queue.put(None)
|
||
|
|
await playback_task
|
||
|
|
|
||
|
|
print("✓ Done!")
|
||
|
|
|
||
|
|
async def stream_tts_simple(text: str, server_url: str = "ws://localhost:8765/ws/stream"):
|
||
|
|
"""
|
||
|
|
Simplified version for Discord bot integration.
|
||
|
|
Returns audio chunks as they're generated.
|
||
|
|
"""
|
||
|
|
async with websockets.connect(server_url) as websocket:
|
||
|
|
# Send tokens (in real bot, these come from LLM stream)
|
||
|
|
tokens = text.split()
|
||
|
|
|
||
|
|
for i, token in enumerate(tokens):
|
||
|
|
token_with_space = token if i == 0 else " " + token
|
||
|
|
|
||
|
|
await websocket.send(json.dumps({
|
||
|
|
"token": token_with_space,
|
||
|
|
"pitch_shift": 0
|
||
|
|
}))
|
||
|
|
|
||
|
|
# Yield audio chunks as they arrive
|
||
|
|
try:
|
||
|
|
audio_bytes = await asyncio.wait_for(
|
||
|
|
websocket.recv(),
|
||
|
|
timeout=2.0
|
||
|
|
)
|
||
|
|
yield audio_bytes
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Flush and get remaining audio
|
||
|
|
await websocket.send(json.dumps({"flush": True}))
|
||
|
|
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
audio_bytes = await asyncio.wait_for(
|
||
|
|
websocket.recv(),
|
||
|
|
timeout=1.0
|
||
|
|
)
|
||
|
|
yield audio_bytes
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Example Discord.py integration
|
||
|
|
"""
|
||
|
|
# In your Discord bot cog:
|
||
|
|
|
||
|
|
import discord
|
||
|
|
import websockets
|
||
|
|
import json
|
||
|
|
|
||
|
|
class VoiceCog(commands.Cog):
|
||
|
|
async def speak_streaming(self, ctx, text: str):
|
||
|
|
'''Stream TTS to Discord voice channel'''
|
||
|
|
|
||
|
|
# Connect to voice if not already connected
|
||
|
|
if not ctx.voice_client:
|
||
|
|
await ctx.author.voice.channel.connect()
|
||
|
|
|
||
|
|
vc = ctx.voice_client
|
||
|
|
|
||
|
|
# Stream audio via WebSocket
|
||
|
|
async with websockets.connect('ws://localhost:8765/ws/stream') as ws:
|
||
|
|
# Simulate token streaming (replace with actual LLM streaming)
|
||
|
|
tokens = text.split()
|
||
|
|
|
||
|
|
for token in tokens:
|
||
|
|
# Send token
|
||
|
|
await ws.send(json.dumps({
|
||
|
|
"token": " " + token,
|
||
|
|
"pitch_shift": 0
|
||
|
|
}))
|
||
|
|
|
||
|
|
# Receive audio
|
||
|
|
try:
|
||
|
|
audio_bytes = await asyncio.wait_for(ws.recv(), timeout=2.0)
|
||
|
|
|
||
|
|
# Convert to Discord audio format
|
||
|
|
audio_source = discord.PCMAudio(io.BytesIO(audio_bytes))
|
||
|
|
|
||
|
|
# Play (non-blocking)
|
||
|
|
if not vc.is_playing():
|
||
|
|
vc.play(audio_source)
|
||
|
|
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Flush
|
||
|
|
await ws.send(json.dumps({"flush": True}))
|
||
|
|
"""
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
text = "Hello! This is a test of the WebSocket streaming API. It should feel very natural!"
|
||
|
|
else:
|
||
|
|
text = " ".join(sys.argv[1:])
|
||
|
|
|
||
|
|
print(f"Text: {text}\n")
|
||
|
|
asyncio.run(stream_tts(text))
|