Phase 4 STT pipeline implemented — Silero VAD + faster-whisper — still not working well at all
This commit is contained in:
@@ -19,6 +19,7 @@ import json
|
||||
import os
|
||||
from typing import Optional
|
||||
import discord
|
||||
from discord.ext import voice_recv
|
||||
import globals
|
||||
from utils.logger import get_logger
|
||||
|
||||
@@ -97,12 +98,12 @@ class VoiceSessionManager:
|
||||
# 10. Create voice session
|
||||
self.active_session = VoiceSession(guild_id, voice_channel, text_channel)
|
||||
|
||||
# 11. Connect to Discord voice channel
|
||||
# 11. Connect to Discord voice channel with VoiceRecvClient
|
||||
try:
|
||||
voice_client = await voice_channel.connect()
|
||||
voice_client = await voice_channel.connect(cls=voice_recv.VoiceRecvClient)
|
||||
self.active_session.voice_client = voice_client
|
||||
self.active_session.active = True
|
||||
logger.info(f"✓ Connected to voice channel: {voice_channel.name}")
|
||||
logger.info(f"✓ Connected to voice channel: {voice_channel.name} (with audio receiving)")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to voice channel: {e}", exc_info=True)
|
||||
raise
|
||||
@@ -387,7 +388,9 @@ class VoiceSession:
|
||||
self.voice_client: Optional[discord.VoiceClient] = None
|
||||
self.audio_source: Optional['MikuVoiceSource'] = None # Forward reference
|
||||
self.tts_streamer: Optional['TTSTokenStreamer'] = None # Forward reference
|
||||
self.voice_receiver: Optional['VoiceReceiver'] = None # STT receiver
|
||||
self.active = False
|
||||
self.miku_speaking = False # Track if Miku is currently speaking
|
||||
|
||||
logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}")
|
||||
|
||||
@@ -433,6 +436,207 @@ class VoiceSession:
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error stopping audio streaming: {e}", exc_info=True)
|
||||
|
||||
async def start_listening(self, user: discord.User):
|
||||
"""
|
||||
Start listening to a user's voice (STT).
|
||||
|
||||
Args:
|
||||
user: Discord user to listen to
|
||||
"""
|
||||
from utils.voice_receiver import VoiceReceiverSink
|
||||
|
||||
try:
|
||||
# Create receiver if not exists
|
||||
if not self.voice_receiver:
|
||||
self.voice_receiver = VoiceReceiverSink(self)
|
||||
|
||||
# Start receiving audio from Discord using discord-ext-voice-recv
|
||||
if self.voice_client:
|
||||
self.voice_client.listen(self.voice_receiver)
|
||||
logger.info("✓ Discord voice receive started (discord-ext-voice-recv)")
|
||||
|
||||
# Start listening to specific user
|
||||
await self.voice_receiver.start_listening(user.id, user)
|
||||
logger.info(f"✓ Started listening to {user.name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start listening to {user.name}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def stop_listening(self, user_id: int):
|
||||
"""
|
||||
Stop listening to a user.
|
||||
|
||||
Args:
|
||||
user_id: Discord user ID
|
||||
"""
|
||||
if self.voice_receiver:
|
||||
await self.voice_receiver.stop_listening(user_id)
|
||||
logger.info(f"✓ Stopped listening to user {user_id}")
|
||||
|
||||
async def stop_all_listening(self):
|
||||
"""Stop listening to all users."""
|
||||
if self.voice_receiver:
|
||||
await self.voice_receiver.stop_all()
|
||||
self.voice_receiver = None
|
||||
logger.info("✓ Stopped all listening")
|
||||
|
||||
async def on_user_vad_event(self, user_id: int, event: dict):
|
||||
"""Called when VAD detects speech state change."""
|
||||
event_type = event.get('event')
|
||||
logger.debug(f"User {user_id} VAD: {event_type}")
|
||||
|
||||
async def on_partial_transcript(self, user_id: int, text: str):
|
||||
"""Called when partial transcript is received."""
|
||||
logger.info(f"Partial from user {user_id}: {text}")
|
||||
# Could show "User is saying..." in chat
|
||||
|
||||
async def on_final_transcript(self, user_id: int, text: str):
|
||||
"""
|
||||
Called when final transcript is received.
|
||||
This triggers LLM response and TTS.
|
||||
"""
|
||||
logger.info(f"Final from user {user_id}: {text}")
|
||||
|
||||
# Get user info
|
||||
user = self.voice_channel.guild.get_member(user_id)
|
||||
if not user:
|
||||
logger.warning(f"User {user_id} not found in guild")
|
||||
return
|
||||
|
||||
# Show what user said
|
||||
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
|
||||
|
||||
# Generate LLM response and speak it
|
||||
await self._generate_voice_response(user, text)
|
||||
|
||||
async def on_user_interruption(self, user_id: int, probability: float):
|
||||
"""
|
||||
Called when user interrupts Miku's speech.
|
||||
Cancel TTS and switch to listening.
|
||||
"""
|
||||
if not self.miku_speaking:
|
||||
return
|
||||
|
||||
logger.info(f"User {user_id} interrupted Miku (prob={probability:.3f})")
|
||||
|
||||
# Cancel Miku's speech
|
||||
await self._cancel_tts()
|
||||
|
||||
# Show interruption in chat
|
||||
user = self.voice_channel.guild.get_member(user_id)
|
||||
await self.text_channel.send(f"⚠️ *{user.name if user else 'User'} interrupted Miku*")
|
||||
|
||||
async def _generate_voice_response(self, user: discord.User, text: str):
|
||||
"""
|
||||
Generate LLM response and speak it.
|
||||
|
||||
Args:
|
||||
user: User who spoke
|
||||
text: Transcribed text
|
||||
"""
|
||||
try:
|
||||
self.miku_speaking = True
|
||||
|
||||
# Show processing
|
||||
await self.text_channel.send(f"💭 *Miku is thinking...*")
|
||||
|
||||
# Import here to avoid circular imports
|
||||
from utils.llm import get_current_gpu_url
|
||||
import aiohttp
|
||||
import globals
|
||||
|
||||
# Simple system prompt for voice
|
||||
system_prompt = """You are Hatsune Miku, the virtual singer.
|
||||
Respond naturally and concisely as Miku would in a voice conversation.
|
||||
Keep responses short (1-3 sentences) since they will be spoken aloud."""
|
||||
|
||||
payload = {
|
||||
"model": globals.TEXT_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text}
|
||||
],
|
||||
"stream": True,
|
||||
"temperature": 0.8,
|
||||
"max_tokens": 200
|
||||
}
|
||||
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
llama_url = get_current_gpu_url()
|
||||
|
||||
# Stream LLM response to TTS
|
||||
full_response = ""
|
||||
async with aiohttp.ClientSession() as http_session:
|
||||
async with http_session.post(
|
||||
f"{llama_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=60)
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
raise Exception(f"LLM error {response.status}: {error_text}")
|
||||
|
||||
# Stream tokens to TTS
|
||||
async for line in response.content:
|
||||
if not self.miku_speaking:
|
||||
# Interrupted
|
||||
break
|
||||
|
||||
line = line.decode('utf-8').strip()
|
||||
if line.startswith('data: '):
|
||||
data_str = line[6:]
|
||||
if data_str == '[DONE]':
|
||||
break
|
||||
|
||||
try:
|
||||
import json
|
||||
data = json.loads(data_str)
|
||||
if 'choices' in data and len(data['choices']) > 0:
|
||||
delta = data['choices'][0].get('delta', {})
|
||||
content = delta.get('content', '')
|
||||
if content:
|
||||
await self.audio_source.send_token(content)
|
||||
full_response += content
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Flush TTS
|
||||
if self.miku_speaking:
|
||||
await self.audio_source.flush()
|
||||
|
||||
# Show response
|
||||
await self.text_channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
|
||||
logger.info(f"✓ Voice response complete: {full_response.strip()}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Voice response failed: {e}", exc_info=True)
|
||||
await self.text_channel.send(f"❌ Sorry, I had trouble responding")
|
||||
|
||||
finally:
|
||||
self.miku_speaking = False
|
||||
|
||||
async def _cancel_tts(self):
|
||||
"""Cancel current TTS synthesis."""
|
||||
logger.info("Canceling TTS synthesis")
|
||||
|
||||
# Stop Discord playback
|
||||
if self.voice_client and self.voice_client.is_playing():
|
||||
self.voice_client.stop()
|
||||
|
||||
# Send interrupt to RVC
|
||||
try:
|
||||
import aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post("http://172.25.0.1:8765/interrupt") as resp:
|
||||
if resp.status == 200:
|
||||
logger.info("✓ TTS interrupted")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to interrupt TTS: {e}")
|
||||
|
||||
self.miku_speaking = False
|
||||
|
||||
|
||||
# Global singleton instance
|
||||
|
||||
Reference in New Issue
Block a user