Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

This commit is contained in:
2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions

View File

@@ -391,6 +391,12 @@ class VoiceSession:
self.voice_receiver: Optional['VoiceReceiver'] = None # STT receiver
self.active = False
self.miku_speaking = False # Track if Miku is currently speaking
self.llm_stream_task: Optional[asyncio.Task] = None # Track LLM streaming task for cancellation
self.last_interruption_time: float = 0 # Track when last interruption occurred
self.interruption_silence_duration = 0.8 # Seconds of silence after interruption before next response
# Voice chat conversation history (last 8 exchanges)
self.conversation_history = [] # List of {"role": "user"/"assistant", "content": str}
logger.info(f"VoiceSession created for {voice_channel.name} in guild {guild_id}")
@@ -496,8 +502,23 @@ class VoiceSession:
"""
Called when final transcript is received.
This triggers LLM response and TTS.
Note: If user interrupted Miku, miku_speaking will already be False
by the time this is called, so the response will proceed normally.
"""
logger.info(f"Final from user {user_id}: {text}")
logger.info(f"📝 Final transcript from user {user_id}: {text}")
# Check if Miku is STILL speaking (not interrupted)
# This prevents queueing if user speaks briefly but not long enough to interrupt
if self.miku_speaking:
logger.info(f"⏭️ Ignoring short input while Miku is speaking (user didn't interrupt long enough)")
# Get user info for notification
user = self.voice_channel.guild.get_member(user_id)
user_name = user.name if user else f"User {user_id}"
await self.text_channel.send(f"💬 *{user_name} said: \"{text}\" (interrupted but too brief - talk longer to interrupt)*")
return
logger.info(f"✓ Processing final transcript (miku_speaking={self.miku_speaking})")
# Get user info
user = self.voice_channel.guild.get_member(user_id)
@@ -505,26 +526,79 @@ class VoiceSession:
logger.warning(f"User {user_id} not found in guild")
return
# Check for stop commands (don't generate response if user wants silence)
stop_phrases = ["stop talking", "be quiet", "shut up", "stop speaking", "silence"]
if any(phrase in text.lower() for phrase in stop_phrases):
logger.info(f"🤫 Stop command detected: {text}")
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
await self.text_channel.send(f"🤫 *Miku goes quiet*")
return
# Show what user said
await self.text_channel.send(f"🎤 {user.name}: *\"{text}\"*")
# Generate LLM response and speak it
await self._generate_voice_response(user, text)
async def on_user_interruption(self, user_id: int, probability: float):
async def on_user_interruption(self, user_id: int):
"""
Called when user interrupts Miku's speech.
Cancel TTS and switch to listening.
This is triggered when user speaks over Miku for long enough (0.8s+ with 8+ chunks).
Immediately cancels LLM streaming, TTS synthesis, and clears audio buffers.
Args:
user_id: Discord user ID who interrupted
"""
if not self.miku_speaking:
return
logger.info(f"User {user_id} interrupted Miku (prob={probability:.3f})")
logger.info(f"🛑 User {user_id} interrupted Miku - canceling everything immediately")
# Cancel Miku's speech
# Get user info
user = self.voice_channel.guild.get_member(user_id)
user_name = user.name if user else f"User {user_id}"
# 1. Mark that Miku is no longer speaking (stops LLM streaming loop check)
self.miku_speaking = False
# 2. Cancel LLM streaming task if it's running
if self.llm_stream_task and not self.llm_stream_task.done():
self.llm_stream_task.cancel()
try:
await self.llm_stream_task
except asyncio.CancelledError:
logger.info("✓ LLM streaming task cancelled")
except Exception as e:
logger.error(f"Error cancelling LLM task: {e}")
# 3. Cancel TTS/RVC synthesis and playback
await self._cancel_tts()
# 4. Add a brief pause to create audible separation
# This gives a fade-out effect and makes the interruption less jarring
import time
self.last_interruption_time = time.time()
logger.info(f"⏸️ Pausing for {self.interruption_silence_duration}s after interruption")
await asyncio.sleep(self.interruption_silence_duration)
# 5. Add interruption marker to conversation history
self.conversation_history.append({
"role": "assistant",
"content": "[INTERRUPTED - user started speaking]"
})
# Show interruption in chat
await self.text_channel.send(f"⚠️ *{user_name} interrupted Miku*")
logger.info(f"✓ Interruption handled, ready for next input")
async def on_user_interruption_old(self, user_id: int, probability: float):
"""
Legacy interruption handler (kept for compatibility).
Called when VAD-based interruption detection is used.
"""
await self.on_user_interruption(user_id)
user = self.voice_channel.guild.get_member(user_id)
await self.text_channel.send(f"⚠️ *{user.name if user else 'User'} interrupted Miku*")
@@ -537,7 +611,18 @@ class VoiceSession:
text: Transcribed text
"""
try:
# Check if we need to wait due to recent interruption
import time
if self.last_interruption_time > 0:
time_since_interruption = time.time() - self.last_interruption_time
remaining_pause = self.interruption_silence_duration - time_since_interruption
if remaining_pause > 0:
logger.info(f"⏸️ Waiting {remaining_pause:.2f}s more before responding (interruption cooldown)")
await asyncio.sleep(remaining_pause)
logger.info(f"🎙️ Starting voice response generation (setting miku_speaking=True)")
self.miku_speaking = True
logger.info(f" → miku_speaking is now: {self.miku_speaking}")
# Show processing
await self.text_channel.send(f"💭 *Miku is thinking...*")
@@ -547,17 +632,53 @@ class VoiceSession:
import aiohttp
import globals
# Simple system prompt for voice
system_prompt = """You are Hatsune Miku, the virtual singer.
Respond naturally and concisely as Miku would in a voice conversation.
Keep responses short (1-3 sentences) since they will be spoken aloud."""
# Load personality and lore
miku_lore = ""
miku_prompt = ""
try:
with open('/app/miku_lore.txt', 'r', encoding='utf-8') as f:
miku_lore = f.read().strip()
with open('/app/miku_prompt.txt', 'r', encoding='utf-8') as f:
miku_prompt = f.read().strip()
except Exception as e:
logger.warning(f"Could not load personality files: {e}")
# Build voice chat system prompt
system_prompt = f"""{miku_prompt}
{miku_lore}
VOICE CHAT CONTEXT:
- You are currently in a voice channel speaking with {user.name} and others
- Your responses will be spoken aloud via text-to-speech
- Keep responses natural and conversational - vary your length based on context:
* Quick reactions: 1 sentence ("Oh wow!" or "That's amazing!")
* Normal chat: 2-3 sentences (share a thought or feeling)
* Stories/explanations: 4-6 sentences when asked for details
- Match the user's energy and conversation style
- IMPORTANT: Only respond in ENGLISH! The TTS system cannot handle Japanese or other languages well.
- Be expressive and use casual language, but stay in character as Miku
- If user says "stop talking" or "be quiet", acknowledge briefly and stop
Remember: This is a live voice conversation - be natural, not formulaic!"""
# Add user message to history
self.conversation_history.append({
"role": "user",
"content": f"{user.name}: {text}"
})
# Keep only last 8 exchanges (16 messages = 8 user + 8 assistant)
if len(self.conversation_history) > 16:
self.conversation_history = self.conversation_history[-16:]
# Build messages for LLM
messages = [{"role": "system", "content": system_prompt}]
messages.extend(self.conversation_history)
payload = {
"model": globals.TEXT_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": text}
],
"messages": messages,
"stream": True,
"temperature": 0.8,
"max_tokens": 200
@@ -566,50 +687,74 @@ Keep responses short (1-3 sentences) since they will be spoken aloud."""
headers = {'Content-Type': 'application/json'}
llama_url = get_current_gpu_url()
# Stream LLM response to TTS
full_response = ""
async with aiohttp.ClientSession() as http_session:
async with http_session.post(
f"{llama_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=aiohttp.ClientTimeout(total=60)
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"LLM error {response.status}: {error_text}")
# Stream tokens to TTS
async for line in response.content:
if not self.miku_speaking:
# Interrupted
break
# Create streaming task so we can cancel it if interrupted
async def stream_llm_to_tts():
"""Stream LLM tokens to TTS. Can be cancelled."""
full_response = ""
async with aiohttp.ClientSession() as http_session:
async with http_session.post(
f"{llama_url}/v1/chat/completions",
json=payload,
headers=headers,
timeout=aiohttp.ClientTimeout(total=60)
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"LLM error {response.status}: {error_text}")
line = line.decode('utf-8').strip()
if line.startswith('data: '):
data_str = line[6:]
if data_str == '[DONE]':
# Stream tokens to TTS
async for line in response.content:
if not self.miku_speaking:
# Interrupted - exit gracefully
logger.info("🛑 LLM streaming stopped (miku_speaking=False)")
break
try:
import json
data = json.loads(data_str)
if 'choices' in data and len(data['choices']) > 0:
delta = data['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
await self.audio_source.send_token(content)
full_response += content
except json.JSONDecodeError:
continue
line = line.decode('utf-8').strip()
if line.startswith('data: '):
data_str = line[6:]
if data_str == '[DONE]':
break
try:
import json
data = json.loads(data_str)
if 'choices' in data and len(data['choices']) > 0:
delta = data['choices'][0].get('delta', {})
content = delta.get('content', '')
if content:
await self.audio_source.send_token(content)
full_response += content
except json.JSONDecodeError:
continue
return full_response
# Run streaming as a task that can be cancelled
self.llm_stream_task = asyncio.create_task(stream_llm_to_tts())
try:
full_response = await self.llm_stream_task
except asyncio.CancelledError:
logger.info("✓ LLM streaming cancelled by interruption")
# Don't re-raise - just return early to avoid breaking STT client
return
# Flush TTS
if self.miku_speaking:
await self.audio_source.flush()
# Add Miku's complete response to history
self.conversation_history.append({
"role": "assistant",
"content": full_response.strip()
})
# Show response
await self.text_channel.send(f"🎤 Miku: *\"{full_response.strip()}\"*")
logger.info(f"✓ Voice response complete: {full_response.strip()}")
else:
# Interrupted - don't add incomplete response to history
# (interruption marker already added by on_user_interruption)
logger.info(f"✓ Response interrupted after {len(full_response)} chars")
except Exception as e:
logger.error(f"Voice response failed: {e}", exc_info=True)
@@ -619,24 +764,50 @@ Keep responses short (1-3 sentences) since they will be spoken aloud."""
self.miku_speaking = False
async def _cancel_tts(self):
"""Cancel current TTS synthesis."""
logger.info("Canceling TTS synthesis")
"""
Immediately cancel TTS synthesis and clear all audio buffers.
# Stop Discord playback
if self.voice_client and self.voice_client.is_playing():
self.voice_client.stop()
This sends interrupt signals to:
1. Local audio buffer (clears queued audio)
2. RVC TTS server (stops synthesis pipeline)
# Send interrupt to RVC
Does NOT stop voice_client (that would disconnect voice receiver).
"""
logger.info("🛑 Canceling TTS synthesis immediately")
# 1. FIRST: Clear local audio buffer to stop playing queued audio
if self.audio_source:
try:
await self.audio_source.clear_buffer()
logger.info("✓ Audio buffer cleared")
except Exception as e:
logger.error(f"Failed to clear audio buffer: {e}")
# 2. SECOND: Send interrupt to RVC to stop synthesis pipeline
try:
import aiohttp
async with aiohttp.ClientSession() as session:
async with session.post("http://172.25.0.1:8765/interrupt") as resp:
if resp.status == 200:
logger.info("✓ TTS interrupted")
# Send interrupt multiple times rapidly to ensure it's received
for i in range(3):
try:
async with session.post(
"http://172.25.0.1:8765/interrupt",
timeout=aiohttp.ClientTimeout(total=2.0)
) as resp:
if resp.status == 200:
data = await resp.json()
logger.info(f"✓ TTS interrupted (flushed {data.get('zmq_chunks_flushed', 0)} chunks)")
break
except asyncio.TimeoutError:
if i < 2: # Don't warn on last attempt
logger.warning("Interrupt request timed out, retrying...")
continue
except Exception as e:
logger.error(f"Failed to interrupt TTS: {e}")
self.miku_speaking = False
# Note: We do NOT call voice_client.stop() because that would
# stop the entire voice system including the receiver!
# The audio source will just play silence until new tokens arrive.
# Global singleton instance