Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions
--- a/bot/utils/voice_receiver.py
+++ b/bot/utils/voice_receiver.py
@@ -27,13 +27,13 @@ class VoiceReceiverSink(voice_recv.AudioSink):
    decodes/resamples as needed, and sends to STT clients for transcription.
    """
    
-    def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8000/ws/stt"):
+    def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8766/ws/stt"):
        """
-        Initialize voice receiver sink.
+        Initialize Voice Receiver.
        
        Args:
-            voice_manager: Reference to VoiceManager for callbacks
-            stt_url: Base URL for STT WebSocket server with path (port 8000 inside container)
+            voice_manager: The voice manager instance
+            stt_url: Base URL for STT WebSocket server with path (port 8766 inside container)
        """
        super().__init__()
        self.voice_manager = voice_manager
@@ -56,6 +56,17 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        # User info (for logging)
        self.users: Dict[int, discord.User] = {}
        
+        # Silence tracking for detecting end of speech
+        self.last_audio_time: Dict[int, float] = {}
+        self.silence_tasks: Dict[int, asyncio.Task] = {}
+        self.silence_timeout = 1.0  # seconds of silence before sending "final"
+        
+        # Interruption detection
+        self.interruption_start_time: Dict[int, float] = {}
+        self.interruption_audio_count: Dict[int, int] = {}
+        self.interruption_threshold_time = 0.8  # seconds of speech to count as interruption
+        self.interruption_threshold_chunks = 8  # minimum audio chunks to count as interruption
+        
        # Active flag
        self.active = False
        
@@ -232,6 +243,17 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        if user_id in self.users:
            del self.users[user_id]
        
+        # Cancel silence detection task
+        if user_id in self.silence_tasks and not self.silence_tasks[user_id].done():
+            self.silence_tasks[user_id].cancel()
+            del self.silence_tasks[user_id]
+        if user_id in self.last_audio_time:
+            del self.last_audio_time[user_id]
+        
+        # Clear interruption tracking
+        self.interruption_start_time.pop(user_id, None)
+        self.interruption_audio_count.pop(user_id, None)
+        
        # Cleanup opus decoder for this user
        if hasattr(self, '_opus_decoders') and user_id in self._opus_decoders:
            del self._opus_decoders[user_id]
@@ -299,10 +321,95 @@ class VoiceReceiverSink(voice_recv.AudioSink):
                    else:
                        # Put remaining partial chunk back in buffer
                        buffer.append(chunk)
+                
+                # Track audio time for silence detection
+                import time
+                current_time = time.time()
+                self.last_audio_time[user_id] = current_time
+                
+                # ===== INTERRUPTION DETECTION =====
+                # Check if Miku is speaking and user is interrupting
+                # Note: self.voice_manager IS the VoiceSession, not the VoiceManager singleton
+                miku_speaking = self.voice_manager.miku_speaking
+                logger.debug(f"[INTERRUPTION CHECK] user={user_id}, miku_speaking={miku_speaking}")
+                
+                if miku_speaking:
+                    # Track interruption
+                    if user_id not in self.interruption_start_time:
+                        # First chunk during Miku's speech
+                        self.interruption_start_time[user_id] = current_time
+                        self.interruption_audio_count[user_id] = 1
+                    else:
+                        # Increment chunk count
+                        self.interruption_audio_count[user_id] += 1
+                    
+                    # Calculate interruption duration
+                    interruption_duration = current_time - self.interruption_start_time[user_id]
+                    chunk_count = self.interruption_audio_count[user_id]
+                    
+                    # Check if interruption threshold is met
+                    if (interruption_duration >= self.interruption_threshold_time and 
+                        chunk_count >= self.interruption_threshold_chunks):
+                        
+                        # Trigger interruption!
+                        logger.info(f"🛑 User {user_id} interrupted Miku (duration={interruption_duration:.2f}s, chunks={chunk_count})")
+                        logger.info(f"   → Stopping Miku's TTS and LLM, will process user's speech when finished")
+                        
+                        # Reset interruption tracking
+                        self.interruption_start_time.pop(user_id, None)
+                        self.interruption_audio_count.pop(user_id, None)
+                        
+                        # Call interruption handler (this sets miku_speaking=False)
+                        asyncio.create_task(
+                            self.voice_manager.on_user_interruption(user_id)
+                        )
+                else:
+                    # Miku not speaking, clear interruption tracking
+                    self.interruption_start_time.pop(user_id, None)
+                    self.interruption_audio_count.pop(user_id, None)
+                
+                # Cancel existing silence task if any
+                if user_id in self.silence_tasks and not self.silence_tasks[user_id].done():
+                    self.silence_tasks[user_id].cancel()
+                
+                # Start new silence detection task
+                self.silence_tasks[user_id] = asyncio.create_task(
+                    self._detect_silence(user_id)
+                )
                        
        except Exception as e:
            logger.error(f"Failed to send audio chunk for user {user_id}: {e}")
    
+    async def _detect_silence(self, user_id: int):
+        """
+        Wait for silence timeout and send 'final' command to STT.
+        
+        This is called after each audio chunk. If no more audio arrives within
+        the silence_timeout period, we send the 'final' command to get the
+        complete transcription.
+        
+        Args:
+            user_id: Discord user ID
+        """
+        try:
+            # Wait for silence timeout
+            await asyncio.sleep(self.silence_timeout)
+            
+            # Check if we still have an active STT client
+            stt_client = self.stt_clients.get(user_id)
+            if not stt_client or not stt_client.is_connected():
+                return
+            
+            # Send final command to get complete transcription
+            logger.debug(f"Silence detected for user {user_id}, requesting final transcript")
+            await stt_client.send_final()
+            
+        except asyncio.CancelledError:
+            # Task was cancelled because new audio arrived
+            pass
+        except Exception as e:
+            logger.error(f"Error in silence detection for user {user_id}: {e}")
+    
    async def _on_vad_event(self, user_id: int, event: dict):
        """
        Handle VAD event from STT.