Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions
--- a/bot/utils/voice_receiver.py
+++ b/bot/utils/voice_receiver.py
@@ -8,6 +8,8 @@ Uses the discord-ext-voice-recv extension for proper audio receiving support.
 import asyncio
 import audioop
 import logging
+import struct
+import array
 from typing import Dict, Optional
 from collections import deque

@@ -27,13 +29,13 @@ class VoiceReceiverSink(voice_recv.AudioSink):
    decodes/resamples as needed, and sends to STT clients for transcription.
    """
    
-    def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8766/ws/stt"):
+    def __init__(self, voice_manager, stt_url: str = "ws://miku-stt:8766"):
        """
        Initialize Voice Receiver.
        
        Args:
            voice_manager: The voice manager instance
-            stt_url: Base URL for STT WebSocket server with path (port 8766 inside container)
+            stt_url: WebSocket URL for RealtimeSTT server (port 8766 inside container)
        """
        super().__init__()
        self.voice_manager = voice_manager
@@ -72,6 +74,68 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        
        logger.info("VoiceReceiverSink initialized")
    
+    @staticmethod
+    def _preprocess_audio(pcm_data: bytes) -> bytes:
+        """
+        Preprocess audio for better STT accuracy.
+        
+        Applies:
+        1. DC offset removal
+        2. High-pass filter (80Hz) to remove rumble
+        3. RMS normalization
+        
+        Args:
+            pcm_data: Raw PCM audio (16-bit mono, 16kHz)
+            
+        Returns:
+            Preprocessed PCM audio
+        """
+        try:
+            # Convert bytes to array of int16 samples
+            samples = array.array('h', pcm_data)
+            
+            # 1. Remove DC offset (mean)
+            mean = sum(samples) / len(samples) if samples else 0
+            samples = array.array('h', [int(s - mean) for s in samples])
+            
+            # 2. Simple high-pass filter (80Hz @ 16kHz)
+            # Using a simple first-order HPF: y[n] = x[n] - x[n-1] + 0.95 * y[n-1]
+            alpha = 0.95  # Filter coefficient (roughly 80Hz cutoff at 16kHz)
+            filtered = array.array('h')
+            prev_input = 0
+            prev_output = 0
+            
+            for sample in samples:
+                output = sample - prev_input + alpha * prev_output
+                filtered.append(int(max(-32768, min(32767, output))))  # Clamp to int16 range
+                prev_input = sample
+                prev_output = output
+            
+            # 3. RMS normalization to target level
+            # Calculate RMS
+            sum_squares = sum(s * s for s in filtered)
+            rms = (sum_squares / len(filtered)) ** 0.5 if filtered else 1.0
+            
+            # Target RMS (roughly -20dB)
+            target_rms = 3276.8  # 10% of max int16 range
+            
+            # Normalize if RMS is too low or too high
+            if rms > 100:  # Only normalize if there's actual signal
+                gain = target_rms / rms
+                # Limit gain to prevent over-amplification of noise
+                gain = min(gain, 4.0)  # Max 12dB boost
+                normalized = array.array('h', [
+                    int(max(-32768, min(32767, s * gain))) for s in filtered
+                ])
+                return normalized.tobytes()
+            else:
+                # Signal too weak, return filtered without normalization
+                return filtered.tobytes()
+                
+        except Exception as e:
+            logger.debug(f"Audio preprocessing failed, using raw audio: {e}")
+            return pcm_data
+    
    def wants_opus(self) -> bool:
        """
        Tell discord-ext-voice-recv we want Opus data, NOT decoded PCM.
@@ -144,6 +208,10 @@ class VoiceReceiverSink(voice_recv.AudioSink):
            # Discord sends 20ms chunks: 960 samples @ 48kHz → 320 samples @ 16kHz
            pcm_16k, _ = audioop.ratecv(pcm_mono, 2, 1, 48000, 16000, None)
            
+            # Preprocess audio for better STT accuracy
+            # (DC offset removal, high-pass filter, RMS normalization)
+            pcm_16k = self._preprocess_audio(pcm_16k)
+            
            # Send to STT client (schedule on event loop thread-safely)
            asyncio.run_coroutine_threadsafe(
                self._send_audio_chunk(user_id, pcm_16k),
@@ -184,21 +252,16 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        self.audio_buffers[user_id] = deque(maxlen=1000)
        
        # Create STT client with callbacks
+        # RealtimeSTT handles VAD internally, so we only need partial/final callbacks
        stt_client = STTClient(
            user_id=user_id,
            stt_url=self.stt_url,
-            on_vad_event=lambda event: asyncio.create_task(
-                self._on_vad_event(user_id, event)
-            ),
            on_partial_transcript=lambda text, timestamp: asyncio.create_task(
                self._on_partial_transcript(user_id, text)
            ),
            on_final_transcript=lambda text, timestamp: asyncio.create_task(
                self._on_final_transcript(user_id, text, user)
            ),
-            on_interruption=lambda prob: asyncio.create_task(
-                self._on_interruption(user_id, prob)
-            )
        )
        
        # Connect to STT server
@@ -279,16 +342,16 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        """
        Send audio chunk to STT client.
        
-        Buffers audio until we have 512 samples (32ms @ 16kHz) which is what
-        Silero VAD expects. Discord sends 320 samples (20ms), so we buffer
-        2 chunks and send 640 samples, then the STT server can split it.
+        RealtimeSTT expects 16kHz mono 16-bit PCM audio.
+        We buffer audio to send larger chunks for efficiency.
+        VAD and silence detection is handled by RealtimeSTT.
        
        Args:
            user_id: Discord user ID
-            audio_data: PCM audio (int16, 16kHz mono, 320 samples = 640 bytes)
+            audio_data: PCM audio (int16, 16kHz mono)
        """
        stt_client = self.stt_clients.get(user_id)
-        if not stt_client or not stt_client.is_connected():
+        if not stt_client or not stt_client.connected:
            return
        
        try:
@@ -299,11 +362,9 @@ class VoiceReceiverSink(voice_recv.AudioSink):
            buffer = self.audio_buffers[user_id]
            buffer.append(audio_data)
            
-            # Silero VAD expects 512 samples @ 16kHz (1024 bytes)
-            # Discord gives us 320 samples (640 bytes) every 20ms
-            # Buffer 2 chunks = 640 samples = 1280 bytes, send as one chunk
-            SAMPLES_NEEDED = 512  # What VAD wants
-            BYTES_NEEDED = SAMPLES_NEEDED * 2  # int16 = 2 bytes per sample
+            # Buffer and send in larger chunks for efficiency
+            # RealtimeSTT will handle VAD internally
+            BYTES_NEEDED = 1024  # 512 samples * 2 bytes
            
            # Check if we have enough buffered audio
            total_bytes = sum(len(chunk) for chunk in buffer)
@@ -313,16 +374,10 @@ class VoiceReceiverSink(voice_recv.AudioSink):
                combined = b''.join(buffer)
                buffer.clear()
                
-                # Send in 512-sample (1024-byte) chunks
-                for i in range(0, len(combined), BYTES_NEEDED):
-                    chunk = combined[i:i+BYTES_NEEDED]
-                    if len(chunk) == BYTES_NEEDED:
-                        await stt_client.send_audio(chunk)
-                    else:
-                        # Put remaining partial chunk back in buffer
-                        buffer.append(chunk)
+                # Send all audio to STT (RealtimeSTT handles VAD internally)
+                await stt_client.send_audio(combined)
                
-                # Track audio time for silence detection
+                # Track audio time for interruption detection
                import time
                current_time = time.time()
                self.last_audio_time[user_id] = current_time
@@ -331,103 +386,57 @@ class VoiceReceiverSink(voice_recv.AudioSink):
                # Check if Miku is speaking and user is interrupting
                # Note: self.voice_manager IS the VoiceSession, not the VoiceManager singleton
                miku_speaking = self.voice_manager.miku_speaking
-                logger.debug(f"[INTERRUPTION CHECK] user={user_id}, miku_speaking={miku_speaking}")
                
                if miku_speaking:
-                    # Track interruption
-                    if user_id not in self.interruption_start_time:
-                        # First chunk during Miku's speech
-                        self.interruption_start_time[user_id] = current_time
-                        self.interruption_audio_count[user_id] = 1
+                    # Calculate RMS to detect if user is actually speaking
+                    # (not just silence/background noise)
+                    rms = audioop.rms(combined, 2)
+                    RMS_THRESHOLD = 500  # Adjust threshold - higher = less sensitive
+                    
+                    if rms > RMS_THRESHOLD:
+                        # User is actually speaking - track as potential interruption
+                        if user_id not in self.interruption_start_time:
+                            # First chunk during Miku's speech with actual audio
+                            self.interruption_start_time[user_id] = current_time
+                            self.interruption_audio_count[user_id] = 1
+                            logger.debug(f"Potential interruption start (rms={rms})")
+                        else:
+                            # Increment chunk count
+                            self.interruption_audio_count[user_id] += 1
+                        
+                        # Calculate interruption duration
+                        interruption_duration = current_time - self.interruption_start_time[user_id]
+                        chunk_count = self.interruption_audio_count[user_id]
+                        
+                        # Check if interruption threshold is met
+                        if (interruption_duration >= self.interruption_threshold_time and 
+                            chunk_count >= self.interruption_threshold_chunks):
+                            
+                            # Trigger interruption!
+                            logger.info(f"🛑 User {user_id} interrupted Miku (duration={interruption_duration:.2f}s, chunks={chunk_count}, rms={rms})")
+                            logger.info(f"   → Stopping Miku's TTS and LLM, will process user's speech when finished")
+                            
+                            # Reset interruption tracking
+                            self.interruption_start_time.pop(user_id, None)
+                            self.interruption_audio_count.pop(user_id, None)
+                            
+                            # Call interruption handler (this sets miku_speaking=False)
+                            asyncio.create_task(
+                                self.voice_manager.on_user_interruption(user_id)
+                            )
                    else:
-                        # Increment chunk count
-                        self.interruption_audio_count[user_id] += 1
-                    
-                    # Calculate interruption duration
-                    interruption_duration = current_time - self.interruption_start_time[user_id]
-                    chunk_count = self.interruption_audio_count[user_id]
-                    
-                    # Check if interruption threshold is met
-                    if (interruption_duration >= self.interruption_threshold_time and 
-                        chunk_count >= self.interruption_threshold_chunks):
-                        
-                        # Trigger interruption!
-                        logger.info(f"🛑 User {user_id} interrupted Miku (duration={interruption_duration:.2f}s, chunks={chunk_count})")
-                        logger.info(f"   → Stopping Miku's TTS and LLM, will process user's speech when finished")
-                        
-                        # Reset interruption tracking
+                        # Audio below RMS threshold (silence) - reset interruption tracking
+                        # This ensures brief pauses in speech reset the counter
                        self.interruption_start_time.pop(user_id, None)
                        self.interruption_audio_count.pop(user_id, None)
-                        
-                        # Call interruption handler (this sets miku_speaking=False)
-                        asyncio.create_task(
-                            self.voice_manager.on_user_interruption(user_id)
-                        )
                else:
                    # Miku not speaking, clear interruption tracking
                    self.interruption_start_time.pop(user_id, None)
                    self.interruption_audio_count.pop(user_id, None)
-                
-                # Cancel existing silence task if any
-                if user_id in self.silence_tasks and not self.silence_tasks[user_id].done():
-                    self.silence_tasks[user_id].cancel()
-                
-                # Start new silence detection task
-                self.silence_tasks[user_id] = asyncio.create_task(
-                    self._detect_silence(user_id)
-                )
                        
        except Exception as e:
            logger.error(f"Failed to send audio chunk for user {user_id}: {e}")
    
-    async def _detect_silence(self, user_id: int):
-        """
-        Wait for silence timeout and send 'final' command to STT.
-        
-        This is called after each audio chunk. If no more audio arrives within
-        the silence_timeout period, we send the 'final' command to get the
-        complete transcription.
-        
-        Args:
-            user_id: Discord user ID
-        """
-        try:
-            # Wait for silence timeout
-            await asyncio.sleep(self.silence_timeout)
-            
-            # Check if we still have an active STT client
-            stt_client = self.stt_clients.get(user_id)
-            if not stt_client or not stt_client.is_connected():
-                return
-            
-            # Send final command to get complete transcription
-            logger.debug(f"Silence detected for user {user_id}, requesting final transcript")
-            await stt_client.send_final()
-            
-        except asyncio.CancelledError:
-            # Task was cancelled because new audio arrived
-            pass
-        except Exception as e:
-            logger.error(f"Error in silence detection for user {user_id}: {e}")
-    
-    async def _on_vad_event(self, user_id: int, event: dict):
-        """
-        Handle VAD event from STT.
-        
-        Args:
-            user_id: Discord user ID
-            event: VAD event dictionary with 'event' and 'probability' keys
-        """
-        user = self.users.get(user_id)
-        event_type = event.get('event', 'unknown')
-        probability = event.get('probability', 0.0)
-        
-        logger.debug(f"VAD [{user.name if user else user_id}]: {event_type} (prob={probability:.3f})")
-        
-        # Notify voice manager - pass the full event dict
-        if hasattr(self.voice_manager, 'on_user_vad_event'):
-            await self.voice_manager.on_user_vad_event(user_id, event)
-    
    async def _on_partial_transcript(self, user_id: int, text: str):
        """
        Handle partial transcript from STT.
@@ -438,7 +447,6 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        """
        user = self.users.get(user_id)
        logger.info(f"[VOICE_RECEIVER] Partial [{user.name if user else user_id}]: {text}")
-        print(f"[DEBUG] PARTIAL TRANSCRIPT RECEIVED: {text}")  # Extra debug
        
        # Notify voice manager
        if hasattr(self.voice_manager, 'on_partial_transcript'):
@@ -456,29 +464,11 @@ class VoiceReceiverSink(voice_recv.AudioSink):
            user: Discord user object
        """
        logger.info(f"[VOICE_RECEIVER] Final [{user.name if user else user_id}]: {text}")
-        print(f"[DEBUG] FINAL TRANSCRIPT RECEIVED: {text}")  # Extra debug
        
        # Notify voice manager - THIS TRIGGERS LLM RESPONSE
        if hasattr(self.voice_manager, 'on_final_transcript'):
            await self.voice_manager.on_final_transcript(user_id, text)
    
-    async def _on_interruption(self, user_id: int, probability: float):
-        """
-        Handle interruption detection from STT.
-        
-        This cancels Miku's current speech if user interrupts.
-        
-        Args:
-            user_id: Discord user ID
-            probability: Interruption confidence probability
-        """
-        user = self.users.get(user_id)
-        logger.info(f"Interruption from [{user.name if user else user_id}] (prob={probability:.3f})")
-        
-        # Notify voice manager - THIS CANCELS MIKU'S SPEECH
-        if hasattr(self.voice_manager, 'on_user_interruption'):
-            await self.voice_manager.on_user_interruption(user_id, probability)
-    
    def get_listening_users(self) -> list:
        """
        Get list of users currently being listened to.
@@ -489,30 +479,10 @@ class VoiceReceiverSink(voice_recv.AudioSink):
        return [
            {
                'user_id': user_id,
-                'username': user.name if user else 'Unknown',
-                'connected': client.is_connected()
+                'username': self.users.get(user_id, {}).name if self.users.get(user_id) else 'Unknown',
+                'connected': self.stt_clients.get(user_id, {}).connected if self.stt_clients.get(user_id) else False
            }
-            for user_id, (user, client) in 
-            [(uid, (self.users.get(uid), self.stt_clients.get(uid))) 
-             for uid in self.stt_clients.keys()]
+            for user_id in self.stt_clients.keys()
        ]
    
-    @voice_recv.AudioSink.listener()
-    def on_voice_member_speaking_start(self, member: discord.Member):
-        """
-        Called when a member starts speaking (green circle appears).
-        
-        This is a virtual event from discord-ext-voice-recv based on packet activity.
-        """
-        if member.id in self.stt_clients:
-            logger.debug(f"🎤 {member.name} started speaking")
-    
-    @voice_recv.AudioSink.listener()
-    def on_voice_member_speaking_stop(self, member: discord.Member):
-        """
-        Called when a member stops speaking (green circle disappears).
-        
-        This is a virtual event from discord-ext-voice-recv based on packet activity.
-        """
-        if member.id in self.stt_clients:
-            logger.debug(f"🔇 {member.name} stopped speaking")
+    # Discord VAD events removed - we rely entirely on RealtimeSTT's VAD for speech detection