Implemented experimental real production ready voice chat, relegated old flow to voice debug mode. New Web UI panel for Voice Chat.

2026-01-20 23:06:17 +02:00
parent 362108f4b0
commit 2934efba22
31 changed files with 5408 additions and 357 deletions
--- a/stt-realtime/Dockerfile
+++ b/stt-realtime/Dockerfile
@@ -0,0 +1,58 @@
+# RealtimeSTT Container
+# Uses Faster-Whisper with CUDA for GPU-accelerated inference
+# Includes dual VAD (WebRTC + Silero) for robust voice detection
+
+FROM nvidia/cuda:12.6.2-cudnn-runtime-ubuntu22.04
+
+# Prevent interactive prompts during build
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-venv \
+    python3.11-dev \
+    python3-pip \
+    build-essential \
+    ffmpeg \
+    libsndfile1 \
+    libportaudio2 \
+    portaudio19-dev \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Upgrade pip
+RUN python3.11 -m pip install --upgrade pip
+
+# Copy requirements first (for Docker layer caching)
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
+
+# Install PyTorch with CUDA 12.1 support (compatible with CUDA 12.6)
+RUN python3.11 -m pip install --no-cache-dir \
+    torch==2.5.1+cu121 \
+    torchaudio==2.5.1+cu121 \
+    --index-url https://download.pytorch.org/whl/cu121
+
+# Copy application code
+COPY stt_server.py .
+
+# Create models directory (models will be downloaded on first run)
+RUN mkdir -p /root/.cache/huggingface
+
+# Expose WebSocket port
+EXPOSE 8766
+
+# Health check - use netcat to check if port is listening
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD python3.11 -c "import socket; s=socket.socket(); s.settimeout(2); s.connect(('localhost', 8766)); s.close()" || exit 1
+
+# Run the server
+CMD ["python3.11", "stt_server.py"]
--- a/stt-realtime/requirements.txt
+++ b/stt-realtime/requirements.txt
@@ -0,0 +1,19 @@
+# RealtimeSTT dependencies
+RealtimeSTT>=0.3.104
+websockets>=12.0
+numpy>=1.24.0
+
+# For faster-whisper backend (GPU accelerated)
+faster-whisper>=1.0.0
+ctranslate2>=4.4.0
+
+# Audio processing
+soundfile>=0.12.0
+librosa>=0.10.0
+
+# VAD dependencies (included with RealtimeSTT but explicit)
+webrtcvad>=2.0.10
+silero-vad>=5.1
+
+# Utilities
+aiohttp>=3.9.0
--- a/stt-realtime/stt_server.py
+++ b/stt-realtime/stt_server.py
@@ -0,0 +1,525 @@
+#!/usr/bin/env python3
+"""
+RealtimeSTT WebSocket Server
+
+Provides real-time speech-to-text transcription using Faster-Whisper.
+Receives audio chunks via WebSocket and streams back partial/final transcripts.
+
+Protocol:
+- Client sends: binary audio data (16kHz, 16-bit mono PCM)
+- Client sends: JSON {"command": "reset"} to reset state
+- Server sends: JSON {"type": "partial", "text": "...", "timestamp": float}
+- Server sends: JSON {"type": "final", "text": "...", "timestamp": float}
+"""
+
+import asyncio
+import json
+import logging
+import time
+import threading
+import queue
+from typing import Optional, Dict, Any
+import numpy as np
+import websockets
+from websockets.server import serve
+from aiohttp import web
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s %(levelname)s [%(name)s] %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger('stt-realtime')
+
+# Import RealtimeSTT
+from RealtimeSTT import AudioToTextRecorder
+
+# Global warmup state
+warmup_complete = False
+warmup_lock = threading.Lock()
+warmup_recorder = None
+
+
+class STTSession:
+    """
+    Manages a single STT session for a WebSocket client.
+    Uses RealtimeSTT's AudioToTextRecorder with feed_audio() method.
+    """
+    
+    def __init__(self, websocket, session_id: str, config: Dict[str, Any]):
+        self.websocket = websocket
+        self.session_id = session_id
+        self.config = config
+        self.recorder: Optional[AudioToTextRecorder] = None
+        self.running = False
+        self.audio_queue = queue.Queue()
+        self.feed_thread: Optional[threading.Thread] = None
+        self.last_partial = ""
+        self.last_stabilized = ""  # Track last stabilized partial
+        self.last_text_was_stabilized = False  # Track which came last
+        self.recording_active = False  # Track if currently recording
+        
+        logger.info(f"[{session_id}] Session created")
+    
+    def _on_realtime_transcription(self, text: str):
+        """Called when partial transcription is available."""
+        if text and text != self.last_partial:
+            self.last_partial = text
+            self.last_text_was_stabilized = False  # Partial came after stabilized
+            logger.info(f"[{self.session_id}] 📝 Partial: {text}")
+            asyncio.run_coroutine_threadsafe(
+                self._send_transcript("partial", text),
+                self.loop
+            )
+    
+    def _on_realtime_stabilized(self, text: str):
+        """Called when a stabilized partial is available (high confidence)."""
+        if text and text.strip():
+            self.last_stabilized = text
+            self.last_text_was_stabilized = True  # Stabilized came after partial
+            logger.info(f"[{self.session_id}] 🔒 Stabilized: {text}")
+            asyncio.run_coroutine_threadsafe(
+                self._send_transcript("partial", text),
+                self.loop
+            )
+    
+    def _on_recording_stop(self):
+        """Called when recording stops (silence detected)."""
+        logger.info(f"[{self.session_id}] ⏹️ Recording stopped")
+        self.recording_active = False
+        
+        # Use the most recent text: prioritize whichever came last
+        if self.last_text_was_stabilized:
+            final_text = self.last_stabilized or self.last_partial
+            source = "stabilized" if self.last_stabilized else "partial"
+        else:
+            final_text = self.last_partial or self.last_stabilized
+            source = "partial" if self.last_partial else "stabilized"
+        
+        if final_text:
+            logger.info(f"[{self.session_id}] ✅ Final (from {source}): {final_text}")
+            asyncio.run_coroutine_threadsafe(
+                self._send_transcript("final", final_text),
+                self.loop
+            )
+        else:
+            # No transcript means VAD false positive (detected "speech" in pure noise)
+            logger.warning(f"[{self.session_id}] ⚠️  Recording stopped but no transcript available (VAD false positive)")
+            logger.info(f"[{self.session_id}] 🔄 Clearing audio buffer to recover")
+            
+            # Clear the audio queue to prevent stale data
+            try:
+                while not self.audio_queue.empty():
+                    self.audio_queue.get_nowait()
+            except Exception:
+                pass
+        
+        # Reset state
+        self.last_stabilized = ""
+        self.last_partial = ""
+        self.last_text_was_stabilized = False
+    
+    def _on_recording_start(self):
+        """Called when recording starts (speech detected)."""
+        logger.info(f"[{self.session_id}] 🎙️ Recording started")
+        self.recording_active = True
+        self.last_stabilized = ""
+        self.last_partial = ""
+    
+    def _on_transcription(self, text: str):
+        """Not used - we use stabilized partials as finals."""
+        pass
+    
+    async def _send_transcript(self, transcript_type: str, text: str):
+        """Send transcript to client via WebSocket."""
+        try:
+            message = {
+                "type": transcript_type,
+                "text": text,
+                "timestamp": time.time()
+            }
+            await self.websocket.send(json.dumps(message))
+        except Exception as e:
+            logger.error(f"[{self.session_id}] Failed to send transcript: {e}")
+    
+    def _feed_audio_thread(self):
+        """Thread that feeds audio to the recorder."""
+        logger.info(f"[{self.session_id}] Audio feed thread started")
+        while self.running:
+            try:
+                # Get audio chunk with timeout
+                audio_chunk = self.audio_queue.get(timeout=0.1)
+                if audio_chunk is not None and self.recorder:
+                    self.recorder.feed_audio(audio_chunk)
+            except queue.Empty:
+                continue
+            except Exception as e:
+                logger.error(f"[{self.session_id}] Error feeding audio: {e}")
+        logger.info(f"[{self.session_id}] Audio feed thread stopped")
+    
+    async def start(self, loop: asyncio.AbstractEventLoop):
+        """Start the STT session."""
+        self.loop = loop
+        self.running = True
+        
+        logger.info(f"[{self.session_id}] Starting RealtimeSTT recorder...")
+        logger.info(f"[{self.session_id}] Model: {self.config['model']}")
+        logger.info(f"[{self.session_id}] Device: {self.config['device']}")
+        
+        try:
+            # Create recorder in a thread to avoid blocking
+            def init_recorder():
+                self.recorder = AudioToTextRecorder(
+                    # Model settings - using same model for both partial and final
+                    model=self.config['model'],
+                    language=self.config['language'],
+                    compute_type=self.config['compute_type'],
+                    device=self.config['device'],
+                    
+                    # Disable microphone - we feed audio manually
+                    use_microphone=False,
+                    
+                    # Real-time transcription - use same model for everything
+                    enable_realtime_transcription=True,
+                    realtime_model_type=self.config['model'],  # Use same model
+                    realtime_processing_pause=0.05,  # 50ms between updates
+                    on_realtime_transcription_update=self._on_realtime_transcription,
+                    on_realtime_transcription_stabilized=self._on_realtime_stabilized,
+                    
+                    # VAD settings - very permissive, rely on Discord's VAD for speech detection
+                    # Our VAD is only for silence detection, not filtering audio content
+                    silero_sensitivity=0.05,  # Very low = barely filters anything
+                    silero_use_onnx=True,  # Faster
+                    webrtc_sensitivity=3,
+                    post_speech_silence_duration=self.config['silence_duration'],
+                    min_length_of_recording=self.config['min_recording_length'],
+                    min_gap_between_recordings=self.config['min_gap'],
+                    pre_recording_buffer_duration=1.0,  # Capture more audio before/after speech
+                    
+                    # Callbacks
+                    on_recording_start=self._on_recording_start,
+                    on_recording_stop=self._on_recording_stop,
+                    on_vad_detect_start=lambda: logger.debug(f"[{self.session_id}] VAD listening"),
+                    on_vad_detect_stop=lambda: logger.debug(f"[{self.session_id}] VAD stopped"),
+                    
+                    # Other settings
+                    spinner=False,  # No spinner in container
+                    level=logging.WARNING,  # Reduce internal logging
+                    
+                    # Beam search settings
+                    beam_size=5,  # Higher beam = better accuracy (used for final processing)
+                    beam_size_realtime=5,  # Increased from 3 for better real-time accuracy
+                    
+                    # Batch sizes
+                    batch_size=16,
+                    realtime_batch_size=8,
+                    
+                    initial_prompt="",  # Can add context here if needed
+                )
+                logger.info(f"[{self.session_id}] ✅ Recorder initialized")
+            
+            # Run initialization in thread pool
+            await asyncio.get_event_loop().run_in_executor(None, init_recorder)
+            
+            # Start audio feed thread
+            self.feed_thread = threading.Thread(target=self._feed_audio_thread, daemon=True)
+            self.feed_thread.start()
+            
+            # Start the recorder's text processing loop in a thread
+            def run_text_loop():
+                while self.running:
+                    try:
+                        # This blocks until speech is detected and transcribed
+                        text = self.recorder.text(self._on_transcription)
+                    except Exception as e:
+                        if self.running:
+                            logger.error(f"[{self.session_id}] Text loop error: {e}")
+                        break
+            
+            self.text_thread = threading.Thread(target=run_text_loop, daemon=True)
+            self.text_thread.start()
+            
+            logger.info(f"[{self.session_id}] ✅ Session started successfully")
+            
+        except Exception as e:
+            logger.error(f"[{self.session_id}] Failed to start session: {e}", exc_info=True)
+            raise
+    
+    def feed_audio(self, audio_data: bytes):
+        """Feed audio data to the recorder."""
+        if self.running:
+            # Convert bytes to numpy array (16-bit PCM)
+            audio_np = np.frombuffer(audio_data, dtype=np.int16)
+            self.audio_queue.put(audio_np)
+    
+    def reset(self):
+        """Reset the session state."""
+        logger.info(f"[{self.session_id}] Resetting session")
+        self.last_partial = ""
+        # Clear audio queue
+        while not self.audio_queue.empty():
+            try:
+                self.audio_queue.get_nowait()
+            except queue.Empty:
+                break
+    
+    async def stop(self):
+        """Stop the session and cleanup."""
+        logger.info(f"[{self.session_id}] Stopping session...")
+        self.running = False
+        
+        # Wait for threads to finish
+        if self.feed_thread and self.feed_thread.is_alive():
+            self.feed_thread.join(timeout=2)
+        
+        # Shutdown recorder
+        if self.recorder:
+            try:
+                self.recorder.shutdown()
+            except Exception as e:
+                logger.error(f"[{self.session_id}] Error shutting down recorder: {e}")
+        
+        logger.info(f"[{self.session_id}] Session stopped")
+
+
+class STTServer:
+    """
+    WebSocket server for RealtimeSTT.
+    Handles multiple concurrent clients (one per Discord user).
+    """
+    
+    def __init__(self, host: str = "0.0.0.0", port: int = 8766):
+        self.host = host
+        self.port = port
+        self.sessions: Dict[str, STTSession] = {}
+        self.session_counter = 0
+        
+        # Default configuration
+        self.config = {
+            # Model - using small.en (English-only, more accurate than multilingual small)
+            'model': 'small.en',
+            'language': 'en',
+            'compute_type': 'float16',  # FP16 for GPU efficiency
+            'device': 'cuda',
+            
+            # VAD settings
+            'silero_sensitivity': 0.6,
+            'webrtc_sensitivity': 3,
+            'silence_duration': 0.8,  # Shorter to improve responsiveness
+            'min_recording_length': 0.5,
+            'min_gap': 0.3,
+        }
+        
+        logger.info("=" * 60)
+        logger.info("RealtimeSTT Server Configuration:")
+        logger.info(f"  Host: {host}:{port}")
+        logger.info(f"  Model: {self.config['model']} (English-only, optimized)")
+        logger.info(f"  Beam size: 5 (higher accuracy)")
+        logger.info(f"  Strategy: Use last partial as final (instant response)")
+        logger.info(f"  Language: {self.config['language']}")
+        logger.info(f"  Device: {self.config['device']}")
+        logger.info(f"  Compute Type: {self.config['compute_type']}")
+        logger.info(f"  Silence Duration: {self.config['silence_duration']}s")
+        logger.info("=" * 60)
+    
+    async def handle_client(self, websocket):
+        """Handle a WebSocket client connection."""
+        self.session_counter += 1
+        session_id = f"session_{self.session_counter}"
+        session = None
+        
+        try:
+            logger.info(f"[{session_id}] Client connected from {websocket.remote_address}")
+            
+            # Create session
+            session = STTSession(websocket, session_id, self.config)
+            self.sessions[session_id] = session
+            
+            # Start session
+            await session.start(asyncio.get_event_loop())
+            
+            # Process messages
+            async for message in websocket:
+                try:
+                    if isinstance(message, bytes):
+                        # Binary audio data
+                        session.feed_audio(message)
+                    else:
+                        # JSON command
+                        data = json.loads(message)
+                        command = data.get('command', '')
+                        
+                        if command == 'reset':
+                            session.reset()
+                        elif command == 'ping':
+                            await websocket.send(json.dumps({
+                                'type': 'pong',
+                                'timestamp': time.time()
+                            }))
+                        else:
+                            logger.warning(f"[{session_id}] Unknown command: {command}")
+                            
+                except json.JSONDecodeError:
+                    logger.warning(f"[{session_id}] Invalid JSON message")
+                except Exception as e:
+                    logger.error(f"[{session_id}] Error processing message: {e}")
+        
+        except websockets.exceptions.ConnectionClosed:
+            logger.info(f"[{session_id}] Client disconnected")
+        except Exception as e:
+            logger.error(f"[{session_id}] Error: {e}", exc_info=True)
+        finally:
+            # Cleanup
+            if session:
+                await session.stop()
+                del self.sessions[session_id]
+    
+    async def run(self):
+        """Run the WebSocket server."""
+        logger.info(f"Starting RealtimeSTT server on ws://{self.host}:{self.port}")
+        
+        async with serve(
+            self.handle_client,
+            self.host,
+            self.port,
+            ping_interval=30,
+            ping_timeout=10,
+            max_size=10 * 1024 * 1024,  # 10MB max message size
+        ):
+            logger.info("✅ Server ready and listening for connections")
+            await asyncio.Future()  # Run forever
+
+
+async def warmup_model(config: Dict[str, Any]):
+    """
+    Warm up the STT model by loading it and processing test audio.
+    This ensures the model is cached in memory before handling real requests.
+    """
+    global warmup_complete, warmup_recorder
+    
+    with warmup_lock:
+        if warmup_complete:
+            logger.info("Model already warmed up")
+            return
+        
+        logger.info("🔥 Starting model warmup...")
+        try:
+            # Generate silent test audio (1 second of silence, 16kHz)
+            test_audio = np.zeros(16000, dtype=np.int16)
+            
+            # Initialize a temporary recorder to load the model
+            logger.info("Loading Faster-Whisper model...")
+            
+            def dummy_callback(text):
+                pass
+            
+            # This will trigger model loading and compilation
+            warmup_recorder = AudioToTextRecorder(
+                model=config['model'],
+                language=config['language'],
+                compute_type=config['compute_type'],
+                device=config['device'],
+                silero_sensitivity=config['silero_sensitivity'],
+                webrtc_sensitivity=config['webrtc_sensitivity'],
+                post_speech_silence_duration=config['silence_duration'],
+                min_length_of_recording=config['min_recording_length'],
+                min_gap_between_recordings=config['min_gap'],
+                enable_realtime_transcription=True,
+                realtime_processing_pause=0.1,
+                on_realtime_transcription_update=dummy_callback,
+                on_realtime_transcription_stabilized=dummy_callback,
+                spinner=False,
+                level=logging.WARNING,
+                beam_size=5,
+                beam_size_realtime=5,
+                batch_size=16,
+                realtime_batch_size=8,
+                initial_prompt="",
+            )
+            
+            logger.info("✅ Model loaded and warmed up successfully")
+            warmup_complete = True
+            
+        except Exception as e:
+            logger.error(f"❌ Warmup failed: {e}", exc_info=True)
+            warmup_complete = False
+
+
+async def health_handler(request):
+    """HTTP health check endpoint"""
+    if warmup_complete:
+        return web.json_response({
+            "status": "ready",
+            "warmed_up": True,
+            "model": "small.en",
+            "device": "cuda"
+        })
+    else:
+        return web.json_response({
+            "status": "warming_up",
+            "warmed_up": False,
+            "model": "small.en",
+            "device": "cuda"
+        }, status=503)
+
+
+async def start_http_server(host: str, http_port: int):
+    """Start HTTP server for health checks"""
+    app = web.Application()
+    app.router.add_get('/health', health_handler)
+    
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, host, http_port)
+    await site.start()
+    
+    logger.info(f"✅ HTTP health server listening on http://{host}:{http_port}")
+
+
+def main():
+    """Main entry point."""
+    import os
+    
+    # Get configuration from environment
+    host = os.environ.get('STT_HOST', '0.0.0.0')
+    port = int(os.environ.get('STT_PORT', '8766'))
+    http_port = int(os.environ.get('STT_HTTP_PORT', '8767'))  # HTTP health check port
+    
+    # Configuration
+    config = {
+        'model': 'small.en',
+        'language': 'en',
+        'compute_type': 'float16',
+        'device': 'cuda',
+        'silero_sensitivity': 0.6,
+        'webrtc_sensitivity': 3,
+        'silence_duration': 0.8,
+        'min_recording_length': 0.5,
+        'min_gap': 0.3,
+    }
+    
+    # Create and run server
+    server = STTServer(host=host, port=port)
+    
+    async def run_all():
+        # Start warmup in background
+        asyncio.create_task(warmup_model(config))
+        
+        # Start HTTP health server
+        asyncio.create_task(start_http_server(host, http_port))
+        
+        # Start WebSocket server
+        await server.run()
+    
+    try:
+        asyncio.run(run_all())
+    except KeyboardInterrupt:
+        logger.info("Server shutdown requested")
+    except Exception as e:
+        logger.error(f"Server error: {e}", exc_info=True)
+        raise
+
+
+if __name__ == '__main__':
+    main()