Changed stt to parakeet — still experiemntal, though performance seems to be better

Error in llama-swap catchall implemented + webhook notifier
2026-01-18 03:35:50 +02:00 · 2026-01-18 01:30:26 +02:00
13 changed files with 683 additions and 43 deletions
--- a/bot/utils/error_handler.py
+++ b/bot/utils/error_handler.py
@@ -0,0 +1,267 @@
 # utils/error_handler.py
 import aiohttp
 import traceback
 import datetime
 import re
 from utils.logger import get_logger
 logger = get_logger('error_handler')
 # Webhook URL for error notifications
 ERROR_WEBHOOK_URL = "https://discord.com/api/webhooks/1462216811293708522/4kdGenpxZFsP0z3VBgebYENODKmcRrmEzoIwCN81jCirnAxuU2YvxGgwGCNBb6TInA9Z"
 # User-friendly error message that Miku will say
 MIKU_ERROR_MESSAGE = "Someone tell Koko-nii there is a problem with my AI."
 def is_error_response(response_text: str) -> bool:
    """
    Detect if a response text is an error message.
    Args:
        response_text: The response text to check
    Returns:
        bool: True if the response appears to be an error message
    """
    if not response_text or not isinstance(response_text, str):
        return False
    response_lower = response_text.lower().strip()
    # Common error patterns
    error_patterns = [
        r'^error:?\s*\d{3}',  # "Error: 502" or "Error 502"
        r'^error:?\s+',  # "Error: " or "Error "
        r'^\d{3}\s+error',  # "502 Error"
        r'^sorry,?\s+(there\s+was\s+)?an?\s+error',  # "Sorry, an error" or "Sorry, there was an error"
        r'^sorry,?\s+the\s+response\s+took\s+too\s+long',  # Timeout error
        r'connection\s+(refused|failed|error|timeout)',
        r'timed?\s*out',
        r'failed\s+to\s+(connect|respond|process)',
        r'service\s+unavailable',
        r'internal\s+server\s+error',
        r'bad\s+gateway',
        r'gateway\s+timeout',
    ]
    # Check if response matches any error pattern
    for pattern in error_patterns:
        if re.search(pattern, response_lower):
            return True
    # Check for HTTP status codes indicating errors
    if re.match(r'^\d{3}$', response_text.strip()):
        status_code = int(response_text.strip())
        if status_code >= 400:  # HTTP error codes
            return True
    return False
 async def send_error_webhook(error_message: str, context: dict = None):
    """
    Send error notification to the webhook.
    Args:
        error_message: The error message or exception details
        context: Optional dictionary with additional context (user, channel, etc.)
    """
    try:
        def truncate_field(text: str, max_length: int = 1000) -> str:
            """Truncate text to fit Discord's field value limit (1024 chars)."""
            if not text:
                return "N/A"
            text = str(text)
            if len(text) > max_length:
                return text[:max_length - 20] + "\n...(truncated)"
            return text
        # Build embed for webhook
        embed = {
            "title": "🚨 Miku Bot Error",
            "color": 0xFF0000,  # Red color
            "timestamp": datetime.datetime.utcnow().isoformat(),
            "fields": []
        }
        # Add error message (limit to 1000 chars to leave room for code blocks)
        error_value = f"```\n{truncate_field(error_message, 900)}\n```"
        embed["fields"].append({
            "name": "Error Message",
            "value": error_value,
            "inline": False
        })
        # Add context if provided
        if context:
            if 'user' in context and context['user']:
                embed["fields"].append({
                    "name": "User",
                    "value": truncate_field(context['user'], 200),
                    "inline": True
                })
            if 'channel' in context and context['channel']:
                embed["fields"].append({
                    "name": "Channel",
                    "value": truncate_field(context['channel'], 200),
                    "inline": True
                })
            if 'guild' in context and context['guild']:
                embed["fields"].append({
                    "name": "Server",
                    "value": truncate_field(context['guild'], 200),
                    "inline": True
                })
            if 'prompt' in context and context['prompt']:
                prompt_value = f"```\n{truncate_field(context['prompt'], 400)}\n```"
                embed["fields"].append({
                    "name": "User Prompt",
                    "value": prompt_value,
                    "inline": False
                })
            if 'exception_type' in context and context['exception_type']:
                embed["fields"].append({
                    "name": "Exception Type",
                    "value": f"`{truncate_field(context['exception_type'], 200)}`",
                    "inline": True
                })
            if 'traceback' in context and context['traceback']:
                tb_value = f"```python\n{truncate_field(context['traceback'], 800)}\n```"
                embed["fields"].append({
                    "name": "Traceback",
                    "value": tb_value,
                    "inline": False
                })
        # Ensure we have at least one field (Discord requirement)
        if not embed["fields"]:
            embed["fields"].append({
                "name": "Status",
                "value": "Error occurred with no additional context",
                "inline": False
            })
        # Send webhook
        payload = {
            "content": "<@344584170839236608>",  # Mention Koko-nii
            "embeds": [embed]
        }
        async with aiohttp.ClientSession() as session:
            async with session.post(ERROR_WEBHOOK_URL, json=payload) as response:
                if response.status in [200, 204]:
                    logger.info(f"✅ Error webhook sent successfully")
                else:
                    error_text = await response.text()
                    logger.error(f"❌ Failed to send error webhook: {response.status} - {error_text}")
    except Exception as e:
        logger.error(f"❌ Exception while sending error webhook: {e}")
        logger.error(traceback.format_exc())
 async def handle_llm_error(
    error: Exception,
    user_prompt: str = None,
    user_id: str = None,
    guild_id: str = None,
    author_name: str = None
 ) -> str:
    """
    Handle LLM errors by logging them and sending webhook notification.
    Args:
        error: The exception that occurred
        user_prompt: The user's prompt (if available)
        user_id: The user ID (if available)
        guild_id: The guild ID (if available)
        author_name: The user's display name (if available)
    Returns:
        str: User-friendly error message for Miku to say
    """
    logger.error(f"🚨 LLM Error occurred: {type(error).__name__}: {str(error)}")
    # Build context
    context = {
        "exception_type": type(error).__name__,
        "traceback": traceback.format_exc()
    }
    if user_prompt:
        context["prompt"] = user_prompt
    if author_name:
        context["user"] = author_name
    elif user_id:
        context["user"] = f"User ID: {user_id}"
    if guild_id:
        context["guild"] = f"Guild ID: {guild_id}"
    # Get full error message
    error_message = f"{type(error).__name__}: {str(error)}"
    # Send webhook notification
    await send_error_webhook(error_message, context)
    return MIKU_ERROR_MESSAGE
 async def handle_response_error(
    response_text: str,
    user_prompt: str = None,
    user_id: str = None,
    guild_id: str = None,
    author_name: str = None,
    channel_name: str = None
 ) -> str:
    """
    Handle error responses from the LLM by checking if the response is an error message.
    Args:
        response_text: The response text from the LLM
        user_prompt: The user's prompt (if available)
        user_id: The user ID (if available)
        guild_id: The guild ID (if available)
        author_name: The user's display name (if available)
        channel_name: The channel name (if available)
    Returns:
        str: Either the original response (if not an error) or user-friendly error message
    """
    if not is_error_response(response_text):
        return response_text
    logger.error(f"🚨 Error response detected: {response_text}")
    # Build context
    context = {}
    if user_prompt:
        context["prompt"] = user_prompt
    if author_name:
        context["user"] = author_name
    elif user_id:
        context["user"] = f"User ID: {user_id}"
    if channel_name:
        context["channel"] = channel_name
    elif guild_id:
        context["channel"] = f"Guild ID: {guild_id}"
    if guild_id:
        context["guild"] = f"Guild ID: {guild_id}"
    # Send webhook notification
    await send_error_webhook(f"LLM returned error response: {response_text}", context)
    return MIKU_ERROR_MESSAGE
--- a/bot/utils/llm.py
+++ b/bot/utils/llm.py
@@ -11,6 +11,7 @@ from utils.context_manager import get_context_for_response_type, get_complete_co
 from utils.moods import load_mood_description
 from utils.conversation_history import conversation_history
 from utils.logger import get_logger
 from utils.error_handler import handle_llm_error, handle_response_error
 logger = get_logger('llm')
@@ -281,8 +282,18 @@ Please respond in a way that reflects this emotional tone.{pfp_context}"""
                    # Escape asterisks for actions (e.g., *adjusts hair* becomes \*adjusts hair\*)
                    reply = _escape_markdown_actions(reply)
                    # Check if the reply is an error response and handle it
                    reply = await handle_response_error(
                        reply,
                        user_prompt=user_prompt,
                        user_id=str(user_id),
                        guild_id=str(guild_id) if guild_id else None,
                        author_name=author_name
                    )
                    # Save to conversation history (only if both prompt and reply are non-empty)
-                    if user_prompt and user_prompt.strip() and reply and reply.strip():
+                    # Don't save error messages to history
                    if user_prompt and user_prompt.strip() and reply and reply.strip() and reply != "Someone tell Koko-nii there is a problem with my AI.":
                        # Add user message to history
                        conversation_history.add_message(
                            channel_id=channel_id,
@@ -298,21 +309,44 @@ Please respond in a way that reflects this emotional tone.{pfp_context}"""
                            is_bot=True
                        )
-                    # Also save to legacy globals for backward compatibility
+                    # Also save to legacy globals for backward compatibility (skip error messages)
-                    if user_prompt and user_prompt.strip() and reply and reply.strip():
+                    if user_prompt and user_prompt.strip() and reply and reply.strip() and reply != "Someone tell Koko-nii there is a problem with my AI.":
                        globals.conversation_history[user_id].append((user_prompt, reply))
                    return reply
                else:
                    error_text = await response.text()
                    logger.error(f"Error from llama-swap: {response.status} - {error_text}")
                    # Send webhook notification for HTTP errors
                    await handle_response_error(
                        f"Error: {response.status}",
                        user_prompt=user_prompt,
                        user_id=str(user_id),
                        guild_id=str(guild_id) if guild_id else None,
                        author_name=author_name
                    )
                    # Don't save error responses to conversation history
-                    return f"Error: {response.status}"
+                    return "Someone tell Koko-nii there is a problem with my AI."
        except asyncio.TimeoutError:
-            return "Sorry, the response took too long. Please try again."
+            logger.error("Timeout error in query_llama")
            return await handle_llm_error(
                asyncio.TimeoutError("Request timed out after 300 seconds"),
                user_prompt=user_prompt,
                user_id=str(user_id),
                guild_id=str(guild_id) if guild_id else None,
                author_name=author_name
            )
        except Exception as e:
            logger.error(f"Error in query_llama: {e}")
-            return f"Sorry, there was an error: {str(e)}"
+            return await handle_llm_error(
                e,
                user_prompt=user_prompt,
                user_id=str(user_id),
                guild_id=str(guild_id) if guild_id else None,
                author_name=author_name
            )
 # Backward compatibility alias for existing code
 query_ollama = query_llama
--- a/bot/utils/logger.py
+++ b/bot/utils/logger.py
@@ -62,6 +62,7 @@ COMPONENTS = {
    'voice_manager': 'Voice channel session management',
    'voice_commands': 'Voice channel commands',
    'voice_audio': 'Voice audio streaming and TTS',
    'error_handler': 'Error detection and webhook notifications',
 }
 # Global configuration
--- a/stt/Dockerfile.stt
+++ b/stt/Dockerfile.stt
@@ -9,13 +9,22 @@ RUN apt-get update && apt-get install -y \
    python3-pip \
    ffmpeg \
    libsndfile1 \
    sox \
    libsox-dev \
    libsox-fmt-all \
    && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
-# Install Python dependencies
+# Upgrade pip to avoid dependency resolution issues
-RUN pip3 install --no-cache-dir -r requirements.txt
+RUN pip3 install --upgrade pip
 # Install dependencies for sox package (required by NeMo) in correct order
 RUN pip3 install --no-cache-dir numpy==2.2.2 typing-extensions
 # Install Python dependencies with legacy resolver (NeMo has complex dependencies)
 RUN pip3 install --no-cache-dir --use-deprecated=legacy-resolver -r requirements.txt
 # Copy application code
 COPY . .
--- a/stt/PARAKEET_MIGRATION.md
+++ b/stt/PARAKEET_MIGRATION.md
@@ -0,0 +1,114 @@
 # NVIDIA Parakeet Migration
 ## Summary
 Replaced Faster-Whisper with NVIDIA Parakeet TDT (Token-and-Duration Transducer) for real-time speech transcription.
 ## Changes Made
 ### 1. New Transcriber: `parakeet_transcriber.py`
 - **Model**: `nvidia/parakeet-tdt-0.6b-v3` (600M parameters)
 - **Features**:
  - Real-time streaming transcription
  - Word-level timestamps for LLM pre-computation
  - GPU-accelerated (CUDA)
  - Lower latency than Faster-Whisper
  - Native PyTorch (no CTranslate2 dependency)
 ### 2. Requirements Updated
 **Removed**:
 - `faster-whisper==1.2.1`
 - `ctranslate2==4.5.0`
 **Added**:
 - `transformers==4.47.1` - HuggingFace model loading
 - `accelerate==1.2.1` - GPU optimization
 - `sentencepiece==0.2.0` - Tokenization
 **Kept**:
 - `torch==2.9.1` & `torchaudio==2.9.1` - Core ML framework
 - `silero-vad==5.1.2` - VAD still uses Silero (CPU)
 ### 3. Server Updates: `stt_server.py`
 **Changes**:
 - Import `ParakeetTranscriber` instead of `WhisperTranscriber`
 - Partial transcripts now include `words` array with timestamps
 - Final transcripts include `words` array for LLM pre-computation
 - Startup logs show "Loading NVIDIA Parakeet TDT model"
 **Word-level Token Format**:
 ```json
 {
  "type": "partial",
  "text": "hello world",
  "words": [
    {"word": "hello", "start_time": 0.0, "end_time": 0.5},
    {"word": "world", "start_time": 0.5, "end_time": 1.0}
  ],
  "user_id": "123",
  "timestamp": 1234.56
 }
 ```
 ## Advantages Over Faster-Whisper
 1. **Real-time Performance**: TDT architecture designed for streaming
 2. **No cuDNN Issues**: Native PyTorch, no CTranslate2 library loading problems
 3. **Word-level Tokens**: Enables LLM prompt pre-computation during speech
 4. **Lower Latency**: Optimized for real-time use cases
 5. **Better GPU Utilization**: Uses standard PyTorch CUDA
 6. **Simpler Dependencies**: No external compiled libraries
 ## Deployment
 1. **Build Container**:
   ```bash
   docker-compose build miku-stt
   ```
 2. **First Run** (downloads model ~600MB):
   ```bash
   docker-compose up miku-stt
   ```
   Model will be cached in `/models` volume for subsequent runs.
 3. **Verify GPU Usage**:
   ```bash
   docker exec miku-stt nvidia-smi
   ```
   You should see `python3` process using VRAM (~1.5GB for model + inference).
 ## Testing
 Same test procedure as before:
 1. Join voice channel
 2. `!miku listen`
 3. Speak clearly
 4. Check logs for "Parakeet model loaded"
 5. Verify transcripts appear faster than before
 ## Bot-Side Compatibility
 No changes needed to bot code - STT WebSocket protocol is identical. The bot will automatically receive word-level tokens in partial/final transcript messages.
 ### Future Enhancement: LLM Pre-computation
 The `words` array can be used to start LLM inference before full transcript completes:
 - Send partial words to LLM as they arrive
 - LLM begins processing prompt tokens
 - Faster response time when user finishes speaking
 ## Rollback (if needed)
 To revert to Faster-Whisper:
 1. Restore `requirements.txt` from git
 2. Restore `stt_server.py` from git  
 3. Delete `parakeet_transcriber.py`
 4. Rebuild container
 ## Performance Expectations
 - **Model Load Time**: ~5-10 seconds (first time downloads from HuggingFace)
 - **VRAM Usage**: ~1.5GB (vs ~800MB for Whisper small)
 - **Latency**: ~200-500ms for 2-second audio chunks
 - **GPU Utilization**: 30-60% during active transcription
 - **Accuracy**: Similar to Whisper small (designed for English)
--- a/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/config.json
+++ b/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/config.json
--- a/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/preprocessor_config.json
+++ b/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/preprocessor_config.json
--- a/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/processor_config.json
+++ b/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/processor_config.json
--- a/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/tokenizer_config.json
+++ b/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/.no_exist/6d590f77001d318fb17a0b5bf7ee329a91b52598/tokenizer_config.json
--- a/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/refs/main
+++ b/stt/models/models--nvidia--parakeet-tdt-0.6b-v3/refs/main
@@ -0,0 +1 @@
 6d590f77001d318fb17a0b5bf7ee329a91b52598
--- a/stt/parakeet_transcriber.py
+++ b/stt/parakeet_transcriber.py
@@ -0,0 +1,209 @@
 """
 NVIDIA Parakeet TDT Transcriber
 Real-time streaming ASR using NVIDIA's Parakeet TDT (Token-and-Duration Transducer) model.
 Supports streaming transcription with word-level timestamps for LLM pre-computation.
 Model: nvidia/parakeet-tdt-0.6b-v3
 - 600M parameters
 - Real-time capable on GPU
 - Word-level timestamps
 - Streaming support via NVIDIA NeMo
 """
 import numpy as np
 import torch
 from nemo.collections.asr.models import EncDecRNNTBPEModel
 from typing import Optional, List, Dict
 import logging
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 logger = logging.getLogger('parakeet')
 class ParakeetTranscriber:
    """
    NVIDIA Parakeet-based streaming transcription with word-level tokens.
    Uses NVIDIA NeMo for proper model loading and inference.
    """
    def __init__(
        self,
        model_name: str = "nvidia/parakeet-tdt-0.6b-v3",
        device: str = "cuda",
        language: str = "en"
    ):
        """
        Initialize Parakeet transcriber.
        Args:
            model_name: HuggingFace model identifier
            device: Device to run on (cuda or cpu)
            language: Language code (Parakeet primarily supports English)
        """
        self.model_name = model_name
        self.device = device
        self.language = language
        logger.info(f"Loading Parakeet model: {model_name} on {device}...")
        # Load model via NeMo from HuggingFace
        self.model = EncDecRNNTBPEModel.from_pretrained(
            model_name=model_name,
            map_location=device
        )
        self.model.eval()
        if device == "cuda":
            self.model = self.model.cuda()
        # Thread pool for blocking transcription calls
        self.executor = ThreadPoolExecutor(max_workers=2)
        logger.info(f"Parakeet model loaded on {device}")
    async def transcribe_async(
        self,
        audio: np.ndarray,
        sample_rate: int = 16000,
        return_timestamps: bool = False
    ) -> str:
        """
        Transcribe audio asynchronously (non-blocking).
        Args:
            audio: Audio data as numpy array (float32)
            sample_rate: Audio sample rate (Parakeet expects 16kHz)
            return_timestamps: Whether to return word-level timestamps
        Returns:
            Transcribed text (or dict with timestamps if return_timestamps=True)
        """
        loop = asyncio.get_event_loop()
        # Run transcription in thread pool to avoid blocking
        result = await loop.run_in_executor(
            self.executor,
            self._transcribe_blocking,
            audio,
            sample_rate,
            return_timestamps
        )
        return result
    def _transcribe_blocking(
        self,
        audio: np.ndarray,
        sample_rate: int,
        return_timestamps: bool
    ):
        """
        Blocking transcription call (runs in thread pool).
        """
        # Convert to float32 if needed
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32) / 32768.0
        # Ensure correct sample rate (Parakeet expects 16kHz)
        if sample_rate != 16000:
            logger.warning(f"Audio sample rate is {sample_rate}Hz, Parakeet expects 16kHz. Resampling...")
            import torchaudio
            audio_tensor = torch.from_numpy(audio).unsqueeze(0)
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            audio_tensor = resampler(audio_tensor)
            audio = audio_tensor.squeeze(0).numpy()
            sample_rate = 16000
        # Transcribe using NeMo model
        with torch.no_grad():
            # Convert to tensor
            audio_signal = torch.from_numpy(audio).unsqueeze(0)
            audio_signal_len = torch.tensor([len(audio)])
            if self.device == "cuda":
                audio_signal = audio_signal.cuda()
                audio_signal_len = audio_signal_len.cuda()
            # Get transcription with timestamps
            # NeMo returns list of Hypothesis objects when timestamps=True
            transcriptions = self.model.transcribe(
                audio=[audio_signal.squeeze(0).cpu().numpy()],
                batch_size=1,
                timestamps=True  # Enable timestamps to get word-level data
            )
            # Extract text from Hypothesis object
            hypothesis = transcriptions[0] if transcriptions else None
            if hypothesis is None:
                text = ""
                words = []
            else:
                # Hypothesis object has .text attribute
                text = hypothesis.text.strip() if hasattr(hypothesis, 'text') else str(hypothesis).strip()
                # Extract word-level timestamps if available
                words = []
                if hasattr(hypothesis, 'timestamp') and hypothesis.timestamp:
                    # timestamp is a dict with 'word' key containing list of word timestamps
                    word_timestamps = hypothesis.timestamp.get('word', [])
                    for word_info in word_timestamps:
                        words.append({
                            "word": word_info.get('word', ''),
                            "start_time": word_info.get('start', 0.0),
                            "end_time": word_info.get('end', 0.0)
                        })
            logger.debug(f"Transcribed: '{text}' with {len(words)} words")
            if return_timestamps:
                return {
                    "text": text,
                    "words": words
                }
            else:
                return text
    async def transcribe_streaming(
        self,
        audio_chunks: List[np.ndarray],
        sample_rate: int = 16000,
        chunk_size_ms: int = 500
    ) -> Dict[str, any]:
        """
        Transcribe audio chunks with streaming support.
        Args:
            audio_chunks: List of audio chunks to process
            sample_rate: Audio sample rate
            chunk_size_ms: Size of each chunk in milliseconds
        Returns:
            Dict with partial and word-level results
        """
        if not audio_chunks:
            return {"text": "", "words": []}
        # Concatenate all chunks
        audio_data = np.concatenate(audio_chunks)
        # Transcribe with timestamps for streaming
        result = await self.transcribe_async(
            audio_data,
            sample_rate,
            return_timestamps=True
        )
        return result
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes."""
        # Parakeet TDT v3 primarily supports English
        return ["en"]
    def cleanup(self):
        """Cleanup resources."""
        self.executor.shutdown(wait=True)
        logger.info("Parakeet transcriber cleaned up")
--- a/stt/requirements.txt
+++ b/stt/requirements.txt
@@ -6,7 +6,7 @@ uvicorn[standard]==0.32.1
 websockets==14.1
 aiohttp==3.11.11
-# Audio processing
+# Audio processing (install numpy first for sox dependency)
 numpy==2.2.2
 soundfile==0.12.1
 librosa==0.10.2.post1
@@ -16,9 +16,12 @@ torch==2.9.1  # Latest PyTorch
 torchaudio==2.9.1
 silero-vad==5.1.2
-# STT (GPU)
+# STT (GPU) - NVIDIA NeMo for Parakeet
-faster-whisper==1.2.1  # Latest version (Oct 31, 2025)
+# Parakeet TDT 0.6b-v3 requires NeMo 2.4
-ctranslate2==4.5.0  # Required by faster-whisper
+# Fix huggingface-hub version conflict with transformers
 huggingface-hub>=0.30.0,<1.0
 nemo_toolkit[asr]==2.4.0
 omegaconf==2.3.0
 # Utilities
 python-multipart==0.0.20
--- a/stt/stt_server.py
+++ b/stt/stt_server.py
@@ -2,13 +2,13 @@
 STT Server
 FastAPI WebSocket server for real-time speech-to-text.
-Combines Silero VAD (CPU) and Faster-Whisper (GPU) for efficient transcription.
+Combines Silero VAD (CPU) and NVIDIA Parakeet (GPU) for efficient transcription.
 Architecture:
 - VAD runs continuously on every audio chunk (CPU)
- Whisper transcribes only when VAD detects speech (GPU)
+- Parakeet transcribes only when VAD detects speech (GPU)
 - Supports multiple concurrent users
- Sends partial and final transcripts via WebSocket
+- Sends partial and final transcripts via WebSocket with word-level tokens
 """
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
@@ -20,7 +20,7 @@ from typing import Dict, Optional
 from datetime import datetime
 from vad_processor import VADProcessor
-from whisper_transcriber import WhisperTranscriber
+from parakeet_transcriber import ParakeetTranscriber
 # Configure logging
 logging.basicConfig(
@@ -34,7 +34,7 @@ app = FastAPI(title="Miku STT Server", version="1.0.0")
 # Global instances (initialized on startup)
 vad_processor: Optional[VADProcessor] = None
-whisper_transcriber: Optional[WhisperTranscriber] = None
+parakeet_transcriber: Optional[ParakeetTranscriber] = None
 # User session tracking
 user_sessions: Dict[str, dict] = {}
@@ -117,39 +117,40 @@ class UserSTTSession:
                self.audio_buffer.append(audio_np)
    async def _transcribe_partial(self):
-        """Transcribe accumulated audio and send partial result."""
+        """Transcribe accumulated audio and send partial result with word tokens."""
        if not self.audio_buffer:
            return
        # Concatenate audio
        audio_full = np.concatenate(self.audio_buffer)
-        # Transcribe asynchronously
+        # Transcribe asynchronously with word-level timestamps
        try:
-            text = await whisper_transcriber.transcribe_async(
+            result = await parakeet_transcriber.transcribe_async(
                audio_full,
                sample_rate=16000,
-                initial_prompt=self.last_transcript  # Use previous for context
+                return_timestamps=True
            )
-            if text and text != self.last_transcript:
+            if result and result.get("text") and result["text"] != self.last_transcript:
-                self.last_transcript = text
+                self.last_transcript = result["text"]
-                # Send partial transcript
+                # Send partial transcript with word tokens for LLM pre-computation
                await self.websocket.send_json({
                    "type": "partial",
-                    "text": text,
+                    "text": result["text"],
                    "words": result.get("words", []),  # Word-level tokens
                    "user_id": self.user_id,
                    "timestamp": self.timestamp_ms
                })
-                logger.info(f"Partial [{self.user_id}]: {text}")
+                logger.info(f"Partial [{self.user_id}]: {result['text']}")
        except Exception as e:
            logger.error(f"Partial transcription failed: {e}", exc_info=True)
    async def _transcribe_final(self):
-        """Transcribe final accumulated audio."""
+        """Transcribe final accumulated audio with word tokens."""
        if not self.audio_buffer:
            return
@@ -157,23 +158,25 @@ class UserSTTSession:
        audio_full = np.concatenate(self.audio_buffer)
        try:
-            text = await whisper_transcriber.transcribe_async(
+            result = await parakeet_transcriber.transcribe_async(
                audio_full,
-                sample_rate=16000
+                sample_rate=16000,
                return_timestamps=True
            )
-            if text:
+            if result and result.get("text"):
-                self.last_transcript = text
+                self.last_transcript = result["text"]
-                # Send final transcript
+                # Send final transcript with word tokens
                await self.websocket.send_json({
                    "type": "final",
-                    "text": text,
+                    "text": result["text"],
                    "words": result.get("words", []),  # Word-level tokens for LLM
                    "user_id": self.user_id,
                    "timestamp": self.timestamp_ms
                })
-                logger.info(f"Final [{self.user_id}]: {text}")
+                logger.info(f"Final [{self.user_id}]: {result['text']}")
        except Exception as e:
            logger.error(f"Final transcription failed: {e}", exc_info=True)
@@ -206,7 +209,7 @@ class UserSTTSession:
@app.on_event("startup")
 async def startup_event():
    """Initialize models on server startup."""
-    global vad_processor, whisper_transcriber
+    global vad_processor, parakeet_transcriber
    logger.info("=" * 50)
    logger.info("Initializing Miku STT Server")
@@ -222,15 +225,14 @@ async def startup_event():
    )
    logger.info("✓ VAD ready")
-    # Initialize Whisper (GPU with cuDNN)
+    # Initialize Parakeet (GPU)
-    logger.info("Loading Faster-Whisper model (GPU)...")
+    logger.info("Loading NVIDIA Parakeet TDT model (GPU)...")
-    whisper_transcriber = WhisperTranscriber(
+    parakeet_transcriber = ParakeetTranscriber(
-        model_size="small",
+        model_name="nvidia/parakeet-tdt-0.6b-v3",
        device="cuda",
        compute_type="float16",
        language="en"
    )
-    logger.info("✓ Whisper ready")
+    logger.info("✓ Parakeet ready")
    logger.info("=" * 50)
    logger.info("STT Server ready to accept connections")
@@ -242,8 +244,8 @@ async def shutdown_event():
    """Cleanup on server shutdown."""
    logger.info("Shutting down STT server...")
-    if whisper_transcriber:
+    if parakeet_transcriber:
-        whisper_transcriber.cleanup()
+        parakeet_transcriber.cleanup()
    logger.info("STT server shutdown complete")
Author	SHA1	Message	Date
koko210Serve	0a8910fff8	Changed stt to parakeet — still experiemntal, though performance seems to be better	2026-01-18 03:35:50 +02:00
koko210Serve	50e4f7a5f2	Error in llama-swap catchall implemented + webhook notifier	2026-01-18 01:30:26 +02:00