Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions
--- a/stt-parakeet/vad/init.py
+++ b/stt-parakeet/vad/init.py
@@ -0,0 +1,6 @@
+"""
+VAD module using onnx-asr library
+"""
+from .silero_vad import SileroVAD, load_vad
+
+__all__ = ["SileroVAD", "load_vad"]
--- a/stt-parakeet/vad/silero_vad.py
+++ b/stt-parakeet/vad/silero_vad.py
@@ -0,0 +1,114 @@
+"""
+Silero VAD wrapper using onnx-asr library
+"""
+import numpy as np
+import onnx_asr
+from typing import Optional, Tuple
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class SileroVAD:
+    """
+    Voice Activity Detection using Silero VAD via onnx-asr.
+    """
+    
+    def __init__(
+        self,
+        providers: Optional[list] = None,
+        threshold: float = 0.5,
+        min_speech_duration_ms: int = 250,
+        min_silence_duration_ms: int = 100,
+        window_size_samples: int = 512,
+        speech_pad_ms: int = 30,
+    ):
+        """
+        Initialize Silero VAD.
+        
+        Args:
+            providers: Optional ONNX runtime providers
+            threshold: Speech probability threshold (0.0-1.0)
+            min_speech_duration_ms: Minimum duration of speech segment
+            min_silence_duration_ms: Minimum duration of silence to split segments
+            window_size_samples: Window size for VAD processing
+            speech_pad_ms: Padding around speech segments
+        """
+        if providers is None:
+            providers = [
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ]
+        
+        logger.info("Loading Silero VAD model...")
+        self.vad = onnx_asr.load_vad("silero", providers=providers)
+        
+        # VAD parameters
+        self.threshold = threshold
+        self.min_speech_duration_ms = min_speech_duration_ms
+        self.min_silence_duration_ms = min_silence_duration_ms
+        self.window_size_samples = window_size_samples
+        self.speech_pad_ms = speech_pad_ms
+        
+        logger.info("Silero VAD initialized successfully")
+    
+    def detect_speech(
+        self,
+        audio: np.ndarray,
+        sample_rate: int = 16000,
+    ) -> list:
+        """
+        Detect speech segments in audio.
+        
+        Args:
+            audio: Audio data as numpy array (float32)
+            sample_rate: Sample rate of audio
+            
+        Returns:
+            List of tuples (start_sample, end_sample) for speech segments
+        """
+        # Note: The actual VAD processing is typically done within
+        # the onnx_asr model.with_vad() method, but we provide
+        # this interface for direct VAD usage
+        
+        # For direct VAD detection, you would use the vad model directly
+        # However, onnx-asr integrates VAD into the recognition pipeline
+        # So this is mainly for compatibility
+        
+        logger.warning("Direct VAD detection - consider using model.with_vad() instead")
+        return []
+    
+    def is_speech(
+        self,
+        audio_chunk: np.ndarray,
+        sample_rate: int = 16000,
+    ) -> Tuple[bool, float]:
+        """
+        Check if audio chunk contains speech.
+        
+        Args:
+            audio_chunk: Audio chunk as numpy array (float32)
+            sample_rate: Sample rate
+            
+        Returns:
+            Tuple of (is_speech: bool, probability: float)
+        """
+        # Placeholder for direct VAD probability check
+        # In practice, use model.with_vad() for automatic segmentation
+        logger.warning("Direct speech detection not implemented - use model.with_vad()")
+        return False, 0.0
+    
+    def get_vad(self):
+        """
+        Get the underlying onnx_asr VAD model.
+        
+        Returns:
+            The onnx_asr VAD model instance
+        """
+        return self.vad
+
+
+# Convenience function
+def load_vad(**kwargs):
+    """Load and return Silero VAD with given configuration."""
+    return SileroVAD(**kwargs)