stt-parakeet/vad/silero_vad.py

"""
Silero VAD wrapper using onnx-asr library
"""
import numpy as np
import onnx_asr
from typing import Optional, Tuple
import logging

logger = logging.getLogger(__name__)


class SileroVAD:
    """
    Voice Activity Detection using Silero VAD via onnx-asr.
    """
    
    def __init__(
        self,
        providers: Optional[list] = None,
        threshold: float = 0.5,
        min_speech_duration_ms: int = 250,
        min_silence_duration_ms: int = 100,
        window_size_samples: int = 512,
        speech_pad_ms: int = 30,
    ):
        """
        Initialize Silero VAD.
        
        Args:
            providers: Optional ONNX runtime providers
            threshold: Speech probability threshold (0.0-1.0)
            min_speech_duration_ms: Minimum duration of speech segment
            min_silence_duration_ms: Minimum duration of silence to split segments
            window_size_samples: Window size for VAD processing
            speech_pad_ms: Padding around speech segments
        """
        if providers is None:
            providers = [
                "CUDAExecutionProvider",
                "CPUExecutionProvider",
            ]
        
        logger.info("Loading Silero VAD model...")
        self.vad = onnx_asr.load_vad("silero", providers=providers)
        
        # VAD parameters
        self.threshold = threshold
        self.min_speech_duration_ms = min_speech_duration_ms
        self.min_silence_duration_ms = min_silence_duration_ms
        self.window_size_samples = window_size_samples
        self.speech_pad_ms = speech_pad_ms
        
        logger.info("Silero VAD initialized successfully")
    
    def detect_speech(
        self,
        audio: np.ndarray,
        sample_rate: int = 16000,
    ) -> list:
        """
        Detect speech segments in audio.
        
        Args:
            audio: Audio data as numpy array (float32)
            sample_rate: Sample rate of audio
            
        Returns:
            List of tuples (start_sample, end_sample) for speech segments
        """
        # Note: The actual VAD processing is typically done within
        # the onnx_asr model.with_vad() method, but we provide
        # this interface for direct VAD usage
        
        # For direct VAD detection, you would use the vad model directly
        # However, onnx-asr integrates VAD into the recognition pipeline
        # So this is mainly for compatibility
        
        logger.warning("Direct VAD detection - consider using model.with_vad() instead")
        return []
    
    def is_speech(
        self,
        audio_chunk: np.ndarray,
        sample_rate: int = 16000,
    ) -> Tuple[bool, float]:
        """
        Check if audio chunk contains speech.
        
        Args:
            audio_chunk: Audio chunk as numpy array (float32)
            sample_rate: Sample rate
            
        Returns:
            Tuple of (is_speech: bool, probability: float)
        """
        # Placeholder for direct VAD probability check
        # In practice, use model.with_vad() for automatic segmentation
        logger.warning("Direct speech detection not implemented - use model.with_vad()")
        return False, 0.0
    
    def get_vad(self):
        """
        Get the underlying onnx_asr VAD model.
        
        Returns:
            The onnx_asr VAD model instance
        """
        return self.vad


# Convenience function
def load_vad(**kwargs):
    """Load and return Silero VAD with given configuration."""
    return SileroVAD(**kwargs)
Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking. 2026-01-19 00:29:44 +02:00			`"""`
			`Silero VAD wrapper using onnx-asr library`
			`"""`
			`import numpy as np`
			`import onnx_asr`
			`from typing import Optional, Tuple`
			`import logging`

			`logger = logging.getLogger(__name__)`


			`class SileroVAD:`
			`"""`
			`Voice Activity Detection using Silero VAD via onnx-asr.`
			`"""`

			`def __init__(`
			`self,`
			`providers: Optional[list] = None,`
			`threshold: float = 0.5,`
			`min_speech_duration_ms: int = 250,`
			`min_silence_duration_ms: int = 100,`
			`window_size_samples: int = 512,`
			`speech_pad_ms: int = 30,`
			`):`
			`"""`
			`Initialize Silero VAD.`

			`Args:`
			`providers: Optional ONNX runtime providers`
			`threshold: Speech probability threshold (0.0-1.0)`
			`min_speech_duration_ms: Minimum duration of speech segment`
			`min_silence_duration_ms: Minimum duration of silence to split segments`
			`window_size_samples: Window size for VAD processing`
			`speech_pad_ms: Padding around speech segments`
			`"""`
			`if providers is None:`
			`providers = [`
			`"CUDAExecutionProvider",`
			`"CPUExecutionProvider",`
			`]`

			`logger.info("Loading Silero VAD model...")`
			`self.vad = onnx_asr.load_vad("silero", providers=providers)`

			`# VAD parameters`
			`self.threshold = threshold`
			`self.min_speech_duration_ms = min_speech_duration_ms`
			`self.min_silence_duration_ms = min_silence_duration_ms`
			`self.window_size_samples = window_size_samples`
			`self.speech_pad_ms = speech_pad_ms`

			`logger.info("Silero VAD initialized successfully")`

			`def detect_speech(`
			`self,`
			`audio: np.ndarray,`
			`sample_rate: int = 16000,`
			`) -> list:`
			`"""`
			`Detect speech segments in audio.`

			`Args:`
			`audio: Audio data as numpy array (float32)`
			`sample_rate: Sample rate of audio`

			`Returns:`
			`List of tuples (start_sample, end_sample) for speech segments`
			`"""`
			`# Note: The actual VAD processing is typically done within`
			`# the onnx_asr model.with_vad() method, but we provide`
			`# this interface for direct VAD usage`

			`# For direct VAD detection, you would use the vad model directly`
			`# However, onnx-asr integrates VAD into the recognition pipeline`
			`# So this is mainly for compatibility`

			`logger.warning("Direct VAD detection - consider using model.with_vad() instead")`
			`return []`

			`def is_speech(`
			`self,`
			`audio_chunk: np.ndarray,`
			`sample_rate: int = 16000,`
			`) -> Tuple[bool, float]:`
			`"""`
			`Check if audio chunk contains speech.`

			`Args:`
			`audio_chunk: Audio chunk as numpy array (float32)`
			`sample_rate: Sample rate`

			`Returns:`
			`Tuple of (is_speech: bool, probability: float)`
			`"""`
			`# Placeholder for direct VAD probability check`
			`# In practice, use model.with_vad() for automatic segmentation`
			`logger.warning("Direct speech detection not implemented - use model.with_vad()")`
			`return False, 0.0`

			`def get_vad(self):`
			`"""`
			`Get the underlying onnx_asr VAD model.`

			`Returns:`
			`The onnx_asr VAD model instance`
			`"""`
			`return self.vad`


			`# Convenience function`
			`def load_vad(**kwargs):`
			`"""Load and return Silero VAD with given configuration."""`
			`return SileroVAD(**kwargs)`