Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.

This commit is contained in:
2026-01-19 00:29:44 +02:00
parent 0a8910fff8
commit 362108f4b0
34 changed files with 4593 additions and 73 deletions

View File

@@ -0,0 +1,6 @@
"""
VAD module using onnx-asr library
"""
from .silero_vad import SileroVAD, load_vad
__all__ = ["SileroVAD", "load_vad"]

View File

@@ -0,0 +1,114 @@
"""
Silero VAD wrapper using onnx-asr library
"""
import numpy as np
import onnx_asr
from typing import Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class SileroVAD:
"""
Voice Activity Detection using Silero VAD via onnx-asr.
"""
def __init__(
self,
providers: Optional[list] = None,
threshold: float = 0.5,
min_speech_duration_ms: int = 250,
min_silence_duration_ms: int = 100,
window_size_samples: int = 512,
speech_pad_ms: int = 30,
):
"""
Initialize Silero VAD.
Args:
providers: Optional ONNX runtime providers
threshold: Speech probability threshold (0.0-1.0)
min_speech_duration_ms: Minimum duration of speech segment
min_silence_duration_ms: Minimum duration of silence to split segments
window_size_samples: Window size for VAD processing
speech_pad_ms: Padding around speech segments
"""
if providers is None:
providers = [
"CUDAExecutionProvider",
"CPUExecutionProvider",
]
logger.info("Loading Silero VAD model...")
self.vad = onnx_asr.load_vad("silero", providers=providers)
# VAD parameters
self.threshold = threshold
self.min_speech_duration_ms = min_speech_duration_ms
self.min_silence_duration_ms = min_silence_duration_ms
self.window_size_samples = window_size_samples
self.speech_pad_ms = speech_pad_ms
logger.info("Silero VAD initialized successfully")
def detect_speech(
self,
audio: np.ndarray,
sample_rate: int = 16000,
) -> list:
"""
Detect speech segments in audio.
Args:
audio: Audio data as numpy array (float32)
sample_rate: Sample rate of audio
Returns:
List of tuples (start_sample, end_sample) for speech segments
"""
# Note: The actual VAD processing is typically done within
# the onnx_asr model.with_vad() method, but we provide
# this interface for direct VAD usage
# For direct VAD detection, you would use the vad model directly
# However, onnx-asr integrates VAD into the recognition pipeline
# So this is mainly for compatibility
logger.warning("Direct VAD detection - consider using model.with_vad() instead")
return []
def is_speech(
self,
audio_chunk: np.ndarray,
sample_rate: int = 16000,
) -> Tuple[bool, float]:
"""
Check if audio chunk contains speech.
Args:
audio_chunk: Audio chunk as numpy array (float32)
sample_rate: Sample rate
Returns:
Tuple of (is_speech: bool, probability: float)
"""
# Placeholder for direct VAD probability check
# In practice, use model.with_vad() for automatic segmentation
logger.warning("Direct speech detection not implemented - use model.with_vad()")
return False, 0.0
def get_vad(self):
"""
Get the underlying onnx_asr VAD model.
Returns:
The onnx_asr VAD model instance
"""
return self.vad
# Convenience function
def load_vad(**kwargs):
"""Load and return Silero VAD with given configuration."""
return SileroVAD(**kwargs)