Decided on Parakeet ONNX Runtime. Works pretty great. Realtime voice chat possible now. UX lacking.
This commit is contained in:
6
stt-parakeet/vad/__init__.py
Normal file
6
stt-parakeet/vad/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
VAD module using onnx-asr library
|
||||
"""
|
||||
from .silero_vad import SileroVAD, load_vad
|
||||
|
||||
__all__ = ["SileroVAD", "load_vad"]
|
||||
114
stt-parakeet/vad/silero_vad.py
Normal file
114
stt-parakeet/vad/silero_vad.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
Silero VAD wrapper using onnx-asr library
|
||||
"""
|
||||
import numpy as np
|
||||
import onnx_asr
|
||||
from typing import Optional, Tuple
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SileroVAD:
|
||||
"""
|
||||
Voice Activity Detection using Silero VAD via onnx-asr.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
providers: Optional[list] = None,
|
||||
threshold: float = 0.5,
|
||||
min_speech_duration_ms: int = 250,
|
||||
min_silence_duration_ms: int = 100,
|
||||
window_size_samples: int = 512,
|
||||
speech_pad_ms: int = 30,
|
||||
):
|
||||
"""
|
||||
Initialize Silero VAD.
|
||||
|
||||
Args:
|
||||
providers: Optional ONNX runtime providers
|
||||
threshold: Speech probability threshold (0.0-1.0)
|
||||
min_speech_duration_ms: Minimum duration of speech segment
|
||||
min_silence_duration_ms: Minimum duration of silence to split segments
|
||||
window_size_samples: Window size for VAD processing
|
||||
speech_pad_ms: Padding around speech segments
|
||||
"""
|
||||
if providers is None:
|
||||
providers = [
|
||||
"CUDAExecutionProvider",
|
||||
"CPUExecutionProvider",
|
||||
]
|
||||
|
||||
logger.info("Loading Silero VAD model...")
|
||||
self.vad = onnx_asr.load_vad("silero", providers=providers)
|
||||
|
||||
# VAD parameters
|
||||
self.threshold = threshold
|
||||
self.min_speech_duration_ms = min_speech_duration_ms
|
||||
self.min_silence_duration_ms = min_silence_duration_ms
|
||||
self.window_size_samples = window_size_samples
|
||||
self.speech_pad_ms = speech_pad_ms
|
||||
|
||||
logger.info("Silero VAD initialized successfully")
|
||||
|
||||
def detect_speech(
|
||||
self,
|
||||
audio: np.ndarray,
|
||||
sample_rate: int = 16000,
|
||||
) -> list:
|
||||
"""
|
||||
Detect speech segments in audio.
|
||||
|
||||
Args:
|
||||
audio: Audio data as numpy array (float32)
|
||||
sample_rate: Sample rate of audio
|
||||
|
||||
Returns:
|
||||
List of tuples (start_sample, end_sample) for speech segments
|
||||
"""
|
||||
# Note: The actual VAD processing is typically done within
|
||||
# the onnx_asr model.with_vad() method, but we provide
|
||||
# this interface for direct VAD usage
|
||||
|
||||
# For direct VAD detection, you would use the vad model directly
|
||||
# However, onnx-asr integrates VAD into the recognition pipeline
|
||||
# So this is mainly for compatibility
|
||||
|
||||
logger.warning("Direct VAD detection - consider using model.with_vad() instead")
|
||||
return []
|
||||
|
||||
def is_speech(
|
||||
self,
|
||||
audio_chunk: np.ndarray,
|
||||
sample_rate: int = 16000,
|
||||
) -> Tuple[bool, float]:
|
||||
"""
|
||||
Check if audio chunk contains speech.
|
||||
|
||||
Args:
|
||||
audio_chunk: Audio chunk as numpy array (float32)
|
||||
sample_rate: Sample rate
|
||||
|
||||
Returns:
|
||||
Tuple of (is_speech: bool, probability: float)
|
||||
"""
|
||||
# Placeholder for direct VAD probability check
|
||||
# In practice, use model.with_vad() for automatic segmentation
|
||||
logger.warning("Direct speech detection not implemented - use model.with_vad()")
|
||||
return False, 0.0
|
||||
|
||||
def get_vad(self):
|
||||
"""
|
||||
Get the underlying onnx_asr VAD model.
|
||||
|
||||
Returns:
|
||||
The onnx_asr VAD model instance
|
||||
"""
|
||||
return self.vad
|
||||
|
||||
|
||||
# Convenience function
|
||||
def load_vad(**kwargs):
|
||||
"""Load and return Silero VAD with given configuration."""
|
||||
return SileroVAD(**kwargs)
|
||||
Reference in New Issue
Block a user